comfy-qa 1.1.0 ā 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -6
- package/package.json +16 -2
- package/src/agent/browser-agent.ts +298 -0
- package/src/agent/demo-editor.ts +450 -0
- package/src/agent/demo-research.ts +725 -0
- package/src/agent/orchestrator.ts +268 -0
- package/src/agent/qa-research.ts +813 -0
- package/src/agent/research.ts +221 -0
- package/src/browser/hud.ts +136 -0
- package/src/browser/recorder.ts +131 -0
- package/src/cli.ts +69 -28
- package/src/commands/full.ts +40 -0
- package/src/commands/issue.ts +23 -0
- package/src/commands/pr.ts +23 -0
- package/src/commands/setup.ts +46 -0
- package/src/recorder/narration.ts +176 -0
- package/src/recorder/post-mix.ts +81 -0
- package/src/report/e2e-test.ts +132 -0
- package/src/report/generate.ts +271 -0
- package/src/utils/comfyui.ts +349 -0
- package/src/utils/github.ts +87 -0
- package/src/utils/parse-url.ts +11 -0
- package/src/utils/qa-skill.ts +376 -0
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Demo Research Agent ā explores a website guided by a feature checklist,
|
|
3
|
+
* narrates discoveries via TTS, and logs every action with timestamps.
|
|
4
|
+
*
|
|
5
|
+
* Output:
|
|
6
|
+
* - raw_video.webm (full Playwright recording)
|
|
7
|
+
* - actions.jsonl (timestamped action log with feature markers)
|
|
8
|
+
* - screenshots/ (per-feature screenshots)
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* bun src/agent/demo-research.ts demo/checklists/registry-web.yaml
|
|
12
|
+
*/
|
|
13
|
+
import { chromium, type Page, type BrowserContext } from "playwright";
|
|
14
|
+
import * as fs from "node:fs";
|
|
15
|
+
import * as path from "node:path";
|
|
16
|
+
import * as yaml from "yaml";
|
|
17
|
+
import { applyHud } from "../../lib/demowright/dist/setup.mjs";
|
|
18
|
+
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Types
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
interface FeatureItem {
|
|
24
|
+
id: string;
|
|
25
|
+
description: string;
|
|
26
|
+
action?: string;
|
|
27
|
+
narration_hint: string;
|
|
28
|
+
success_hint?: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
interface Chapter {
|
|
32
|
+
name: string;
|
|
33
|
+
goal: string;
|
|
34
|
+
features: FeatureItem[];
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
interface Checklist {
|
|
38
|
+
product: string;
|
|
39
|
+
url: string;
|
|
40
|
+
staging_url_env?: string;
|
|
41
|
+
persona: string;
|
|
42
|
+
narration_style: Record<string, string>;
|
|
43
|
+
chapters: Chapter[];
|
|
44
|
+
conclusion: { narration: string };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
interface ActionLogEntry {
|
|
48
|
+
ts: number;
|
|
49
|
+
offsetMs: number;
|
|
50
|
+
type: "narrate" | "action" | "feature_start" | "feature_end" | "chapter_start" | "chapter_end" | "screenshot" | "error";
|
|
51
|
+
feature?: string;
|
|
52
|
+
chapter?: string;
|
|
53
|
+
action?: string;
|
|
54
|
+
selector?: string;
|
|
55
|
+
text?: string;
|
|
56
|
+
success?: boolean;
|
|
57
|
+
error?: string;
|
|
58
|
+
screenshot?: string;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
interface AgentDecision {
|
|
62
|
+
narration: string;
|
|
63
|
+
actions: Array<{
|
|
64
|
+
type: "click" | "type" | "scroll" | "hover" | "wait" | "key";
|
|
65
|
+
selector?: string;
|
|
66
|
+
text?: string;
|
|
67
|
+
x?: number;
|
|
68
|
+
y?: number;
|
|
69
|
+
key?: string;
|
|
70
|
+
ms?: number;
|
|
71
|
+
}>;
|
|
72
|
+
done: boolean;
|
|
73
|
+
observation: string;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// ---------------------------------------------------------------------------
|
|
77
|
+
// LLM integration (Anthropic direct or OpenRouter)
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
async function callLLM(
|
|
81
|
+
systemPrompt: string,
|
|
82
|
+
messages: Array<{ role: "user" | "assistant"; content: any }>,
|
|
83
|
+
): Promise<string> {
|
|
84
|
+
const openRouterKey = process.env.OPENROUTER_API_KEY;
|
|
85
|
+
const anthropicKey = process.env.ANTHROPIC_API_KEY;
|
|
86
|
+
|
|
87
|
+
if (openRouterKey) {
|
|
88
|
+
return callOpenRouter(systemPrompt, messages, openRouterKey);
|
|
89
|
+
}
|
|
90
|
+
if (anthropicKey) {
|
|
91
|
+
return callAnthropic(systemPrompt, messages);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Fallback: claude CLI
|
|
95
|
+
const prompt = `${systemPrompt}\n\n${messages.map((m) => (typeof m.content === "string" ? m.content : JSON.stringify(m.content))).join("\n")}`;
|
|
96
|
+
const proc = Bun.spawn(["claude", "--print", "--model", "claude-sonnet-4-6"], {
|
|
97
|
+
stdin: new TextEncoder().encode(prompt),
|
|
98
|
+
stdout: "pipe",
|
|
99
|
+
stderr: "pipe",
|
|
100
|
+
});
|
|
101
|
+
const output = await new Response(proc.stdout).text();
|
|
102
|
+
await proc.exited;
|
|
103
|
+
return output;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
async function callAnthropic(
|
|
107
|
+
systemPrompt: string,
|
|
108
|
+
messages: Array<{ role: "user" | "assistant"; content: any }>,
|
|
109
|
+
): Promise<string> {
|
|
110
|
+
const Anthropic = (await import("@anthropic-ai/sdk")).default;
|
|
111
|
+
const client = new Anthropic();
|
|
112
|
+
const response = await client.messages.create({
|
|
113
|
+
model: "claude-sonnet-4-6",
|
|
114
|
+
max_tokens: 2048,
|
|
115
|
+
system: systemPrompt,
|
|
116
|
+
messages,
|
|
117
|
+
});
|
|
118
|
+
return response.content[0].type === "text" ? response.content[0].text : "";
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async function callOpenRouter(
|
|
122
|
+
systemPrompt: string,
|
|
123
|
+
messages: Array<{ role: "user" | "assistant"; content: any }>,
|
|
124
|
+
apiKey: string,
|
|
125
|
+
): Promise<string> {
|
|
126
|
+
// Convert Anthropic-style messages to OpenAI-style for OpenRouter
|
|
127
|
+
const openaiMessages: any[] = [{ role: "system", content: systemPrompt }];
|
|
128
|
+
|
|
129
|
+
for (const msg of messages) {
|
|
130
|
+
if (typeof msg.content === "string") {
|
|
131
|
+
openaiMessages.push({ role: msg.role, content: msg.content });
|
|
132
|
+
} else if (Array.isArray(msg.content)) {
|
|
133
|
+
// Convert Anthropic content blocks to OpenAI format
|
|
134
|
+
const parts: any[] = [];
|
|
135
|
+
for (const block of msg.content) {
|
|
136
|
+
if (block.type === "text") {
|
|
137
|
+
parts.push({ type: "text", text: block.text });
|
|
138
|
+
} else if (block.type === "image") {
|
|
139
|
+
parts.push({
|
|
140
|
+
type: "image_url",
|
|
141
|
+
image_url: {
|
|
142
|
+
url: `data:${block.source.media_type};base64,${block.source.data}`,
|
|
143
|
+
},
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
openaiMessages.push({ role: msg.role, content: parts });
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
|
|
152
|
+
method: "POST",
|
|
153
|
+
headers: {
|
|
154
|
+
Authorization: `Bearer ${apiKey}`,
|
|
155
|
+
"Content-Type": "application/json",
|
|
156
|
+
"HTTP-Referer": "https://github.com/comfy-org/comfy-qa",
|
|
157
|
+
"X-Title": "Comfy QA Demo Research",
|
|
158
|
+
},
|
|
159
|
+
body: JSON.stringify({
|
|
160
|
+
model: "anthropic/claude-sonnet-4",
|
|
161
|
+
max_tokens: 2048,
|
|
162
|
+
messages: openaiMessages,
|
|
163
|
+
}),
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
if (!res.ok) {
|
|
167
|
+
const errText = await res.text();
|
|
168
|
+
throw new Error(`OpenRouter API error ${res.status}: ${errText.slice(0, 300)}`);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const json = (await res.json()) as any;
|
|
172
|
+
return json.choices?.[0]?.message?.content ?? "";
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
async function askAgent(
|
|
176
|
+
checklist: Checklist,
|
|
177
|
+
chapter: Chapter,
|
|
178
|
+
feature: FeatureItem,
|
|
179
|
+
pageState: { screenshot: string; a11yTree: string; url: string; title: string },
|
|
180
|
+
history: string[],
|
|
181
|
+
attempt: number,
|
|
182
|
+
): Promise<AgentDecision> {
|
|
183
|
+
const systemPrompt = `You are a demo presenter exploring a website to create a narrated video demo.
|
|
184
|
+
|
|
185
|
+
Product: ${checklist.product}
|
|
186
|
+
Persona: ${checklist.persona}
|
|
187
|
+
|
|
188
|
+
You are currently demonstrating a feature. Your job is to:
|
|
189
|
+
1. Write a short narration (1-2 sentences, first person, conversational) explaining what you're doing and WHY it matters to the user.
|
|
190
|
+
2. Decide what Playwright actions to take to demonstrate the feature.
|
|
191
|
+
3. Report whether you've successfully demonstrated the feature.
|
|
192
|
+
|
|
193
|
+
CRITICAL RULES FOR "done":
|
|
194
|
+
- Set "done": true as soon as you have NARRATED the feature and it is VISIBLE on the page. You do NOT need to click every element.
|
|
195
|
+
- If the relevant content is already visible in the screenshot or accessibility tree, narrate it and set "done": true immediately.
|
|
196
|
+
- Scrolling to see content and narrating it IS a successful demonstration. You don't need to interact further.
|
|
197
|
+
- If you've already narrated the feature on a previous attempt, set "done": true.
|
|
198
|
+
- Do NOT keep trying different selectors if the content is already visible. Just narrate and finish.
|
|
199
|
+
- Maximum 2 actions per response. Prefer scroll + wait over complex click sequences.
|
|
200
|
+
|
|
201
|
+
IMPORTANT: You are in a headless browser ā there is NO URL bar. To navigate to a different page, use {"type": "navigate", "text": "https://full-url-here"}. Do NOT try to use keyboard shortcuts like Ctrl+L or type URLs into input fields.
|
|
202
|
+
|
|
203
|
+
Respond with ONLY a JSON object (no markdown):
|
|
204
|
+
{
|
|
205
|
+
"narration": "What to say (first person, explain user benefit)",
|
|
206
|
+
"actions": [
|
|
207
|
+
{"type": "click", "selector": "CSS selector"},
|
|
208
|
+
{"type": "type", "selector": "CSS selector", "text": "text to type"},
|
|
209
|
+
{"type": "scroll", "y": 300},
|
|
210
|
+
{"type": "hover", "selector": "CSS selector"},
|
|
211
|
+
{"type": "wait", "ms": 1000},
|
|
212
|
+
{"type": "key", "key": "Enter"},
|
|
213
|
+
{"type": "navigate", "text": "https://example.com/page"}
|
|
214
|
+
],
|
|
215
|
+
"done": true/false,
|
|
216
|
+
"observation": "What I see on the page"
|
|
217
|
+
}`;
|
|
218
|
+
|
|
219
|
+
const userContent: any[] = [
|
|
220
|
+
{
|
|
221
|
+
type: "text",
|
|
222
|
+
text: `## Current Task
|
|
223
|
+
Chapter: ${chapter.name} ā ${chapter.goal}
|
|
224
|
+
Feature: ${feature.id} ā ${feature.description}
|
|
225
|
+
${feature.action ? `Suggested action: ${feature.action}` : ""}
|
|
226
|
+
Narration hint: ${feature.narration_hint}
|
|
227
|
+
${feature.success_hint ? `Success criteria: ${feature.success_hint}` : ""}
|
|
228
|
+
Attempt: ${attempt}/3
|
|
229
|
+
${attempt > 1 ? "IMPORTANT: If you can see the relevant content on the page, just narrate it and set done=true. Do not keep retrying." : ""}
|
|
230
|
+
|
|
231
|
+
## Page State
|
|
232
|
+
URL: ${pageState.url}
|
|
233
|
+
Title: ${pageState.title}
|
|
234
|
+
|
|
235
|
+
## Accessibility Tree (first 2000 chars)
|
|
236
|
+
${pageState.a11yTree.slice(0, 2000)}
|
|
237
|
+
|
|
238
|
+
## Recent History
|
|
239
|
+
${history.slice(-8).join("\n") || "(start)"}`,
|
|
240
|
+
},
|
|
241
|
+
];
|
|
242
|
+
|
|
243
|
+
// Include screenshot as vision input
|
|
244
|
+
if (pageState.screenshot) {
|
|
245
|
+
userContent.push({
|
|
246
|
+
type: "image",
|
|
247
|
+
source: {
|
|
248
|
+
type: "base64",
|
|
249
|
+
media_type: "image/png",
|
|
250
|
+
data: pageState.screenshot,
|
|
251
|
+
},
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const text = await callLLM(systemPrompt, [{ role: "user", content: userContent }]);
|
|
256
|
+
|
|
257
|
+
// Parse JSON response
|
|
258
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
259
|
+
if (!jsonMatch) {
|
|
260
|
+
return {
|
|
261
|
+
narration: "",
|
|
262
|
+
actions: [],
|
|
263
|
+
done: false,
|
|
264
|
+
observation: `Could not parse agent response: ${text.slice(0, 200)}`,
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
try {
|
|
268
|
+
return JSON.parse(jsonMatch[0]) as AgentDecision;
|
|
269
|
+
} catch {
|
|
270
|
+
return {
|
|
271
|
+
narration: "",
|
|
272
|
+
actions: [],
|
|
273
|
+
done: false,
|
|
274
|
+
observation: "JSON parse failed",
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// ---------------------------------------------------------------------------
|
|
280
|
+
// Page state capture
|
|
281
|
+
// ---------------------------------------------------------------------------
|
|
282
|
+
|
|
283
|
+
async function getA11ySnapshot(page: Page): Promise<string> {
|
|
284
|
+
try {
|
|
285
|
+
const tree = await page.accessibility.snapshot();
|
|
286
|
+
return tree ? formatA11y(tree, 0) : "(empty)";
|
|
287
|
+
} catch {
|
|
288
|
+
return "(unavailable)";
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
function formatA11y(node: any, depth: number): string {
|
|
293
|
+
const indent = " ".repeat(depth);
|
|
294
|
+
let line = `${indent}[${node.role}]`;
|
|
295
|
+
if (node.name) line += ` "${node.name}"`;
|
|
296
|
+
if (node.value) line += ` val="${node.value}"`;
|
|
297
|
+
let result = line + "\n";
|
|
298
|
+
if (node.children) {
|
|
299
|
+
for (const child of node.children.slice(0, 40)) {
|
|
300
|
+
result += formatA11y(child, depth + 1);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
return result;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
async function captureState(page: Page) {
|
|
307
|
+
const buf = await page.screenshot({ type: "png" });
|
|
308
|
+
return {
|
|
309
|
+
screenshot: buf.toString("base64"),
|
|
310
|
+
a11yTree: await getA11ySnapshot(page),
|
|
311
|
+
url: page.url(),
|
|
312
|
+
title: await page.title(),
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// ---------------------------------------------------------------------------
|
|
317
|
+
// Action execution
|
|
318
|
+
// ---------------------------------------------------------------------------
|
|
319
|
+
|
|
320
|
+
async function executeAction(
|
|
321
|
+
page: Page,
|
|
322
|
+
action: AgentDecision["actions"][0],
|
|
323
|
+
): Promise<{ success: boolean; result: string }> {
|
|
324
|
+
try {
|
|
325
|
+
switch (action.type) {
|
|
326
|
+
case "click":
|
|
327
|
+
if (action.selector) {
|
|
328
|
+
await page.click(action.selector, { timeout: 5000 });
|
|
329
|
+
return { success: true, result: `Clicked: ${action.selector}` };
|
|
330
|
+
}
|
|
331
|
+
if (action.x !== undefined && action.y !== undefined) {
|
|
332
|
+
await page.mouse.click(action.x, action.y);
|
|
333
|
+
return { success: true, result: `Clicked (${action.x},${action.y})` };
|
|
334
|
+
}
|
|
335
|
+
return { success: false, result: "Click: no target" };
|
|
336
|
+
|
|
337
|
+
case "type":
|
|
338
|
+
if (action.selector && action.text) {
|
|
339
|
+
await page.fill(action.selector, action.text, { timeout: 5000 });
|
|
340
|
+
return { success: true, result: `Typed "${action.text}" ā ${action.selector}` };
|
|
341
|
+
}
|
|
342
|
+
if (action.text) {
|
|
343
|
+
await page.keyboard.type(action.text, { delay: 80 });
|
|
344
|
+
return { success: true, result: `Typed: "${action.text}"` };
|
|
345
|
+
}
|
|
346
|
+
return { success: false, result: "Type: no text" };
|
|
347
|
+
|
|
348
|
+
case "scroll":
|
|
349
|
+
await page.mouse.wheel(0, action.y ?? 300);
|
|
350
|
+
return { success: true, result: `Scrolled ${action.y ?? 300}px` };
|
|
351
|
+
|
|
352
|
+
case "hover":
|
|
353
|
+
if (action.selector) {
|
|
354
|
+
await page.hover(action.selector, { timeout: 5000 });
|
|
355
|
+
return { success: true, result: `Hovered: ${action.selector}` };
|
|
356
|
+
}
|
|
357
|
+
return { success: false, result: "Hover: no target" };
|
|
358
|
+
|
|
359
|
+
case "wait":
|
|
360
|
+
await page.waitForTimeout(action.ms ?? 1000);
|
|
361
|
+
return { success: true, result: `Waited ${action.ms ?? 1000}ms` };
|
|
362
|
+
|
|
363
|
+
case "key":
|
|
364
|
+
if (action.key) {
|
|
365
|
+
await page.keyboard.press(action.key);
|
|
366
|
+
return { success: true, result: `Key: ${action.key}` };
|
|
367
|
+
}
|
|
368
|
+
return { success: false, result: "Key: none" };
|
|
369
|
+
|
|
370
|
+
case "navigate":
|
|
371
|
+
if (action.text) {
|
|
372
|
+
await page.goto(action.text, { waitUntil: "domcontentloaded", timeout: 15000 });
|
|
373
|
+
await page.waitForTimeout(1500);
|
|
374
|
+
return { success: true, result: `Navigated: ${action.text}` };
|
|
375
|
+
}
|
|
376
|
+
return { success: false, result: "Navigate: no URL" };
|
|
377
|
+
|
|
378
|
+
default:
|
|
379
|
+
return { success: false, result: `Unknown: ${action.type}` };
|
|
380
|
+
}
|
|
381
|
+
} catch (err: any) {
|
|
382
|
+
return { success: false, result: `Failed: ${err.message?.slice(0, 150)}` };
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// ---------------------------------------------------------------------------
|
|
387
|
+
// TTS (reuse Gemini TTS from fixture)
|
|
388
|
+
// ---------------------------------------------------------------------------
|
|
389
|
+
|
|
390
|
+
async function generateTTS(text: string): Promise<Buffer | null> {
|
|
391
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
392
|
+
if (!apiKey || !text.trim()) return null;
|
|
393
|
+
|
|
394
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key=${apiKey}`;
|
|
395
|
+
const body = {
|
|
396
|
+
contents: [{ parts: [{ text }] }],
|
|
397
|
+
generationConfig: {
|
|
398
|
+
responseModalities: ["AUDIO"],
|
|
399
|
+
speechConfig: {
|
|
400
|
+
voiceConfig: { prebuiltVoiceConfig: { voiceName: "Kore" } },
|
|
401
|
+
},
|
|
402
|
+
},
|
|
403
|
+
};
|
|
404
|
+
|
|
405
|
+
try {
|
|
406
|
+
const res = await fetch(url, {
|
|
407
|
+
method: "POST",
|
|
408
|
+
headers: { "Content-Type": "application/json" },
|
|
409
|
+
body: JSON.stringify(body),
|
|
410
|
+
});
|
|
411
|
+
if (!res.ok) return null;
|
|
412
|
+
const json = (await res.json()) as any;
|
|
413
|
+
const b64 = json?.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
|
414
|
+
if (!b64) return null;
|
|
415
|
+
const pcm = Buffer.from(b64, "base64");
|
|
416
|
+
return pcmToWav(pcm, 24000, 1);
|
|
417
|
+
} catch {
|
|
418
|
+
return null;
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
function pcmToWav(pcm: Buffer, sampleRate: number, channels: number): Buffer {
|
|
423
|
+
const header = Buffer.alloc(44);
|
|
424
|
+
const dataSize = pcm.length;
|
|
425
|
+
header.write("RIFF", 0);
|
|
426
|
+
header.writeUInt32LE(36 + dataSize, 4);
|
|
427
|
+
header.write("WAVE", 8);
|
|
428
|
+
header.write("fmt ", 12);
|
|
429
|
+
header.writeUInt32LE(16, 16);
|
|
430
|
+
header.writeUInt16LE(1, 20);
|
|
431
|
+
header.writeUInt16LE(channels, 22);
|
|
432
|
+
header.writeUInt32LE(sampleRate, 24);
|
|
433
|
+
header.writeUInt32LE(sampleRate * channels * 2, 28);
|
|
434
|
+
header.writeUInt16LE(channels * 2, 32);
|
|
435
|
+
header.writeUInt16LE(16, 34);
|
|
436
|
+
header.write("data", 36);
|
|
437
|
+
header.writeUInt32LE(dataSize, 40);
|
|
438
|
+
return Buffer.concat([header, pcm]);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// ---------------------------------------------------------------------------
|
|
442
|
+
// Play audio in browser (inject base64 WAV)
|
|
443
|
+
// ---------------------------------------------------------------------------
|
|
444
|
+
|
|
445
|
+
async function playAudioInBrowser(page: Page, wavBuf: Buffer): Promise<number> {
|
|
446
|
+
const b64 = wavBuf.toString("base64");
|
|
447
|
+
const durationMs = await page.evaluate(async (data: string) => {
|
|
448
|
+
const binary = atob(data);
|
|
449
|
+
const bytes = new Uint8Array(binary.length);
|
|
450
|
+
for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
|
|
451
|
+
const blob = new Blob([bytes], { type: "audio/wav" });
|
|
452
|
+
const url = URL.createObjectURL(blob);
|
|
453
|
+
const audio = new Audio(url);
|
|
454
|
+
await audio.play();
|
|
455
|
+
const dur = audio.duration * 1000;
|
|
456
|
+
return isFinite(dur) ? dur : 3000;
|
|
457
|
+
}, b64);
|
|
458
|
+
return durationMs;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// ---------------------------------------------------------------------------
|
|
462
|
+
// Main research loop
|
|
463
|
+
// ---------------------------------------------------------------------------
|
|
464
|
+
|
|
465
|
+
export async function runDemoResearch(checklistPath: string) {
|
|
466
|
+
// Load checklist
|
|
467
|
+
const raw = fs.readFileSync(checklistPath, "utf-8");
|
|
468
|
+
const checklist = yaml.parse(raw) as Checklist;
|
|
469
|
+
|
|
470
|
+
// Resolve URL
|
|
471
|
+
const baseUrl = checklist.staging_url_env
|
|
472
|
+
? process.env[checklist.staging_url_env] ?? checklist.url
|
|
473
|
+
: checklist.url;
|
|
474
|
+
|
|
475
|
+
// Setup output dir
|
|
476
|
+
const productSlug = checklist.product.toLowerCase().replace(/\s+/g, "-");
|
|
477
|
+
const outputDir = path.resolve(`.comfy-qa/.research/${productSlug}`);
|
|
478
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
479
|
+
fs.mkdirSync(path.join(outputDir, "screenshots"), { recursive: true });
|
|
480
|
+
|
|
481
|
+
// Action log
|
|
482
|
+
const logPath = path.join(outputDir, "actions.jsonl");
|
|
483
|
+
const logStream = fs.createWriteStream(logPath, { flags: "w" });
|
|
484
|
+
const startMs = Date.now();
|
|
485
|
+
|
|
486
|
+
function log(entry: Omit<ActionLogEntry, "ts" | "offsetMs">) {
|
|
487
|
+
const now = Date.now();
|
|
488
|
+
const full: ActionLogEntry = { ts: now, offsetMs: now - startMs, ...entry };
|
|
489
|
+
logStream.write(JSON.stringify(full) + "\n");
|
|
490
|
+
const prefix = `[${((now - startMs) / 1000).toFixed(1)}s]`;
|
|
491
|
+
if (entry.type === "narrate") console.log(` ${prefix} š¤ ${entry.text}`);
|
|
492
|
+
else if (entry.type === "action") console.log(` ${prefix} ā¶ ${entry.action} ${entry.success ? "ā" : "ā"}`);
|
|
493
|
+
else if (entry.type === "feature_start") console.log(` ${prefix} š ${entry.feature}: ${entry.text}`);
|
|
494
|
+
else if (entry.type === "feature_end") console.log(` ${prefix} ${entry.success ? "ā
" : "ā"} ${entry.feature}`);
|
|
495
|
+
else if (entry.type === "chapter_start") console.log(`\n ${prefix} š Chapter: ${entry.chapter}`);
|
|
496
|
+
else if (entry.type === "error") console.log(` ${prefix} ā ${entry.error}`);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Launch browser
|
|
500
|
+
console.log(`\nš¬ Research Agent: ${checklist.product}`);
|
|
501
|
+
console.log(` URL: ${baseUrl}`);
|
|
502
|
+
console.log(` Output: ${outputDir}\n`);
|
|
503
|
+
|
|
504
|
+
const browser = await chromium.launch({ headless: true });
|
|
505
|
+
const context = await browser.newContext({
|
|
506
|
+
viewport: { width: 1280, height: 720 },
|
|
507
|
+
recordVideo: { dir: outputDir, size: { width: 1280, height: 720 } },
|
|
508
|
+
});
|
|
509
|
+
|
|
510
|
+
// Apply HUD overlay (cursor, keystrokes)
|
|
511
|
+
await applyHud(context, {
|
|
512
|
+
cursor: true,
|
|
513
|
+
keyboard: true,
|
|
514
|
+
cursorStyle: "default",
|
|
515
|
+
actionDelay: 200,
|
|
516
|
+
});
|
|
517
|
+
|
|
518
|
+
const page = await context.newPage();
|
|
519
|
+
const history: string[] = [];
|
|
520
|
+
|
|
521
|
+
// Navigate to site
|
|
522
|
+
await page.goto(baseUrl, { waitUntil: "domcontentloaded", timeout: 30000 });
|
|
523
|
+
await page.waitForTimeout(2000);
|
|
524
|
+
|
|
525
|
+
// Audio segments for later muxing
|
|
526
|
+
const audioSegments: Array<{ offsetMs: number; wavBuf: Buffer }> = [];
|
|
527
|
+
|
|
528
|
+
// -- Research loop --
|
|
529
|
+
for (const chapter of checklist.chapters) {
|
|
530
|
+
log({ type: "chapter_start", chapter: chapter.name, text: chapter.goal });
|
|
531
|
+
|
|
532
|
+
for (const feature of chapter.features) {
|
|
533
|
+
log({ type: "feature_start", feature: feature.id, text: feature.description });
|
|
534
|
+
|
|
535
|
+
// Auto-execute navigation actions from checklist before asking the agent
|
|
536
|
+
if (feature.action) {
|
|
537
|
+
const navMatch = feature.action.match(/navigate\s+to\s+(\S+)/i);
|
|
538
|
+
if (navMatch) {
|
|
539
|
+
const target = navMatch[1].startsWith("http") ? navMatch[1] : new URL(navMatch[1], baseUrl).href;
|
|
540
|
+
try {
|
|
541
|
+
await page.goto(target, { waitUntil: "domcontentloaded", timeout: 15000 });
|
|
542
|
+
await page.waitForTimeout(1500);
|
|
543
|
+
log({ type: "action", feature: feature.id, action: `navigate ${target}`, success: true });
|
|
544
|
+
history.push(`[${feature.id}] Navigated to ${target}`);
|
|
545
|
+
} catch (err: any) {
|
|
546
|
+
log({ type: "error", feature: feature.id, error: `Auto-navigate failed: ${err.message?.slice(0, 100)}` });
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
let demonstrated = false;
|
|
552
|
+
for (let attempt = 1; attempt <= 3 && !demonstrated; attempt++) {
|
|
553
|
+
try {
|
|
554
|
+
// Capture page state
|
|
555
|
+
const state = await captureState(page);
|
|
556
|
+
|
|
557
|
+
// Ask Claude what to do
|
|
558
|
+
const decision = await askAgent(checklist, chapter, feature, state, history, attempt);
|
|
559
|
+
|
|
560
|
+
// Narrate
|
|
561
|
+
if (decision.narration) {
|
|
562
|
+
log({ type: "narrate", feature: feature.id, text: decision.narration });
|
|
563
|
+
const wav = await generateTTS(decision.narration);
|
|
564
|
+
if (wav) {
|
|
565
|
+
audioSegments.push({ offsetMs: Date.now() - startMs, wavBuf: wav });
|
|
566
|
+
const durMs = await playAudioInBrowser(page, wav).catch(() => 3000);
|
|
567
|
+
await page.waitForTimeout(Math.max(durMs - 500, 500));
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
// Execute actions
|
|
572
|
+
for (const action of decision.actions) {
|
|
573
|
+
const result = await executeAction(page, action);
|
|
574
|
+
log({
|
|
575
|
+
type: "action",
|
|
576
|
+
feature: feature.id,
|
|
577
|
+
action: `${action.type} ${action.selector ?? action.text ?? action.key ?? ""}`.trim(),
|
|
578
|
+
selector: action.selector,
|
|
579
|
+
success: result.success,
|
|
580
|
+
error: result.success ? undefined : result.result,
|
|
581
|
+
});
|
|
582
|
+
history.push(`[${feature.id}] ${result.result}`);
|
|
583
|
+
await page.waitForTimeout(400); // visual pause
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
if (decision.observation) {
|
|
587
|
+
history.push(`[observe] ${decision.observation}`);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
demonstrated = decision.done;
|
|
591
|
+
} catch (err: any) {
|
|
592
|
+
log({ type: "error", feature: feature.id, error: err.message?.slice(0, 200) });
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// Screenshot for this feature
|
|
597
|
+
const ssPath = path.join(outputDir, "screenshots", `${feature.id}.png`);
|
|
598
|
+
await page.screenshot({ path: ssPath }).catch(() => {});
|
|
599
|
+
log({ type: "screenshot", feature: feature.id, screenshot: ssPath });
|
|
600
|
+
|
|
601
|
+
log({ type: "feature_end", feature: feature.id, success: demonstrated });
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
log({ type: "chapter_end", chapter: chapter.name });
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// Conclusion narration
|
|
608
|
+
if (checklist.conclusion?.narration) {
|
|
609
|
+
log({ type: "narrate", text: checklist.conclusion.narration });
|
|
610
|
+
const wav = await generateTTS(checklist.conclusion.narration);
|
|
611
|
+
if (wav) {
|
|
612
|
+
audioSegments.push({ offsetMs: Date.now() - startMs, wavBuf: wav });
|
|
613
|
+
await playAudioInBrowser(page, wav).catch(() => {});
|
|
614
|
+
await page.waitForTimeout(3000);
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
// Save audio track
|
|
619
|
+
if (audioSegments.length > 0) {
|
|
620
|
+
const totalMs = Date.now() - startMs;
|
|
621
|
+
const wavPath = path.join(outputDir, "narration.wav");
|
|
622
|
+
const wavBuf = buildWavTrack(audioSegments, totalMs);
|
|
623
|
+
if (wavBuf) fs.writeFileSync(wavPath, wavBuf);
|
|
624
|
+
console.log(`\n š Audio: ${wavPath}`);
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
// Stop recording
|
|
628
|
+
await page.waitForTimeout(1000);
|
|
629
|
+
const videoPath = await page.video()?.path();
|
|
630
|
+
await context.close();
|
|
631
|
+
await browser.close();
|
|
632
|
+
|
|
633
|
+
if (videoPath) {
|
|
634
|
+
const dest = path.join(outputDir, "raw_video.webm");
|
|
635
|
+
fs.renameSync(videoPath, dest);
|
|
636
|
+
console.log(` š¬ Video: ${dest}`);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
logStream.end();
|
|
640
|
+
console.log(` š Log: ${logPath}`);
|
|
641
|
+
console.log(`\nā
Research complete for ${checklist.product}\n`);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
// ---------------------------------------------------------------------------
|
|
645
|
+
// WAV track builder (stereo mix of all narration segments)
|
|
646
|
+
// ---------------------------------------------------------------------------
|
|
647
|
+
|
|
648
|
+
function parseWav(buf: Buffer) {
|
|
649
|
+
const dataOff = buf.indexOf("data") + 8;
|
|
650
|
+
if (dataOff < 8) return { float32: new Float32Array(0), sampleRate: 24000, channels: 1, sampleCount: 0, durationMs: 0 };
|
|
651
|
+
const sr = buf.readUInt32LE(24);
|
|
652
|
+
const ch = buf.readUInt16LE(22);
|
|
653
|
+
const pcm = buf.subarray(dataOff);
|
|
654
|
+
const count = pcm.length / 2;
|
|
655
|
+
const f32 = new Float32Array(count);
|
|
656
|
+
for (let i = 0; i < count; i++) f32[i] = pcm.readInt16LE(i * 2) / 32768;
|
|
657
|
+
return { float32: f32, sampleRate: sr, channels: ch, sampleCount: count, durationMs: (count / ch / sr) * 1000 };
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
function buildWavTrack(segments: Array<{ offsetMs: number; wavBuf: Buffer }>, totalMs: number): Buffer | null {
|
|
661
|
+
if (!segments.length) return null;
|
|
662
|
+
const sr = 24000;
|
|
663
|
+
const ch = 2;
|
|
664
|
+
const totalSamples = Math.ceil((totalMs / 1000) * sr * ch);
|
|
665
|
+
const track = new Float32Array(totalSamples);
|
|
666
|
+
|
|
667
|
+
for (const seg of segments) {
|
|
668
|
+
const p = parseWav(seg.wavBuf);
|
|
669
|
+
const off = Math.floor((seg.offsetMs / 1000) * sr) * ch;
|
|
670
|
+
const stereo =
|
|
671
|
+
p.channels === 1
|
|
672
|
+
? (() => {
|
|
673
|
+
const s = new Float32Array(p.sampleCount * 2);
|
|
674
|
+
for (let i = 0; i < p.sampleCount; i++) {
|
|
675
|
+
s[i * 2] = p.float32[i];
|
|
676
|
+
s[i * 2 + 1] = p.float32[i];
|
|
677
|
+
}
|
|
678
|
+
return s;
|
|
679
|
+
})()
|
|
680
|
+
: p.float32;
|
|
681
|
+
for (let i = 0; i < stereo.length && off + i < track.length; i++) {
|
|
682
|
+
track[off + i] += stereo[i];
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
const int16 = new Int16Array(track.length);
|
|
687
|
+
for (let i = 0; i < track.length; i++) {
|
|
688
|
+
const s = Math.max(-1, Math.min(1, track[i]));
|
|
689
|
+
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
const dataBytes = int16.length * 2;
|
|
693
|
+
const buf = Buffer.alloc(44 + dataBytes);
|
|
694
|
+
buf.write("RIFF", 0);
|
|
695
|
+
buf.writeUInt32LE(36 + dataBytes, 4);
|
|
696
|
+
buf.write("WAVE", 8);
|
|
697
|
+
buf.write("fmt ", 12);
|
|
698
|
+
buf.writeUInt32LE(16, 16);
|
|
699
|
+
buf.writeUInt16LE(1, 20);
|
|
700
|
+
buf.writeUInt16LE(ch, 22);
|
|
701
|
+
buf.writeUInt32LE(sr, 24);
|
|
702
|
+
buf.writeUInt32LE(sr * ch * 2, 28);
|
|
703
|
+
buf.writeUInt16LE(ch * 2, 32);
|
|
704
|
+
buf.writeUInt16LE(16, 34);
|
|
705
|
+
buf.write("data", 36);
|
|
706
|
+
buf.writeUInt32LE(dataBytes, 40);
|
|
707
|
+
Buffer.from(int16.buffer).copy(buf, 44);
|
|
708
|
+
return buf;
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// ---------------------------------------------------------------------------
|
|
712
|
+
// CLI entrypoint
|
|
713
|
+
// ---------------------------------------------------------------------------
|
|
714
|
+
|
|
715
|
+
if (import.meta.main) {
|
|
716
|
+
const checklistPath = process.argv[2];
|
|
717
|
+
if (!checklistPath) {
|
|
718
|
+
console.error("Usage: bun src/agent/demo-research.ts <checklist.yaml>");
|
|
719
|
+
process.exit(1);
|
|
720
|
+
}
|
|
721
|
+
runDemoResearch(path.resolve(checklistPath)).catch((err) => {
|
|
722
|
+
console.error(err);
|
|
723
|
+
process.exit(1);
|
|
724
|
+
});
|
|
725
|
+
}
|