comfy-qa 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,813 @@
1
+ /**
2
+ * QA Research Agent — explores a website, tests CRUD operations,
3
+ * generates a demowright spec, debugs it, and records QA evidence video.
4
+ *
5
+ * Phase 1: Explore site headlessly, test each operation, pass/fail scoring
6
+ * Phase 2: Generate .spec.ts, debug without video (demowright fast mode)
7
+ * Phase 3: Record with video (demowright demo mode + TTS)
8
+ *
9
+ * Usage:
10
+ * bun src/agent/qa-research.ts demo/checklists/registry-web.yaml
11
+ */
12
+ import * as fs from "node:fs";
13
+ import * as path from "node:path";
14
+ import { $ } from "bun";
15
+ import { chromium, type Page } from "playwright";
16
+ import * as yaml from "yaml";
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Types
20
+ // ---------------------------------------------------------------------------
21
+
22
+ interface Operation {
23
+ id: string;
24
+ type: string;
25
+ description: string;
26
+ steps_hint: string;
27
+ narration: string;
28
+ success_criteria: string;
29
+ }
30
+
31
+ interface Feature {
32
+ name: string;
33
+ operations: Operation[];
34
+ }
35
+
36
+ interface Checklist {
37
+ product: string;
38
+ url: string;
39
+ persona: string;
40
+ features: Feature[];
41
+ conclusion?: { narration: string };
42
+ }
43
+
44
+ interface StepAction {
45
+ type: "goto" | "click" | "type" | "scroll" | "hover" | "wait" | "key" | "safeMove";
46
+ selector?: string;
47
+ text?: string;
48
+ value?: number;
49
+ }
50
+
51
+ interface OperationResult {
52
+ id: string;
53
+ feature: string;
54
+ type: string;
55
+ narration: string;
56
+ success: boolean;
57
+ actions: StepAction[];
58
+ error?: string;
59
+ }
60
+
61
+ interface ResearchResults {
62
+ product: string;
63
+ url: string;
64
+ features: {
65
+ name: string;
66
+ operations: OperationResult[];
67
+ score: string;
68
+ passed: number;
69
+ total: number;
70
+ }[];
71
+ totalPassed: number;
72
+ totalOperations: number;
73
+ scorePercent: number;
74
+ }
75
+
76
+ // ---------------------------------------------------------------------------
77
+ // LLM
78
+ // ---------------------------------------------------------------------------
79
+
80
+ const ANTHROPIC_KEY = process.env.ANTHROPIC_API_KEY_QA ?? process.env.ANTHROPIC_API_KEY ?? "";
81
+ const OPENROUTER_KEY = process.env.OPENROUTER_API_KEY ?? "";
82
+
83
+ async function callLLM(system: string, messages: any[]): Promise<string> {
84
+ if (ANTHROPIC_KEY) {
85
+ try {
86
+ const res = await fetch("https://api.anthropic.com/v1/messages", {
87
+ method: "POST",
88
+ headers: {
89
+ "x-api-key": ANTHROPIC_KEY,
90
+ "anthropic-version": "2023-06-01",
91
+ "content-type": "application/json",
92
+ },
93
+ body: JSON.stringify({
94
+ model: "claude-sonnet-4-20250514",
95
+ max_tokens: 8192,
96
+ system,
97
+ messages,
98
+ }),
99
+ });
100
+ const json = (await res.json()) as any;
101
+ return json.content?.[0]?.text ?? "";
102
+ } catch {}
103
+ }
104
+
105
+ if (OPENROUTER_KEY) {
106
+ const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
107
+ method: "POST",
108
+ headers: {
109
+ Authorization: `Bearer ${OPENROUTER_KEY}`,
110
+ "content-type": "application/json",
111
+ },
112
+ body: JSON.stringify({
113
+ model: "anthropic/claude-sonnet-4-20250514",
114
+ messages: [{ role: "system", content: system }, ...messages],
115
+ max_tokens: 2048,
116
+ }),
117
+ });
118
+ const json = (await res.json()) as any;
119
+ return json.choices?.[0]?.message?.content ?? "";
120
+ }
121
+
122
+ throw new Error("No API key (ANTHROPIC_API_KEY_QA or OPENROUTER_API_KEY)");
123
+ }
124
+
125
+ // ---------------------------------------------------------------------------
126
+ // Page helpers
127
+ // ---------------------------------------------------------------------------
128
+
129
+ function formatA11y(node: any, depth: number): string {
130
+ const indent = " ".repeat(depth);
131
+ let line = `${indent}${node.role}`;
132
+ if (node.name) line += ` "${node.name}"`;
133
+ if (node.value) line += ` [${node.value}]`;
134
+ let text = line + "\n";
135
+ for (const child of node.children ?? []) {
136
+ text += formatA11y(child, depth + 1);
137
+ }
138
+ return text;
139
+ }
140
+
141
+ async function captureState(page: Page) {
142
+ const screenshot = await page.screenshot({ type: "png" }).catch(() => null);
143
+
144
+ // accessibility.snapshot() can throw if the property itself is undefined
145
+ let a11yTree = "(unavailable)";
146
+ try {
147
+ if (page.accessibility) {
148
+ const a11y = await page.accessibility.snapshot();
149
+ if (a11y) a11yTree = formatA11y(a11y, 0).slice(0, 3000);
150
+ }
151
+ } catch {
152
+ // Fall back to innerText extraction
153
+ try {
154
+ const text = await page.evaluate(() => document.body?.innerText?.slice(0, 3000) ?? "");
155
+ a11yTree = text || "(unavailable)";
156
+ } catch {}
157
+ }
158
+
159
+ return {
160
+ screenshotBase64: screenshot ? screenshot.toString("base64") : "",
161
+ a11yTree,
162
+ url: page.url(),
163
+ title: await page.title().catch(() => ""),
164
+ };
165
+ }
166
+
167
+ async function executeAction(page: Page, action: StepAction): Promise<boolean> {
168
+ try {
169
+ switch (action.type) {
170
+ case "goto":
171
+ await page.goto(action.text!, { waitUntil: "domcontentloaded", timeout: 15000 });
172
+ await page.waitForTimeout(1500);
173
+ break;
174
+ case "click":
175
+ await page.locator(action.selector!).first().click({ timeout: 5000 });
176
+ break;
177
+ case "type":
178
+ await page.locator(action.selector!).first().fill(action.text!, { timeout: 5000 });
179
+ break;
180
+ case "scroll":
181
+ await page.mouse.wheel(0, action.value ?? 300);
182
+ await page.waitForTimeout(500);
183
+ break;
184
+ case "hover":
185
+ await page.locator(action.selector!).first().hover({ timeout: 5000 });
186
+ break;
187
+ case "wait":
188
+ await page.waitForTimeout(action.value ?? 1000);
189
+ break;
190
+ case "key":
191
+ await page.keyboard.press(action.text!);
192
+ break;
193
+ case "safeMove":
194
+ const el = page.locator(action.selector!).first();
195
+ if (await el.isVisible({ timeout: 3000 })) {
196
+ const box = await el.boundingBox();
197
+ if (box) await page.mouse.move(box.x + box.width / 2, box.y + box.height / 2);
198
+ }
199
+ break;
200
+ }
201
+ return true;
202
+ } catch {
203
+ return false;
204
+ }
205
+ }
206
+
207
+ // ---------------------------------------------------------------------------
208
+ // Phase 1: Research
209
+ // ---------------------------------------------------------------------------
210
+
211
+ async function testOperation(
212
+ page: Page,
213
+ checklist: Checklist,
214
+ feature: Feature,
215
+ op: Operation,
216
+ ): Promise<OperationResult> {
217
+ const result: OperationResult = {
218
+ id: op.id,
219
+ feature: feature.name,
220
+ type: op.type,
221
+ narration: op.narration,
222
+ success: false,
223
+ actions: [],
224
+ };
225
+
226
+ console.log(` Testing: ${op.id}`);
227
+
228
+ for (let attempt = 1; attempt <= 3; attempt++) {
229
+ try {
230
+ const state = await captureState(page);
231
+
232
+ const systemPrompt = `You are a QA tester. Test a specific operation on a website.
233
+
234
+ Product: ${checklist.product}
235
+
236
+ RULES:
237
+ - Headless browser, NO URL bar. Use {"type": "goto", "text": "url"} to navigate.
238
+ - Use simple CSS selectors. Maximum 5 actions.
239
+ - Set "success": true ONLY if success criteria is met in the current state.
240
+ - If content is already visible, set "success": true with empty actions.
241
+ - On retry, try a different approach.
242
+
243
+ Respond with ONLY JSON:
244
+ {
245
+ "actions": [{"type": "click", "selector": "..."}],
246
+ "success": true/false,
247
+ "observation": "what I see"
248
+ }`;
249
+
250
+ const userContent: any[] = [{
251
+ type: "text",
252
+ text: `Operation: ${op.id} — ${op.description}
253
+ Steps hint: ${op.steps_hint}
254
+ Success criteria: ${op.success_criteria}
255
+ Attempt: ${attempt}/3${attempt > 1 ? " — previous approach failed, try something different" : ""}
256
+
257
+ URL: ${state.url} | Title: ${state.title}
258
+
259
+ Accessibility Tree:
260
+ ${state.a11yTree}`,
261
+ }];
262
+
263
+ if (state.screenshotBase64) {
264
+ userContent.push({
265
+ type: "image",
266
+ source: { type: "base64", media_type: "image/png", data: state.screenshotBase64 },
267
+ });
268
+ }
269
+
270
+ const response = await callLLM(systemPrompt, [{ role: "user", content: userContent }]);
271
+ const jsonMatch = response.match(/\{[\s\S]*\}/);
272
+ if (!jsonMatch) continue;
273
+
274
+ const decision = JSON.parse(jsonMatch[0]);
275
+
276
+ // Execute actions and record successful ones
277
+ for (const action of decision.actions ?? []) {
278
+ const ok = await executeAction(page, action);
279
+ if (ok) result.actions.push(action);
280
+ }
281
+
282
+ await page.waitForTimeout(1000);
283
+
284
+ if (decision.success) {
285
+ result.success = true;
286
+ console.log(` āœ… (attempt ${attempt})`);
287
+ return result;
288
+ }
289
+ } catch (err: any) {
290
+ result.error = err.message?.slice(0, 200);
291
+ }
292
+ }
293
+
294
+ console.log(` āŒ (3 attempts failed)`);
295
+ return result;
296
+ }
297
+
298
+ async function runPhase1(checklist: Checklist): Promise<ResearchResults> {
299
+ console.log(`\nšŸ”¬ Phase 1: Research — ${checklist.product}\n`);
300
+
301
+ const browser = await chromium.launch({ headless: true });
302
+ const context = await browser.newContext({ viewport: { width: 1280, height: 720 } });
303
+ const page = await context.newPage();
304
+
305
+ await page.goto(checklist.url, { waitUntil: "domcontentloaded", timeout: 30000 });
306
+ await page.waitForTimeout(2000);
307
+
308
+ const results: ResearchResults = {
309
+ product: checklist.product,
310
+ url: checklist.url,
311
+ features: [],
312
+ totalPassed: 0,
313
+ totalOperations: 0,
314
+ scorePercent: 0,
315
+ };
316
+
317
+ for (const feature of checklist.features) {
318
+ console.log(`\n šŸ“‹ ${feature.name}`);
319
+ const fr = { name: feature.name, operations: [] as OperationResult[], score: "", passed: 0, total: feature.operations.length };
320
+
321
+ for (const op of feature.operations) {
322
+ const r = await testOperation(page, checklist, feature, op);
323
+ fr.operations.push(r);
324
+ if (r.success) fr.passed++;
325
+ }
326
+
327
+ fr.score = `${fr.passed}/${fr.total}`;
328
+ results.features.push(fr);
329
+ results.totalPassed += fr.passed;
330
+ results.totalOperations += fr.total;
331
+ console.log(` Score: ${fr.score}`);
332
+ }
333
+
334
+ results.scorePercent = Math.round((results.totalPassed / results.totalOperations) * 100);
335
+ await browser.close();
336
+ return results;
337
+ }
338
+
339
+ // ---------------------------------------------------------------------------
340
+ // Phase 2: Generate spec
341
+ // ---------------------------------------------------------------------------
342
+
343
+ /**
344
+ * Render a rich HTML scorecard to be shown as a fullscreen page before the outro.
345
+ * Returns a self-contained HTML string that the spec can pass to page.setContent().
346
+ */
347
+ function generateScorecardHtml(results: ResearchResults, checklist: Checklist): string {
348
+ const rows = results.features.map((f) => {
349
+ const ops = f.operations.map((op) => {
350
+ const mark = op.success ? "āœ“" : "āœ—";
351
+ const cls = op.success ? "pass" : "fail";
352
+ return `<li class="${cls}"><span class="mark">${mark}</span>${escapeHtml(op.id)}</li>`;
353
+ }).join("");
354
+ const featMark = f.passed === f.total ? "āœ“" : "⚠";
355
+ const featCls = f.passed === f.total ? "pass" : "partial";
356
+ return `
357
+ <section class="feature ${featCls}">
358
+ <h3><span class="mark">${featMark}</span>${escapeHtml(f.name)} <span class="score">${f.score}</span></h3>
359
+ <ul>${ops}</ul>
360
+ </section>`;
361
+ }).join("");
362
+
363
+ const total = `${results.totalPassed}/${results.totalOperations}`;
364
+ const pct = results.scorePercent;
365
+
366
+ return `<!DOCTYPE html>
367
+ <html><head><meta charset="utf-8"><title>${escapeHtml(checklist.product)} QA</title>
368
+ <style>
369
+ * { box-sizing: border-box; margin: 0; padding: 0; }
370
+ body {
371
+ font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
372
+ background: radial-gradient(ellipse at top, #1a1f3a 0%, #0a0e1f 100%);
373
+ color: #fff;
374
+ min-height: 100vh;
375
+ padding: 40px 60px;
376
+ }
377
+ header {
378
+ text-align: center;
379
+ margin-bottom: 32px;
380
+ }
381
+ header h1 {
382
+ font-size: 42px;
383
+ font-weight: 800;
384
+ margin-bottom: 8px;
385
+ background: linear-gradient(135deg, #fff 0%, #8892b0 100%);
386
+ -webkit-background-clip: text;
387
+ background-clip: text;
388
+ -webkit-text-fill-color: transparent;
389
+ }
390
+ header .total {
391
+ font-size: 28px;
392
+ font-weight: 600;
393
+ color: ${pct >= 80 ? "#4ade80" : pct >= 50 ? "#facc15" : "#f87171"};
394
+ }
395
+ .features {
396
+ display: grid;
397
+ grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
398
+ gap: 20px;
399
+ }
400
+ .feature {
401
+ background: rgba(255, 255, 255, 0.04);
402
+ border: 1px solid rgba(255, 255, 255, 0.1);
403
+ border-radius: 12px;
404
+ padding: 20px 24px;
405
+ backdrop-filter: blur(10px);
406
+ }
407
+ .feature.pass { border-left: 4px solid #4ade80; }
408
+ .feature.partial { border-left: 4px solid #facc15; }
409
+ .feature h3 {
410
+ font-size: 20px;
411
+ font-weight: 700;
412
+ margin-bottom: 12px;
413
+ display: flex;
414
+ align-items: center;
415
+ gap: 10px;
416
+ }
417
+ .feature h3 .score {
418
+ margin-left: auto;
419
+ font-family: 'SF Mono', Monaco, monospace;
420
+ font-size: 18px;
421
+ color: #8892b0;
422
+ }
423
+ .feature ul { list-style: none; }
424
+ .feature li {
425
+ padding: 6px 0;
426
+ font-size: 15px;
427
+ display: flex;
428
+ align-items: center;
429
+ gap: 10px;
430
+ color: #ccd6f6;
431
+ }
432
+ .feature li.fail { color: #8892b0; text-decoration: line-through; }
433
+ .mark {
434
+ display: inline-block;
435
+ width: 20px;
436
+ height: 20px;
437
+ line-height: 20px;
438
+ text-align: center;
439
+ border-radius: 50%;
440
+ font-weight: 700;
441
+ font-size: 12px;
442
+ flex-shrink: 0;
443
+ }
444
+ .feature.pass h3 .mark { background: #4ade80; color: #0a0e1f; }
445
+ .feature.partial h3 .mark { background: #facc15; color: #0a0e1f; }
446
+ .feature li.pass .mark { background: rgba(74, 222, 128, 0.2); color: #4ade80; }
447
+ .feature li.fail .mark { background: rgba(248, 113, 113, 0.2); color: #f87171; }
448
+ footer {
449
+ text-align: center;
450
+ margin-top: 40px;
451
+ font-size: 14px;
452
+ color: #8892b0;
453
+ }
454
+ </style>
455
+ </head><body>
456
+ <header>
457
+ <h1>${escapeHtml(checklist.product)} QA Results</h1>
458
+ <div class="total">${total} &nbsp;•&nbsp; ${pct}%</div>
459
+ </header>
460
+ <div class="features">
461
+ ${rows}
462
+ </div>
463
+ <footer>Comfy-QA Evidence • Generated ${new Date().toISOString().split("T")[0]}</footer>
464
+ </body></html>`;
465
+ }
466
+
467
+ function escapeHtml(s: string): string {
468
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
469
+ }
470
+
471
+ /**
472
+ * Generate assertion code for a FAILED operation.
473
+ * Instead of .catch(() => {}), we assert the actual broken behavior.
474
+ * When the bug is fixed, this assertion will fail → signal to update the spec.
475
+ */
476
+ function generateFailAssertion(op: OperationResult): string {
477
+ const lines: string[] = [];
478
+
479
+ // Replay the actions that were attempted, wrapped in try/catch to observe failure
480
+ if (op.actions.length > 0) {
481
+ const lastAction = op.actions[op.actions.length - 1];
482
+
483
+ // If the last action was a goto, the page may have loaded but content wasn't right
484
+ if (lastAction.type === "goto") {
485
+ lines.push(` // Navigate to the target page`);
486
+ lines.push(` await page.goto(${JSON.stringify(lastAction.text)}, { waitUntil: "domcontentloaded", timeout: 15000 });`);
487
+ lines.push(` await page.waitForTimeout(1500);`);
488
+ lines.push(` // Assert: operation did not succeed (current known behavior)`);
489
+ lines.push(` // When this bug is fixed, this assertion will fail → update the spec`);
490
+ }
491
+
492
+ // If last action was a click/type that failed, assert the element isn't interactable
493
+ if (lastAction.type === "click" && lastAction.selector) {
494
+ lines.push(` // Assert: element is not clickable (current known behavior)`);
495
+ lines.push(` const target = page.locator(${JSON.stringify(lastAction.selector)}).first();`);
496
+ lines.push(` expect(await target.isVisible().catch(() => false)).toBe(false);`);
497
+ }
498
+ }
499
+
500
+ if (lines.length === 0) {
501
+ // No specific actions to assert — just document the failure
502
+ lines.push(` // This operation failed during research — no working selectors found`);
503
+ lines.push(` // Current page state is shown in the video as evidence`);
504
+ }
505
+
506
+ return lines.join("\n");
507
+ }
508
+
509
+ function actionToCode(a: StepAction): string | null {
510
+ switch (a.type) {
511
+ case "goto":
512
+ return `await page.goto(${JSON.stringify(a.text)}, { waitUntil: "domcontentloaded", timeout: 15000 }); await page.waitForTimeout(1500);`;
513
+ case "click":
514
+ return `await page.locator(${JSON.stringify(a.selector)}).first().click({ timeout: 5000 });`;
515
+ case "type":
516
+ return `await page.locator(${JSON.stringify(a.selector)}).first().fill(${JSON.stringify(a.text)}, { timeout: 5000 });`;
517
+ case "scroll":
518
+ return `await page.mouse.wheel(0, ${a.value ?? 300});`;
519
+ case "hover":
520
+ return `await page.locator(${JSON.stringify(a.selector)}).first().hover({ timeout: 5000 });`;
521
+ case "wait":
522
+ return `await page.waitForTimeout(${a.value ?? 1000});`;
523
+ case "key":
524
+ return `await page.keyboard.press(${JSON.stringify(a.text)});`;
525
+ case "safeMove":
526
+ return `await safeMove(page, ${JSON.stringify(a.selector)});`;
527
+ default:
528
+ return null;
529
+ }
530
+ }
531
+
532
+ function generateSpec(results: ResearchResults, checklist: Checklist): string {
533
+ const segments: string[] = [];
534
+
535
+ // Actions that change the page (must run BEFORE narration starts)
536
+ const SETUP_TYPES = new Set(["goto", "click", "type", "scroll", "key", "wait"]);
537
+ // Actions that are visual (safe to run DURING narration)
538
+ const VISUAL_TYPES = new Set(["safeMove", "hover"]);
539
+
540
+ for (const feature of results.features) {
541
+ segments.push(`\n // ── ${feature.name} (${feature.score}) ──`);
542
+
543
+ for (const op of feature.operations) {
544
+ const icon = op.success ? "āœ…" : "āŒ";
545
+
546
+ if (op.success) {
547
+ // ── PASS: generate setup + action with real assertions ──
548
+ const narration = op.narration;
549
+ const setupLines: string[] = [];
550
+ const visualLines: string[] = [];
551
+
552
+ for (const a of op.actions) {
553
+ const line = actionToCode(a);
554
+ if (line) {
555
+ if (SETUP_TYPES.has(a.type)) {
556
+ setupLines.push(line);
557
+ } else {
558
+ visualLines.push(line);
559
+ }
560
+ }
561
+ }
562
+
563
+ if (setupLines.length > 0) {
564
+ const actionBody = visualLines.length > 0
565
+ ? visualLines.map(l => ` ${l}`).join("\n") + "\n await pace();"
566
+ : " await pace();";
567
+
568
+ segments.push(` // ${icon} ${op.id} (${op.type}) — PASS
569
+ .segment(${JSON.stringify(narration)}, {
570
+ setup: async () => {
571
+ ${setupLines.map(l => ` ${l}`).join("\n")}
572
+ },
573
+ action: async (pace) => {
574
+ ${actionBody}
575
+ },
576
+ })`);
577
+ } else {
578
+ const bodyLines = visualLines.length > 0
579
+ ? visualLines.map(l => ` ${l}`).join("\n") + "\n await pace();"
580
+ : " await pace();";
581
+
582
+ segments.push(` // ${icon} ${op.id} (${op.type}) — PASS
583
+ .segment(${JSON.stringify(narration)}, async (pace) => {
584
+ ${bodyLines}
585
+ })`);
586
+ }
587
+ } else {
588
+ // ── FAIL: assert the actual broken behavior ──
589
+ const narration = `${op.narration.replace(/\.$/, "")} — but this operation is currently failing.`;
590
+ const failBody = generateFailAssertion(op);
591
+
592
+ segments.push(` // ${icon} ${op.id} (${op.type}) — FAIL (asserts current broken behavior)
593
+ .segment(${JSON.stringify(narration)}, async (pace) => {
594
+ ${failBody}
595
+ await pace();
596
+ })`);
597
+ }
598
+ }
599
+ }
600
+
601
+ // Scorecard (compact for outro subtitle)
602
+ const scoreLines: string[] = [];
603
+ for (const f of results.features) {
604
+ const icon = f.passed === f.total ? "āœ…" : "⚠";
605
+ scoreLines.push(`${f.name} ${f.score} ${icon}`);
606
+ }
607
+
608
+ // Scorecard HTML (for full-page render before outro)
609
+ const scorecardHtml = generateScorecardHtml(results, checklist);
610
+
611
+ const slug = checklist.product.toLowerCase().replace(/\s+/g, "-");
612
+
613
+ return `/**
614
+ * ${checklist.product} — QA Evidence Video
615
+ * Auto-generated by Research Agent on ${new Date().toISOString().split("T")[0]}
616
+ * Score: ${results.totalPassed}/${results.totalOperations} (${results.scorePercent}%)
617
+ */
618
+ import { test, safeMove, expect } from "./fixtures/fixture";
619
+ import { createVideoScript } from "../lib/demowright/dist/index.mjs";
620
+
621
+ const SCORECARD_HTML = ${JSON.stringify(scorecardHtml)};
622
+
623
+ test("${slug} QA evidence", async ({ page }) => {
624
+ test.setTimeout(10 * 60_000);
625
+
626
+ const script = createVideoScript()
627
+ .title(${JSON.stringify(checklist.product + " QA")}, {
628
+ subtitle: "Score: ${results.totalPassed}/${results.totalOperations} (${results.scorePercent}%)",
629
+ durationMs: 3000,
630
+ })
631
+ ${segments.join("\n")}
632
+
633
+ // Render the full scorecard as the last segment (visible for ~8s)
634
+ .segment("Here are the final QA results for this product.", {
635
+ setup: async () => {
636
+ await page.setContent(SCORECARD_HTML, { waitUntil: "domcontentloaded" });
637
+ await page.waitForTimeout(500);
638
+ },
639
+ action: async (pace) => {
640
+ await pace();
641
+ },
642
+ })
643
+
644
+ .outro({
645
+ text: "QA Results: ${results.totalPassed}/${results.totalOperations} (${results.scorePercent}%)",
646
+ subtitle: ${JSON.stringify(scoreLines.join(" | "))},
647
+ durationMs: 4000,
648
+ });
649
+
650
+ // Pre-generate TTS BEFORE navigating — avoids idle time in recording
651
+ await script.prepare(page);
652
+
653
+ // Navigate after TTS is ready — recording is already active
654
+ await page.goto(${JSON.stringify(checklist.url)}, { waitUntil: "domcontentloaded" });
655
+ await page.waitForTimeout(2000);
656
+
657
+ await script.render(page, {
658
+ baseName: ${JSON.stringify(slug + "-qa")},
659
+ outputDir: ".comfy-qa/.demos",
660
+ });
661
+ });
662
+ `;
663
+ }
664
+
665
+ // ---------------------------------------------------------------------------
666
+ // Phase 2+3: Debug & Record
667
+ // ---------------------------------------------------------------------------
668
+
669
+ async function runSpec(specPath: string, label: string): Promise<{ ok: boolean; output: string }> {
670
+ console.log(`\n${label}\n Running: bunx playwright test ${specPath}\n`);
671
+ try {
672
+ const result = await $`bunx playwright test ${specPath} --reporter=list 2>&1`.text();
673
+ const ok = !result.includes("failed");
674
+ console.log(result.slice(-1000));
675
+ return { ok, output: result };
676
+ } catch (err: any) {
677
+ const output = err.stdout?.toString() ?? err.message ?? "";
678
+ console.log(output.slice(-1000));
679
+ return { ok: false, output };
680
+ }
681
+ }
682
+
683
+ /**
684
+ * Phase 2 debug loop: if spec fails, ask LLM to fix it, then re-run.
685
+ * Returns true if spec passes after fixes.
686
+ */
687
+ async function debugLoop(specPath: string, maxRetries = 3): Promise<boolean> {
688
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
689
+ const { ok, output } = await runSpec(specPath, `šŸ”§ Phase 2: Debug (attempt ${attempt}/${maxRetries})`);
690
+ if (ok) return true;
691
+
692
+ if (attempt === maxRetries) {
693
+ console.log(`\n⚠ Spec still failing after ${maxRetries} debug attempts.`);
694
+ return false;
695
+ }
696
+
697
+ // Ask LLM to fix the spec
698
+ console.log(`\nšŸ”§ Asking LLM to fix spec...`);
699
+ const specContent = fs.readFileSync(specPath, "utf-8");
700
+ const errorTail = output.slice(-2000);
701
+
702
+ const fixPrompt = `You are fixing a Playwright test spec that failed. Here is the spec and error output.
703
+
704
+ RULES:
705
+ - Only fix the specific error. Don't rewrite the whole spec.
706
+ - Keep the same structure (title, segments, outro).
707
+ - If a selector timed out, try a more robust selector or add .catch(() => {}).
708
+ - If an import is wrong, fix it.
709
+ - Return the COMPLETE fixed spec file (not just the diff).
710
+
711
+ ## Current spec:
712
+ \`\`\`typescript
713
+ ${specContent}
714
+ \`\`\`
715
+
716
+ ## Error output (last 2000 chars):
717
+ \`\`\`
718
+ ${errorTail}
719
+ \`\`\`
720
+
721
+ Return ONLY the fixed TypeScript file content, no markdown fences.`;
722
+
723
+ try {
724
+ const fixed = await callLLM("You fix Playwright test specs. Return only the fixed file content.", [
725
+ { role: "user", content: fixPrompt },
726
+ ]);
727
+
728
+ // Extract TypeScript from response (strip markdown fences if present)
729
+ let fixedContent = fixed.trim();
730
+ if (fixedContent.startsWith("```")) {
731
+ fixedContent = fixedContent.replace(/^```\w*\n/, "").replace(/\n```$/, "");
732
+ }
733
+
734
+ // Sanity checks — reject truncated / incomplete LLM responses
735
+ const hasImport = fixedContent.includes("import");
736
+ const hasTest = fixedContent.includes("test(");
737
+ const hasEnd = /\}\s*\)\s*;?\s*$/.test(fixedContent);
738
+ const balancedBraces = (fixedContent.match(/\{/g)?.length ?? 0) === (fixedContent.match(/\}/g)?.length ?? 0);
739
+
740
+ if (hasImport && hasTest && hasEnd && balancedBraces) {
741
+ fs.writeFileSync(specPath, fixedContent);
742
+ console.log(` āœļø Spec updated, retrying...`);
743
+ } else {
744
+ console.log(` ⚠ LLM response incomplete (import=${hasImport}, test=${hasTest}, end=${hasEnd}, balanced=${balancedBraces}), keeping original.`);
745
+ }
746
+ } catch (err: any) {
747
+ console.log(` ⚠ LLM fix failed: ${err.message?.slice(0, 100)}`);
748
+ }
749
+ }
750
+ return false;
751
+ }
752
+
753
+ // ---------------------------------------------------------------------------
754
+ // Main
755
+ // ---------------------------------------------------------------------------
756
+
757
+ async function main() {
758
+ const checklistPath = process.argv[2];
759
+ if (!checklistPath) {
760
+ console.error("Usage: bun src/agent/qa-research.ts <checklist.yaml>");
761
+ process.exit(1);
762
+ }
763
+
764
+ const raw = fs.readFileSync(path.resolve(checklistPath), "utf-8");
765
+ const checklist = yaml.parse(raw) as Checklist;
766
+ const slug = checklist.product.toLowerCase().replace(/\s+/g, "-");
767
+ const outputDir = path.resolve(".comfy-qa/.research", slug);
768
+ fs.mkdirSync(outputDir, { recursive: true });
769
+
770
+ // ── Phase 1 ──
771
+ const results = await runPhase1(checklist);
772
+
773
+ fs.writeFileSync(path.join(outputDir, "research-results.json"), JSON.stringify(results, null, 2));
774
+
775
+ // Print scorecard
776
+ console.log(`\n ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”`);
777
+ console.log(` │ ${checklist.product} QA Results`.padEnd(37) + "│");
778
+ console.log(` │ │`);
779
+ for (const f of results.features) {
780
+ const icon = f.passed === f.total ? "āœ…" : "⚠ ";
781
+ console.log(` │ ${icon} ${f.name.padEnd(22)} ${f.score.padStart(5)} │`);
782
+ }
783
+ console.log(` │ │`);
784
+ console.log(` │ Total: ${results.totalPassed}/${results.totalOperations} (${results.scorePercent}%)`.padEnd(37) + "│");
785
+ console.log(` ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜`);
786
+
787
+ // ── Phase 2: Generate spec ──
788
+ const specContent = generateSpec(results, checklist);
789
+ const specPath = path.join("demo", `${slug}-qa.spec.ts`);
790
+ fs.writeFileSync(specPath, specContent);
791
+ console.log(`\nšŸ“ Spec: ${specPath}`);
792
+
793
+ // ── Phase 2: Debug loop (LLM fixes failing spec) ──
794
+ const debugOk = await debugLoop(specPath, 3);
795
+
796
+ // ── Phase 3: Record ──
797
+ if (debugOk) {
798
+ const { ok: recordOk } = await runSpec(specPath, "šŸŽ¬ Phase 3: Record (with video)");
799
+ if (recordOk) {
800
+ console.log(`\nāœ… Video: .comfy-qa/.demos/${slug}-qa.mp4`);
801
+ }
802
+ } else {
803
+ console.log(`\n⚠ Debug failed after retries. Fix spec manually:`);
804
+ console.log(` bunx playwright test ${specPath}`);
805
+ }
806
+
807
+ console.log("\nāœ… Done.");
808
+ }
809
+
810
+ main().catch((err) => {
811
+ console.error(err);
812
+ process.exit(1);
813
+ });