npm - comfy-qa - Versions diffs - 2.3.0 → 2.4.1 - Mend

comfy-qa 2.3.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +1 -1
package/src/agent/browser-agent.ts +130 -55
package/src/agent/orchestrator.ts +111 -70
package/src/agent/research.ts +1 -45
package/src/recorder/narration.ts +9 -2
package/src/recorder/post-mix.ts +77 -57
package/src/utils/llm.ts +41 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "comfy-qa",
-  "version": "2.3.0",
+  "version": "2.4.1",
   "description": "ComfyUI QA automation CLI",
   "repository": {
     "type": "git",

package/src/agent/browser-agent.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import type { Page } from "playwright";
 import type { RecorderSession } from "../browser/recorder";
-import type { TestScenario, QAChecklistItem } from "./research";
+import type { TestScenario } from "./research";
+import { callLLM } from "../utils/llm";
 /** An action the AI agent decides to take */
 interface AgentAction {
@@ -61,70 +62,30 @@ async function capturePageState(page: Page): Promise<{
   return { screenshot, a11yTree, url, title, consoleErrors: [] };
 }
-/** Ask Claude to decide the next action based on the current page state */
+/** Ask Claude to decide the next action based on the current page state (live fallback) */
 async function askAgentForAction(
   scenario: TestScenario,
   stepIndex: number,
   pageState: { screenshot: string; a11yTree: string; url: string; title: string },
   history: string[]
 ): Promise<AgentAction[]> {
-  const prompt = `You are a QA automation agent controlling a browser via Playwright to test ComfyUI.
+  const prompt = `You are a QA automation agent controlling a browser via Playwright.
-## Current Scenario: ${scenario.name}
-${scenario.description}
-## Test Steps
-${scenario.steps.map((s, i) => `${i === stepIndex ? "👉 " : "  "}${i + 1}. ${s}`).join("\n")}
-## Current Step: ${stepIndex + 1}/${scenario.steps.length}
-"${scenario.steps[stepIndex]}"
-## Expected Outcome
-${scenario.expectedOutcome}
-## Playwright Hint
-${scenario.playwrightHint}
-## Page State
-- URL: ${pageState.url}
-- Title: ${pageState.title}
+## Scenario: ${scenario.name}
+## Step ${stepIndex + 1}/${scenario.steps.length}: "${scenario.steps[stepIndex]}"
+## Playwright Hint: ${scenario.playwrightHint}
+## URL: ${pageState.url}
+## A11y Tree (truncated):
+${pageState.a11yTree.slice(0, 2000)}
+## History: ${history.slice(-5).join("; ") || "(start)"}
-## Accessibility Tree (truncated)
-${pageState.a11yTree.slice(0, 3000)}
-## Action History
-${history.slice(-10).join("\n") || "(start)"}
----
-Return a JSON array of 1-5 actions to execute for this step. Each action:
-{
-  "type": "click" | "type" | "scroll" | "hover" | "wait" | "key" | "done",
-  "selector": "CSS selector or text content to target",
-  "text": "text to type (for type action)",
-  "x": number, "y": number (for coordinate-based click),
-  "key": "key name (for key action, e.g. Enter, Tab)",
-  "ms": milliseconds (for wait action),
-  "observation": "what you expect to see / what you observed"
-}
-Use "done" when the current step is complete and we should move to the next step.
-If ComfyUI is not loaded or the page shows something unexpected, include an observation explaining what you see.
-Return ONLY the JSON array.`;
-  const proc = Bun.spawn(["claude", "--print", "--model", "claude-sonnet-4-6"], {
-    stdin: new TextEncoder().encode(prompt),
-    stdout: "pipe",
-    stderr: "pipe",
-  });
-  const output = await new Response(proc.stdout).text();
-  await proc.exited;
+Return a JSON array of 1-5 actions:
+{"type":"click"|"type"|"scroll"|"hover"|"wait"|"key"|"done","selector":"...","text":"...","key":"...","ms":N,"observation":"..."}
+Use "done" when the step is complete. Return ONLY the JSON array.`;
+  const output = await callLLM(prompt);
   const jsonMatch = output.match(/\[[\s\S]*\]/);
-  if (!jsonMatch) {
-    return [{ type: "done", observation: `Agent could not parse response: ${output.slice(0, 200)}` }];
-  }
+  if (!jsonMatch) return [{ type: "done", observation: `No JSON in response` }];
   try {
     return JSON.parse(jsonMatch[0]) as AgentAction[];
   } catch {
@@ -196,6 +157,120 @@ async function executeAction(page: Page, action: AgentAction): Promise<string> {
   }
 }
+// ─── Pre-planning ──────────────────────────────────────────────────────────
+export interface PlannedStep {
+  stepText: string;
+  actions: AgentAction[];
+}
+export interface PlannedScenario {
+  scenarioIndex: number;
+  scenario: TestScenario;
+  steps: PlannedStep[];
+}
+/** Pre-plan all actions for a scenario before recording starts (one LLM call, no browser) */
+export async function prePlanScenario(
+  scenario: TestScenario,
+  scenarioIndex: number,
+  targetUrl: string,
+): Promise<PlannedScenario> {
+  console.log(`  [plan] Scenario ${scenarioIndex + 1}: ${scenario.name}`);
+  const prompt = `You are a Playwright automation expert. Plan concrete browser actions for this QA scenario.
+Site URL: ${targetUrl}
+## Scenario: ${scenario.name}
+${scenario.description}
+## Playwright Hint
+${scenario.playwrightHint}
+## Preconditions
+${scenario.preconditions.join("\n")}
+## Steps to automate
+${scenario.steps.map((s, i) => `${i + 1}. ${s}`).join("\n")}
+For each step, return concrete Playwright actions. Use real CSS selectors or accessible names from the site.
+Prefer: getByRole, getByLabel, getByText, getByPlaceholder over brittle CSS selectors.
+Express selectors as Playwright locator strings (e.g. "button:has-text('Submit')" or "[data-testid='search']").
+Return a JSON array — one object per step:
+[
+  {
+    "stepText": "exact step text",
+    "actions": [
+      {"type": "click"|"type"|"scroll"|"hover"|"wait"|"key", "selector": "...", "text": "...", "key": "...", "ms": N, "observation": "..."}
+    ]
+  }
+]
+Return ONLY the JSON array. Be specific and actionable.`;
+  const output = await callLLM(prompt);
+  const jsonMatch = output.match(/\[[\s\S]*\]/);
+  if (jsonMatch) {
+    try {
+      const steps = JSON.parse(jsonMatch[0]) as PlannedStep[];
+      return { scenarioIndex, scenario, steps };
+    } catch {}
+  }
+  // Fallback: one wait action per step so recording at least proceeds
+  return {
+    scenarioIndex,
+    scenario,
+    steps: scenario.steps.map((stepText) => ({
+      stepText,
+      actions: [{ type: "wait" as const, ms: 1500, observation: stepText }],
+    })),
+  };
+}
+/** Execute pre-planned actions and return actual elapsed ms per step */
+export async function runScenarioWithPlan(
+  session: RecorderSession,
+  plan: PlannedScenario,
+): Promise<{ success: boolean; log: string[]; stepTimingsMs: number[] }> {
+  const log: string[] = [];
+  const stepTimingsMs: number[] = [];
+  await session.step(`Scenario ${plan.scenarioIndex + 1}: ${plan.scenario.name}`);
+  await session.plan(plan.scenario.description);
+  log.push(`=== Scenario: ${plan.scenario.name} ===`);
+  for (let stepIdx = 0; stepIdx < plan.steps.length; stepIdx++) {
+    const planned = plan.steps[stepIdx];
+    if (!planned) continue;
+    const stepText = planned.stepText;
+    await session.status(`Step ${stepIdx + 1}/${plan.steps.length}: ${stepText}`);
+    log.push(`--- Step ${stepIdx + 1}: ${stepText} ---`);
+    const stepStart = Date.now();
+    for (const action of planned.actions) {
+      if (action.observation) {
+        log.push(`  [observe] ${action.observation}`);
+        await session.annotate(200, 300, action.observation, 1500);
+      }
+      const result = await executeAction(session.page, action);
+      log.push(`  [action] ${result}`);
+      await session.page.waitForTimeout(150); // minimal visual pause
+    }
+    stepTimingsMs.push(Date.now() - stepStart);
+    await session.screenshot(
+      `scenario-${String(plan.scenarioIndex + 1).padStart(2, "0")}-step-${String(stepIdx + 1).padStart(2, "0")}`
+    );
+  }
+  log.push(`=== Scenario complete ===`);
+  return { success: true, log, stepTimingsMs };
+}
 /** Run a full test scenario with AI-driven browser automation */
 export async function runScenarioWithAgent(
   session: RecorderSession,

package/src/agent/orchestrator.ts CHANGED Viewed

@@ -4,7 +4,7 @@ import { fetchPR, fetchIssue, parseRef, fetchDeploymentPreviewUrl } from "../uti
 import { detectRunningInstance, bootstrapWorkspace, type ComfyUIInstance, COMFYUI_REPOS, REPO_PROD_URLS } from "../utils/comfyui";
 import { researchPR, researchIssue } from "./research";
 import { startRecorder, navigateWithHUD } from "../browser/recorder";
-import { runScenarioWithAgent, runScenarioResearchOnly } from "./browser-agent";
+import { runScenarioWithAgent, runScenarioResearchOnly, prePlanScenario, runScenarioWithPlan, type PlannedScenario } from "./browser-agent";
 import { saveReport } from "../report/generate";
 import { generateE2ETest } from "../report/e2e-test";
 import { ensureQASkill } from "../utils/qa-skill";
@@ -137,111 +137,152 @@ export async function runQA(opts: QAOptions): Promise<void> {
       }
     }
-    // Pre-generate narration BEFORE recording (so durations are known)
-    const narrationSegments: NarrationSegment[] = [
-      { id: "intro", text: `Welcome to comfy QA. Let's review ${targetType} number ${parsed.number}: ${target.title.slice(0, 100)}` },
-      { id: "github", text: `First, let's look at the GitHub ${targetType} page for context.` },
-      { id: "analysis", text: `Severity ${research.severity}. Affected area: ${research.affectedArea}.` },
-      ...research.testScenarios.flatMap((s, i): NarrationSegment[] => [
-        { id: `scenario-${i + 1}-intro`, text: `Scenario ${i + 1}: ${s.name}. ${s.description}` },
-        ...s.steps.slice(0, 5).map((step, j) => ({
-          id: `scenario-${i + 1}-step-${j + 1}`,
-          text: `Step ${j + 1}: ${step.slice(0, 150)}`,
-        })),
-      ]),
-      { id: "outro", text: `QA session complete. Report and video evidence saved.` },
-    ];
-    const narration = await generateNarration(narrationSegments, outputDir);
+    // [3a] Pre-plan all scenario actions before recording (parallel LLM calls, no browser yet)
+    let plans: PlannedScenario[] = [];
+    if (comfyUrl) {
+      console.log(`\n[3a/5] Pre-planning ${research.testScenarios.length} scenarios…`);
+      plans = await Promise.all(
+        research.testScenarios.map((s, i) => prePlanScenario(s, i, comfyUrl!))
+      );
+      console.log(`  ✓ Plans ready`);
+    }
+    // [3b] Record — execute pre-planned actions (no LLM calls on the hot path)
+    // Narration is generated AFTER recording using real step timings for perfect sync.
     const session = await startRecorder(outputDir, `qa-${parsed.number}`);
-    if (narration) session.attachNarration(narration.durations);
-    const ffmpegStartMs = Date.now();
+    // Timing markers collected during recording, used for narration after
+    const introStartMs = Date.now();
+    let githubDoneMs = 0;
+    let analysisDoneMs = 0;
+    const scenarioStartMs: number[] = [];
+    const stepTimings: number[][] = []; // [scenarioIdx][stepIdx] = elapsed ms
     try {
-      // Screenshot the GitHub page for evidence
-      if (narration) await session.narrate("intro", `Opening ${target.url}`);
-      else await session.step(`Opening ${target.url}`);
+      await session.step(`Opening ${target.url}`);
       await navigateWithHUD(session, target.url, `QA: ${targetType.toUpperCase()} #${parsed.number}`);
       await session.plan(`Analyzing: ${target.title}`);
-      if (narration) await session.narrate("github", "Inspecting GitHub page");
-      else await session.page.waitForTimeout(2000);
-      if (narration) await session.narrate("analysis", `${research.severity} severity`);
+      await session.page.waitForTimeout(2000);
+      githubDoneMs = Date.now();
       await session.screenshot("01-github-page");
+      analysisDoneMs = Date.now();
-      if (comfyUrl) {
-        // ── ComfyUI available: AI agent drives the browser ──
-        console.log(`  [mode] Agent-driven QA against ${comfyUrl}`);
-        await session.step(`Navigating to ComfyUI at ${comfyUrl}`);
-        await navigateWithHUD(session, comfyUrl, `ComfyUI QA — ${targetType.toUpperCase()} #${parsed.number}`);
+      if (comfyUrl && plans.length > 0) {
+        console.log(`  [mode] Pre-planned QA against ${comfyUrl}`);
+        await session.step(`Navigating to ${comfyUrl}`);
+        await navigateWithHUD(session, comfyUrl, `QA — ${targetType.toUpperCase()} #${parsed.number}`);
         await session.page.waitForTimeout(2000);
-        await session.screenshot("02-comfyui-loaded");
-        for (let i = 0; i < research.testScenarios.length; i++) {
-          const scenario = research.testScenarios[i];
-          console.log(`  [scenario ${i + 1}/${research.testScenarios.length}] ${scenario.name}`);
+        await session.screenshot("02-target-loaded");
-          const result = await runScenarioWithAgent(session, scenario, i);
+        for (const plan of plans) {
+          scenarioStartMs.push(Date.now());
+          const result = await runScenarioWithPlan(session, plan);
+          stepTimings.push(result.stepTimingsMs);
           allLogs.push(...result.log);
+          if (plan.scenarioIndex < plans.length - 1) {
+            await session.page.goto(comfyUrl, { waitUntil: "domcontentloaded", timeout: 15000 });
+            await session.page.waitForTimeout(500);
+          }
+        }
+      } else if (comfyUrl) {
+        // Fallback to live agent if pre-planning produced no plans
+        console.log(`  [mode] Live agent QA against ${comfyUrl}`);
+        await navigateWithHUD(session, comfyUrl, `QA — ${targetType.toUpperCase()} #${parsed.number}`);
+        await session.page.waitForTimeout(2000);
+        await session.screenshot("02-target-loaded");
+        for (let i = 0; i < research.testScenarios.length; i++) {
+          scenarioStartMs.push(Date.now());
+          const result = await runScenarioWithAgent(session, research.testScenarios[i]!, i);
+          stepTimings.push([]);
+          allLogs.push(...result.log);
           if (i < research.testScenarios.length - 1) {
             await session.page.goto(comfyUrl, { waitUntil: "domcontentloaded", timeout: 15000 });
-            await session.page.waitForTimeout(1000);
+            await session.page.waitForTimeout(500);
           }
         }
       } else {
-        // ── No ComfyUI: research-only recording with GitHub evidence ──
-        console.log(`  [mode] Research-only (no ComfyUI instance available)`);
+        console.log(`  [mode] Research-only (no target URL)`);
         await session.step("Scrolling through issue details");
         for (let scroll = 0; scroll < 3; scroll++) {
           await session.page.mouse.wheel(0, 400);
           await session.page.waitForTimeout(1000);
         }
         await session.screenshot("02-github-details");
         for (let i = 0; i < research.testScenarios.length; i++) {
-          const scenario = research.testScenarios[i];
-          console.log(`  [scenario ${i + 1}/${research.testScenarios.length}] ${scenario.name} (planned)`);
-          if (narration) {
-            await session.narrate(`scenario-${i + 1}-intro`, `Scenario ${i + 1}: ${scenario.name}`);
-            for (let j = 0; j < Math.min(scenario.steps.length, 5); j++) {
-              await session.narrate(`scenario-${i + 1}-step-${j + 1}`, `Step ${j + 1}: ${scenario.steps[j].slice(0, 80)}`);
-            }
-            await session.screenshot(`scenario-${String(i + 1).padStart(2, "0")}-plan`);
-          } else {
-            const result = await runScenarioResearchOnly(session, scenario, i);
-            allLogs.push(...result.log);
-          }
+          const result = await runScenarioResearchOnly(session, research.testScenarios[i]!, i);
+          allLogs.push(...result.log);
         }
       }
-      if (narration) await session.narrate("outro", "QA Session complete");
-      else await session.step("QA Session complete");
-      await session.status(comfyUrl ? "Agent-driven QA finished" : "Research-only QA finished");
-      await session.page.waitForTimeout(1500);
+      await session.step("QA Session complete");
+      await session.status("QA finished");
+      await session.page.waitForTimeout(1000);
       await session.screenshot("99-final");
       screenshots = session.screenshots;
     } finally {
-      const demoStartMs = session.getDemoStartMs();
       await session.stop();
-      if (bootstrappedInstance) {
-        await bootstrappedInstance.stop();
-      }
+      if (bootstrappedInstance) await bootstrappedInstance.stop();
       const webm = path.join(outputDir, `qa-${parsed.number}.webm`);
       if (fs.existsSync(webm)) videoPath = webm;
+    }
-      // Post-mix narration onto the recorded video
-      if (narration && videoPath) {
-        try {
-          const offsetMs = Math.max(0, demoStartMs - ffmpegStartMs);
-          const finalPath = path.join(outputDir, `qa-${parsed.number}-narrated.mp4`);
-          await postMix(videoPath, narration.trackPath, narration.metaPath, offsetMs, finalPath);
+    // [3c] Generate narration using REAL step timings measured during recording
+    if (videoPath) {
+      const recordingStart = introStartMs;
+      const toVideoMs = (absMs: number) => Math.max(0, absMs - recordingStart);
+      const narrationSegments: NarrationSegment[] = [
+        {
+          id: "intro",
+          text: `Welcome to comfy QA. Reviewing ${targetType} ${parsed.number}: ${target.title.slice(0, 100)}`,
+          startMs: 0,
+        },
+        {
+          id: "github",
+          text: `First, the GitHub ${targetType} page for context.`,
+          startMs: toVideoMs(githubDoneMs),
+        },
+        {
+          id: "analysis",
+          text: `Severity ${research.severity}. Affected area: ${research.affectedArea}.`,
+          startMs: toVideoMs(analysisDoneMs),
+        },
+        ...research.testScenarios.flatMap((s, i): NarrationSegment[] => {
+          const scenStepTimings = stepTimings[i] ?? [];
+          const scenStart = scenarioStartMs[i] ?? analysisDoneMs;
+          // Accumulate step start times within the scenario
+          let stepCursor = toVideoMs(scenStart);
+          return [
+            {
+              id: `scenario-${i + 1}-intro`,
+              text: `Scenario ${i + 1}: ${s.name}. ${s.description.slice(0, 120)}`,
+              startMs: stepCursor,
+            },
+            ...s.steps.slice(0, 5).map((step, j) => {
+              const start = stepCursor;
+              stepCursor += scenStepTimings[j] ?? 2000;
+              return {
+                id: `scenario-${i + 1}-step-${j + 1}`,
+                text: `Step ${j + 1}: ${step.slice(0, 150)}`,
+                startMs: start,
+              };
+            }),
+          ];
+        }),
+        { id: "outro", text: `QA session complete. Report and video evidence saved.`, startMs: toVideoMs(Date.now()) },
+      ];
+      try {
+        console.log(`\n[3d/5] Generating narration from real timings…`);
+        const narration = await generateNarration(narrationSegments, outputDir);
+        if (narration) {
+          const finalPath = path.join(outputDir, `qa-${parsed.number}.mp4`);
+          await postMix(videoPath, narration.trackPath, narration.metaPath, 0, finalPath);
           videoPath = finalPath;
-        } catch (err) {
-          console.log(`  [post-mix] Failed: ${String(err).slice(0, 200)}`);
         }
+      } catch (err) {
+        console.log(`  [narration] Failed: ${String(err).slice(0, 200)}`);
       }
     }

package/src/agent/research.ts CHANGED Viewed

@@ -1,49 +1,5 @@
-import { $ } from "bun";
 import type { PRInfo, IssueInfo } from "../utils/github";
-const OPENROUTER_KEY = process.env.OPENROUTER_API_KEY ?? "";
-const OPENROUTER_MODEL = process.env.OPENROUTER_MODEL ?? "openai/gpt-4.5";
-async function callClaude(prompt: string): Promise<string> {
-  // Prefer OpenRouter
-  if (OPENROUTER_KEY) {
-    const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
-      method: "POST",
-      headers: { Authorization: `Bearer ${OPENROUTER_KEY}`, "content-type": "application/json" },
-      body: JSON.stringify({
-        model: OPENROUTER_MODEL,
-        messages: [{ role: "user", content: prompt }],
-        max_tokens: 4096,
-      }),
-    });
-    const json = (await res.json()) as any;
-    if (json.choices?.[0]?.message?.content) return json.choices[0].message.content;
-    console.log(`  ⚠ OpenRouter: ${json.error?.message?.slice(0, 80) ?? "empty response"}`);
-  }
-  // Fallback: Anthropic SDK
-  const apiKey = process.env.ANTHROPIC_API_KEY_QA ?? process.env.ANTHROPIC_API_KEY;
-  if (apiKey) {
-    const Anthropic = (await import("@anthropic-ai/sdk")).default;
-    const client = new Anthropic({ apiKey });
-    const response = await client.messages.create({
-      model: "claude-opus-4-6",
-      max_tokens: 4096,
-      messages: [{ role: "user", content: prompt }],
-    });
-    return response.content[0].type === "text" ? response.content[0].text : "";
-  }
-  // Last resort: claude CLI
-  const proc = Bun.spawn(["claude", "--print", "--model", "claude-opus-4-6"], {
-    stdin: new TextEncoder().encode(prompt),
-    stdout: "pipe",
-    stderr: "pipe",
-  });
-  const output = await new Response(proc.stdout).text();
-  await proc.exited;
-  return output;
-}
+import { callLLM as callClaude } from "../utils/llm";
 /** Extract JSON from Claude response, handling code blocks and markdown wrapping */
 function parseResearchJSON(text: string): ResearchResult {

package/src/recorder/narration.ts CHANGED Viewed

@@ -15,6 +15,9 @@ const GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta";
 export interface NarrationSegment {
   id: string;
   text: string;
+  /** When this segment should start in the video (ms from recording start).
+   *  If omitted, segments are concatenated sequentially. */
+  startMs?: number;
 }
 export interface NarrationResult {
@@ -166,9 +169,13 @@ export async function generateNarration(
   const totalDurationMs = meta.reduce((sum, m) => sum + m.durationMs, 0);
-  // Save meta for subtitle generation
+  // Save meta for subtitle generation — include startMs for timed post-mix
   const metaPath = path.join(narrationDir, "meta.json");
-  fs.writeFileSync(metaPath, JSON.stringify({ segments: meta, totalDurationMs }, null, 2));
+  const segmentsWithStart = meta.map((m, i) => ({
+    ...m,
+    startMs: segments[i]?.startMs ?? undefined,
+  }));
+  fs.writeFileSync(metaPath, JSON.stringify({ segments: segmentsWithStart, totalDurationMs }, null, 2));
   console.log(`  [narration] Track: ${trackPath} (${(totalDurationMs / 1000).toFixed(1)}s)`);

package/src/recorder/post-mix.ts CHANGED Viewed

@@ -1,72 +1,61 @@
 /**
- * Post-mix narration audio onto the recorded video and burn subtitles.
- * Uses ffmpeg adelay filter for sync — single offset = (demo_start - ffmpeg_start) ms.
+ * Post-mix narration audio onto the recorded video.
  *
- * Inspired by snomiao/playwright-multi-tab — see docs/making-the-demo-video.md
+ * Two modes:
+ *  - Timed: each narration segment placed at its measured video timestamp (startMs).
+ *  - Sequential (fallback): concatenated track delayed by offsetMs.
+ *
+ * Subtitles are embedded as mov_text soft stream (visible in VLC, browsers, Gemini)
+ * AND written as a VTT sidecar for the web player <track> element.
  */
 import { $ } from "bun";
 import * as fs from "fs";
 import * as path from "path";
+interface MetaSegment {
+  id: string;
+  text: string;
+  durationMs: number;
+  startMs?: number;
+}
 interface Meta {
-  segments: { id: string; text: string; durationMs: number }[];
+  segments: MetaSegment[];
   totalDurationMs: number;
 }
-/** Format ms as SRT timestamp HH:MM:SS,mmm */
 function srtTime(ms: number): string {
   const h = Math.floor(ms / 3600000);
   const m = Math.floor((ms % 3600000) / 60000);
   const s = Math.floor((ms % 60000) / 1000);
-  const msr = ms % 1000;
-  return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")},${String(msr).padStart(3, "0")}`;
+  const r = ms % 1000;
+  return `${String(h).padStart(2,"0")}:${String(m).padStart(2,"0")}:${String(s).padStart(2,"0")},${String(r).padStart(3,"0")}`;
 }
-/** Format ms as WebVTT timestamp HH:MM:SS.mmm */
 function vttTime(ms: number): string {
   return srtTime(ms).replace(",", ".");
 }
-/** Generate SRT subtitle file from meta + initial offset */
-function generateSrt(meta: Meta, offsetMs: number, outPath: string): void {
-  const lines: string[] = [];
-  let cursor = offsetMs;
-  meta.segments.forEach((seg, i) => {
-    const start = cursor;
-    const end = cursor + seg.durationMs;
-    lines.push(String(i + 1));
-    lines.push(`${srtTime(start)} --> ${srtTime(end)}`);
-    lines.push(seg.text);
-    lines.push("");
-    cursor = end;
-  });
-  fs.writeFileSync(outPath, lines.join("\n"));
-}
+/** Write narration.srt + narration.vtt to outDir. Returns srtPath. */
+function writeSubtitleFiles(meta: Meta, fallbackOffsetMs: number, outDir: string): string {
+  const srtLines: string[] = [];
+  const vttLines: string[] = ["WEBVTT", ""];
+  let cursor = fallbackOffsetMs;
-/** Generate WebVTT subtitle file from meta + initial offset (browser-native, no libass) */
-export function generateVtt(meta: Meta, offsetMs: number, outPath: string): void {
-  const lines: string[] = ["WEBVTT", ""];
-  let cursor = offsetMs;
   meta.segments.forEach((seg, i) => {
-    const start = cursor;
-    const end = cursor + seg.durationMs;
-    lines.push(String(i + 1));
-    lines.push(`${vttTime(start)} --> ${vttTime(end)}`);
-    lines.push(seg.text);
-    lines.push("");
+    const start = seg.startMs ?? cursor;
+    const end = start + seg.durationMs;
     cursor = end;
+    srtLines.push(String(i + 1), `${srtTime(start)} --> ${srtTime(end)}`, seg.text, "");
+    vttLines.push(String(i + 1), `${vttTime(start)} --> ${vttTime(end)}`, seg.text, "");
   });
-  fs.writeFileSync(outPath, lines.join("\n"));
+  const srtPath = path.join(outDir, "narration.srt");
+  fs.writeFileSync(srtPath, srtLines.join("\n"));
+  fs.writeFileSync(path.join(outDir, "narration.vtt"), vttLines.join("\n"));
+  return srtPath;
 }
-/**
- * Mix audio + subtitles onto video.
- * @param videoPath path to silent recorded video (webm/mp4)
- * @param trackPath narration_track.wav from generateNarration
- * @param metaPath meta.json from generateNarration
- * @param offsetMs delay to apply to audio (when narration starts in video timeline)
- * @param outPath output video path
- */
 export async function postMix(
   videoPath: string,
   trackPath: string,
@@ -75,24 +64,55 @@ export async function postMix(
   outPath: string
 ): Promise<void> {
   const meta: Meta = JSON.parse(fs.readFileSync(metaPath, "utf-8"));
+  const outDir = path.dirname(outPath);
-  // Step 1: overlay audio with adelay
-  const tmpDir = path.dirname(outPath);
-  const audioMixed = path.join(tmpDir, "_audio-mixed.mp4");
-  const adelay = `${offsetMs}|${offsetMs}`;
+  const timedMode = meta.segments.length > 0 && meta.segments.every((s) => s.startMs != null);
+  console.log(`  [post-mix] Mode: ${timedMode ? "timed" : "sequential"} | ${meta.segments.length} segments`);
+  // Write subtitle files first (SRT embedded into mp4, VTT served as sidecar)
+  const srtPath = writeSubtitleFiles(meta, offsetMs, outDir);
-  console.log(`  [post-mix] Overlaying audio (adelay=${offsetMs}ms)…`);
-  await $`ffmpeg -y -i ${videoPath} -i ${trackPath} -filter_complex ${`[1:a]adelay=${adelay}[aout]`} -map 0:v -map [aout] -c:v libx264 -preset fast -pix_fmt yuv420p -c:a aac -shortest ${audioMixed}`.quiet();
+  if (timedMode) {
+    const narrationDir = path.dirname(trackPath);
+    const segWavs = meta.segments.map((s) => path.join(narrationDir, `${s.id}.wav`));
+    const missing = segWavs.filter((p) => !fs.existsSync(p));
-  // Step 2: generate SRT + WebVTT (browser-native, no libass needed)
-  const srtPath = path.join(tmpDir, "narration.srt");
-  generateSrt(meta, offsetMs, srtPath);
-  const vttPath = path.join(tmpDir, "narration.vtt");
-  generateVtt(meta, offsetMs, vttPath);
-  console.log(`  [post-mix] Subtitles → ${srtPath} + ${vttPath}`);
+    if (missing.length === 0) {
+      // Build per-segment adelay filter
+      const audioInputs = segWavs.flatMap((p) => ["-i", p]);
+      const n = meta.segments.length;
+      const delays = meta.segments.map((s, i) =>
+        `[${i + 1}:a]adelay=${s.startMs}|${s.startMs}[a${i}]`
+      ).join(";");
+      const mixIn = meta.segments.map((_, i) => `[a${i}]`).join("");
+      const audioFilter = `${delays};${mixIn}amix=inputs=${n}:normalize=0[aout]`;
+      const srtInputIdx = n + 1;
-  // Step 3: rename audio-mixed to final output (subtitles served as sidecar .vtt)
-  fs.renameSync(audioMixed, outPath);
+      console.log(`  [post-mix] Timed mix + subtitle embed…`);
+      await $`ffmpeg -y -i ${videoPath} ${audioInputs} -i ${srtPath} \
+        -filter_complex ${audioFilter} \
+        -map 0:v -map [aout] -map ${String(srtInputIdx)}:s \
+        -c:v libx264 -preset fast -pix_fmt yuv420p \
+        -c:a aac -b:a 128k \
+        -c:s mov_text -metadata:s:s:0 language=eng \
+        -shortest ${outPath}`.quiet();
+      console.log(`  [post-mix] ✓ ${outPath}`);
+      return;
+    }
+    console.log(`  [post-mix] Missing ${missing.length} WAVs — falling back to sequential`);
+  }
+  // Sequential mode
+  const adelay = `${offsetMs}|${offsetMs}`;
+  console.log(`  [post-mix] Sequential mix + subtitle embed…`);
+  await $`ffmpeg -y -i ${videoPath} -i ${trackPath} -i ${srtPath} \
+    -filter_complex ${`[1:a]adelay=${adelay}[aout]`} \
+    -map 0:v -map [aout] -map 2:s \
+    -c:v libx264 -preset fast -pix_fmt yuv420p \
+    -c:a aac -b:a 128k \
+    -c:s mov_text -metadata:s:s:0 language=eng \
+    -shortest ${outPath}`.quiet();
-  console.log(`  [post-mix] Final video → ${outPath} (subtitles: ${vttPath})`);
+  console.log(`  [post-mix] ✓ ${outPath}`);
 }

package/src/utils/llm.ts ADDED Viewed

@@ -0,0 +1,41 @@
+const OPENROUTER_KEY = process.env.OPENROUTER_API_KEY ?? "";
+const OPENROUTER_MODEL = process.env.OPENROUTER_MODEL ?? "openai/gpt-4.5";
+export async function callLLM(prompt: string): Promise<string> {
+  if (OPENROUTER_KEY) {
+    const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
+      method: "POST",
+      headers: { Authorization: `Bearer ${OPENROUTER_KEY}`, "content-type": "application/json" },
+      body: JSON.stringify({
+        model: OPENROUTER_MODEL,
+        messages: [{ role: "user", content: prompt }],
+        max_tokens: 4096,
+      }),
+    });
+    const json = (await res.json()) as any;
+    if (json.choices?.[0]?.message?.content) return json.choices[0].message.content;
+    console.log(`  ⚠ OpenRouter: ${json.error?.message?.slice(0, 80) ?? "empty response"}`);
+  }
+  const apiKey = process.env.ANTHROPIC_API_KEY_QA ?? process.env.ANTHROPIC_API_KEY;
+  if (apiKey) {
+    const Anthropic = (await import("@anthropic-ai/sdk")).default;
+    const client = new Anthropic({ apiKey });
+    const response = await client.messages.create({
+      model: "claude-opus-4-6",
+      max_tokens: 4096,
+      messages: [{ role: "user", content: prompt }],
+    });
+    const block = response.content[0];
+    return block?.type === "text" ? block.text : "";
+  }
+  const proc = Bun.spawn(["claude", "--print", "--model", "claude-opus-4-6"], {
+    stdin: new TextEncoder().encode(prompt),
+    stdout: "pipe",
+    stderr: "pipe",
+  });
+  const output = await new Response(proc.stdout).text();
+  await proc.exited;
+  return output;
+}