npm - @koda-sl/baker-cli - Versions diffs - 0.70.0 → 0.71.2 - Mend

@koda-sl/baker-cli 0.70.0 → 0.71.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +11 -3
package/canvas/video-overlay-composition/index.html +65 -191
package/canvas/video-overlay-composition/meta.json +4 -15
package/dist/{chunk-XFDZVKLF.js → chunk-JIDZ37KG.js} +67 -3
package/dist/chunk-JIDZ37KG.js.map +1 -0
package/dist/cli.js +391 -87
package/dist/cli.js.map +1 -1
package/dist/engine/index.d.ts +14 -0
package/dist/engine/index.js +1 -1
package/package.json +1 -1
package/dist/chunk-XFDZVKLF.js.map +0 -1

package/dist/cli.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   defaultRegistry,
   generateCatalog,
   validateCanvasDeep
-} from "./chunk-XFDZVKLF.js";
+} from "./chunk-JIDZ37KG.js";
 // src/cli.ts
 import { defineCommand as defineCommand138, runMain } from "citty";
@@ -147,9 +147,9 @@ async function handleResponse(response) {
     throw new ApiError("INTERNAL_ERROR", "Failed to parse API response as JSON");
   }
 }
-async function apiGet(path6, params) {
+async function apiGet(path7, params) {
   const env = getEnv();
-  const url = new URL(path6, env.BAKER_API_URL);
+  const url = new URL(path7, env.BAKER_API_URL);
   if (params) {
     const clean = sanitizeParams(params);
     for (const [key, value] of Object.entries(clean)) {
@@ -174,12 +174,12 @@ async function apiGet(path6, params) {
   }
   return handleResponse(response);
 }
-async function apiPost(path6, body, opts) {
+async function apiPost(path7, body, opts) {
   const env = getEnv();
   const timeoutMs = opts?.timeoutMs ?? 6e4;
   let response;
   try {
-    response = await fetchWithRateLimitRetry(new URL(path6, env.BAKER_API_URL).toString(), {
+    response = await fetchWithRateLimitRetry(new URL(path7, env.BAKER_API_URL).toString(), {
       method: "POST",
       headers: {
         Authorization: `Bearer ${env.BAKER_API_KEY}`,
@@ -918,31 +918,31 @@ function cachePath(category, key) {
   return join(dir, `${hashKey(key)}.json`);
 }
 function cacheGet(category, key) {
-  const path6 = cachePath(category, key);
-  if (!existsSync(path6)) {
+  const path7 = cachePath(category, key);
+  if (!existsSync(path7)) {
     return null;
   }
   try {
-    const raw = readFileSync(path6, "utf-8");
+    const raw = readFileSync(path7, "utf-8");
     const entry = JSON.parse(raw);
     if (entry.expiresAt < Date.now()) {
-      rmSync(path6, { force: true });
+      rmSync(path7, { force: true });
       return null;
     }
     return entry;
   } catch {
-    rmSync(path6, { force: true });
+    rmSync(path7, { force: true });
     return null;
   }
 }
 function cacheSet(category, key, data, ttlMs, fields) {
-  const path6 = cachePath(category, key);
+  const path7 = cachePath(category, key);
   const entry = {
     expiresAt: Date.now() + ttlMs,
     data,
     fields
   };
-  writeFileSync(path6, JSON.stringify(entry), "utf-8");
+  writeFileSync(path7, JSON.stringify(entry), "utf-8");
 }
 var HOUR = 60 * 60 * 1e3;
 var MINUTE = 60 * 1e3;
@@ -7714,6 +7714,24 @@ async function probeDuration(filePath) {
 import { readFile as readFile2 } from "fs/promises";
 import path2 from "path";
 import { defineCommand as defineCommand74 } from "citty";
+// src/commands/canvas/placeholders.ts
+function unsuppliedPlaceholderAssets(canvas) {
+  const nodes = canvas?.nodes;
+  if (!Array.isArray(nodes)) return [];
+  const out = [];
+  for (const n of nodes) {
+    const node = n;
+    if (node?.type !== "ingest") continue;
+    const params = node.params;
+    if (params?.source === "path" && typeof params.path === "string" && params.path.includes("[TODO")) {
+      out.push({ node: String(node.id ?? "?"), placeholder: params.path });
+    }
+  }
+  return out;
+}
+// src/commands/canvas/run.ts
 var runCommand = defineCommand74({
   meta: { name: "run", description: "Validate and execute a canvas JSON file." },
   args: {
@@ -7735,6 +7753,25 @@ var runCommand = defineCommand74({
 `);
       process.exit(2);
     }
+    const pending = unsuppliedPlaceholderAssets(parsed);
+    if (pending.length > 0) {
+      process.stderr.write(
+        `${JSON.stringify(
+          {
+            ok: false,
+            error: {
+              code: "unsupplied_assets",
+              message: "This canvas still has placeholder asset slots \u2014 supply a real image (or video) at each before running. Each `el_*` ingest is a [TODO] you fill with the real logo/subject/product, then re-run.",
+              assets: pending
+            }
+          },
+          null,
+          2
+        )}
+`
+      );
+      process.exit(2);
+    }
     const engine = createEngineFromEnv({
       cacheDir: args["cache-dir"] ? String(args["cache-dir"]) : void 0,
       outputsDir: args["outputs-dir"] ? String(args["outputs-dir"]) : void 0,
@@ -8192,7 +8229,7 @@ var scaffoldStaticAdCommand = defineCommand75({
 // src/commands/canvas/scaffold-video.ts
 import { cp, mkdir, readFile as readFile4, writeFile as writeFile2 } from "fs/promises";
-import path4 from "path";
+import path5 from "path";
 import { defineCommand as defineCommand76 } from "citty";
 // src/engine/scaffold/video.ts
@@ -8200,7 +8237,19 @@ import { z as z3 } from "zod";
 var FIXED_TTS_MODEL = "elevenlabs/eleven_v3";
 var FIXED_SFX_MODEL = "elevenlabs/eleven_text_to_sound_v2";
 var FIXED_MUSIC_MODEL = "elevenlabs/music-v1";
+var FIXED_LIPSYNC_MODEL = "fal/veed-lipsync";
 var MUSIC_BED_GAIN_DB = -12;
+var NARRATOR_SPEAKERS = /* @__PURE__ */ new Set([
+  "voiceover",
+  "voice_over",
+  "narrator",
+  "narration",
+  "vo",
+  "announcer",
+  "off_screen",
+  "offscreen",
+  "off-screen"
+]);
 var SHARED_ASPECT_RATIOS = /* @__PURE__ */ new Set(["1:1", "16:9", "9:16", "4:3", "3:4", "21:9"]);
 var EDGES = ["start", "end"];
 function snapToSeedance(durationS) {
@@ -8216,11 +8265,39 @@ function snapToSeedance(durationS) {
   }
   return best;
 }
+function ceilToSeedance(durationS) {
+  const max = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
+  if (!Number.isFinite(durationS) || durationS <= 0) return SEEDANCE_DURATIONS[0];
+  for (const d of SEEDANCE_DURATIONS) if (d >= durationS) return d;
+  return max;
+}
+function sceneDurationS(scene) {
+  const raw = scene.duration_s ?? (scene.end_s != null && scene.start_s != null ? scene.end_s - scene.start_s : 5);
+  const max = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
+  return Math.min(Math.max(raw, 0.5), max);
+}
+function trimArgs(durationS) {
+  return [
+    "-i",
+    "{{in.clip}}",
+    "-t",
+    durationS.toFixed(3),
+    "-an",
+    "-c:v",
+    "libx264",
+    "-pix_fmt",
+    "yuv420p",
+    "{{out.video}}"
+  ];
+}
 var FrameAsset = z3.object({ url: z3.string().optional() }).loose().optional();
 var DialogueLine = z3.object({
   speaker: z3.string().optional(),
   line: z3.string().optional(),
+  // Absolute seconds on the source timeline (the deconstruct emits both).
   start_s: z3.number().optional(),
+  end_s: z3.number().optional(),
+  delivery: z3.string().optional(),
   voice_description: z3.string().optional()
 }).loose();
 var Sfx = z3.object({
@@ -8260,7 +8337,12 @@ var VideoBlueprint = z3.object({
       identified_track: z3.object({ title: z3.string().optional(), artist: z3.string().optional() }).loose().nullish()
     }).loose().optional(),
     cast: z3.array(z3.object({ id: z3.string().optional(), description: z3.string().optional() }).loose()).optional(),
-    voiceover: z3.object({ voice_description: z3.string().optional() }).loose().optional()
+    voiceover: z3.object({
+      // on_camera | mixed → mouths are on screen (lip-sync candidates);
+      // voiceover | none → narration over the picture (no lip-sync).
+      mode: z3.string().optional(),
+      voice_description: z3.string().optional()
+    }).loose().optional()
   }).loose().optional(),
   scenes: z3.array(Scene).min(1)
 }).loose();
@@ -8392,7 +8474,17 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor) {
   ].join("\n");
   const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
   return [
-    `Render the ${EDGE} frame of scene ${sceneIndex + 1} as a single still image. This prompt is self-contained and edit-per-frame: change the FRAME DESCRIPTION below to alter ONLY this frame. EXCLUDE all overlay text, captions, stickers, and watermarks \u2014 they are composited on top later.`,
+    `Render the ${EDGE} frame of scene ${sceneIndex + 1} as a single still image. This prompt is self-contained and edit-per-frame: change the FRAME DESCRIPTION below to alter ONLY this frame.`,
+    "",
+    "CRITICAL \u2014 RENDER A CLEAN PLATE WITH ZERO TEXT OR GRAPHICS:",
+    "This frame is a background plate. ALL words, captions, headlines, lower-third bars,",
+    "news tickers/crawls, chyrons, on-screen logos/wordmarks, station bugs, watermarks,",
+    "subtitles, UI and numbers are added afterwards as a separate HTML layer. Render NONE",
+    "of them \u2014 no legible text anywhere in the image, not even in the background, on the",
+    "news desk, on screens, or as part of a 'broadcast look'. If a reference image (a logo,",
+    "a desk, a studio) contains any text or graphics, DO NOT reproduce that text \u2014 render",
+    "the subject/scene only, with blank surfaces where text would be. Imperfect/garbled",
+    "letterforms are the worst outcome; leave those areas clean.",
     "",
     "REFERENCE IMAGES (in the order provided):",
     legend,
@@ -8400,7 +8492,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor) {
     "FRAME DESCRIPTION (this frame's editable prompt):",
     description,
     "",
-    "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared cast identity, palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above.",
+    "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared cast identity, palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: NO rendered text or graphic overlays \u2014 clean plate only.",
     "",
     "GLOBAL STYLE REFERENCE (shared across frames; not this frame's content):",
     "{{target_blueprint}}"
@@ -8451,7 +8543,7 @@ function buildSeedancePrompt(scene, sceneIndex, present) {
   if (transcript) parts.push(`Transcript: ${transcript}`);
   return parts.join("\n");
 }
-function buildSceneVisuals(blueprint, slots, opts, nodes) {
+function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
   const ar = aspectRatioParam(blueprint);
   const reuse = opts.frames === "reuse";
   const clipRefs = [];
@@ -8473,10 +8565,11 @@ function buildSceneVisuals(blueprint, slots, opts, nodes) {
       ctx,
       nodes
     );
+    const dur = sceneDurationS(scene);
     const clipParams = {
       model: opts.videoModel,
       prompt: buildSeedancePrompt(scene, i, slotsForScene(slots, i)),
-      duration: snapToSeedance(scene.duration_s ?? 5)
+      duration: ceilToSeedance(dur)
     };
     if (ar) clipParams.aspect_ratio = ar;
     nodes.push({
@@ -8485,7 +8578,29 @@ function buildSceneVisuals(blueprint, slots, opts, nodes) {
       inputs: { first_frame: firstFrame, last_frame: lastFrame },
       params: clipParams
     });
-    clipRefs.push(`$ref:s${i}_clip.video`);
+    let base = `$ref:s${i}_clip.video`;
+    const onCam = (sceneTurns.get(i) ?? []).filter((t) => t.onCamera);
+    const solo = onCam.length === 1 ? onCam[0] : void 0;
+    if (solo) {
+      nodes.push({
+        id: `s${i}_lipsync`,
+        type: "video_lipsync",
+        inputs: { video: base, audio: solo.audioRef },
+        params: { model: FIXED_LIPSYNC_MODEL }
+      });
+      base = `$ref:s${i}_lipsync.video`;
+    }
+    if (ceilToSeedance(dur) === dur) {
+      clipRefs.push(base);
+    } else {
+      nodes.push({
+        id: `s${i}_trim`,
+        type: "ffmpeg",
+        inputs: { clip: base },
+        params: { args: trimArgs(dur), outputs: { video: { kind: "video", ext: "mp4" } } }
+      });
+      clipRefs.push(`$ref:s${i}_trim.video`);
+    }
   });
   return clipRefs;
 }
@@ -8498,8 +8613,21 @@ function musicBedPrompt(blueprint, musicPrompt) {
 Reference vibe: the original used "${title}"${by} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.`;
 }
-function buildAudio(blueprint, nodes) {
+function onCameraDialogue(blueprint) {
+  const mode = blueprint.global?.voiceover?.mode;
+  return mode !== "voiceover" && mode !== "none";
+}
+var castIdSet = (blueprint) => new Set((blueprint.global?.cast ?? []).map((c) => c.id).filter((id) => Boolean(id)));
+function isOnCameraSpeaker(speaker, casts, cameraOn) {
+  if (!cameraOn) return false;
+  if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
+  return casts.has(speaker);
+}
+function buildDialogue(blueprint, nodes) {
   const tracks = [];
+  const sceneTurns = /* @__PURE__ */ new Map();
+  const casts = castIdSet(blueprint);
+  const cameraOn = onCameraDialogue(blueprint);
   const voiceNodeBySpeaker = /* @__PURE__ */ new Map();
   const speakerDescription = (speaker) => {
     for (const scene of blueprint.scenes) {
@@ -8518,41 +8646,62 @@ function buildAudio(blueprint, nodes) {
     voiceNodeBySpeaker.set(speaker, id);
     return id;
   };
-  const scriptBySpeaker = /* @__PURE__ */ new Map();
-  const orderedSpeakers = [];
-  for (const scene of blueprint.scenes) {
-    for (const line of scene.dialogue ?? []) {
-      if (!line.line) continue;
-      const speaker = line.speaker ?? "voiceover";
-      const start = line.start_s ?? scene.start_s ?? 0;
-      const existing = scriptBySpeaker.get(speaker);
-      if (existing) {
-        existing.lines.push(line.line);
-        existing.start = Math.min(existing.start, start);
-      } else {
-        scriptBySpeaker.set(speaker, { lines: [line.line], start });
-        orderedSpeakers.push(speaker);
-      }
-    }
-  }
   const usedVoIds = /* @__PURE__ */ new Set();
-  orderedSpeakers.forEach((speaker, idx) => {
-    const script = scriptBySpeaker.get(speaker);
-    if (!script) return;
-    const voiceNode = ensureVoiceNode(speaker);
-    let id = sanitizeId2(`vo_${speaker}`, `vo_${idx}`);
-    while (usedVoIds.has(id)) id = `${id}_${idx}`;
-    usedVoIds.add(id);
-    nodes.push({
-      id,
-      type: "tts",
-      inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
-      // Join lines with a space: each line keeps its own terminal punctuation, so
-      // sentence boundaries (and the pauses they imply) survive into one read.
-      params: { model: FIXED_TTS_MODEL, text: script.lines.join(" "), voice: "{{voice_ref}}" }
+  blueprint.scenes.forEach((scene, sceneIndex) => {
+    const lines = (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).slice().sort((a, b) => (a.start_s ?? 0) - (b.start_s ?? 0));
+    if (lines.length === 0) return;
+    const groups = [];
+    for (const line of lines) {
+      const speaker = line.speaker ?? "voiceover";
+      const last = groups[groups.length - 1];
+      if (last && last.speaker === speaker) last.lines.push(line);
+      else groups.push({ speaker, lines: [line] });
+    }
+    const list = [];
+    groups.forEach((group, gi) => {
+      const first = group.lines[0];
+      const last = group.lines[group.lines.length - 1];
+      if (!first || !last) return;
+      const start = first.start_s ?? scene.start_s ?? 0;
+      const end = last.end_s ?? last.start_s ?? scene.end_s ?? start;
+      const voiceNode = ensureVoiceNode(group.speaker);
+      let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
+      if (usedVoIds.has(id)) {
+        let n = 2;
+        while (usedVoIds.has(`${id}_${n}`)) n++;
+        id = `${id}_${n}`;
+      }
+      usedVoIds.add(id);
+      nodes.push({
+        id,
+        type: "tts",
+        inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
+        // Lines join with a space; each keeps its terminal punctuation so eleven_v3
+        // reads the sentence boundaries (and their pauses) within the one turn.
+        params: {
+          model: FIXED_TTS_MODEL,
+          text: group.lines.map((l) => l.line.trim()).join(" "),
+          voice: "{{voice_ref}}"
+        }
+      });
+      const turn = {
+        sceneIndex,
+        speaker: group.speaker,
+        onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn),
+        start_s: start,
+        end_s: end,
+        ttsId: id,
+        audioRef: `$ref:${id}.audio`
+      };
+      list.push(turn);
+      tracks.push({ slot: id, ref: turn.audioRef, start_s: start, end_s: end, kind: "vo" });
     });
-    tracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: script.start });
+    sceneTurns.set(sceneIndex, list);
   });
+  return { tracks, sceneTurns };
+}
+function buildSfxMusic(blueprint, nodes) {
+  const tracks = [];
   blueprint.scenes.forEach((scene, i) => {
     (scene.sfx ?? []).forEach((sfx, k) => {
       const text = sfx.sound_effect_prompt ?? sfx.description;
@@ -8561,7 +8710,12 @@ function buildAudio(blueprint, nodes) {
       const params = { model: FIXED_SFX_MODEL, text };
       if (typeof sfx.duration_s === "number") params.duration_seconds = Math.min(Math.max(sfx.duration_s, 0.5), 30);
       nodes.push({ id, type: "sound_effect", params });
-      tracks.push({ slot: `sfx_s${i}_${k}`, ref: `$ref:${id}.audio`, start_s: sfx.at_s ?? scene.start_s ?? 0 });
+      tracks.push({
+        slot: `sfx_s${i}_${k}`,
+        ref: `$ref:${id}.audio`,
+        start_s: sfx.at_s ?? scene.start_s ?? 0,
+        kind: "sfx"
+      });
     });
   });
   const musicPrompt = blueprint.global?.music?.music_prompt;
@@ -8573,10 +8727,90 @@ function buildAudio(blueprint, nodes) {
       type: "music",
       params: { model: FIXED_MUSIC_MODEL, prompt: musicBedPrompt(blueprint, musicPrompt), music_length_ms: musicMs }
     });
-    tracks.unshift({ slot: "music", ref: "$ref:music_bed.audio", start_s: 0, gain_db: MUSIC_BED_GAIN_DB });
+    tracks.push({ slot: "music", ref: "$ref:music_bed.audio", start_s: 0, gain_db: MUSIC_BED_GAIN_DB, kind: "music" });
   }
   return tracks;
 }
+var OverlayStyle = z3.object({ color_hex: z3.string().optional(), background: z3.string().optional(), size: z3.string().optional() }).loose();
+var Overlay = z3.object({
+  text: z3.string().optional(),
+  appears_at_s: z3.number().optional(),
+  duration_s: z3.number().optional(),
+  position: z3.string().optional(),
+  role: z3.string().optional(),
+  animation: z3.string().optional(),
+  animation_detail: z3.string().optional(),
+  style: OverlayStyle.optional()
+}).loose();
+var FloatingElement = z3.object({
+  kind: z3.string().optional(),
+  description: z3.string().optional(),
+  brand_name: z3.string().nullish(),
+  what_it_represents: z3.string().optional(),
+  appears_at_s: z3.number().optional(),
+  duration_s: z3.number().optional(),
+  position: z3.string().optional()
+}).loose();
+function escapeHtml(s) {
+  return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
+}
+function commentSafe(s) {
+  return escapeHtml(s).replace(/-{2,}/g, "\u2013");
+}
+var SUPPORTED_ANIMS = /* @__PURE__ */ new Set(["fade", "slide_up", "slide_down", "pop"]);
+function normalizeAnim(animation) {
+  if (!animation || animation === "none") return void 0;
+  const mapped = animation === "slide" ? "slide_up" : animation;
+  return SUPPORTED_ANIMS.has(mapped) ? mapped : void 0;
+}
+function positionClass(position) {
+  const p = (position ?? "bottom_center").toLowerCase().replace(/[^a-z]+/g, "-");
+  return `pos-${p}`;
+}
+function overlayElement(ov, sceneStart) {
+  if (!ov.text?.trim()) return "";
+  const at = ov.appears_at_s ?? sceneStart;
+  const dur = ov.duration_s ?? 2.5;
+  const role = ov.role ? ` data-role="${escapeHtml(ov.role)}"` : "";
+  const normAnim = normalizeAnim(ov.animation);
+  const anim = normAnim ? ` data-anim="${normAnim}"` : "";
+  const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
+  return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
+}
+function floatingStub(fe, sceneStart) {
+  const at = fe.appears_at_s ?? sceneStart;
+  const dur = fe.duration_s ?? 2.5;
+  const kind = commentSafe(fe.kind ?? "element");
+  const label = commentSafe(fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element");
+  const slug = (fe.kind ?? "element").toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "") || "element";
+  return [
+    `<!-- ${kind}: ${label} @ ${at}s for ${dur}s (${positionClass(fe.position)}). Drop an image in this dir and uncomment:`,
+    `<img class="ov ${positionClass(fe.position)}" src="your-${slug}.png" data-start="${at}" data-dur="${dur}" alt="" /> -->`
+  ].join("\n");
+}
+function buildOverlayHtml(input) {
+  const blueprint = VideoBlueprint.parse(input);
+  const blocks = [
+    [
+      "<!-- \u2B07 OVERLAY LAYER \u2014 this is YOUR HTML to paint. The reference's overlays are",
+      "     seeded below as plain elements (text + position class + data-start/data-dur).",
+      "     Restyle freely in <style>, regroup, animate, swap a logo placeholder for a",
+      "     real <img> you drop in this dir. The runtime only shows/hides by timestamp;",
+      "     it makes NO styling decisions. Positions: edit the .pos-* classes or add your own. -->"
+    ].join("\n")
+  ];
+  for (const scene of blueprint.scenes) {
+    const sceneStart = scene.start_s ?? 0;
+    const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
+    const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
+    const parts = [
+      ...overlays.success ? overlays.data.map((ov) => overlayElement(ov, sceneStart)) : [],
+      ...floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []
+    ].filter(Boolean);
+    if (parts.length > 0) blocks.push(parts.join("\n"));
+  }
+  return blocks.join("\n\n");
+}
 function lastSceneEnd(blueprint) {
   let end = 0;
   for (const s of blueprint.scenes) end = Math.max(end, s.end_s ?? 0);
@@ -8608,7 +8842,8 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
       params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
     });
   });
-  const clipRefs = buildSceneVisuals(blueprint, slots, opts, nodes);
+  const { tracks: voTracks, sceneTurns } = buildDialogue(blueprint, nodes);
+  const clipRefs = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
   const concatInputs = {};
   clipRefs.forEach((ref, i) => {
     concatInputs[`c${i}`] = ref;
@@ -8628,12 +8863,12 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
       id: "overlaid",
       type: "hyperframe_render",
       inputs: { background: videoRef },
-      params: { composition: opts.overlayCompositionPath, overlays, floating_elements: floating }
+      params: { composition: opts.overlayCompositionPath }
     });
     videoRef = "$ref:overlaid.video";
     videoNode = "overlaid";
   }
-  const tracks = buildAudio(blueprint, nodes);
+  const tracks = [...voTracks, ...buildSfxMusic(blueprint, nodes)];
   if (tracks.length > 0) {
     const mixInputs = {};
     for (const t of tracks) mixInputs[t.slot] = t.ref;
@@ -8666,8 +8901,18 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
           "1:a:0",
           "-c:v",
           "copy",
+          // The raw mix is a quiet mono track (tts + ducked bed), which reads as
+          // "no sound" in casual players. Normalize integrated loudness to the
+          // social/broadcast target (-14 LUFS, -1.5 dBTP) and upmix to stereo so
+          // every rendered ad is loud and plays everywhere.
+          "-af",
+          "loudnorm=I=-14:TP=-1.5:LRA=11,aformat=channel_layouts=stereo",
           "-c:a",
           "aac",
+          "-b:a",
+          "192k",
+          "-ar",
+          "48000",
           "-shortest",
           "{{out.video}}"
         ],
@@ -8681,43 +8926,69 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
     metadata: {
       name: "video reproduction",
       description: VIDEO_GUIDE,
-      todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts)
+      todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts),
+      // The timing plan `baker canvas validate` checks before any billed render:
+      // sequenced voiceover turns (no overlap), audio ≈ video length, and which
+      // scenes must be lip-synced.
+      video: buildVideoMeta(blueprint, sceneTurns)
     },
     nodes,
     output: { node: videoNode, output: "video" }
   };
 }
+function buildVideoMeta(blueprint, sceneTurns) {
+  const vo_segments = [];
+  const talking_scenes = [];
+  for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
+    for (const t of turns) {
+      vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
+    }
+    if (turns.filter((t) => t.onCamera).length === 1) {
+      talking_scenes.push({ scene, lipsync_node: `s${scene}_lipsync` });
+    }
+  }
+  return {
+    duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
+    vo_segments,
+    talking_scenes
+  };
+}
 var VIDEO_GUIDE = [
-  "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Each scene is two AI-generated boundary frames (start/end) animated into a clip, concatenated, overlaid with timed text, and mixed with voiceover + SFX + music. Edit it, supply the real assets, then run.",
+  "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 optional lip-sync \u2192 concatenated. On-screen text is a separate HTML layer you paint; audio is sequenced voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
   "",
   "WHAT TO DO NEXT:",
   "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` / `s<i>_end` node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want.",
-  "2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent.",
-  "3. Confirm the `voice_select` casting (one per speaker). The voiceover is ONE continuous `tts` per speaker \u2014 punctuation, ALL-CAPS, and spacing are read verbatim by eleven_v3 for emphasis/pauses, so edit `params.text` to shape delivery.",
-  "4. Text overlays are composited on top (not baked into frames) by the `overlaid` node \u2014 edit the `overlays` array there. For on-brand type, drop `brand-bold.otf` / `brand-regular.otf` into the `video-overlay-composition/` dir (referenced via @font-face); otherwise a system font is used. You don't always need text \u2014 it's often cleaner to overlay it than bake it in.",
-  "5. `baker canvas validate` then `baker canvas run`. Running generates many billed image/video/audio assets \u2014 it is not free.",
+  "2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
+  "3. Confirm the `voice_select` casting (one per speaker). Voiceover is SEQUENCED: each contiguous same-speaker turn is its own `tts` placed at its real start, so dialogue alternates instead of stacking. Edit a turn's `params.text` (punctuation / ALL-CAPS / line breaks are read verbatim by eleven_v3 for emphasis and pauses) to shape delivery; re-author the words to be TRUE for your brand.",
+  "4. Lip-sync: scenes with a single on-camera speaker route their clip through `video_lipsync` (~20 cr each) so the mouth matches the line. Two-speaker scenes are left un-synced \u2014 split them or pick a primary speaker if you want sync. Drop the node to skip.",
+  "5. Overlays are REAL HTML you paint. Open `video-overlay-composition/index.html`: the reference's overlays are seeded inside `#overlay-root` as plain elements (text + a `.pos-*` class + `data-start`/`data-dur`). Restyle the CSS freely \u2014 build lower-thirds, a ticker, whatever the look needs \u2014 and replace a logo placeholder with a real `<img>` you drop in that dir. The runtime only shows/hides by timestamp; it makes no styling decisions. Drop `brand-bold.otf` / `brand-regular.otf` there for on-brand type.",
+  "6. `baker canvas validate` (proves audio/lip-sync timing for free) then `baker canvas run` (generates many billed image/video/audio assets \u2014 not free).",
   "",
   "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are."
 ].join("\n");
 function buildVideoTodo(report, overlayCount, floatingCount, opts) {
   return {
-    edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference.",
+    edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference. Frames are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
     frames_mode: opts.frames ?? "generate",
+    assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
     recurring_elements_to_supply: report.elements,
+    text_strategy: "Decide per ad: text is either baked by the generated creative OR painted via the overlay HTML \u2014 not both. Default here is clean text-free frames + the HTML overlay layer (video-overlay-composition/index.html) as the single text source, which you fully control.",
+    timeline: "Automatic: each clip is generated at >= its scene length then trimmed back to the real scene duration, so the concatenated picture stays on the same timeline as the absolute-timed audio (this is what makes the lips line up). You don't manage it.",
     voices_to_confirm: report.dialogue.map((d) => ({
       scene: d.scene,
       speaker: d.speaker,
       voice_description: d.voice_description,
       line: d.line
     })),
-    voiceover_note: "One continuous tts per speaker; same voice locked via voice_select.voice_id. Use punctuation / ALL CAPS / line breaks in params.text for emphasis and pacing (read verbatim).",
+    voiceover_note: "Sequenced: one tts per contiguous same-speaker TURN, placed at its real start_s so turns alternate (no parallel monologues); same voice locked via voice_select.voice_id. Edit a turn's params.text (punctuation / ALL CAPS / line breaks read verbatim) to shape delivery.",
+    lip_sync_note: "Scenes with a single on-camera speaker route their clip through video_lipsync (~20 cr each) so the mouth matches the line. Two-speaker scenes are left un-synced (one track can't drive two faces) \u2014 split or pick a primary. `baker canvas validate` checks every talking scene is synced.",
     text_overlays: {
       count: overlayCount,
-      note: "Composited by the `overlaid` node, animated per the blueprint (fade/pop/slide/typewriter/karaoke). Edit the `overlays` array. Drop brand-*.otf into video-overlay-composition/ for on-brand type."
+      note: "Seeded as editable HTML inside `#overlay-root` in video-overlay-composition/index.html (text + a .pos-* class + data-start/data-dur). PAINT it: restyle the CSS, build lower-thirds/tickers, drop brand-*.otf for on-brand type. The runtime only shows/hides by timestamp."
     },
     floating_elements: {
       count: floatingCount,
-      note: floatingCount > 0 ? "Rendered as labeled placeholders. Replace with the real logo/sticker/cutout art (recurring logos are better handled as an el_* element baked into frames)." : "none detected"
+      note: floatingCount > 0 ? "Seeded as labeled placeholders in index.html \u2014 replace each with a real <img> you drop into video-overlay-composition/. Recurring logos are also handled well as an el_* element baked into frames." : "none detected"
     },
     sound_effects: { count: report.sfx_count },
     music: {
@@ -8769,8 +9040,24 @@ function videoReport(input, elementsInput) {
   };
 }
+// src/commands/canvas/composition-path.ts
+import { existsSync as existsSync3 } from "fs";
+import path4 from "path";
+function resolveShippedCanvasDir(name, startDir, exists = existsSync3, maxDepth = 8) {
+  const rel = path4.join("canvas", name);
+  let dir = startDir;
+  for (let i = 0; i < maxDepth; i++) {
+    const candidate = path4.join(dir, rel);
+    if (exists(path4.join(candidate, "meta.json"))) return candidate;
+    const parent = path4.dirname(dir);
+    if (parent === dir) break;
+    dir = parent;
+  }
+  return path4.resolve(startDir, "../../../", rel);
+}
 // src/commands/canvas/scaffold-video.ts
-var SHIPPED_COMPOSITION_DIR = path4.resolve(import.meta.dirname, "../../../canvas/video-overlay-composition");
+var SHIPPED_COMPOSITION_DIR = resolveShippedCanvasDir("video-overlay-composition", import.meta.dirname);
 function resolveModel2(kind, preferred) {
   const ids = Object.keys(MODEL_REGISTRY[kind]);
   return ids.includes(preferred) ? preferred : ids[0] ?? preferred;
@@ -8891,13 +9178,19 @@ var scaffoldVideoCommand = defineCommand76({
     "video-model": { type: "string", description: "Override the video_generate model id for clips" }
   },
   async run({ args }) {
-    const videoPath = path4.resolve(String(args.file));
-    const base = path4.basename(videoPath, path4.extname(videoPath));
-    const outPath = args.out ? path4.resolve(String(args.out)) : path4.join(path4.dirname(videoPath), `${base}.video.canvas.json`);
-    const outDir = path4.dirname(outPath);
-    const blueprintPath = path4.join(outDir, "prompt.json");
+    const videoPath = path5.resolve(String(args.file));
+    const base = path5.basename(videoPath, path5.extname(videoPath));
+    const outPath = args.out ? path5.resolve(String(args.out)) : path5.join(path5.dirname(videoPath), `${base}.video.canvas.json`);
+    const outDir = path5.dirname(outPath);
+    const blueprintPath = path5.join(outDir, "prompt.json");
     const frames = args.frames === "reuse" ? "reuse" : "generate";
     const maxScenes = args["max-scenes"] ? Number(args["max-scenes"]) : void 0;
+    if (Number.isFinite(maxScenes)) {
+      process.stderr.write(
+        `\u26A0\uFE0F  --max-scenes ${maxScenes} caps the deconstruct: any scenes beyond ${maxScenes} are MERGED away, reducing fidelity (fewer cuts, lost beats). Omit it to reproduce every scene.
+`
+      );
+    }
     const { deconstructModel, selectModel, imageModel, videoModel } = resolveModels2(args);
     const analysisCanvas = buildAnalysisCanvas(videoPath, deconstructModel, selectModel, {
       maxScenes: Number.isFinite(maxScenes) ? maxScenes : void 0,
@@ -8909,8 +9202,19 @@ var scaffoldVideoCommand = defineCommand76({
     const annotated = annotateBlueprintWithElements(blueprint, elements);
     await writeFile2(blueprintPath, `${JSON.stringify(annotated, null, 2)}
 `, "utf8");
-    const compositionDest = path4.join(outDir, "video-overlay-composition");
+    const compositionDest = path5.join(outDir, "video-overlay-composition");
     await cp(SHIPPED_COMPOSITION_DIR, compositionDest, { recursive: true });
+    const indexPath = path5.join(compositionDest, "index.html");
+    const overlayHtml = buildOverlayHtml(blueprint);
+    const indexHtml = await readFile4(indexPath, "utf8");
+    const injected = indexHtml.replace("<!--OVERLAYS-->", () => overlayHtml);
+    if (injected === indexHtml && overlayHtml.trim()) {
+      fail2(
+        "composition_marker_missing",
+        `video-overlay-composition/index.html is missing the <!--OVERLAYS--> marker \u2014 cannot inject the overlay layer`
+      );
+    }
+    await writeFile2(indexPath, injected, "utf8");
     const opts = {
       imageModel,
       videoModel,
@@ -8953,7 +9257,7 @@ var scaffoldVideoCommand = defineCommand76({
             run_estimated_credits: validation.estimatedCredits
           },
           checklist: {
-            edit_prompt: `Edit ${path4.basename(blueprintPath)} \u2014 the blueprint deconstructed from your video; rewrite it into the ad you want (cast, palette, copy, claims). Every scene frame reads it via target_blueprint.`,
+            edit_prompt: `Edit ${path5.basename(blueprintPath)} \u2014 the blueprint deconstructed from your video; rewrite it into the ad you want (cast, palette, copy, claims). Every scene frame reads it via target_blueprint.`,
             recurring_elements_to_supply: report.elements,
             voices_to_confirm: report.dialogue.map((d) => ({
               scene: d.scene,
@@ -8978,7 +9282,7 @@ var scaffoldVideoCommand = defineCommand76({
 // src/commands/canvas/validate.ts
 import { readFile as readFile5 } from "fs/promises";
-import path5 from "path";
+import path6 from "path";
 import { defineCommand as defineCommand77 } from "citty";
 var validateCommand = defineCommand77({
   meta: {
@@ -8987,7 +9291,7 @@ var validateCommand = defineCommand77({
   },
   args: { file: { type: "positional", required: true, description: "Path to canvas JSON" } },
   async run({ args }) {
-    const filePath = path5.resolve(String(args.file));
+    const filePath = path6.resolve(String(args.file));
     const raw = await readFile5(filePath, "utf8");
     let parsed;
     try {
@@ -9222,7 +9526,7 @@ Examples:
 });
 // src/commands/ga4/query.ts
-import { appendFileSync as appendFileSync2, existsSync as existsSync3, readFileSync as readFileSync6, writeFileSync as writeFileSync4 } from "fs";
+import { appendFileSync as appendFileSync2, existsSync as existsSync4, readFileSync as readFileSync6, writeFileSync as writeFileSync4 } from "fs";
 import { resolve as resolve2 } from "path";
 import { defineCommand as defineCommand81 } from "citty";
@@ -9296,7 +9600,7 @@ function writeRowsToFile2(filePath, rows, append) {
   const fields = extractFields2(rows);
   const ext = filePath.split(".").pop()?.toLowerCase();
   if (ext === "csv") {
-    if (!append || !existsSync3(filePath)) {
+    if (!append || !existsSync4(filePath)) {
       writeFileSync4(filePath, `${toCsvRow(fields)}
 `, "utf-8");
     }
@@ -9307,12 +9611,12 @@ function writeRowsToFile2(filePath, rows, append) {
     const lines = rows.map((row) => JSON.stringify(row));
     const content = `${lines.join("\n")}
 `;
-    if (append && existsSync3(filePath)) {
+    if (append && existsSync4(filePath)) {
       appendFileSync2(filePath, content, "utf-8");
     } else {
       writeFileSync4(filePath, content, "utf-8");
     }
-  } else if (append && existsSync3(filePath)) {
+  } else if (append && existsSync4(filePath)) {
     const existing = JSON.parse(readFileSync6(filePath, "utf-8"));
     writeFileSync4(filePath, JSON.stringify([...existing, ...rows], null, 2), "utf-8");
   } else {
@@ -9453,7 +9757,7 @@ Examples:
 import { defineCommand as defineCommand86 } from "citty";
 // src/commands/gsc/query.ts
-import { appendFileSync as appendFileSync3, existsSync as existsSync4, readFileSync as readFileSync7, writeFileSync as writeFileSync5 } from "fs";
+import { appendFileSync as appendFileSync3, existsSync as existsSync5, readFileSync as readFileSync7, writeFileSync as writeFileSync5 } from "fs";
 import { resolve as resolve3 } from "path";
 import { defineCommand as defineCommand83 } from "citty";
@@ -9582,7 +9886,7 @@ function writeRowsToFile3(filePath, rows, append) {
   const fields = extractFields3(rows);
   const ext = filePath.split(".").pop()?.toLowerCase();
   if (ext === "csv") {
-    if (!append || !existsSync4(filePath)) {
+    if (!append || !existsSync5(filePath)) {
       writeFileSync5(filePath, `${toCsvRow(fields)}
 `, "utf-8");
     }
@@ -9592,12 +9896,12 @@ function writeRowsToFile3(filePath, rows, append) {
   } else if (ext === "jsonl") {
     const content = `${rows.map((row) => JSON.stringify(row)).join("\n")}
 `;
-    if (append && existsSync4(filePath)) {
+    if (append && existsSync5(filePath)) {
       appendFileSync3(filePath, content, "utf-8");
     } else {
       writeFileSync5(filePath, content, "utf-8");
     }
-  } else if (append && existsSync4(filePath)) {
+  } else if (append && existsSync5(filePath)) {
     const existing = JSON.parse(readFileSync7(filePath, "utf-8"));
     writeFileSync5(filePath, JSON.stringify([...existing, ...rows], null, 2), "utf-8");
   } else {
@@ -9915,9 +10219,9 @@ async function readImageBuffer(pathOrUrl) {
   }
   return readFile6(pathOrUrl);
 }
-async function isDirectory(path6) {
+async function isDirectory(path7) {
   try {
-    const s = await stat2(path6);
+    const s = await stat2(path7);
     return s.isDirectory();
   } catch {
     return false;