npm - @koda-sl/baker-cli - Versions diffs - 0.82.0 → 0.91.0 - Mend

@koda-sl/baker-cli 0.82.0 → 0.91.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +25 -9
package/canvas/video-overlay-composition/index.html +31 -5
package/dist/{chunk-KIL2ZJST.js → chunk-LMVDA3EZ.js} +151 -17
package/dist/chunk-LMVDA3EZ.js.map +1 -0
package/dist/cli.js +1258 -281
package/dist/cli.js.map +1 -1
package/dist/engine/index.js +1 -1
package/package.json +1 -1
package/dist/chunk-KIL2ZJST.js.map +0 -1

package/dist/cli.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   defaultRegistry,
   generateCatalog,
   validateCanvasDeep
-} from "./chunk-KIL2ZJST.js";
+} from "./chunk-LMVDA3EZ.js";
 // src/cli.ts
 import { defineCommand as defineCommand141, runMain } from "citty";
@@ -8274,10 +8274,100 @@ var scaffoldStaticAdCommand = defineCommand75({
 });
 // src/commands/canvas/scaffold-video.ts
-import { cp, mkdir, readFile as readFile4, writeFile as writeFile2 } from "fs/promises";
+import { cp, mkdir, readFile as readFile5, writeFile as writeFile2 } from "fs/promises";
 import path5 from "path";
 import { defineCommand as defineCommand76 } from "citty";
+// src/engine/nodes/local/lib/sceneDetect.ts
+import { execFile as execFile2 } from "child_process";
+import { mkdtemp, readdir as readdir2, readFile as readFile4, rm } from "fs/promises";
+import { tmpdir } from "os";
+import { join as join2 } from "path";
+import { promisify as promisify2 } from "util";
+var execFileAsync2 = promisify2(execFile2);
+var PYSCENEDETECT_THRESHOLD = 18;
+var PYSCENEDETECT_MIN_SCENE_LEN_S = 0.25;
+var PYSCENEDETECT_RECHECK_THRESHOLD = 27;
+var PYSCENEDETECT_RECHECK_MIN_SCENE_LEN_S = 0.6;
+function isLikelyOverSegmented(cuts, opts = {}) {
+  const minCuts = opts.minCuts ?? 6;
+  const maxMedianGap = opts.medianGapS ?? 2;
+  const sorted = [...cuts].filter((c) => Number.isFinite(c) && c > 0).sort((a, b) => a - b);
+  if (sorted.length < minCuts) return false;
+  const gaps = [];
+  let prev = 0;
+  for (const c of sorted) {
+    gaps.push(c - prev);
+    prev = c;
+  }
+  gaps.sort((a, b) => a - b);
+  const mid = Math.floor(gaps.length / 2);
+  const median = gaps.length % 2 ? gaps[mid] : (gaps[mid - 1] + gaps[mid]) / 2;
+  return median < maxMedianGap;
+}
+function timecodeToSeconds(tc) {
+  const m = tc.trim().match(/^(\d+):(\d{1,2}):(\d{1,2}(?:\.\d+)?)$/);
+  if (!m) return null;
+  const h = Number.parseInt(m[1] ?? "", 10);
+  const min = Number.parseInt(m[2] ?? "", 10);
+  const s = Number.parseFloat(m[3] ?? "");
+  if (!Number.isFinite(h) || !Number.isFinite(min) || !Number.isFinite(s)) return null;
+  return h * 3600 + min * 60 + s;
+}
+function parsePySceneDetectCsvCuts(csv) {
+  const firstLine = csv.split(/\r?\n/, 1)[0] ?? "";
+  if (!/^\s*Timecode List:/i.test(firstLine)) return [];
+  const cuts = [];
+  for (const cell of firstLine.split(",").slice(1)) {
+    const t = timecodeToSeconds(cell);
+    if (t !== null && t > 0) cuts.push(Math.round(t * 1e3) / 1e3);
+  }
+  return [...new Set(cuts)].sort((a, b) => a - b);
+}
+async function runSceneDetectOnce(filePath, threshold, minSceneLenS, timeoutMs) {
+  const outDir = await mkdtemp(join2(tmpdir(), "baker-scenedetect-"));
+  try {
+    await execFileAsync2(
+      "scenedetect",
+      [
+        "--input",
+        filePath,
+        "--output",
+        outDir,
+        "detect-content",
+        "--threshold",
+        String(threshold),
+        "--min-scene-len",
+        String(minSceneLenS),
+        "list-scenes",
+        "--quiet"
+      ],
+      { encoding: "utf-8", maxBuffer: 32 * 1024 * 1024, timeout: timeoutMs }
+    );
+    const csvName = (await readdir2(outDir)).find((f) => f.toLowerCase().endsWith(".csv"));
+    if (!csvName) return [];
+    return parsePySceneDetectCsvCuts(await readFile4(join2(outDir, csvName), "utf-8"));
+  } finally {
+    await rm(outDir, { recursive: true, force: true });
+  }
+}
+async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
+  const pinned = opts.threshold !== void 0;
+  const threshold = opts.threshold ?? PYSCENEDETECT_THRESHOLD;
+  const minSceneLenS = opts.minSceneLenS ?? PYSCENEDETECT_MIN_SCENE_LEN_S;
+  const timeoutMs = opts.timeout_ms ?? 12e4;
+  const cuts = await runSceneDetectOnce(filePath, threshold, minSceneLenS, timeoutMs);
+  if (!pinned && isLikelyOverSegmented(cuts)) {
+    return await runSceneDetectOnce(
+      filePath,
+      PYSCENEDETECT_RECHECK_THRESHOLD,
+      PYSCENEDETECT_RECHECK_MIN_SCENE_LEN_S,
+      timeoutMs
+    );
+  }
+  return cuts;
+}
 // src/engine/scaffold/video.ts
 import { z as z3 } from "zod";
@@ -8380,7 +8470,7 @@ var FIXED_TTS_MODEL = "elevenlabs/eleven_v3";
 var FIXED_SFX_MODEL = "elevenlabs/eleven_text_to_sound_v2";
 var FIXED_MUSIC_MODEL = "elevenlabs/music-v1";
 var FIXED_VOICE_CONVERT_MODEL = "elevenlabs/eleven_multilingual_sts_v2";
-var MUSIC_BED_GAIN_DB = -12;
+var MUSIC_BED_GAIN_DB = -20;
 var AMBIENT_BED_GAIN_DB = -20;
 var TRANSITION_DEFAULT_S = 0.4;
 var XFADE_BY_TYPE = {
@@ -8432,10 +8522,78 @@ function sceneDurationS(scene) {
   const max = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
   return Math.min(Math.max(raw, 0.5), max);
 }
-function trimArgs(durationS) {
+function canvasDims(ar) {
+  switch (ar) {
+    case "1:1":
+      return { w: 1080, h: 1080 };
+    case "16:9":
+      return { w: 1920, h: 1080 };
+    case "4:3":
+      return { w: 1440, h: 1080 };
+    case "3:4":
+      return { w: 1080, h: 1440 };
+    case "21:9":
+      return { w: 1920, h: 822 };
+    default:
+      return { w: 1080, h: 1920 };
+  }
+}
+function fillPanel(label, w, h, out) {
+  return `[${label}]scale=${w}:${h}:force_original_aspect_ratio=increase,crop=${w}:${h},setsar=1,fps=30[${out}]`;
+}
+function splitStackArgs(count, axis, dims) {
+  const pw = axis === "horizontal" ? Math.round(dims.w / count) : dims.w;
+  const ph = axis === "vertical" ? Math.round(dims.h / count) : dims.h;
+  const inputs = [];
+  const filt = [];
+  let labels = "";
+  for (let i = 0; i < count; i++) {
+    inputs.push("-i", `{{in.c${i}}}`);
+    filt.push(fillPanel(`${i}:v`, pw, ph, `p${i}`));
+    labels += `[p${i}]`;
+  }
+  const stack = axis === "vertical" ? "vstack" : "hstack";
+  filt.push(`${labels}${stack}=inputs=${count}[v]`);
+  return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
+}
+function overlayXY(position, marginPx) {
+  const p = (position ?? "bottom_right").toLowerCase();
+  const x = p.includes("left") ? `${marginPx}` : p.includes("right") ? `W-w-${marginPx}` : "(W-w)/2";
+  const y = p.includes("top") ? `${marginPx}` : p.includes("bottom") ? `H-h-${marginPx}` : "(H-h)/2";
+  return { x, y };
+}
+function pipOverlayArgs(dims, position, insetWpct) {
+  const iw = Math.round(dims.w * insetWpct);
+  const margin = Math.round(dims.w * 0.04);
+  const { x, y } = overlayXY(position, margin);
+  const filt = `${fillPanel("0:v", dims.w, dims.h, "bg")};[1:v]scale=${iw}:-2,setsar=1,fps=30[fg];[bg][fg]overlay=x=${x}:y=${y}:format=auto[v]`;
+  return ["-i", "{{in.c0}}", "-i", "{{in.c1}}", "-filter_complex", filt, "-map", "[v]", "{{out.video}}"];
+}
+var FLASH_HOLD_MAX_S = 2;
+function stillHoldArgs(durationS, dims) {
+  return [
+    "-loop",
+    "1",
+    "-i",
+    "{{in.frame}}",
+    "-t",
+    durationS.toFixed(3),
+    "-r",
+    "30",
+    "-vf",
+    `scale=${dims.w}:${dims.h}:force_original_aspect_ratio=increase,crop=${dims.w}:${dims.h},setsar=1,format=yuv420p`,
+    "-c:v",
+    "libx264",
+    "-pix_fmt",
+    "yuv420p",
+    "{{out.video}}"
+  ];
+}
+function trimArgs(durationS, offsetS = 0) {
   return [
     "-i",
     "{{in.clip}}",
+    ...offsetS > 0 ? ["-ss", offsetS.toFixed(3)] : [],
     "-t",
     durationS.toFixed(3),
     "-an",
@@ -8462,6 +8620,25 @@ var Sfx = z3.object({
   sound_effect_prompt: z3.string().optional(),
   description: z3.string().optional()
 }).loose();
+var CompositionRegion = z3.object({
+  // full | top | bottom | left | right | inset
+  panel: z3.string().optional(),
+  // 9-grid anchor for an `inset` presenter box.
+  position: z3.string().optional(),
+  is_presenter: z3.boolean().optional(),
+  // The cast id shown/speaking in this region (routes lip-sync + element refs).
+  cast_ref: z3.string().optional(),
+  summary: z3.string().optional(),
+  frame_prompt: z3.string().optional(),
+  motion_prompt: z3.string().optional()
+}).loose();
+var SceneComposition = z3.object({
+  // full_frame (default) | split_screen | pip | keyed_overlay
+  layout: z3.string().optional(),
+  // split_screen only: vertical (top/bottom) | horizontal (left/right).
+  split_axis: z3.string().optional(),
+  regions: z3.array(CompositionRegion).optional()
+}).loose();
 var CameraMotion = z3.object({ movement: z3.string().optional(), detail: z3.string().optional() }).loose();
 var TranscriptWord = z3.object({ text: z3.string().optional() }).loose();
 var Scene = z3.object({
@@ -8470,6 +8647,10 @@ var Scene = z3.object({
   duration_s: z3.number().optional(),
   summary: z3.string().optional(),
   action_detail: z3.string().optional(),
+  // The scene's spatial layout. Absent/full_frame ⇒ one uncut shot (default path).
+  // A layered layout (split_screen/pip/keyed_overlay) with regions ⇒ the scaffold
+  // builds one clip per region and stacks/overlays them into the scene picture.
+  composition: SceneComposition.optional(),
   // The capture "look" for this scene — selected from the ad-native shoot-mode
   // grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
   // UGC/product mode; a human can override per scene by setting this.
@@ -8495,7 +8676,12 @@ var Scene = z3.object({
   floating_elements: z3.array(z3.unknown()).optional(),
   transcript_slice: z3.array(TranscriptWord).optional(),
   start_frame_asset: FrameAsset,
-  end_frame_asset: FrameAsset
+  end_frame_asset: FrameAsset,
+  // DECON-supplied: true when this scene is a length-split CONTINUATION of the
+  // previous one (the SAME physical shot, broken up only because it exceeded the
+  // clip ceiling). The scaffold then shares the splice keyframe — this scene's
+  // start frame IS the previous scene's end frame — so the join is seamless.
+  continues_previous: z3.boolean().optional()
 }).loose();
 var VideoBlueprint = z3.object({
   source: z3.object({ aspect_ratio: z3.string().optional(), duration_s: z3.number().optional() }).loose().optional(),
@@ -8600,6 +8786,40 @@ function annotateBlueprintWithElements(blueprintInput, elementsInput) {
   clone.reference_elements = summary;
   return clone;
 }
+var SELECT_SCENE_FIELDS = [
+  "index",
+  "start_s",
+  "end_s",
+  "duration_s",
+  "summary",
+  "narrative_role",
+  "action_detail",
+  "start_frame_prompt",
+  "end_frame_prompt"
+];
+var SELECT_GLOBAL_FIELDS = ["cast", "branding", "voiceover"];
+function slimBlueprintForSelection(blueprintInput) {
+  if (!blueprintInput || typeof blueprintInput !== "object" || Array.isArray(blueprintInput)) return blueprintInput;
+  const bp = blueprintInput;
+  const out = {};
+  for (const k of ["version", "source"]) if (k in bp) out[k] = bp[k];
+  if (bp.global && typeof bp.global === "object" && !Array.isArray(bp.global)) {
+    const g = bp.global;
+    const slimG = {};
+    for (const k of SELECT_GLOBAL_FIELDS) if (k in g) slimG[k] = g[k];
+    out.global = slimG;
+  }
+  if (Array.isArray(bp.scenes)) {
+    out.scenes = bp.scenes.map((s) => {
+      if (!s || typeof s !== "object" || Array.isArray(s)) return s;
+      const sr = s;
+      const slim = {};
+      for (const k of SELECT_SCENE_FIELDS) if (k in sr) slim[k] = sr[k];
+      return slim;
+    });
+  }
+  return out;
+}
 function roleForType2(type) {
   switch (type.toLowerCase()) {
     case "logo":
@@ -8633,7 +8853,16 @@ function todoPath2(el, label) {
   return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in${fresh}${same}]`;
 }
 function buildElementSlots(elements) {
-  const usedIds = /* @__PURE__ */ new Set(["prompt", "spine", "overlaid", "audio_mix", "final", "music_bed"]);
+  const usedIds = /* @__PURE__ */ new Set([
+    "prompt",
+    "spine",
+    "overlaid",
+    "captions",
+    "captions_transcript",
+    "audio_mix",
+    "final",
+    "music_bed"
+  ]);
   const slots = [];
   assignElementLabels2(elements).forEach(({ el, label }, i) => {
     let id = sanitizeId2(`el_${label}`, `el_${i}`);
@@ -8646,6 +8875,7 @@ function buildElementSlots(elements) {
       type: el.type,
       description: el.description,
       sameAs: el.same_as ?? void 0,
+      castId: el.cast_id ?? void 0,
       presence: presenceOf(el)
     });
   });
@@ -8684,7 +8914,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
   const legend = [
     ...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
     ...hasAnchor ? [
-      "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions of THIS frame. IGNORE its overlay text, captions, and any brand that is being swapped."
+      "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions. IGNORE its text, its logo, its brand name, and its colors entirely \u2014 it is a DIFFERENT brand's footage, here only to anchor layout/pose, never identity or palette."
     ] : []
   ].join("\n");
   const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
@@ -8703,6 +8933,9 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
     "a studio) contains any text or graphics, DO NOT reproduce them \u2014 render the subject/scene",
     "only, leaving the regions where overlays will sit clean. Imperfect/garbled letterforms or",
     "stray icons are the worst outcome; leave those areas blank.",
+    "A SCREEN/UI surface \u2014 an app, website, chat, dashboard, or phone display \u2014 is NEVER",
+    "rendered here: leave any phone/screen OFF or blank-screened. The real interface is",
+    "composited later as a screenshot or a brand HTML block, never AI-generated.",
     "",
     "FRAMING \u2014 ONE UNCUT FRAME:",
     "Render ONE single uncut photographic frame: NO split screen, NO panels, NO dividing line,",
@@ -8730,41 +8963,71 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
     "REFERENCE IMAGES (in the order provided):",
     legend,
     "",
-    "Identity comes from the reference images, not from this prose \u2014 render each person,",
-    "product, and set to MATCH its reference image, and describe only pose, expression, action,",
-    "and camera in the FRAME DESCRIPTION below.",
-    "",
+    // RECAST is the whole point of a transform: the dropped el_* images define who/
+    // what is on screen, NOT the source footage and NOT the prose. Without this, the
+    // model reproduces the original ad's people (a proven failure mode).
+    ...present.length > 0 ? [
+      "IDENTITY & AESTHETIC \u2014 RECAST (this is a transform, not a copy):",
+      "Identity comes from the reference image, never from the source footage or this prose. Render every",
+      "person, animal, product, and set to MATCH its labeled reference image above \u2014 that image is the ONLY",
+      "source of their identity, wardrobe, styling, and look. This is a complete recast: do NOT reproduce,",
+      "trace, or resemble any individual, animal, product, or set from the source ad. Where the FRAME",
+      "DESCRIPTION below names an appearance detail (hair, outfit, color, age, breed, brand of an object),",
+      "IGNORE that wording \u2014 the reference image is the truth; use the description ONLY for pose, expression,",
+      "action, framing, lighting, and palette.",
+      ""
+    ] : [
+      "Identity comes from the reference image, never from prose \u2014 render the subject to MATCH it and",
+      "describe only pose, expression, action, framing, and lighting in the FRAME DESCRIPTION below.",
+      ""
+    ],
     "FRAME DESCRIPTION (this frame's editable prompt):",
     description,
     "",
-    "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
+    "Render exactly what the FRAME DESCRIPTION and the SHARED AD SPEC specify \u2014 this is the authoritative ad: its cast identity (via the reference images), palette, brand, and intent are law. Keep every recurring element identical to its reference image across all frames. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
     "",
-    "GLOBAL STYLE REFERENCE (shared across frames; not this frame's content):",
+    "SHARED AD SPEC (authoritative \u2014 the ad blueprint this frame belongs to; align cast/palette/brand/type with it):",
     "{{target_blueprint}}"
   ].join("\n");
 }
+function ingestFrameRef(url, edge, ctx, nodes) {
+  const cached2 = ctx.ingestCache?.get(url);
+  if (cached2) return cached2;
+  const tag = ctx.tag ?? "";
+  const refId = `s${ctx.sceneIndex}${tag}_${edge}_ref`;
+  nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
+  const ref = `$ref:${refId}.asset`;
+  ctx.ingestCache?.set(url, ref);
+  return ref;
+}
 function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
-  const refId = `s${ctx.sceneIndex}_${edge}_ref`;
-  if (url) nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
-  if (ctx.reuse && url) return `$ref:${refId}.asset`;
-  const reference = [...present.map((s) => s.ref), ...url ? [`$ref:${refId}.asset`] : []];
+  const tag = ctx.tag ?? "";
+  if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
+  const hasPersonOrAnimal = present.some((s) => {
+    const t = s.type.toLowerCase();
+    return t === "person" || t === "animal";
+  });
+  const useOriginalAnchor = Boolean(url) && !hasPersonOrAnimal;
+  const hasOriginal = useOriginalAnchor;
+  const originalRef = useOriginalAnchor && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
+  const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
   const genParams = {
     model: ctx.imageModel,
     image_size: "2K",
-    prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url), ctx.shootMode)
+    prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, hasOriginal, ctx.shootMode)
   };
   if (ctx.ar) genParams.aspect_ratio = ctx.ar;
-  const genNode = {
-    id: `s${ctx.sceneIndex}_${edge}`,
+  const genId = `s${ctx.sceneIndex}${tag}_${edge}`;
+  nodes.push({
+    id: genId,
     type: "image_generate",
     // `params.prompt` is this frame's authoritative, edit-per-frame description.
-    // `target_blueprint` is kept only as a demoted shared style reference (global
-    // cast/palette/typography), so editing one frame never touches another.
+    // `target_blueprint` is the shared ad spec (cast identity, palette, brand, type)
+    // the frame must stay consistent with — editing one frame never touches another.
     inputs: { target_blueprint: "$ref:prompt.asset", ...reference.length > 0 ? { reference } : {} },
     params: genParams
-  };
-  nodes.push(genNode);
-  return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
+  });
+  return `$ref:${genId}.images#0`;
 }
 function seedanceAudioLine(scene, mode, audio, nativeLine) {
   const ambient = scene.ambient?.trim() || diegeticFor(mode);
@@ -8810,10 +9073,11 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
   );
   return parts.join("\n");
 }
-function audioExtractArgs(durationS) {
+function audioExtractArgs(durationS, offsetS = 0) {
   return [
     "-i",
     "{{in.clip}}",
+    ...offsetS > 0.05 ? ["-ss", offsetS.toFixed(3)] : [],
     "-t",
     durationS.toFixed(3),
     "-vn",
@@ -8841,27 +9105,21 @@ function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
     hasProduct: present.some((s) => s.type.toLowerCase() === "product")
   });
 }
-function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
+function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks, nativeSegments, clipRef = `$ref:s${i}_clip.video`) {
   if (nativeTurn) {
-    const extractLen = Math.min(Math.max(lengths.dur, lengths.speech), lengths.genDur);
+    const speechWindow = Math.max(0.5, nativeTurn.end_s - nativeTurn.start_s);
+    const extractLen = Math.min(speechWindow, lengths.genDur);
     nodes.push({
       id: `s${i}_voextract`,
       type: "ffmpeg",
-      inputs: { clip: `$ref:s${i}_clip.video` },
+      inputs: { clip: clipRef },
       params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
     });
-    nodes.push({
-      id: `s${i}_voconv`,
-      type: "audio_voice_convert",
-      inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
-      params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
-    });
-    voTracks.push({
-      slot: `s${i}_voconv`,
-      ref: `$ref:s${i}_voconv.audio`,
+    nativeSegments.push({
+      voiceNode: nativeTurn.voiceNode,
+      ref: `$ref:s${i}_voextract.audio`,
       start_s: nativeTurn.start_s,
-      end_s: nativeTurn.start_s + extractLen,
-      kind: "vo"
+      end_s: nativeTurn.start_s + extractLen
     });
   } else if (ambientBroll) {
     const ambientStart = scene.start_s ?? 0;
@@ -8881,85 +9139,260 @@ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes
     });
   }
 }
-function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
-  const ar = aspectRatioParam(blueprint);
-  const reuse = opts.frames === "reuse";
-  const clips = [];
-  const voTracks = [];
-  const lastIndex = blueprint.scenes.length - 1;
-  const cameraOn = onCameraDialogue(blueprint);
-  const casts = castIdSet(blueprint);
-  blueprint.scenes.forEach((scene, i) => {
-    const nativeTurn = (sceneTurns.get(i) ?? []).find((t) => t.native);
-    const present = slotsForScene(slots, i);
-    const mode = sceneShootMode(scene, present, nativeTurn, cameraOn, casts);
-    const ambientBroll = Boolean(opts.ambient) && !nativeTurn && mode !== "ugc_selfie";
-    const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel, shootMode: mode };
-    const firstFrame = buildFrameRef(
-      "start",
-      scene.start_frame_asset?.url,
-      scene.start_frame_prompt,
-      slotsForFrame(slots, i, "start"),
-      ctx,
-      nodes
-    );
-    const lastFrame = buildFrameRef(
-      "end",
-      scene.end_frame_asset?.url,
-      scene.end_frame_prompt,
-      slotsForFrame(slots, i, "end"),
-      ctx,
-      nodes
-    );
-    const dur = sceneDurationS(scene);
-    let out = sceneOutTransition(scene, i === lastIndex);
-    let trimTarget = dur + (out?.dur ?? 0);
-    if (out && ceilToSeedance(trimTarget) < trimTarget) {
-      out = null;
-      trimTarget = dur;
-    }
-    const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
-    const genDur = ceilToSeedance(Math.max(trimTarget, speech));
-    const clipParams = {
-      model: opts.videoModel,
-      prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
-      duration: genDur,
-      // Native talking scene → Seedance generates the spoken audio + lip-sync;
-      // an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
-      // clip is silent and audio comes from the tts/music timeline.
-      generate_audio: Boolean(nativeTurn) || ambientBroll
-    };
-    if (ar) clipParams.aspect_ratio = ar;
+function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
+  const bySpeaker = /* @__PURE__ */ new Map();
+  for (const seg of segments) {
+    const arr = bySpeaker.get(seg.voiceNode) ?? [];
+    arr.push(seg);
+    bySpeaker.set(seg.voiceNode, arr);
+  }
+  const tracks = [];
+  for (const [voiceNode, segs] of bySpeaker) {
+    const trackId = `${voiceNode}_track`;
+    const convId = `${voiceNode}_conv`;
+    const mixInputs = {};
+    segs.forEach((s, k) => {
+      mixInputs[`seg${k}`] = s.ref;
+    });
     nodes.push({
-      id: `s${i}_clip`,
-      type: "video_generate",
-      inputs: { first_frame: firstFrame, last_frame: lastFrame },
-      params: clipParams
+      id: trackId,
+      type: "audio_timeline",
+      inputs: mixInputs,
+      params: {
+        tracks: segs.map((s, k) => ({ slot: `seg${k}`, start_s: s.start_s })),
+        total_ms: totalMs
+      }
     });
-    emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, { dur, speech, genDur }, nodes, voTracks);
-    const base = `$ref:s${i}_clip.video`;
-    if (genDur === trimTarget) {
-      clips.push({ ref: base, scene_s: dur, out });
-    } else {
-      nodes.push({
-        id: `s${i}_trim`,
-        type: "ffmpeg",
-        inputs: { clip: base },
-        params: { args: trimArgs(trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
-      });
-      clips.push({ ref: `$ref:s${i}_trim.video`, scene_s: dur, out });
+    nodes.push({
+      id: convId,
+      type: "audio_voice_convert",
+      inputs: { audio: `$ref:${trackId}.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
+      params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
+    });
+    tracks.push({ slot: convId, ref: `$ref:${convId}.audio`, start_s: 0, kind: "vo" });
+  }
+  return tracks;
+}
+function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
+  const clipParams = {
+    model: opts.videoModel,
+    prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
+    duration: lengths.genDur,
+    // Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
+    // ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
+    generate_audio: Boolean(nativeTurn) || ambientBroll
+  };
+  if (opts.ar) clipParams.aspect_ratio = opts.ar;
+  nodes.push({
+    id: `s${i}${tag}_clip`,
+    type: "video_generate",
+    inputs: { first_frame: frames.first, ...frames.last ? { last_frame: frames.last } : {} },
+    params: clipParams
+  });
+  const base = `$ref:s${i}${tag}_clip.video`;
+  if (lengths.genDur === lengths.trimTarget) return { ref: base, scene_s: lengths.dur, out };
+  nodes.push({
+    id: `s${i}${tag}_clip_trim`,
+    type: "ffmpeg",
+    inputs: { clip: base },
+    params: { args: trimArgs(lengths.trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
+  });
+  return { ref: `$ref:s${i}${tag}_clip_trim.video`, scene_s: lengths.dur, out };
+}
+var COMPOSITE_LAYOUTS = /* @__PURE__ */ new Set(["split_screen", "pip", "keyed_overlay"]);
+var UI_SURFACE_RE = /\b(?:app|ui|web ?site|web ?page|website|browser|chat|interface|mock-?up|in[- ]?app|dashboard|app screen|phone screen|screen[- ]?(?:recording|capture|grab|share))\b/i;
+function regionIsUiSurface(r) {
+  return UI_SURFACE_RE.test(`${r.panel ?? ""} ${r.summary ?? ""} ${r.frame_prompt ?? ""}`);
+}
+function isUiOnlyComposite(regions) {
+  const ui = regions.filter(regionIsUiSurface).length;
+  return ui >= 1 && regions.length - ui <= 1;
+}
+function layeredComposition(scene) {
+  const comp = scene.composition;
+  const layout = (comp?.layout ?? "").toLowerCase();
+  if (!COMPOSITE_LAYOUTS.has(layout)) return null;
+  const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
+  if (regions.length < 2) return null;
+  if (isUiOnlyComposite(regions)) return null;
+  return { layout, regions, comp: comp ?? {} };
+}
+function splitAxisOf(comp, regions) {
+  const panels = regions.map((r) => (r.panel ?? "").toLowerCase());
+  if (panels.some((p) => p === "top" || p === "bottom")) return "vertical";
+  if (panels.some((p) => p === "left" || p === "right")) return "horizontal";
+  return (comp.split_axis ?? "").toLowerCase() === "horizontal" ? "horizontal" : "vertical";
+}
+function orderSplitRefs(regions, regionRefs, axis) {
+  const rank = (panel) => {
+    const p = (panel ?? "").toLowerCase();
+    if (axis === "vertical") return p === "top" ? 0 : p === "bottom" ? 2 : 1;
+    return p === "left" ? 0 : p === "right" ? 2 : 1;
+  };
+  return regionRefs.map((ref, k) => ({ ref, k, rank: rank(regions[k]?.panel) })).sort((a, b) => a.rank - b.rank || a.k - b.k).map((x) => x.ref);
+}
+function presenterIndexOf(regions, hasNative) {
+  const flagged = regions.findIndex((r) => r.is_presenter);
+  if (flagged >= 0) return flagged;
+  return hasNative ? 0 : -1;
+}
+function slotsForRegion(present, isPresenter) {
+  return present.filter((s) => {
+    const t = s.type.toLowerCase();
+    const person = t === "person" || t === "animal";
+    return isPresenter ? person : !person;
+  });
+}
+function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes) {
+  const dims = canvasDims(opts.ar);
+  const presIdx = presenterIndexOf(regions, Boolean(nativeTurn));
+  const regionRefs = [];
+  let presenterPosition;
+  regions.forEach((region, r) => {
+    const isPresenter = r === presIdx;
+    const tag = `_r${r}`;
+    const regionSlots = slotsForRegion(present, isPresenter);
+    const ctx = {
+      sceneIndex: i,
+      ar: opts.ar,
+      reuse: opts.reuse,
+      imageModel: opts.imageModel,
+      shootMode: mode,
+      tag
+    };
+    const startPrompt = region.frame_prompt ?? scene.start_frame_prompt;
+    const endPrompt = region.frame_prompt ?? scene.end_frame_prompt;
+    const first = buildFrameRef("start", void 0, startPrompt, regionSlots, ctx, nodes);
+    const last = buildFrameRef("end", void 0, endPrompt, regionSlots, ctx, nodes);
+    const regionNative = isPresenter ? nativeTurn : void 0;
+    const regionScene = {
+      ...scene,
+      summary: region.summary ?? scene.summary,
+      motion_prompt: region.motion_prompt ?? scene.motion_prompt,
+      dialogue: isPresenter ? scene.dialogue : []
+    };
+    const clip = emitSceneClip(
+      i,
+      regionScene,
+      regionSlots,
+      mode,
+      regionNative,
+      false,
+      { first, last },
+      lengths,
+      null,
+      { ar: opts.ar, videoModel: opts.videoModel },
+      nodes,
+      tag
+    );
+    regionRefs.push(clip.ref);
+    if (isPresenter) presenterPosition = region.position;
+  });
+  const compInputs = {};
+  let args;
+  if (layout === "split_screen") {
+    const axis = splitAxisOf(comp, regions);
+    orderSplitRefs(regions, regionRefs, axis).forEach((ref, k) => {
+      compInputs[`c${k}`] = ref;
+    });
+    args = splitStackArgs(regionRefs.length, axis, dims);
+  } else {
+    const bgIdx = regions.findIndex((_, k) => k !== presIdx);
+    const bgRef = regionRefs[bgIdx >= 0 ? bgIdx : 0];
+    let presRef = regionRefs[presIdx >= 0 ? presIdx : 1];
+    if (layout === "keyed_overlay" && presIdx >= 0) {
+      const keyId = `s${i}_key`;
+      nodes.push({ id: keyId, type: "video_background_remove", inputs: { video: presRef }, params: {} });
+      presRef = `$ref:${keyId}.video`;
+    }
+    compInputs.c0 = bgRef;
+    compInputs.c1 = presRef;
+    args = pipOverlayArgs(dims, presenterPosition, layout === "keyed_overlay" ? 0.5 : 0.34);
+  }
+  nodes.push({
+    id: `s${i}_composite`,
+    type: "ffmpeg",
+    inputs: compInputs,
+    params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
+  });
+  const presenterClipRef = presIdx >= 0 ? `$ref:s${i}_r${presIdx}_clip.video` : void 0;
+  return { clip: { ref: `$ref:s${i}_composite.video`, scene_s: lengths.dur, out }, presenterClipRef };
+}
+function sceneTiming(scene, isLast, nativeTurn) {
+  const dur = sceneDurationS(scene);
+  let out = sceneOutTransition(scene, isLast);
+  let trimTarget = dur + (out?.dur ?? 0);
+  if (out && ceilToSeedance(trimTarget) < trimTarget) {
+    out = null;
+    trimTarget = dur;
+  }
+  const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
+  const genDur = ceilToSeedance(Math.max(trimTarget, speech));
+  return { dur, out, trimTarget, genDur, speech };
+}
+function emitCompositeScene(composite, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes, voTracks, nativeSegments, clips) {
+  const built = buildCompositeScene(
+    composite.layout,
+    composite.regions,
+    composite.comp,
+    scene,
+    i,
+    present,
+    mode,
+    nativeTurn,
+    { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
+    out,
+    opts,
+    nodes
+  );
+  emitSceneNativeAudio(
+    i,
+    scene,
+    nativeTurn,
+    false,
+    { dur: lengths.dur, speech: lengths.speech, genDur: lengths.genDur },
+    nodes,
+    voTracks,
+    nativeSegments,
+    built.presenterClipRef
+  );
+  clips.push(built.clip);
+}
+function emitFlashHold(i, scene, slots, ctx, lengths, out, ar, nodes, clips) {
+  const frame = buildFrameRef(
+    "start",
+    scene.start_frame_asset?.url,
+    scene.start_frame_prompt,
+    slotsForFrame(slots, i, "start"),
+    ctx,
+    nodes
+  );
+  nodes.push({
+    id: `s${i}_clip`,
+    type: "ffmpeg",
+    inputs: { frame },
+    params: {
+      args: stillHoldArgs(lengths.trimTarget, canvasDims(ar)),
+      outputs: { video: { kind: "video", ext: "mp4" } }
     }
   });
-  return { clips, voTracks };
+  clips.push({ ref: `$ref:s${i}_clip.video`, scene_s: lengths.dur, out });
+}
+function musicArcDigest(blueprint) {
+  const roles = blueprint.scenes.map((s) => s.narrative_role).filter((r) => Boolean(r));
+  const arc = roles.length > 0 ? roles.join(" \u2192 ") : "";
+  return arc ? `
+Emotional arc across scenes: ${arc}. Shape the bed's energy to this arc, swelling on the payoff. Purely instrumental \u2014 no vocals, no singing, no spoken words.` : "";
 }
 function musicBedPrompt(blueprint, musicPrompt) {
+  const digest = musicArcDigest(blueprint);
   const track2 = blueprint.global?.music?.identified_track;
   const title = track2?.title?.trim();
-  if (!title) return musicPrompt;
-  const by = track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : "";
-  return `${musicPrompt}
+  const vibe = title ? `
-Reference vibe: the original used "${title}"${by} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.`;
+Reference vibe: the original used "${title}"${track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : ""} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.` : "";
+  return `${musicPrompt}${digest}${vibe}`;
 }
 function onCameraDialogue(blueprint) {
   const mode = blueprint.global?.voiceover?.mode;
@@ -8998,92 +9431,483 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
   if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
   return casts.has(speaker);
 }
-function buildDialogue(blueprint, nodes) {
-  const tracks = [];
-  const sceneTurns = /* @__PURE__ */ new Map();
+function makePresenterPresent(slots, canonical) {
+  const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
+  const bySpeaker = /* @__PURE__ */ new Map();
+  for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
+  const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
+  return (speaker, sceneIndex) => {
+    const presence = bySpeaker.get(speaker) ?? solePerson;
+    if (!presence) return true;
+    return presence.has(sceneIndex);
+  };
+}
+var PAUSE_GAP_S = 0.6;
+var PHRASE_MAX_S = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
+function collapseVoiceover(blueprint) {
   const casts = castIdSet(blueprint);
   const cameraOn = onCameraDialogue(blueprint);
-  const voiceNodeBySpeaker = /* @__PURE__ */ new Map();
-  const speakerDescription = (speaker) => {
-    for (const scene of blueprint.scenes) {
-      for (const line of scene.dialogue ?? []) {
-        if ((line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
-      }
+  const presenters = /* @__PURE__ */ new Set();
+  for (const scene of blueprint.scenes)
+    for (const l of scene.dialogue ?? []) {
+      const sp = l.speaker ?? "voiceover";
+      if (isOnCameraSpeaker(sp, casts, cameraOn)) presenters.add(sp);
     }
+  if (presenters.size !== 1) return (s) => s;
+  const presenter = [...presenters][0];
+  return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
+}
+function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
+  const casts = castIdSet(blueprint);
+  const cameraOn = onCameraDialogue(blueprint);
+  const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
+  const multiSpeaker = /* @__PURE__ */ new Set();
+  blueprint.scenes.forEach((scene, i) => {
+    const onCam = new Set(
+      (scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
+    );
+    if (onCam.size >= 2) multiSpeaker.add(i);
+  });
+  const lines = blueprint.scenes.flatMap(
+    (scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
+      const raw = l.speaker ?? "voiceover";
+      const sp = canonical(raw);
+      const text = l.line.trim();
+      const start = l.start_s ?? scene.start_s ?? 0;
+      return {
+        sceneIndex,
+        speaker: sp,
+        // Shown = a cast member speaking AND their element is actually on screen
+        // here (not a cutaway). A b-roll cutaway mid-phrase fails this and gets
+        // its own clip while the phrase voice plays under it.
+        shown: isOnCameraSpeaker(raw, casts, cameraOn) && !multiSpeaker.has(sceneIndex) && presenterPresent(sp, sceneIndex),
+        start,
+        // Real speech end. When the deconstruct gives no end_s, estimate it from
+        // the words — NOT the scene end (which would fabricate continuity across
+        // a long silent b-roll gap and wrongly merge two separate phrases).
+        end: l.end_s ?? start + estSpeechS(text),
+        text
+      };
+    })
+  ).sort((a, b) => a.start - b.start);
+  const phrases = [];
+  let cur = null;
+  const flush = () => {
+    if (!cur) return;
+    const shownScenes = [...cur.shown].sort((a, b) => a - b);
+    phrases.push({
+      speaker: cur.speaker,
+      start_s: cur.start,
+      end_s: cur.end,
+      text: cur.texts.join(" "),
+      firstScene: cur.firstScene,
+      shownScenes,
+      presenterShown: shownScenes.length > 0
+    });
+    cur = null;
+  };
+  for (const ln of lines) {
+    const lineCover = ln.shown ? Math.max(ln.end, sceneEndS(ln.sceneIndex)) : ln.end;
+    const lineClipStart = ln.shown ? Math.min(ln.start, blueprint.scenes[ln.sceneIndex]?.start_s ?? ln.start) : ln.start;
+    const breakRun = !cur || cur.speaker !== ln.speaker || ln.start - cur.end > PAUSE_GAP_S || // Cap by SCENE COVERAGE span, not line end — a presenter run whose sliced scenes span
+    // more than one Seedance clip splits into the next take here (at this scene's
+    // boundary, never mid-scene), so no segment ever reads past the generated clip.
+    Math.max(cur.coverEnd, lineCover) - Math.min(cur.clipStart, lineClipStart) > PHRASE_MAX_S;
+    if (breakRun || !cur) {
+      flush();
+      cur = {
+        speaker: ln.speaker,
+        firstScene: ln.sceneIndex,
+        start: ln.start,
+        end: ln.end,
+        coverEnd: lineCover,
+        clipStart: lineClipStart,
+        texts: [ln.text],
+        shown: /* @__PURE__ */ new Set()
+      };
+    } else {
+      cur.texts.push(ln.text);
+      cur.end = Math.max(cur.end, ln.end);
+      cur.coverEnd = Math.max(cur.coverEnd, lineCover);
+      cur.clipStart = Math.min(cur.clipStart, lineClipStart);
+    }
+    if (ln.shown) cur.shown.add(ln.sceneIndex);
+  }
+  flush();
+  return phrases;
+}
+function makeVoiceFactory(blueprint, canonical, nodes) {
+  const bySpeaker = /* @__PURE__ */ new Map();
+  const describe = (speaker) => {
+    for (const scene of blueprint.scenes)
+      for (const line of scene.dialogue ?? [])
+        if (canonical(line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
     const cast = blueprint.global?.cast?.find((c) => c.id === speaker);
     return cast?.description ?? blueprint.global?.voiceover?.voice_description ?? `${speaker} voice`;
   };
-  const ensureVoiceNode = (speaker) => {
-    const existing = voiceNodeBySpeaker.get(speaker);
+  return (speaker) => {
+    const existing = bySpeaker.get(speaker);
     if (existing) return existing;
-    const id = sanitizeId2(`voice_${speaker}`, `voice_${voiceNodeBySpeaker.size}`);
-    const description = speakerDescription(speaker);
-    const traits = parseVoiceTraits(description);
-    nodes.push({ id, type: "voice_select", params: { description, ...traits } });
-    voiceNodeBySpeaker.set(speaker, id);
+    const id = sanitizeId2(`voice_${speaker}`, `voice_${bySpeaker.size}`);
+    const description = describe(speaker);
+    nodes.push({ id, type: "voice_select", params: { description, ...parseVoiceTraits(description) } });
+    bySpeaker.set(speaker, id);
     return id;
   };
+}
+function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
+  const anchor = phrase.shownScenes[0];
+  const anchorScene = env.blueprint.scenes[anchor];
+  if (!anchorScene) return;
+  const present = slotsForScene(env.slots, anchor);
+  const nativeTurn = {
+    sceneIndex: anchor,
+    speaker: phrase.speaker,
+    start_s: phrase.start_s,
+    end_s: phrase.end_s,
+    text: phrase.text,
+    voiceNode,
+    native: true
+  };
+  const mode = sceneShootMode(anchorScene, present, nativeTurn, env.cameraOn, env.casts);
+  const ctx = {
+    sceneIndex: anchor,
+    ar: env.ar,
+    reuse: env.reuse,
+    imageModel: env.opts.imageModel,
+    shootMode: mode,
+    ingestCache: env.ingestCache
+  };
+  const first = buildFrameRef(
+    "start",
+    anchorScene.start_frame_asset?.url,
+    anchorScene.start_frame_prompt,
+    slotsForFrame(env.slots, anchor, "start"),
+    ctx,
+    nodes
+  );
+  const lastShown = phrase.shownScenes[phrase.shownScenes.length - 1] ?? anchor;
+  const lastScene = env.blueprint.scenes[lastShown] ?? anchorScene;
+  const last = buildFrameRef(
+    "end",
+    lastScene.end_frame_asset?.url,
+    lastScene.end_frame_prompt,
+    slotsForFrame(env.slots, lastShown, "end"),
+    ctx,
+    nodes
+  );
+  const clipStart = phrase.shownScenes.reduce(
+    (m, s) => Math.min(m, env.blueprint.scenes[s]?.start_s ?? phrase.start_s),
+    phrase.start_s
+  );
+  const coverEnd = phrase.shownScenes.reduce((m, s) => Math.max(m, env.blueprint.scenes[s]?.end_s ?? 0), phrase.end_s);
+  const phraseLen = Math.max(0.5, coverEnd - clipStart);
+  const genDur = ceilToSeedance(phraseLen);
+  const clipParams = {
+    model: env.opts.videoModel,
+    prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
+    duration: genDur,
+    generate_audio: true
+  };
+  if (env.ar) clipParams.aspect_ratio = env.ar;
+  nodes.push({
+    id: `s${anchor}_clip`,
+    type: "video_generate",
+    inputs: { first_frame: first, last_frame: last },
+    params: clipParams
+  });
+  const clipRef = `$ref:s${anchor}_clip.video`;
+  const speechOffset = Math.max(0, phrase.start_s - clipStart);
+  const extractLen = Math.min(Math.max(0.5, phrase.end_s - phrase.start_s), Math.max(0.5, genDur - speechOffset));
+  nodes.push({
+    id: `s${anchor}_voextract`,
+    type: "ffmpeg",
+    inputs: { clip: clipRef },
+    params: { args: audioExtractArgs(extractLen, speechOffset), outputs: { audio: { kind: "audio", ext: "mp3" } } }
+  });
+  const convId = `s${anchor}_conv`;
+  nodes.push({
+    id: convId,
+    type: "audio_voice_convert",
+    inputs: { audio: `$ref:s${anchor}_voextract.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
+    params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
+  });
+  out.voTracks.push({
+    slot: convId,
+    ref: `$ref:${convId}.audio`,
+    start_s: phrase.start_s,
+    end_s: phrase.end_s,
+    kind: "vo"
+  });
+  out.voSegments.push({
+    slot: convId,
+    start_s: phrase.start_s,
+    end_s: phrase.end_s,
+    scene: anchor,
+    speaker: phrase.speaker
+  });
+  out.talkingScenes.push({
+    scene: anchor,
+    voice_convert_node: convId,
+    scene_s: Math.round(phraseLen * 100) / 100,
+    est_speech_s: Math.round(estSpeechS(phrase.text) * 100) / 100
+  });
+  for (const s of phrase.shownScenes) {
+    const sc = env.blueprint.scenes[s];
+    if (!sc) continue;
+    const rawOffset = (sc.start_s ?? clipStart) - clipStart;
+    out.sceneSlice.set(s, {
+      clipRef,
+      // Snap a sub-frame offset (line-start vs scene-start drift) to 0 so a single-scene
+      // phrase hits the whole-clip fast path instead of a needless re-encode + tiny shift.
+      offset: rawOffset < 0.05 ? 0 : rawOffset,
+      len: sceneDurationS(sc),
+      clipDur: genDur
+    });
+  }
+}
+function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
+  let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
+  while (used.has(id)) id = `${id}_x`;
+  used.add(id);
+  nodes.push({
+    id,
+    type: "tts",
+    inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
+    params: { model: FIXED_TTS_MODEL, text: phrase.text, voice: "{{voice_ref}}" }
+  });
+  out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
+  out.voSegments.push({
+    slot: id,
+    start_s: phrase.start_s,
+    end_s: phrase.end_s,
+    scene: phrase.firstScene,
+    speaker: phrase.speaker
+  });
+}
+function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
+  const present = slotsForScene(env.slots, i);
+  const onCam = (scene.dialogue ?? []).filter(
+    (l) => Boolean(l.line?.trim()) && isOnCameraSpeaker(l.speaker ?? "voiceover", env.casts, env.cameraOn)
+  );
+  const distinctSpeakers = new Set(onCam.map((l) => canonical(l.speaker ?? "voiceover")));
+  let nativeTurn;
+  if (onCam.length > 0 && distinctSpeakers.size === 1) {
+    const speaker = canonical(onCam[0]?.speaker ?? "voiceover");
+    const voiceNode = ensureVoiceNode(speaker);
+    const start = onCam[0]?.start_s ?? scene.start_s ?? 0;
+    const end = onCam[onCam.length - 1]?.end_s ?? scene.end_s ?? start;
+    const text = onCam.map((l) => l.line.trim()).join(" ");
+    nativeTurn = { sceneIndex: i, speaker, start_s: start, end_s: end, text, voiceNode, native: true };
+    out.talkingScenes.push({
+      scene: i,
+      voice_convert_node: `${voiceNode}_conv`,
+      scene_s: Math.round(sceneDurationS(scene) * 100) / 100,
+      est_speech_s: Math.round(estSpeechS(text) * 100) / 100
+    });
+  }
+  const mode = sceneShootMode(scene, present, nativeTurn, env.cameraOn, env.casts);
+  const lengths = sceneTiming(scene, isLast, nativeTurn);
+  emitCompositeScene(
+    composite,
+    scene,
+    i,
+    present,
+    mode,
+    nativeTurn,
+    lengths,
+    lengths.out,
+    { ar: env.ar, reuse: env.reuse, imageModel: env.opts.imageModel, videoModel: env.opts.videoModel },
+    nodes,
+    out.voTracks,
+    out.nativeSegments,
+    out.clips
+  );
+  if (!nativeTurn && distinctSpeakers.size >= 2) {
+    emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out);
+  }
+}
+function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
+  const bySpeaker = /* @__PURE__ */ new Map();
+  for (const l of onCam) {
+    const speaker = canonical(l.speaker ?? "voiceover");
+    const text = l.line.trim();
+    const start = l.start_s ?? scene.start_s ?? 0;
+    const end = l.end_s ?? start + estSpeechS(text);
+    const cur = bySpeaker.get(speaker);
+    if (cur) {
+      cur.lines.push(text);
+      cur.start = Math.min(cur.start, start);
+      cur.end = Math.max(cur.end, end);
+    } else {
+      bySpeaker.set(speaker, { lines: [text], start, end });
+    }
+  }
+  for (const [speaker, agg] of bySpeaker) {
+    const voiceNode = ensureVoiceNode(speaker);
+    emitPhraseTts(
+      {
+        speaker,
+        start_s: agg.start,
+        end_s: agg.end,
+        text: agg.lines.join(" "),
+        firstScene: i,
+        shownScenes: [],
+        presenterShown: false
+      },
+      voiceNode,
+      i,
+      usedVoIds,
+      nodes,
+      out
+    );
+  }
+}
+function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
+  const present = slotsForScene(env.slots, i);
+  const mode = sceneShootMode(scene, present, void 0, env.cameraOn, env.casts);
+  const ambientBroll = Boolean(env.opts.ambient) && mode !== "ugc_selfie";
+  const lengths = sceneTiming(scene, isLast, void 0);
+  const ctx = {
+    sceneIndex: i,
+    ar: env.ar,
+    reuse: env.reuse,
+    imageModel: env.opts.imageModel,
+    shootMode: mode,
+    ingestCache: env.ingestCache
+  };
+  if (!ambientBroll && lengths.dur <= FLASH_HOLD_MAX_S) {
+    emitFlashHold(i, scene, env.slots, ctx, lengths, lengths.out, env.ar, nodes, out.clips);
+    return void 0;
+  }
+  const first = scene.continues_previous && prevEndFrame ? prevEndFrame : buildFrameRef(
+    "start",
+    scene.start_frame_asset?.url,
+    scene.start_frame_prompt,
+    slotsForFrame(env.slots, i, "start"),
+    ctx,
+    nodes
+  );
+  const last = buildFrameRef(
+    "end",
+    scene.end_frame_asset?.url,
+    scene.end_frame_prompt,
+    slotsForFrame(env.slots, i, "end"),
+    ctx,
+    nodes
+  );
+  const clip = emitSceneClip(
+    i,
+    scene,
+    present,
+    mode,
+    void 0,
+    ambientBroll,
+    { first, last },
+    { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
+    lengths.out,
+    { ar: env.ar, videoModel: env.opts.videoModel },
+    nodes
+  );
+  if (ambientBroll) {
+    emitSceneNativeAudio(
+      i,
+      scene,
+      void 0,
+      true,
+      { dur: lengths.dur, speech: 0, genDur: lengths.genDur },
+      nodes,
+      out.voTracks,
+      out.nativeSegments
+    );
+  }
+  out.clips.push(clip);
+  return last;
+}
+function buildTimeline(blueprint, slots, opts, nodes) {
+  const reuse = opts.frames === "reuse";
+  const compositeScenes = /* @__PURE__ */ new Set();
+  if (!reuse) {
+    blueprint.scenes.forEach((s, i) => {
+      if (layeredComposition(s)) compositeScenes.add(i);
+    });
+  }
+  const canonical = collapseVoiceover(blueprint);
+  const ensureVoiceNode = makeVoiceFactory(blueprint, canonical, nodes);
+  const env = {
+    blueprint,
+    slots,
+    opts,
+    ar: aspectRatioParam(blueprint),
+    reuse,
+    cameraOn: onCameraDialogue(blueprint),
+    casts: castIdSet(blueprint),
+    ingestCache: /* @__PURE__ */ new Map()
+  };
+  const out = {
+    clips: [],
+    voTracks: [],
+    voSegments: [],
+    talkingScenes: [],
+    nativeSegments: [],
+    sceneSlice: /* @__PURE__ */ new Map()
+  };
+  const presenterPresent = makePresenterPresent(slots, canonical);
+  const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent);
   const usedVoIds = /* @__PURE__ */ new Set();
-  blueprint.scenes.forEach((scene, sceneIndex) => {
-    const lines = (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).slice().sort((a, b) => (a.start_s ?? 0) - (b.start_s ?? 0));
-    if (lines.length === 0) return;
-    const groups = [];
-    for (const line of lines) {
-      const speaker = line.speaker ?? "voiceover";
-      const last = groups[groups.length - 1];
-      if (last && last.speaker === speaker) last.lines.push(line);
-      else groups.push({ speaker, lines: [line] });
-    }
-    const shells = groups.map((group) => {
-      const first = group.lines[0];
-      const last = group.lines[group.lines.length - 1];
-      if (!first || !last) return void 0;
-      return {
-        group,
-        start: first.start_s ?? scene.start_s ?? 0,
-        end: last.end_s ?? last.start_s ?? scene.end_s ?? first.start_s ?? scene.start_s ?? 0,
-        onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn)
-      };
-    }).filter((s) => Boolean(s));
-    const onCamCount = shells.filter((s) => s.onCamera).length;
-    const list = [];
-    shells.forEach((shell, gi) => {
-      const { group, start, end, onCamera } = shell;
-      const voiceNode = ensureVoiceNode(group.speaker);
-      const text = group.lines.map((l) => l.line.trim()).join(" ");
-      const native = onCamera && onCamCount === 1;
-      const turn = {
-        sceneIndex,
-        speaker: group.speaker,
-        start_s: start,
-        end_s: end,
-        text,
-        voiceNode,
-        native
-      };
-      if (!native) {
-        let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
-        if (usedVoIds.has(id)) {
-          let n = 2;
-          while (usedVoIds.has(`${id}_${n}`)) n++;
-          id = `${id}_${n}`;
-        }
-        usedVoIds.add(id);
+  const claimed = /* @__PURE__ */ new Set();
+  phrases.forEach((phrase, k) => {
+    const voiceNode = ensureVoiceNode(phrase.speaker);
+    const available = phrase.shownScenes.filter((s) => !claimed.has(s));
+    if (phrase.presenterShown && available.length > 0) {
+      for (const s of available) claimed.add(s);
+      emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
+    } else {
+      emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
+    }
+  });
+  const lastIndex = blueprint.scenes.length - 1;
+  let prevEndFrame;
+  blueprint.scenes.forEach((scene, i) => {
+    const composite = compositeScenes.has(i) ? layeredComposition(scene) : null;
+    if (composite) {
+      emitCompositeInTimeline(
+        composite,
+        scene,
+        i,
+        i === lastIndex,
+        env,
+        canonical,
+        ensureVoiceNode,
+        usedVoIds,
+        nodes,
+        out
+      );
+      prevEndFrame = void 0;
+      return;
+    }
+    const slice = out.sceneSlice.get(i);
+    if (slice) {
+      const whole = slice.offset === 0 && Math.abs(slice.len - slice.clipDur) <= 0.05;
+      if (whole) {
+        out.clips.push({ ref: slice.clipRef, scene_s: slice.len, out: null });
+      } else {
         nodes.push({
-          id,
-          type: "tts",
-          inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
-          params: { model: FIXED_TTS_MODEL, text, voice: "{{voice_ref}}" }
+          id: `s${i}_seg`,
+          type: "ffmpeg",
+          inputs: { clip: slice.clipRef },
+          params: { args: trimArgs(slice.len, slice.offset), outputs: { video: { kind: "video", ext: "mp4" } } }
         });
-        turn.ttsId = id;
-        const audioRef = `$ref:${id}.audio`;
-        tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
+        out.clips.push({ ref: `$ref:s${i}_seg.video`, scene_s: slice.len, out: null });
       }
-      list.push(turn);
-    });
-    sceneTurns.set(sceneIndex, list);
+      prevEndFrame = void 0;
+      return;
+    }
+    prevEndFrame = emitBrollScene(scene, i, i === lastIndex, env, nodes, out, prevEndFrame);
   });
-  return { tracks, sceneTurns };
+  const totalMs = Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3);
+  out.voTracks.push(...buildPerSpeakerVoiceConversion(out.nativeSegments, totalMs, nodes));
+  return { clips: out.clips, voTracks: out.voTracks, vo_segments: out.voSegments, talking_scenes: out.talkingScenes };
 }
 function buildSfxMusic(blueprint, nodes) {
   const tracks = [];
@@ -9106,13 +9930,21 @@ function buildSfxMusic(blueprint, nodes) {
   const musicPrompt = blueprint.global?.music?.music_prompt;
   if (musicPrompt) {
     const total = blueprint.source?.duration_s ?? lastSceneEnd(blueprint);
-    const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, 0), Math.max(total - 0.5, 0));
+    const hookEnd = blueprint.scenes[0]?.end_s ?? 0;
+    const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, hookEnd), Math.max(total - 0.5, 0));
     const totalMs = Math.round((total - startAt) * 1e3);
     const musicMs = Math.min(Math.max(totalMs, 3e3), ELEVENLABS_MAX_MUSIC_LENGTH_MS);
     nodes.push({
       id: "music_bed",
       type: "music",
-      params: { model: FIXED_MUSIC_MODEL, prompt: musicBedPrompt(blueprint, musicPrompt), music_length_ms: musicMs }
+      // force_instrumental: the model is vocal-capable; without this it can SING the
+      // mood (and feeding it the script made it sing the ad). The voice owns the words.
+      params: {
+        model: FIXED_MUSIC_MODEL,
+        prompt: musicBedPrompt(blueprint, musicPrompt),
+        music_length_ms: musicMs,
+        force_instrumental: true
+      }
     });
     tracks.push({
       slot: "music",
@@ -9156,22 +9988,63 @@ function normalizeAnim(animation) {
   const mapped = animation === "slide" ? "slide_up" : animation;
   return SUPPORTED_ANIMS.has(mapped) ? mapped : void 0;
 }
+var FACE_ZONE_POSITIONS = /* @__PURE__ */ new Set([
+  "center",
+  "centre",
+  "mid-center",
+  "mid-centre",
+  "middle-center",
+  "center-center",
+  "mid",
+  "middle"
+]);
 function positionClass(position) {
   const p = (position ?? "bottom_center").toLowerCase().replace(/[^a-z]+/g, "-");
-  return `pos-${p}`;
+  const safe = FACE_ZONE_POSITIONS.has(p) ? "bottom-center" : p;
+  return `pos-${safe}`;
 }
-function overlayElement(ov, sceneStart) {
+function collectCaptions(blueprint) {
+  return blueprint.scenes.flatMap((scene) => {
+    const sceneStart = scene.start_s ?? 0;
+    const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
+    return overlays.success ? overlays.data.filter((ov) => Boolean(ov.text?.trim())).map((ov) => {
+      const at = ov.appears_at_s ?? sceneStart;
+      return { text: ov.text.trim(), at, end: at + (ov.duration_s ?? 2.5), ov };
+    }) : [];
+  }).sort((a, b) => a.at - b.at);
+}
+function mergeCaptions(blueprint) {
+  const byText = /* @__PURE__ */ new Map();
+  for (const e of collectCaptions(blueprint)) {
+    const arr = byText.get(e.text);
+    if (arr) arr.push(e);
+    else byText.set(e.text, [e]);
+  }
+  const merged = [];
+  for (const arr of byText.values()) {
+    let cur = null;
+    for (const e of arr) {
+      if (cur && e.at <= cur.end + 0.35) cur.end = Math.max(cur.end, e.end);
+      else {
+        cur = { ...e };
+        merged.push(cur);
+      }
+    }
+  }
+  return merged.sort((a, b) => a.at - b.at);
+}
+function overlayElement(ov, at, dur) {
   if (!ov.text?.trim()) return "";
-  const at = ov.appears_at_s ?? sceneStart;
-  const dur = ov.duration_s ?? 2.5;
   const role = ov.role ? ` data-role="${escapeHtml(ov.role)}"` : "";
   const normAnim = normalizeAnim(ov.animation);
   const anim = normAnim ? ` data-anim="${normAnim}"` : "";
   const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
   return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
 }
+var RICH_OVERLAY_RE = /notif|tweet|\bx post\b|post\b|comment|message|chat|bubble|card|review|rating|stat|counter|toast|popup/;
 function sourceHint(fe) {
   const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
+  const haystack = `${fe.kind ?? ""} ${fe.description ?? ""} ${fe.what_it_represents ?? ""}`.toLowerCase();
   switch ((fe.kind ?? "").toLowerCase()) {
     case "logo":
       return "baker images logo <domain> (or baker images library)";
@@ -9181,6 +10054,9 @@ function sourceHint(fe) {
     case "product_cutout":
       return `baker images library "${desc}" (the client's own product)`;
     default:
+      if (RICH_OVERLAY_RE.test(haystack)) {
+        return `npx hyperframes add <social-card/notification block> for "${desc}" (animated overlay, not a static icon \u2014 see references/hyperframes/catalog.md)`;
+      }
       return `baker images icon "${desc}"`;
   }
 }
@@ -9196,6 +10072,26 @@ function floatingStub(fe, sceneStart) {
     `<img class="ov ${positionClass(fe.position)}" src="your-${slug}.png" data-start="${at}" data-dur="${dur}" alt="" /> -->`
   ].join("\n");
 }
+function uiPipStub(scene) {
+  const comp = scene.composition;
+  const layout = (comp?.layout ?? "").toLowerCase();
+  if (!COMPOSITE_LAYOUTS.has(layout)) return "";
+  const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
+  if (regions.length < 2 || !isUiOnlyComposite(regions)) return "";
+  const ui = regions.find(regionIsUiSurface);
+  const at = scene.start_s ?? 0;
+  const dur = Math.max(0.5, Math.round(((scene.end_s ?? at + 2.5) - at) * 100) / 100);
+  const label = commentSafe(ui?.summary || ui?.frame_prompt || ui?.panel || "the app screen");
+  return [
+    `<!-- PHONE UI @ ${at}s for ${dur}s \u2014 the app/site screen this scene shows: ${label}.`,
+    "     Build it as a REAL surface, NEVER AI: capture the live page \u2014",
+    "       baker images screenshot https://<brand-domain>/<path>  (image-library skill)",
+    "     \u2014 OR hand-build a brand-accurate HTML screen; then frame it in a phone mockup:",
+    "       npx hyperframes add phone-scroll   (writes compositions/phone-scroll.html)",
+    "     drop the screenshot as screenshot.png in this dir and nest it as a PIP clip:",
+    `     <div data-composition-src="compositions/phone-scroll.html" data-start="${at}" data-duration="${dur}" data-track-index="2" data-width="1080" data-height="1920"></div> -->`
+  ].join("\n");
+}
 function buildOverlayHtml(input) {
   const blueprint = VideoBlueprint.parse(input);
   const blocks = [
@@ -9215,14 +10111,14 @@ function buildOverlayHtml(input) {
       "     Positions: edit the .pos-* classes or add your own. -->"
     ].join("\n")
   ];
+  const ovParts = mergeCaptions(blueprint).map((e) => overlayElement(e.ov, e.at, Math.round((e.end - e.at) * 1e3) / 1e3)).filter(Boolean);
+  if (ovParts.length > 0) blocks.push(ovParts.join("\n"));
   for (const scene of blueprint.scenes) {
     const sceneStart = scene.start_s ?? 0;
-    const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
     const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
-    const parts = [
-      ...overlays.success ? overlays.data.map((ov) => overlayElement(ov, sceneStart)) : [],
-      ...floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []
-    ].filter(Boolean);
+    const parts = (floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []).filter(Boolean);
+    const pip = uiPipStub(scene);
+    if (pip) parts.push(pip);
     if (parts.length > 0) blocks.push(parts.join("\n"));
   }
   return blocks.join("\n\n");
@@ -9255,15 +10151,15 @@ function xfadeSpineArgs(clips) {
   let cur = "c0";
   let accLen = clipInputLen(clips[0]);
   for (let k = 0; k < n - 1; k++) {
-    const join3 = clips[k].out;
+    const join4 = clips[k].out;
     const next = `c${k + 1}`;
     const out = k === n - 2 ? "v" : `j${k + 1}`;
-    if (join3) {
-      const offset = Math.max(0, accLen - join3.dur);
+    if (join4) {
+      const offset = Math.max(0, accLen - join4.dur);
       filt.push(
-        `[${cur}][${next}]xfade=transition=${join3.xfade}:duration=${join3.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
+        `[${cur}][${next}]xfade=transition=${join4.xfade}:duration=${join4.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
       );
-      accLen = accLen - join3.dur + clipInputLen(clips[k + 1]);
+      accLen = accLen - join4.dur + clipInputLen(clips[k + 1]);
     } else {
       filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
       accLen += clipInputLen(clips[k + 1]);
@@ -9304,15 +10200,14 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
       params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
     });
   });
-  if (opts.actorSheets) applyActorSheets(slots, nodes);
-  const { tracks: ttsTracks, sceneTurns } = buildDialogue(blueprint, nodes);
-  const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
-  const voTracks = [...ttsTracks, ...nativeVoTracks];
+  applyActorSheets(slots, nodes);
+  const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
   let videoRef = buildSpine(clips, nodes);
   let videoNode = "spine";
   const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
   const floating = blueprint.scenes.flatMap((s) => s.floating_elements ?? []);
-  if (overlays.length > 0 || floating.length > 0) {
+  const hasUiPip = blueprint.scenes.some((s) => uiPipStub(s) !== "");
+  if (overlays.length > 0 || floating.length > 0 || hasUiPip) {
     nodes.push({
       id: "overlaid",
       type: "hyperframe_render",
@@ -9322,10 +10217,28 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
     videoRef = "$ref:overlaid.video";
     videoNode = "overlaid";
   }
+  if (opts.captionsCompositionPath && opts.transcriptPath) {
+    nodes.push({
+      id: "captions_transcript",
+      type: "ingest",
+      params: { source: "path", path: opts.transcriptPath, expect: "json" }
+    });
+    nodes.push({
+      id: "captions",
+      type: "hyperframe_render",
+      inputs: { background: videoRef, transcript: "$ref:captions_transcript.asset" },
+      params: { composition: opts.captionsCompositionPath }
+    });
+    videoRef = "$ref:captions.video";
+    videoNode = "captions";
+  }
   const tracks = [...voTracks, ...buildSfxMusic(blueprint, nodes)];
   if (tracks.length > 0) {
     const mixInputs = {};
     for (const t of tracks) mixInputs[t.slot] = t.ref;
+    const musicTrack = tracks.find((t) => t.kind === "music");
+    const voiceSlots = tracks.filter((t) => t.kind === "vo").map((t) => t.slot);
+    const duck = musicTrack && voiceSlots.length > 0 ? { duck: { track: musicTrack.slot, against: voiceSlots } } : {};
     nodes.push({
       id: "audio_mix",
       type: "audio_timeline",
@@ -9336,7 +10249,8 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
           start_s: t.start_s,
           ...t.gain_db !== void 0 ? { gain_db: t.gain_db } : {}
         })),
-        total_ms: Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3)
+        total_ms: Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3),
+        ...duck
       }
     });
     nodes.push({
@@ -9384,45 +10298,31 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
       // The timing plan `baker canvas validate` checks before any billed render:
       // sequenced voiceover turns (no overlap), audio ≈ video length, and which
       // scenes must be lip-synced.
-      video: buildVideoMeta(blueprint, sceneTurns)
+      video: buildVideoMeta(blueprint, { vo_segments, talking_scenes })
     },
     nodes,
     output: { node: videoNode, output: "video" }
   };
 }
-function buildVideoMeta(blueprint, sceneTurns) {
-  const vo_segments = [];
-  const talking_scenes = [];
-  for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
-    for (const t of turns) {
-      if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
-    }
-    const nativeTurn = turns.find((t) => t.native);
-    if (nativeTurn) {
-      const sceneObj = blueprint.scenes[scene];
-      talking_scenes.push({
-        scene,
-        voice_convert_node: `s${scene}_voconv`,
-        scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
-        est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
-      });
-    }
-  }
+function buildVideoMeta(blueprint, meta) {
   return {
     duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
-    vo_segments,
-    talking_scenes,
-    motion_board: buildMotionBoard(blueprint, sceneTurns)
+    vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
+    talking_scenes: meta.talking_scenes,
+    motion_board: buildMotionBoard(blueprint)
   };
 }
-function buildMotionBoard(blueprint, sceneTurns) {
+function sceneSpokenText(scene) {
+  return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
+}
+function buildMotionBoard(blueprint) {
   const round = (n) => Math.round(n * 100) / 100;
   let cursor = 0;
   return blueprint.scenes.map((scene, i) => {
     const start_s = scene.start_s ?? cursor;
     const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
     cursor = end_s;
-    const spoken = (sceneTurns.get(i) ?? []).map((t) => t.text?.trim()).filter((l) => Boolean(l)).join(" ") || null;
+    const spoken = sceneSpokenText(scene);
     const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
     const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
     const graphics = [
@@ -9445,19 +10345,21 @@ function buildMotionBoard(blueprint, sceneTurns) {
       scene: i,
       role: resolveSceneRole(scene, i, blueprint.scenes.length),
       window_s: [round(start_s), round(end_s)],
-      storyboard_frames: [`s${i}_start`, `s${i}_end`],
+      // A continuation b-roll scene shares the previous scene's end frame as its start
+      // (no own `s<i>_start` node), so point the storyboard at that shared keyframe.
+      storyboard_frames: [scene.continues_previous && i > 0 ? `s${i - 1}_end` : `s${i}_start`],
       spoken,
       graphics
     };
   });
 }
 var VIDEO_GUIDE = [
-  "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 concatenated. Talking heads are voiced NATIVELY by Seedance (lips+voice generated together) then re-voiced to one brand voice; off-camera narration is sequenced tts. On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
+  "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video, built like an editing timeline. The VOICE is cut at PAUSES, not at visual cuts: each continuous-speech PHRASE is ONE Seedance clip (native lip-sync + audio) re-voiced to one brand voice, so a sentence never breaks mid-word across a cut. Each scene's PICTURE is independent: a scene that SHOWS the speaker slices its window out of the phrase clip; a b-roll cutaway gets its own silent clip (or a still hold for a sub-2s flash) laid over the continuing voice; a pure-voiceover stretch is one ElevenLabs tts read. Every clip gets a CLEAN-PLATE start AND end keyframe (no baked text), RECAST to your dropped reference assets \u2014 Seedance interpolates real in-shot motion between them. Each frame grounds ONLY on its own extracted frame + el_* slots (never another generated frame), so all frames render in PARALLEL (no cross-frame cascade). A SPLIT-SCREEN / PICTURE-IN-PICTURE / KEYED-PRESENTER scene is reproduced as one clip PER REGION, stacked or overlaid (see `metadata.todo.composition`). On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voice + SFX + a ducked music bed, normalized stereo. It is a STARTING POINT, not a locked render: add, delete, reorder, split, merge, or re-time scenes freely (a b-roll cutaway INSIDE a phrase lands at an approximate beat \u2014 nudge it) \u2014 see `metadata.todo.full_flexibility`.",
   "",
   "WHAT TO DO NEXT:",
   "0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
-  "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` / `s<i>_end` node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want.",
-  "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. The boundary frames ARE your storyboard; `metadata.video.motion_board` lays out each scene's frames, time window, spoken line, and the graphics scheduled in it. Lock the frames + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
+  "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` keyframe node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want. The frame is RECAST to the el_* reference images you drop (the source ad's people are never reused), so describe pose/action/framing here and let the references carry identity.",
+  "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. Each scene's keyframe IS your storyboard; `metadata.video.motion_board` lays out each scene's frame, time window, spoken line, and the graphics scheduled in it. Lock the keyframes + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
   "2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
   "3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
   "4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
@@ -9468,11 +10370,11 @@ var VIDEO_GUIDE = [
   "- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
   "- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
   "- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
-  "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it). Match-cut continuous action by setting scene N+1's start frame = scene N's end frame (costs no extra gens).",
+  "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it).",
   "- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
   "- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
   "",
-  "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are.",
+  "Tip: `prompt.json` is the deconstruction provenance + the authoritative SHARED AD SPEC each frame reads for cast identity, palette, brand, and type cohesion. The per-frame editing surface is the frame node's own FRAME DESCRIPTION.",
   "Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
 ].join("\n");
 function inferNarrativeRole(index, total) {
@@ -9514,14 +10416,16 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
   const hookSceneIndex = findHookSceneIndex(blueprint);
   const h = hookSceneIndex;
   return {
+    full_flexibility: "THIS CANVAS IS A STARTING POINT, NOT A LOCKED RENDER. It mirrors the reference's structure so you have a faithful scaffold \u2014 but you have FULL EDITING FREEDOM and should use it. You can: ADD a scene (new s<i>_start/_end + s<i>_clip + wire it into `spine`), DELETE a scene (drop its nodes + its `spine` input), REORDER scenes, SPLIT one beat into two or MERGE two into one, change any frame prompt or motion brief, swap an element reference, re-time or rewrite any overlay/voice, or change a scene's LAYOUT (make a full-frame beat a split-screen/PIP, or flatten a composite to one shot \u2014 see `composition`). Re-craft for OUR brand and OUR best ad; the reference is inspiration, not a spec to trace. The content-addressed cache re-bills only what you actually change, so iterate freely. `baker canvas validate` re-checks timing/lip-sync after any edit.",
+    composition: "Some scenes are COMPOSITED, not single shots \u2014 `prompt.json`'s scene.composition.layout tells you which: `split_screen` (panels each showing different footage \u2014 e.g. b-roll on top, presenter on the bottom), `pip` (a presenter boxed in a corner over full-frame background), or `keyed_overlay` (a green-screen/cut-out presenter over background). Each is reproduced as ONE generated clip PER REGION (`s<i>_r0_*`, `s<i>_r1_*`, \u2026) stacked (vstack/hstack) or overlaid by an `s<i>_composite` ffmpeg node; a keyed presenter runs through `s<i>_key` (video_background_remove) for a transparent cut-out first. Edit each region's own keyframe prompt + motion brief independently. The presenter region (is_presenter) carries the lip-synced voice. To CHANGE a layout, edit composition in prompt.json and re-scaffold, or hand-edit the s<i>_composite ffmpeg args (splitStackArgs/pipOverlayArgs patterns). A clean full-frame talking head is simpler than a composite \u2014 flatten when the brand's version doesn't need the split.",
     recraft_the_script_first: `VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer.${h >= 0 ? " The HOOK is the #1 decision (see the `hook` todo);" : ""} ${h >= 0 ? "then work" : "Work"} the per-scene \`script_recraft\` checklist. References: references/hook-craft.md (the hook), references/script-craft.md (body/CTA) + the meta-ads-playbook skill.`,
     ...h >= 0 ? {
       hook: `THE HOOK IS THE HIGHEST-LEVERAGE BEAT \u2014 the first frame + first 3\u20134s decide whether the ad is watched at all, and the hook is TARGETING. But highest-leverage does NOT mean always rewrite: this hook already won, so MOST OF THE TIME you KEEP it and build on top (swap only the specifics). REBUILD is the exception \u2014 only when it doesn't transfer (a claim we lack or a different funnel/awareness stage), and then by reaching for its deeper INNER MECHANIC and delivering that truthfully, not inventing a new opener from nothing. For scene ${h}: DIAGNOSE it (device + mechanic + what stage it targets), DECIDE keep/adapt/rebuild, then hold the opener to the criteria \u2014 ${HOOK_OPENER_CRITERIA}. The hook lives across s${h}_start (the scroll-stopping first frame), the scene-${h} overlay text, the s${h}_clip line, an optional ~0.5s micro-hook, and the ramp into the body. Full diagnose\u2192decide\u2192(keep/adapt/rebuild) discipline + the proven hook-type menu: references/hook-craft.md (+ meta-ads-playbook \xA710/\xA717/\xA739).`
     } : {},
     script_recraft: buildScriptRecraft(blueprint),
-    edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference. Frames are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
+    edit_frames_in_place: "Each s<i>_start keyframe node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is the authoritative shared ad spec (cast identity, palette, brand). Frames are RECAST to the el_* reference images (the source ad's cast is never reused) and are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
     frames_mode: opts.frames ?? "generate",
-    review_storyboard_before_clips: "STORYBOARD FIRST. The per-scene boundary frames (s<i>_start / s<i>_end) ARE your storyboard \u2014 align the LOOK on them before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's frames, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
+    review_storyboard_before_clips: "STORYBOARD FIRST. Each scene's keyframe (s<i>_start) IS your storyboard \u2014 align the LOOK on it before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's keyframe, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
     motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
     assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
     sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
@@ -9534,18 +10438,17 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
       voice_description: d.voice_description,
       line: d.line
     })),
-    talking_head_note: "NATIVE: a single-on-camera-speaker scene is voiced by Seedance itself (line in s<i>_clip prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Edit the line in the clip's prompt to re-author it.",
-    voice_note: "Every native talking clip's audio is re-voiced to ONE brand voice via audio_voice_convert (eleven_multilingual_sts_v2), timing preserved so lips stay matched. voice_select.voice_id is that brand voice \u2014 set its gender/language to match the creator. Off-camera narration uses a sequenced tts per turn.",
-    native_timing: "Seedance paces the spoken line to fill the clip, so each native talking clip is generated long enough for the estimated speech and its audio is kept full-length (not hard-trimmed to the visual scene) \u2014 the line is never cut mid-word; the voice may continue a beat past the visual cut (natural VO continuity). `metadata.video.talking_scenes` carries each scene's scene_s vs est_speech_s. If a rendered line still sounds clipped, the line is simply longer than the scene: shorten the line or lengthen the scene in the deconstruct.",
+    talking_head_note: "PHRASE-NATIVE: a continuous-speech phrase where the speaker is shown is ONE Seedance clip (the full phrase quoted in s<anchor>_clip's prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Scenes that show the speaker slice their window out of that clip (s<i>_seg); edit the phrase line in the s<anchor>_clip prompt to re-author it. A pure-voiceover phrase (speaker never shown) is one ElevenLabs tts read instead.",
+    voice_note: "ONE voice per person: a single voice_select is reused across all that person's phrases (on-camera AND off \u2014 the deconstruct's `voiceover` label folds into the sole presenter). Each presenter phrase's native audio is re-voiced to that brand voice via audio_voice_convert (eleven_multilingual_sts_v2, one convert per phrase, timing preserved so lips stay matched). Set voice_select.voice_id's gender/language to match the creator.",
+    native_timing: "The voice is cut at PAUSES, not at visual cuts, so a sentence spanning a cut stays one continuous read (no mid-word break). The clip is generated long enough for the estimated speech; if a line runs longer than its phrase window the voice continues a beat into the following pause (natural VO continuity). `metadata.video.talking_scenes` carries each phrase's scene_s vs est_speech_s. CAVEAT: a b-roll cutaway INSIDE a phrase lands at an approximate (proportional) time \u2014 Seedance exposes no word timing \u2014 so if a cutaway is off its beat, nudge the scene boundary (it's a starting point).",
     craft: {
       note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
       principles: [
         "Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
         "Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
-        "LOCK THE CAMERA \u2014 a first/last-frame clip holds the framing the two frames define; only move when a move is specified. Unmotivated camera drift is the top realism tell.",
+        "LOCK THE CAMERA \u2014 Seedance animates forward from the single keyframe; only move when the motion brief specifies a move. Unmotivated camera drift is the top realism tell.",
         "One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
         "Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
-        "Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
         "Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
         "Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
         "Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
@@ -9639,6 +10542,7 @@ function resolveShippedCanvasDir(name, startDir, exists = existsSync3, maxDepth
 // src/commands/canvas/scaffold-video.ts
 var SHIPPED_COMPOSITION_DIR = resolveShippedCanvasDir("video-overlay-composition", import.meta.dirname);
+var SHIPPED_CAPTIONS_DIR = resolveShippedCanvasDir("tiktok-captions-composition", import.meta.dirname);
 function resolveModel2(kind, preferred) {
   const ids = Object.keys(MODEL_REGISTRY[kind]);
   return ids.includes(preferred) ? preferred : ids[0] ?? preferred;
@@ -9659,10 +10563,10 @@ DROP one-off background extras and incidental props \u2014 but the shared set/lo
 ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
-For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
+For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of ONLY the scenes where the element is ACTUALLY VISIBLE ON SCREEN \u2014 judged from that scene's start_frame_prompt / end_frame_prompt subjects and its action_detail, NOT from who is merely speaking. A narrator heard over b-roll is NOT present in that b-roll scene; a dog-running cutaway does NOT contain the couch creator just because she talks across it. Do NOT pad the list \u2014 an element wrongly listed in a scene makes the reproduction render the wrong subject there (e.g. the creator appearing in a pure-dog b-roll). When in doubt, leave a scene OUT. Output ONLY the JSON object.`;
 async function loadAssetText2(ref, label) {
   const r = ref;
-  if (typeof r?.path === "string") return readFile4(r.path, "utf8");
+  if (typeof r?.path === "string") return readFile5(r.path, "utf8");
   if (typeof r?.url === "string") {
     const res = await fetch(r.url);
     if (!res.ok) throw new Error(`failed to fetch ${label} (${res.status})`);
@@ -9670,6 +10574,24 @@ async function loadAssetText2(ref, label) {
   }
   throw new Error(`${label}: output had no readable path or url`);
 }
+async function loadTranscriptBestEffort(ref) {
+  if (!ref) return void 0;
+  try {
+    return await loadAssetText2(ref, "deconstruct transcript");
+  } catch {
+    return void 0;
+  }
+}
+async function stageCaptions(outDir, transcript) {
+  const text = transcript?.trim();
+  if (!text || text === "[]") return {};
+  const transcriptPath = path5.join(outDir, "transcript.json");
+  await writeFile2(transcriptPath, `${text}
+`, "utf8");
+  const compositionPath = path5.join(outDir, "tiktok-captions-composition");
+  await cp(SHIPPED_CAPTIONS_DIR, compositionPath, { recursive: true });
+  return { compositionPath, transcriptPath };
+}
 function parseElements2(raw) {
   const parsed = JSON.parse(raw);
   if (Array.isArray(parsed)) return parsed;
@@ -9678,6 +10600,31 @@ function parseElements2(raw) {
   }
   return [];
 }
+async function detectShotCutsBestEffort(videoPath, threshold) {
+  try {
+    const cuts = await detectSceneCutsPySceneDetect(videoPath, threshold ? { threshold } : {});
+    if (cuts.length > 0) {
+      process.stderr.write(`Detected ${cuts.length} shot cut(s) via PySceneDetect: ${cuts.join(", ")}s
+`);
+    } else {
+      process.stderr.write("PySceneDetect ran but found no hard cuts; using LLM scene boundaries.\n");
+    }
+    return cuts;
+  } catch (e) {
+    const msg = e instanceof Error ? e.message : String(e);
+    const code = e?.code;
+    const missing = code === "ENOENT" || /ENOENT|not found|command not found/i.test(msg);
+    if (missing) {
+      process.stderr.write(
+        "WARNING: `scenedetect` (PySceneDetect) is NOT installed \u2014 falling back to LLM-only scene boundaries, which under-segments (coarse 9-15s scenes instead of the real 1-4s cuts). Install it (`pipx install scenedetect[opencv]` or `pip install scenedetect[opencv]`) for accurate shot-cut detection.\n"
+      );
+    } else {
+      process.stderr.write(`Shot-cut detection skipped (${msg}); using LLM boundaries.
+`);
+    }
+    return [];
+  }
+}
 function fail2(code, message) {
   process.stderr.write(`${JSON.stringify({ ok: false, error: { code, message } }, null, 2)}
 `);
@@ -9699,53 +10646,78 @@ function resolveModels2(args) {
     videoModel: pick("video-model", "video_generate", "bytedance/seedance-2.0")
   };
 }
-function buildAnalysisCanvas(videoPath, deconstructModel, selectModel, opts) {
+function buildDeconstructCanvas(videoPath, deconstructModel, opts) {
   const deconstructParams = { model: deconstructModel, mode: "full" };
   if (typeof opts.maxScenes === "number") deconstructParams.max_scenes = opts.maxScenes;
   if (opts.language) deconstructParams.language = opts.language;
   if (opts.focus) deconstructParams.focus = opts.focus;
+  if (opts.shotCuts && opts.shotCuts.length > 0) deconstructParams.shot_cuts = opts.shotCuts;
+  deconstructParams.max_clip_s = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
   return {
     schema: "baker-canvas/1",
     metadata: { name: "video deconstruct pass" },
     nodes: [
       { id: "src", type: "ingest", params: { source: "path", path: videoPath, expect: "video" } },
-      { id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams },
+      { id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams }
+    ],
+    output: { node: "deconstruct", output: "analysis" }
+  };
+}
+function buildSelectCanvas(selectModel, slimmedBlueprintJson) {
+  return {
+    schema: "baker-canvas/1",
+    metadata: { name: "element selection pass" },
+    nodes: [
       {
         id: "select",
         type: "text_generate",
-        inputs: { blueprint: "$ref:deconstruct.analysis" },
         params: {
           model: selectModel,
           max_tokens: 6e3,
           temperature: 0,
           response_format: "json_object",
           system: SELECT_SYSTEM2,
-          prompt: SELECT_PROMPT2
+          prompt: SELECT_PROMPT2.replace("{{blueprint}}", () => slimmedBlueprintJson)
         }
       }
     ],
     output: { node: "select", output: "text" }
   };
 }
-async function runAnalysisPasses(canvas) {
+async function runAnalysisPasses(deconstructCanvas, selectModel) {
   const engine = createEngineFromEnv({ log: (line) => process.stderr.write(`${line}
 `) });
-  let outputsByNode;
-  let creditsSpent;
+  let credits = 0;
+  let sawCredits = false;
+  const addCredits = (stats) => {
+    const c = stats?.total_credits;
+    if (typeof c === "number") {
+      credits += c;
+      sawCredits = true;
+    }
+  };
+  let blueprint;
+  let transcript;
   try {
-    const result = await engine.run(canvas, {});
-    outputsByNode = result.outputs_by_node;
-    creditsSpent = result.stats?.total_credits;
+    const r1 = await engine.run(deconstructCanvas, {});
+    addCredits(r1.stats);
+    blueprint = JSON.parse(await loadAssetText2(r1.outputs_by_node.deconstruct?.analysis, "deconstruct output"));
+    transcript = await loadTranscriptBestEffort(r1.outputs_by_node.deconstruct?.transcript);
   } catch (e) {
     if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
+    if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
     return fail2("deconstruct", e instanceof Error ? e.message : String(e));
   }
+  const slimJson = JSON.stringify(slimBlueprintForSelection(blueprint));
   try {
-    const blueprint = JSON.parse(await loadAssetText2(outputsByNode.deconstruct?.analysis, "deconstruct output"));
-    const elements = parseElements2(await loadAssetText2(outputsByNode.select?.text, "selection output"));
-    return { blueprint, elements, creditsSpent };
+    const r2 = await engine.run(buildSelectCanvas(selectModel, slimJson), {});
+    addCredits(r2.stats);
+    const elements = parseElements2(await loadAssetText2(r2.outputs_by_node.select?.text, "selection output"));
+    return { blueprint, elements, transcript, creditsSpent: sawCredits ? credits : void 0 };
   } catch (e) {
-    return fail2("read_outputs", e instanceof Error ? e.message : String(e));
+    if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
+    if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
+    return fail2("deconstruct", e instanceof Error ? e.message : String(e));
   }
 }
 var scaffoldVideoCommand = defineCommand76({
@@ -9761,11 +10733,11 @@ var scaffoldVideoCommand = defineCommand76({
       type: "boolean",
       description: "Give silent b-roll scenes native diegetic ambient mixed deep under the music bed (off by default)"
     },
-    "actor-sheets": {
-      type: "boolean",
-      description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
-    },
     "max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
+    "shot-threshold": {
+      type: "string",
+      description: "PySceneDetect content threshold. Default is adaptive (18, auto re-checked at 27 when a continuous shot looks over-segmented); pinning a value disables the re-check. Lower = more/softer cuts, higher = fewer."
+    },
     language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
     focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
     "deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
@@ -9788,12 +10760,15 @@ var scaffoldVideoCommand = defineCommand76({
       );
     }
     const { deconstructModel, selectModel, imageModel, videoModel } = resolveModels2(args);
-    const analysisCanvas = buildAnalysisCanvas(videoPath, deconstructModel, selectModel, {
+    const shotThreshold = args["shot-threshold"] ? Number(args["shot-threshold"]) : void 0;
+    const shotCuts = await detectShotCutsBestEffort(videoPath, shotThreshold);
+    const deconstructCanvas = buildDeconstructCanvas(videoPath, deconstructModel, {
       maxScenes: Number.isFinite(maxScenes) ? maxScenes : void 0,
       language: args.language ? String(args.language) : void 0,
-      focus: args.focus ? String(args.focus) : void 0
+      focus: args.focus ? String(args.focus) : void 0,
+      shotCuts
     });
-    const { blueprint, elements, creditsSpent } = await runAnalysisPasses(analysisCanvas);
+    const { blueprint, elements, transcript, creditsSpent } = await runAnalysisPasses(deconstructCanvas, selectModel);
     await mkdir(outDir, { recursive: true });
     const annotated = annotateBlueprintWithElements(blueprint, elements);
     await writeFile2(blueprintPath, `${JSON.stringify(annotated, null, 2)}
@@ -9802,7 +10777,7 @@ var scaffoldVideoCommand = defineCommand76({
     await cp(SHIPPED_COMPOSITION_DIR, compositionDest, { recursive: true });
     const indexPath = path5.join(compositionDest, "index.html");
     const overlayHtml = buildOverlayHtml(blueprint);
-    const indexHtml = await readFile4(indexPath, "utf8");
+    const indexHtml = await readFile5(indexPath, "utf8");
     const injected = indexHtml.replace("<!--OVERLAYS-->", () => overlayHtml);
     if (injected === indexHtml && overlayHtml.trim()) {
       fail2(
@@ -9811,14 +10786,16 @@ var scaffoldVideoCommand = defineCommand76({
       );
     }
     await writeFile2(indexPath, injected, "utf8");
+    const captions = await stageCaptions(outDir, transcript);
     const opts = {
       imageModel,
       videoModel,
       overlayCompositionPath: compositionDest,
+      captionsCompositionPath: captions.compositionPath,
+      transcriptPath: captions.transcriptPath,
       blueprintPath,
       frames,
-      ambient: Boolean(args.ambient),
-      actorSheets: Boolean(args["actor-sheets"])
+      ambient: Boolean(args.ambient)
     };
     let canvas;
     let report;
@@ -9851,7 +10828,7 @@ var scaffoldVideoCommand = defineCommand76({
           stats: {
             scene_count: report.scene_count,
             total_nodes: canvas.nodes.length,
-            deconstruct_credits_spent: creditsSpent,
+            analysis_credits_spent: creditsSpent,
             run_estimated_credits: validation.estimatedCredits
           },
           checklist: {
@@ -9879,7 +10856,7 @@ var scaffoldVideoCommand = defineCommand76({
 });
 // src/commands/canvas/validate.ts
-import { readFile as readFile5 } from "fs/promises";
+import { readFile as readFile6 } from "fs/promises";
 import path6 from "path";
 import { defineCommand as defineCommand77 } from "citty";
 var validateCommand = defineCommand77({
@@ -9890,7 +10867,7 @@ var validateCommand = defineCommand77({
   args: { file: { type: "positional", required: true, description: "Path to canvas JSON" } },
   async run({ args }) {
     const filePath = path6.resolve(String(args.file));
-    const raw = await readFile5(filePath, "utf8");
+    const raw = await readFile6(filePath, "utf8");
     let parsed;
     try {
       parsed = JSON.parse(raw);
@@ -10779,8 +11756,8 @@ function cropSprite(input, region) {
 // src/lib/image/io.ts
 import { randomBytes } from "crypto";
-import { glob as fsGlob, readFile as readFile6, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
-import { dirname, extname, join as join2, resolve as resolve4 } from "path";
+import { glob as fsGlob, readFile as readFile7, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
+import { dirname, extname, join as join3, resolve as resolve4 } from "path";
 var REMOTE_RE = /^https?:\/\//i;
 var GLOB_RE = /[*?[\]{}]/;
 function isRemoteUrl(value) {
@@ -10815,7 +11792,7 @@ async function readImageBuffer(pathOrUrl) {
     }
     return Buffer.from(await response.arrayBuffer());
   }
-  return readFile6(pathOrUrl);
+  return readFile7(pathOrUrl);
 }
 async function isDirectory(path7) {
   try {
@@ -10830,14 +11807,14 @@ async function resolveOutputPath(inputPath, outputArg, options) {
   if (!outputArg) return base;
   if (options.multipleInputs || await isDirectory(outputArg)) {
     const filename = base.split("/").pop() ?? "out.png";
-    return join2(outputArg, filename);
+    return join3(outputArg, filename);
   }
   return outputArg;
 }
 async function atomicWrite(targetPath, data) {
   const absolute = resolve4(targetPath);
   const dir = dirname(absolute);
-  const tmp = join2(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
+  const tmp = join3(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
   await writeFile3(tmp, data);
   await rename(tmp, absolute);
 }
@@ -11179,7 +12156,7 @@ var findCommand = defineCommand91({
 });
 // src/commands/images/generate.ts
-import { readFile as readFile7 } from "fs/promises";
+import { readFile as readFile8 } from "fs/promises";
 import { defineCommand as defineCommand92 } from "citty";
 import sharp2 from "sharp";
 var GENERATE_TIMEOUT_MS = 18e4;
@@ -11262,7 +12239,7 @@ async function resolveReferences(spec) {
     }
     let raw;
     try {
-      raw = await readFile7(entry);
+      raw = await readFile8(entry);
     } catch {
       throw new ApiError("VALIDATION_ERROR", `Reference file not found: ${entry}`);
     }
@@ -12983,7 +13960,7 @@ var stockCommand = defineCommand105({
 });
 // src/commands/images/upload.ts
-import { readFile as readFile8 } from "fs/promises";
+import { readFile as readFile9 } from "fs/promises";
 import { extname as extname2 } from "path";
 import { defineCommand as defineCommand106 } from "citty";
 var MIME_MAP = {
@@ -13123,7 +14100,7 @@ async function uploadLocal(target, args) {
     });
     return;
   }
-  const fileBuffer = await readFile8(target);
+  const fileBuffer = await readFile9(target);
   const base64 = fileBuffer.toString("base64");
   const body = { base64, contentType };
   if (args.source) body.source = args.source;
@@ -15088,7 +16065,7 @@ var searchCommand3 = defineCommand135({
 });
 // src/commands/videos/upload.ts
-import { readFile as readFile9, stat as stat3 } from "fs/promises";
+import { readFile as readFile10, stat as stat3 } from "fs/promises";
 import { extname as extname3 } from "path";
 import { defineCommand as defineCommand136 } from "citty";
 var MIME_MAP2 = {
@@ -15153,7 +16130,7 @@ var uploadCommand2 = defineCommand136({
         return;
       }
       const { uploadUrl, videoId } = await apiPost("/api/videos/upload", {});
-      const fileBuffer = await readFile9(filePath);
+      const fileBuffer = await readFile10(filePath);
       const uploadResponse = await fetch(uploadUrl, {
         method: "PUT",
         headers: { "Content-Type": contentType },