npm - @koda-sl/baker-cli - Versions diffs - 0.74.0 → 0.79.0 - Mend

@koda-sl/baker-cli 0.74.0 → 0.79.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +34 -8
package/canvas/end-card-composition/index.html +66 -0
package/canvas/end-card-composition/meta.json +19 -0
package/canvas/feature-reveal-composition/index.html +83 -0
package/canvas/feature-reveal-composition/meta.json +18 -0
package/canvas/lower-third-composition/index.html +75 -0
package/canvas/lower-third-composition/meta.json +18 -0
package/canvas/stat-counter-composition/index.html +73 -0
package/canvas/stat-counter-composition/meta.json +20 -0
package/canvas/title-card-composition/index.html +90 -0
package/canvas/title-card-composition/meta.json +20 -0
package/dist/{chunk-JIDZ37KG.js → chunk-CCO34ACK.js} +507 -307
package/dist/chunk-CCO34ACK.js.map +1 -0
package/dist/cli.js +624 -109
package/dist/cli.js.map +1 -1
package/dist/engine/index.d.ts +6 -0
package/dist/engine/index.js +1 -1
package/package.json +1 -1
package/dist/chunk-JIDZ37KG.js.map +0 -1

package/dist/cli.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   defaultRegistry,
   generateCatalog,
   validateCanvasDeep
-} from "./chunk-JIDZ37KG.js";
+} from "./chunk-CCO34ACK.js";
 // src/cli.ts
 import { defineCommand as defineCommand141, runMain } from "citty";
@@ -8280,11 +8280,121 @@ import { defineCommand as defineCommand76 } from "citty";
 // src/engine/scaffold/video.ts
 import { z as z3 } from "zod";
+// src/engine/scaffold/lib/shoot-modes.ts
+var SHOOT_MODES = [
+  "ugc_selfie",
+  "ugc_broll",
+  "studio_product",
+  "lifestyle_cinematic",
+  "screen_ui"
+];
+var SHOOT_MODE_SPECS = {
+  ugc_selfie: {
+    label: "UGC selfie / talking-head",
+    allowsDoF: false,
+    capture: [
+      "CAPTURE \u2014 AUTHENTIC PHONE UGC (front camera):",
+      "Shot on a modern phone front camera \u2014 natural lens, real skin texture and pores,",
+      "catchlights, mixed indoor white balance, faint sensor grain, slight handheld imperfection.",
+      "NO shallow depth of field or background blur \u2014 native phone footage is flat front-to-back;",
+      "blur reads as 'produced', not filmed-on-a-phone. Keep the whole frame in focus."
+    ].join("\n"),
+    motion: "Lock the camera at arm's-length selfie distance; only natural handheld micro-movement. Move the camera only if a move is named above.",
+    diegetic: "a quiet room tone with soft fabric and breath under the speaker's own voice"
+  },
+  ugc_broll: {
+    label: "UGC b-roll / handheld",
+    allowsDoF: false,
+    capture: [
+      "CAPTURE \u2014 AUTHENTIC PHONE UGC (rear camera, candid):",
+      "Shot on a modern phone rear camera, handheld and candid \u2014 natural lens, real materials and",
+      "textures, real hands in frame where natural, mixed natural white balance, faint sensor grain.",
+      "NO shallow depth of field or background blur \u2014 native phone footage is flat front-to-back;",
+      "keep the whole frame in focus."
+    ].join("\n"),
+    motion: "Handheld, candid framing; keep any move small and motivated. Move the camera only if a move is named above.",
+    diegetic: "the real ambient of the setting \u2014 handling sounds, footsteps, and room or outdoor tone"
+  },
+  studio_product: {
+    label: "Studio / product (pack shot)",
+    allowsDoF: true,
+    capture: [
+      "CAPTURE \u2014 CONTROLLED PRODUCT / STUDIO:",
+      "Photographed on a controlled set \u2014 a clean seamless or one styled surface, a soft key with",
+      "gentle fill and a subtle rim, true-to-life color. Shallow depth of field IS allowed to isolate",
+      "the hero, with crisp specular highlights on the product's real materials.",
+      "Still a real photograph, not CGI \u2014 no plastic or waxy surfaces, no over-render; real material",
+      "texture and weight."
+    ].join("\n"),
+    motion: "Lock off, or a slow motivated push-in / settle onto the product; otherwise hold.",
+    diegetic: "minimal \u2014 soft product-handling sounds over a quiet room tone"
+  },
+  lifestyle_cinematic: {
+    label: "Lifestyle / cinematic",
+    allowsDoF: true,
+    capture: [
+      "CAPTURE \u2014 LIFESTYLE / CINEMATIC:",
+      "A real camera in a real location \u2014 natural motivated light, true color, a gentle filmic grade,",
+      "fine grain. A shallow depth of field is allowed when motivated by the moment.",
+      "Photographic, not rendered \u2014 real skin and material texture, no airbrushing, no glossy 3D look."
+    ].join("\n"),
+    motion: "A slow, motivated camera move (gentle push-in, drift, or settle) is allowed; otherwise hold.",
+    diegetic: "the location's natural ambience \u2014 wind, traffic, water, or room tone as the setting implies"
+  },
+  screen_ui: {
+    label: "Screen / UI / demo",
+    allowsDoF: true,
+    capture: [
+      "CAPTURE \u2014 SCREEN / UI CAPTURE:",
+      "A clean screen or app capture \u2014 crisp pixels, true on-screen color, optionally framed inside a",
+      "real device held in a real hand. No human-skin realism is needed; the screen content is the subject.",
+      "Do not bake invented UI copy into the plate beyond what the reference shows \u2014 editable text lives",
+      "on the overlay layer."
+    ].join("\n"),
+    motion: "Hold on the screen; allow a slow push-in or a UI scroll only if a move is named above.",
+    diegetic: "soft UI taps and device handling over a quiet room tone"
+  }
+};
+function isShootMode(value) {
+  return typeof value === "string" && SHOOT_MODES.includes(value);
+}
+function captureBlockFor(mode) {
+  return SHOOT_MODE_SPECS[mode].capture;
+}
+function seedanceMotionFor(mode) {
+  return SHOOT_MODE_SPECS[mode].motion;
+}
+function diegeticFor(mode) {
+  return SHOOT_MODE_SPECS[mode].diegetic;
+}
+function deriveShootMode(opts) {
+  if (isShootMode(opts.explicit)) return opts.explicit;
+  if (opts.talking) return "ugc_selfie";
+  if (opts.hasProduct && !opts.hasPerson) return "studio_product";
+  return "ugc_broll";
+}
+// src/engine/scaffold/video.ts
 var FIXED_TTS_MODEL = "elevenlabs/eleven_v3";
 var FIXED_SFX_MODEL = "elevenlabs/eleven_text_to_sound_v2";
 var FIXED_MUSIC_MODEL = "elevenlabs/music-v1";
-var FIXED_LIPSYNC_MODEL = "fal/veed-lipsync";
+var FIXED_VOICE_CONVERT_MODEL = "elevenlabs/eleven_multilingual_sts_v2";
 var MUSIC_BED_GAIN_DB = -12;
+var AMBIENT_BED_GAIN_DB = -20;
+var TRANSITION_DEFAULT_S = 0.4;
+var XFADE_BY_TYPE = {
+  fade: "fade",
+  dissolve: "dissolve",
+  whip: "smoothleft",
+  swipe: "wipeleft",
+  zoom: "zoomin"
+};
+var WORDS_PER_SECOND = 2.5;
+function estSpeechS(text) {
+  const words = text.trim().split(/\s+/).filter(Boolean).length;
+  return words / WORDS_PER_SECOND;
+}
 var NARRATOR_SPEAKERS = /* @__PURE__ */ new Set([
   "voiceover",
   "voice_over",
@@ -8360,10 +8470,25 @@ var Scene = z3.object({
   duration_s: z3.number().optional(),
   summary: z3.string().optional(),
   action_detail: z3.string().optional(),
+  // The capture "look" for this scene — selected from the ad-native shoot-mode
+  // grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
+  // UGC/product mode; a human can override per scene by setting this.
+  shoot_mode: z3.string().optional(),
+  // Diegetic ambient the clip's native audio should carry (no music). When
+  // absent the scene falls back to its shoot mode's default ambience.
+  ambient: z3.string().optional(),
   camera_motion: CameraMotion.optional(),
   start_frame_prompt: z3.string().optional(),
   end_frame_prompt: z3.string().optional(),
   motion_prompt: z3.string().optional(),
+  // The scene's role in the ad's persuasion arc (DECON-supplied); drives the
+  // script re-craft checklist. Inferred from position when absent.
+  narrative_role: z3.string().optional(),
+  // How this scene cuts to the next (DECON-supplied). A recognized non-cut type
+  // (fade/whip/zoom/dissolve/swipe) is reproduced as an ffmpeg xfade at the
+  // boundary; cut/match_cut/none/other stay hard cuts. The last scene's value is
+  // ignored (nothing follows it).
+  transition_out: z3.object({ type: z3.string().optional(), description: z3.string().optional() }).loose().optional(),
   dialogue: z3.array(DialogueLine).optional(),
   sfx: z3.array(Sfx).optional(),
   overlays: z3.array(z3.unknown()).optional(),
@@ -8378,6 +8503,10 @@ var VideoBlueprint = z3.object({
     music: z3.object({
       present: z3.boolean().optional(),
       music_prompt: z3.string().optional(),
+      // Absolute second the music enters in the reference (the bed often
+      // kicks in mid-ad, after the hook). We start the regenerated track here
+      // instead of at 0 so the timing matches.
+      starts_at_s: z3.number().optional(),
       // Populated by the deconstruct when AudD (Shazam-style) recognizes the
       // reference track. We never reuse it — only style the regenerated bed.
       identified_track: z3.object({ title: z3.string().optional(), artist: z3.string().optional() }).loose().nullish()
@@ -8401,6 +8530,11 @@ var RecurringElement = z3.object({
   expression: z3.string().nullable().optional(),
   // When the element maps to a global cast entry, its stable id (for annotation).
   cast_id: z3.string().nullable().optional(),
+  // The label of another element that is the SAME individual as this one, shown
+  // in a DIFFERENT wardrobe/persona/state (e.g. one creator playing skeptic in a
+  // pink shirt and believer in a white shirt). Each look gets its own reference
+  // slot, but the face/identity must stay identical across them.
+  same_as: z3.string().nullable().optional(),
   // Scenes the element appears in. Either a bare list of scene indices (both
   // edges) or per-{scene,edge} entries. Both forms are accepted and merged.
   scenes: z3.array(z3.number()).optional(),
@@ -8476,15 +8610,27 @@ function roleForType2(type) {
       return "the showcased product; keep this exact product identity consistent across every frame. Ignore any caption text printed on this reference.";
     case "person":
     case "animal":
-      return "a recurring hero subject; keep this exact identity (face, hair, wardrobe, markings) consistent across EVERY frame of the video. Ignore any caption text printed on this reference.";
+      return "a recurring cast member; render the SAME individual as this reference image and keep them consistent across EVERY frame \u2014 their appearance comes from this reference, never from prose. Ignore any caption text printed on this reference.";
+    case "location":
+      return "the fixed set/location; keep the room, background, and layout identical to this reference across EVERY frame \u2014 do not re-invent the environment. Ignore any caption text printed on this reference.";
     default:
       return "a recurring identity element; reproduce it faithfully and keep it consistent across every frame. Ignore any caption text printed on it.";
   }
 }
+function roleForSlot(slot) {
+  if (slot.sameAs) {
+    const what = slot.description ? ` (${slot.description})` : "";
+    return `the SAME individual as ${slot.sameAs}, shown in a DIFFERENT wardrobe/persona/state${what} \u2014 keep the FACE and identity IDENTICAL to the ${slot.sameAs} references; change ONLY wardrobe, styling, and expression. Ignore any caption text printed on this reference.`;
+  }
+  return roleForType2(slot.type);
+}
 function todoPath2(el, label) {
   const desc = el.description ? ` \u2014 ${el.description}` : "";
   const expr = el.expression ? `, with a ${el.expression} expression` : "";
-  return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in]`;
+  const t = el.type.toLowerCase();
+  const fresh = t === "person" || t === "animal" || t === "location" ? " [SOURCE FRESH \u2014 a DIFFERENT person/animal/set than the original ad; do not reuse the source's individual]" : "";
+  const same = el.same_as ? ` [SAME INDIVIDUAL as ${el.same_as} \u2014 a different wardrobe/look of the same person; reuse that cast person, change only the outfit]` : "";
+  return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in${fresh}${same}]`;
 }
 function buildElementSlots(elements) {
   const usedIds = /* @__PURE__ */ new Set(["prompt", "spine", "overlaid", "audio_mix", "final", "music_bed"]);
@@ -8499,6 +8645,7 @@ function buildElementSlots(elements) {
       label,
       type: el.type,
       description: el.description,
+      sameAs: el.same_as ?? void 0,
       presence: presenceOf(el)
     });
   });
@@ -8507,38 +8654,90 @@ function buildElementSlots(elements) {
 function slotsForFrame(slots, sceneIndex, edge) {
   return slots.filter((s) => s.presence.get(sceneIndex)?.has(edge));
 }
+var ACTOR_SHEET_MODEL = "google/gemini-3-pro-image-preview";
+function applyActorSheets(slots, nodes) {
+  for (const slot of slots) {
+    const t = slot.type.toLowerCase();
+    if (t !== "person" && t !== "animal") continue;
+    if (slot.presence.size < 2) continue;
+    const sheetId = `${slot.id}_sheet`;
+    nodes.push({
+      id: sheetId,
+      type: "image_reference_sheet",
+      // The lone dropped ingest is the source; the sheet fans it into a turnaround.
+      inputs: { references: [slot.ref] },
+      params: {
+        model: ACTOR_SHEET_MODEL,
+        subject_description: slot.description ?? `the ${slot.type}`,
+        subject_type: t === "person" ? "person" : "character",
+        image_size: "2K"
+      }
+    });
+    slot.ref = `$ref:${sheetId}.sheet`;
+  }
+}
 function slotsForScene(slots, sceneIndex) {
   return slots.filter((s) => s.presence.has(sceneIndex));
 }
-function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor) {
+function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mode) {
   const EDGE = edge.toUpperCase();
   const legend = [
-    ...present.map((s) => `- ${s.label} \u2014 ${roleForType2(s.type)}`),
+    ...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
     ...hasAnchor ? [
       "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions of THIS frame. IGNORE its overlay text, captions, and any brand that is being swapped."
     ] : []
   ].join("\n");
   const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
+  const isHookFrame = sceneIndex === 0 && edge === "start";
   return [
     `Render the ${EDGE} frame of scene ${sceneIndex + 1} as a single still image. This prompt is self-contained and edit-per-frame: change the FRAME DESCRIPTION below to alter ONLY this frame.`,
     "",
-    "CRITICAL \u2014 RENDER A CLEAN PLATE WITH ZERO TEXT OR GRAPHICS:",
-    "This frame is a background plate. ALL words, captions, headlines, lower-third bars,",
-    "news tickers/crawls, chyrons, on-screen logos/wordmarks, station bugs, watermarks,",
-    "subtitles, UI and numbers are added afterwards as a separate HTML layer. Render NONE",
-    "of them \u2014 no legible text anywhere in the image, not even in the background, on the",
-    "news desk, on screens, or as part of a 'broadcast look'. If a reference image (a logo,",
-    "a desk, a studio) contains any text or graphics, DO NOT reproduce that text \u2014 render",
-    "the subject/scene only, with blank surfaces where text would be. Imperfect/garbled",
-    "letterforms are the worst outcome; leave those areas clean.",
+    "CRITICAL \u2014 RENDER A CLEAN PLATE WITH ZERO TEXT OR GRAPHIC OVERLAYS:",
+    "This frame is a background plate. Every overlay element is composited afterwards as a",
+    "separate hyperframe HTML overlay layer \u2014 NOT painted into this image. Render NONE of:",
+    "words, captions, subtitles, headlines, lower-third bars, news tickers/crawls, chyrons,",
+    "station bugs, watermarks, numbers, prices; and NONE of the graphic overlays layered on",
+    "the picture either \u2014 icons, stickers, emojis, badges, rating/trust seals, progress bars,",
+    "UI chrome/buttons, and arrows. No legible text anywhere, not even in the background, on a",
+    "desk, on screens, or as part of a 'broadcast look'. If a reference image (a logo, a desk,",
+    "a studio) contains any text or graphics, DO NOT reproduce them \u2014 render the subject/scene",
+    "only, leaving the regions where overlays will sit clean. Imperfect/garbled letterforms or",
+    "stray icons are the worst outcome; leave those areas blank.",
+    "",
+    "FRAMING \u2014 ONE UNCUT FRAME:",
+    "Render ONE single uncut photographic frame: NO split screen, NO panels, NO dividing line,",
+    "NO collage, NO before/after. Avoid the AI look \u2014 no waxy/plastic skin, no airbrushing, no",
+    "over-smoothing, no over-saturation, no glossy 3D render. Every descriptor needs a technical",
+    'anchor (a named lens / focal length / color grade) \u2014 no empty adjectives like "cinematic",',
+    `"beautiful", "high quality"; they waste tokens and don't move the model.`,
+    "",
+    // The capture aesthetic + depth-of-field rule are SHOOT-MODE specific: a UGC
+    // selfie is flat phone footage; a pack shot is a controlled studio frame. Only
+    // this block varies by mode — the clean-plate and framing rules above are universal.
+    captureBlockFor(mode),
+    "",
+    // Moderation-safe phrasing — Seedance routes around the real-person filter but
+    // prompts still hit provider moderation; age-blind, role-based descriptions trip
+    // it far less. (The client's own brand assets come from the references, not here.)
+    "Describe any person by role, wardrobe, and build \u2014 never by name and never by age",
+    "(no child/kid/teen/young/elderly); do not invent brand logos or marks.",
+    ...isHookFrame ? [
+      "",
+      "HOOK FRAME (scene 1 opens the ad): the feed plays muted, so this frame must read",
+      "INSTANTLY SOUND-OFF \u2014 one clear subject, legible at a glance in ~1 second, no clutter."
+    ] : [],
     "",
     "REFERENCE IMAGES (in the order provided):",
     legend,
     "",
+    "Identity comes from the reference images, not from this prose \u2014 render each person,",
+    "product, and set to MATCH its reference image, and describe only pose, expression, action,",
+    "and camera in the FRAME DESCRIPTION below.",
+    "",
     "FRAME DESCRIPTION (this frame's editable prompt):",
     description,
     "",
-    "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared cast identity, palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: NO rendered text or graphic overlays \u2014 clean plate only.",
+    "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
     "",
     "GLOBAL STYLE REFERENCE (shared across frames; not this frame's content):",
     "{{target_blueprint}}"
@@ -8552,7 +8751,7 @@ function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
   const genParams = {
     model: ctx.imageModel,
     image_size: "2K",
-    prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url))
+    prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url), ctx.shootMode)
   };
   if (ctx.ar) genParams.aspect_ratio = ctx.ar;
   const genNode = {
@@ -8567,7 +8766,17 @@ function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
   nodes.push(genNode);
   return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
 }
-function buildSeedancePrompt(scene, sceneIndex, present) {
+function seedanceAudioLine(scene, mode, audio, nativeLine) {
+  const ambient = scene.ambient?.trim() || diegeticFor(mode);
+  if (nativeLine) {
+    return `Audio: diegetic only \u2014 the speaker's own voice over ${ambient}; no music, no song, no soundtrack (the music bed is a separate track).`;
+  }
+  if (audio) {
+    return `Audio: diegetic ambient only \u2014 ${ambient}; no spoken dialogue, no music, no song, no soundtrack (voice and music are separate tracks).`;
+  }
+  return null;
+}
+function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine) {
   const parts = [];
   const summary = scene.summary?.trim();
   parts.push(summary ? `Scene ${sceneIndex + 1}: ${summary}` : `Scene ${sceneIndex + 1}`);
@@ -8583,18 +8792,109 @@ function buildSeedancePrompt(scene, sceneIndex, present) {
       `Keep these consistent with their references: ${present.map((s) => `${s.label} (${s.description ?? s.type})`).join("; ")}`
     );
   }
-  const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
-  if (lines.length > 0) parts.push(`Spoken: ${lines.map((l) => `"${l}"`).join(" ")}`);
+  if (nativeLine) {
+    parts.push(
+      `The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${nativeLine}"`
+    );
+  } else {
+    const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
+    if (lines.length > 0)
+      parts.push(`Spoken context (do not render as audio): ${lines.map((l) => `"${l}"`).join(" ")}`);
+  }
   const transcript = (scene.transcript_slice ?? []).map((w) => w.text?.trim()).filter(Boolean).join(" ").trim();
   if (transcript) parts.push(`Transcript: ${transcript}`);
+  const audioLine = seedanceAudioLine(scene, mode, audio, nativeLine);
+  if (audioLine) parts.push(audioLine);
+  parts.push(
+    `Direction: describe MOTION ONLY \u2014 the frames carry the content; keep it short. ${seedanceMotionFor(mode)} Spell choreography move-by-move (not 'she dances' but the actual beats: head nod, shoulder roll, knee dip). One short continuous beat. Real physical weight on any impact (no weightless AI motion). Describe any person by role and wardrobe, never by name or age.`
+  );
   return parts.join("\n");
 }
+function audioExtractArgs(durationS) {
+  return [
+    "-i",
+    "{{in.clip}}",
+    "-t",
+    durationS.toFixed(3),
+    "-vn",
+    "-acodec",
+    "libmp3lame",
+    "-q:a",
+    "2",
+    "{{out.audio}}"
+  ];
+}
+function sceneOutTransition(scene, isLast) {
+  if (isLast) return null;
+  const type = scene.transition_out?.type?.toLowerCase();
+  const xfade = type ? XFADE_BY_TYPE[type] : void 0;
+  return xfade ? { xfade, dur: TRANSITION_DEFAULT_S } : null;
+}
+function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
+  const talking = Boolean(nativeTurn) || cameraOn && (scene.dialogue ?? []).some(
+    (d) => d.line?.trim() && isOnCameraSpeaker(d.speaker ?? "voiceover", casts, cameraOn)
+  );
+  return deriveShootMode({
+    explicit: scene.shoot_mode,
+    talking,
+    hasPerson: present.some((s) => s.type.toLowerCase() === "person" || s.type.toLowerCase() === "animal"),
+    hasProduct: present.some((s) => s.type.toLowerCase() === "product")
+  });
+}
+function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
+  if (nativeTurn) {
+    const extractLen = Math.min(Math.max(lengths.dur, lengths.speech), lengths.genDur);
+    nodes.push({
+      id: `s${i}_voextract`,
+      type: "ffmpeg",
+      inputs: { clip: `$ref:s${i}_clip.video` },
+      params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
+    });
+    nodes.push({
+      id: `s${i}_voconv`,
+      type: "audio_voice_convert",
+      inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
+      params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
+    });
+    voTracks.push({
+      slot: `s${i}_voconv`,
+      ref: `$ref:s${i}_voconv.audio`,
+      start_s: nativeTurn.start_s,
+      end_s: nativeTurn.start_s + extractLen,
+      kind: "vo"
+    });
+  } else if (ambientBroll) {
+    const ambientStart = scene.start_s ?? 0;
+    nodes.push({
+      id: `s${i}_ambient`,
+      type: "ffmpeg",
+      inputs: { clip: `$ref:s${i}_clip.video` },
+      params: { args: audioExtractArgs(lengths.dur), outputs: { audio: { kind: "audio", ext: "mp3" } } }
+    });
+    voTracks.push({
+      slot: `s${i}_ambient`,
+      ref: `$ref:s${i}_ambient.audio`,
+      start_s: ambientStart,
+      end_s: ambientStart + lengths.dur,
+      gain_db: AMBIENT_BED_GAIN_DB,
+      kind: "ambient"
+    });
+  }
+}
 function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
   const ar = aspectRatioParam(blueprint);
   const reuse = opts.frames === "reuse";
-  const clipRefs = [];
+  const clips = [];
+  const voTracks = [];
+  const lastIndex = blueprint.scenes.length - 1;
+  const cameraOn = onCameraDialogue(blueprint);
+  const casts = castIdSet(blueprint);
   blueprint.scenes.forEach((scene, i) => {
-    const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel };
+    const nativeTurn = (sceneTurns.get(i) ?? []).find((t) => t.native);
+    const present = slotsForScene(slots, i);
+    const mode = sceneShootMode(scene, present, nativeTurn, cameraOn, casts);
+    const ambientBroll = Boolean(opts.ambient) && !nativeTurn && mode !== "ugc_selfie";
+    const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel, shootMode: mode };
     const firstFrame = buildFrameRef(
       "start",
       scene.start_frame_asset?.url,
@@ -8612,10 +8912,22 @@ function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
       nodes
     );
     const dur = sceneDurationS(scene);
+    let out = sceneOutTransition(scene, i === lastIndex);
+    let trimTarget = dur + (out?.dur ?? 0);
+    if (out && ceilToSeedance(trimTarget) < trimTarget) {
+      out = null;
+      trimTarget = dur;
+    }
+    const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
+    const genDur = ceilToSeedance(Math.max(trimTarget, speech));
     const clipParams = {
       model: opts.videoModel,
-      prompt: buildSeedancePrompt(scene, i, slotsForScene(slots, i)),
-      duration: ceilToSeedance(dur)
+      prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
+      duration: genDur,
+      // Native talking scene → Seedance generates the spoken audio + lip-sync;
+      // an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
+      // clip is silent and audio comes from the tts/music timeline.
+      generate_audio: Boolean(nativeTurn) || ambientBroll
     };
     if (ar) clipParams.aspect_ratio = ar;
     nodes.push({
@@ -8624,31 +8936,21 @@ function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
       inputs: { first_frame: firstFrame, last_frame: lastFrame },
       params: clipParams
     });
-    let base = `$ref:s${i}_clip.video`;
-    const onCam = (sceneTurns.get(i) ?? []).filter((t) => t.onCamera);
-    const solo = onCam.length === 1 ? onCam[0] : void 0;
-    if (solo) {
-      nodes.push({
-        id: `s${i}_lipsync`,
-        type: "video_lipsync",
-        inputs: { video: base, audio: solo.audioRef },
-        params: { model: FIXED_LIPSYNC_MODEL }
-      });
-      base = `$ref:s${i}_lipsync.video`;
-    }
-    if (ceilToSeedance(dur) === dur) {
-      clipRefs.push(base);
+    emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, { dur, speech, genDur }, nodes, voTracks);
+    const base = `$ref:s${i}_clip.video`;
+    if (genDur === trimTarget) {
+      clips.push({ ref: base, scene_s: dur, out });
     } else {
       nodes.push({
         id: `s${i}_trim`,
         type: "ffmpeg",
         inputs: { clip: base },
-        params: { args: trimArgs(dur), outputs: { video: { kind: "video", ext: "mp4" } } }
+        params: { args: trimArgs(trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
       });
-      clipRefs.push(`$ref:s${i}_trim.video`);
+      clips.push({ ref: `$ref:s${i}_trim.video`, scene_s: dur, out });
     }
   });
-  return clipRefs;
+  return { clips, voTracks };
 }
 function musicBedPrompt(blueprint, musicPrompt) {
   const track2 = blueprint.global?.music?.identified_track;
@@ -8664,6 +8966,33 @@ function onCameraDialogue(blueprint) {
   return mode !== "voiceover" && mode !== "none";
 }
 var castIdSet = (blueprint) => new Set((blueprint.global?.cast ?? []).map((c) => c.id).filter((id) => Boolean(id)));
+var LANGUAGE_WORDS = [
+  [/\b(french|fran[çc]ais|francaise)\b/, "french"],
+  [/\b(spanish|espa[ñn]ol|castilian)\b/, "spanish"],
+  [/\benglish\b/, "english"],
+  [/\b(german|deutsch)\b/, "german"],
+  [/\b(italian|italiano)\b/, "italian"],
+  [/\b(portuguese|portugu[êe]s|brazilian)\b/, "portuguese"],
+  [/\b(dutch|nederlands)\b/, "dutch"],
+  [/\b(arabic)\b/, "arabic"],
+  [/\b(japanese)\b/, "japanese"],
+  [/\b(korean)\b/, "korean"],
+  [/\b(hindi)\b/, "hindi"],
+  [/\b(polish)\b/, "polish"]
+];
+function parseVoiceTraits(description) {
+  const d = description.toLowerCase();
+  const out = {};
+  if (/\b(female|woman|women|girl|lady)\b/.test(d)) out.gender = "female";
+  else if (/\b(male|man|men|guy|boy)\b/.test(d)) out.gender = "male";
+  for (const [re, name] of LANGUAGE_WORDS) {
+    if (re.test(d)) {
+      out.language = name;
+      break;
+    }
+  }
+  return out;
+}
 function isOnCameraSpeaker(speaker, casts, cameraOn) {
   if (!cameraOn) return false;
   if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
@@ -8688,7 +9017,9 @@ function buildDialogue(blueprint, nodes) {
     const existing = voiceNodeBySpeaker.get(speaker);
     if (existing) return existing;
     const id = sanitizeId2(`voice_${speaker}`, `voice_${voiceNodeBySpeaker.size}`);
-    nodes.push({ id, type: "voice_select", params: { description: speakerDescription(speaker) } });
+    const description = speakerDescription(speaker);
+    const traits = parseVoiceTraits(description);
+    nodes.push({ id, type: "voice_select", params: { description, ...traits } });
     voiceNodeBySpeaker.set(speaker, id);
     return id;
   };
@@ -8703,44 +9034,52 @@ function buildDialogue(blueprint, nodes) {
       if (last && last.speaker === speaker) last.lines.push(line);
       else groups.push({ speaker, lines: [line] });
     }
-    const list = [];
-    groups.forEach((group, gi) => {
+    const shells = groups.map((group) => {
       const first = group.lines[0];
       const last = group.lines[group.lines.length - 1];
-      if (!first || !last) return;
-      const start = first.start_s ?? scene.start_s ?? 0;
-      const end = last.end_s ?? last.start_s ?? scene.end_s ?? start;
+      if (!first || !last) return void 0;
+      return {
+        group,
+        start: first.start_s ?? scene.start_s ?? 0,
+        end: last.end_s ?? last.start_s ?? scene.end_s ?? first.start_s ?? scene.start_s ?? 0,
+        onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn)
+      };
+    }).filter((s) => Boolean(s));
+    const onCamCount = shells.filter((s) => s.onCamera).length;
+    const list = [];
+    shells.forEach((shell, gi) => {
+      const { group, start, end, onCamera } = shell;
       const voiceNode = ensureVoiceNode(group.speaker);
-      let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
-      if (usedVoIds.has(id)) {
-        let n = 2;
-        while (usedVoIds.has(`${id}_${n}`)) n++;
-        id = `${id}_${n}`;
-      }
-      usedVoIds.add(id);
-      nodes.push({
-        id,
-        type: "tts",
-        inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
-        // Lines join with a space; each keeps its terminal punctuation so eleven_v3
-        // reads the sentence boundaries (and their pauses) within the one turn.
-        params: {
-          model: FIXED_TTS_MODEL,
-          text: group.lines.map((l) => l.line.trim()).join(" "),
-          voice: "{{voice_ref}}"
-        }
-      });
+      const text = group.lines.map((l) => l.line.trim()).join(" ");
+      const native = onCamera && onCamCount === 1;
       const turn = {
         sceneIndex,
         speaker: group.speaker,
-        onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn),
         start_s: start,
         end_s: end,
-        ttsId: id,
-        audioRef: `$ref:${id}.audio`
+        text,
+        voiceNode,
+        native
       };
+      if (!native) {
+        let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
+        if (usedVoIds.has(id)) {
+          let n = 2;
+          while (usedVoIds.has(`${id}_${n}`)) n++;
+          id = `${id}_${n}`;
+        }
+        usedVoIds.add(id);
+        nodes.push({
+          id,
+          type: "tts",
+          inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
+          params: { model: FIXED_TTS_MODEL, text, voice: "{{voice_ref}}" }
+        });
+        turn.ttsId = id;
+        const audioRef = `$ref:${id}.audio`;
+        tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
+      }
       list.push(turn);
-      tracks.push({ slot: id, ref: turn.audioRef, start_s: start, end_s: end, kind: "vo" });
     });
     sceneTurns.set(sceneIndex, list);
   });
@@ -8766,14 +9105,22 @@ function buildSfxMusic(blueprint, nodes) {
   });
   const musicPrompt = blueprint.global?.music?.music_prompt;
   if (musicPrompt) {
-    const totalMs = Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3);
+    const total = blueprint.source?.duration_s ?? lastSceneEnd(blueprint);
+    const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, 0), Math.max(total - 0.5, 0));
+    const totalMs = Math.round((total - startAt) * 1e3);
     const musicMs = Math.min(Math.max(totalMs, 3e3), ELEVENLABS_MAX_MUSIC_LENGTH_MS);
     nodes.push({
       id: "music_bed",
       type: "music",
       params: { model: FIXED_MUSIC_MODEL, prompt: musicBedPrompt(blueprint, musicPrompt), music_length_ms: musicMs }
     });
-    tracks.push({ slot: "music", ref: "$ref:music_bed.audio", start_s: 0, gain_db: MUSIC_BED_GAIN_DB, kind: "music" });
+    tracks.push({
+      slot: "music",
+      ref: "$ref:music_bed.audio",
+      start_s: startAt,
+      gain_db: MUSIC_BED_GAIN_DB,
+      kind: "music"
+    });
   }
   return tracks;
 }
@@ -8823,14 +9170,29 @@ function overlayElement(ov, sceneStart) {
   const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
   return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
 }
+function sourceHint(fe) {
+  const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
+  switch ((fe.kind ?? "").toLowerCase()) {
+    case "logo":
+      return "baker images logo <domain> (or baker images library)";
+    case "emoji":
+    case "sticker":
+      return `baker images sticker "${desc}" (or baker images gif)`;
+    case "product_cutout":
+      return `baker images library "${desc}" (the client's own product)`;
+    default:
+      return `baker images icon "${desc}"`;
+  }
+}
 function floatingStub(fe, sceneStart) {
   const at = fe.appears_at_s ?? sceneStart;
   const dur = fe.duration_s ?? 2.5;
   const kind = commentSafe(fe.kind ?? "element");
   const label = commentSafe(fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element");
+  const hint = commentSafe(sourceHint(fe));
   const slug = (fe.kind ?? "element").toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "") || "element";
   return [
-    `<!-- ${kind}: ${label} @ ${at}s for ${dur}s (${positionClass(fe.position)}). Drop an image in this dir and uncomment:`,
+    `<!-- ${kind}: ${label} @ ${at}s for ${dur}s (${positionClass(fe.position)}). Source a real asset: ${hint} \u2014 drop it in this dir and uncomment:`,
     `<img class="ov ${positionClass(fe.position)}" src="your-${slug}.png" data-start="${at}" data-dur="${dur}" alt="" /> -->`
   ].join("\n");
 }
@@ -8871,6 +9233,52 @@ function concatArgs(count) {
   }
   return [...inputs, "-filter_complex", `${labels}concat=n=${count}:v=1:a=0[v]`, "-map", "[v]", "{{out.video}}"];
 }
+function clipInputLen(c) {
+  return c.scene_s + (c.out?.dur ?? 0);
+}
+function xfadeSpineArgs(clips) {
+  const n = clips.length;
+  const inputs = [];
+  const filt = [];
+  for (let i = 0; i < n; i++) {
+    inputs.push("-i", `{{in.c${i}}}`);
+    filt.push(`[${i}:v]format=yuv420p,fps=30,setsar=1,settb=AVTB[c${i}]`);
+  }
+  let cur = "c0";
+  let accLen = clipInputLen(clips[0]);
+  for (let k = 0; k < n - 1; k++) {
+    const join3 = clips[k].out;
+    const next = `c${k + 1}`;
+    const out = k === n - 2 ? "v" : `j${k + 1}`;
+    if (join3) {
+      const offset = Math.max(0, accLen - join3.dur);
+      filt.push(
+        `[${cur}][${next}]xfade=transition=${join3.xfade}:duration=${join3.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
+      );
+      accLen = accLen - join3.dur + clipInputLen(clips[k + 1]);
+    } else {
+      filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
+      accLen += clipInputLen(clips[k + 1]);
+    }
+    cur = out;
+  }
+  return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
+}
+function buildSpine(clips, nodes) {
+  const inputs = {};
+  clips.forEach((c, i) => {
+    inputs[`c${i}`] = c.ref;
+  });
+  const hasTransition = clips.length > 1 && clips.some((c) => c.out);
+  const args = hasTransition ? xfadeSpineArgs(clips) : concatArgs(clips.length);
+  nodes.push({
+    id: "spine",
+    type: "ffmpeg",
+    inputs,
+    params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
+  });
+  return "$ref:spine.video";
+}
 function scaffoldVideoCanvas(input, elementsInput, opts) {
   const blueprint = VideoBlueprint.parse(input);
   const elements = RecurringElements.parse(elementsInput);
@@ -8888,19 +9296,11 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
       params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
     });
   });
-  const { tracks: voTracks, sceneTurns } = buildDialogue(blueprint, nodes);
-  const clipRefs = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
-  const concatInputs = {};
-  clipRefs.forEach((ref, i) => {
-    concatInputs[`c${i}`] = ref;
-  });
-  nodes.push({
-    id: "spine",
-    type: "ffmpeg",
-    inputs: concatInputs,
-    params: { args: concatArgs(clipRefs.length), outputs: { video: { kind: "video", ext: "mp4" } } }
-  });
-  let videoRef = "$ref:spine.video";
+  if (opts.actorSheets) applyActorSheets(slots, nodes);
+  const { tracks: ttsTracks, sceneTurns } = buildDialogue(blueprint, nodes);
+  const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
+  const voTracks = [...ttsTracks, ...nativeVoTracks];
+  let videoRef = buildSpine(clips, nodes);
   let videoNode = "spine";
   const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
   const floating = blueprint.scenes.flatMap((s) => s.floating_elements ?? []);
@@ -8972,7 +9372,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
     metadata: {
       name: "video reproduction",
       description: VIDEO_GUIDE,
-      todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts),
+      todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts, blueprint),
       // The timing plan `baker canvas validate` checks before any billed render:
       // sequenced voiceover turns (no overlap), audio ≈ video length, and which
       // scenes must be lip-synced.
@@ -8987,60 +9387,162 @@ function buildVideoMeta(blueprint, sceneTurns) {
   const talking_scenes = [];
   for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
     for (const t of turns) {
-      vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
-    }
-    if (turns.filter((t) => t.onCamera).length === 1) {
-      talking_scenes.push({ scene, lipsync_node: `s${scene}_lipsync` });
+      if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
+    }
+    const nativeTurn = turns.find((t) => t.native);
+    if (nativeTurn) {
+      const sceneObj = blueprint.scenes[scene];
+      talking_scenes.push({
+        scene,
+        voice_convert_node: `s${scene}_voconv`,
+        scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
+        est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
+      });
     }
   }
   return {
     duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
     vo_segments,
-    talking_scenes
+    talking_scenes,
+    motion_board: buildMotionBoard(blueprint, sceneTurns)
   };
 }
+function buildMotionBoard(blueprint, sceneTurns) {
+  const round = (n) => Math.round(n * 100) / 100;
+  let cursor = 0;
+  return blueprint.scenes.map((scene, i) => {
+    const start_s = scene.start_s ?? cursor;
+    const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
+    cursor = end_s;
+    const spoken = (sceneTurns.get(i) ?? []).map((t) => t.text?.trim()).filter((l) => Boolean(l)).join(" ") || null;
+    const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
+    const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
+    const graphics = [
+      ...(overlays.success ? overlays.data : []).filter((ov) => ov.text?.trim()).map((ov) => ({
+        kind: "text",
+        at_s: round(ov.appears_at_s ?? start_s),
+        dur_s: round(ov.duration_s ?? 2.5),
+        position: ov.position ?? "bottom_center",
+        text: ov.text?.trim()
+      })),
+      ...(floats.success ? floats.data : []).map((fe) => ({
+        kind: "graphic",
+        at_s: round(fe.appears_at_s ?? start_s),
+        dur_s: round(fe.duration_s ?? 2.5),
+        position: fe.position ?? "bottom_center",
+        label: fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element"
+      }))
+    ].sort((a, b) => a.at_s - b.at_s);
+    return {
+      scene: i,
+      role: scene.narrative_role?.trim() || inferNarrativeRole(i, blueprint.scenes.length),
+      window_s: [round(start_s), round(end_s)],
+      storyboard_frames: [`s${i}_start`, `s${i}_end`],
+      spoken,
+      graphics
+    };
+  });
+}
 var VIDEO_GUIDE = [
-  "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 optional lip-sync \u2192 concatenated. On-screen text is a separate HTML layer you paint; audio is sequenced voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
+  "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 concatenated. Talking heads are voiced NATIVELY by Seedance (lips+voice generated together) then re-voiced to one brand voice; off-camera narration is sequenced tts. On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
   "",
   "WHAT TO DO NEXT:",
+  "0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
   "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` / `s<i>_end` node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want.",
+  "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. The boundary frames ARE your storyboard; `metadata.video.motion_board` lays out each scene's frames, time window, spoken line, and the graphics scheduled in it. Lock the frames + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
   "2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
-  "3. Confirm the `voice_select` casting (one per speaker). Voiceover is SEQUENCED: each contiguous same-speaker turn is its own `tts` placed at its real start, so dialogue alternates instead of stacking. Edit a turn's `params.text` (punctuation / ALL-CAPS / line breaks are read verbatim by eleven_v3 for emphasis and pauses) to shape delivery; re-author the words to be TRUE for your brand.",
-  "4. Lip-sync: scenes with a single on-camera speaker route their clip through `video_lipsync` (~20 cr each) so the mouth matches the line. Two-speaker scenes are left un-synced \u2014 split them or pick a primary speaker if you want sync. Drop the node to skip.",
-  "5. Overlays are REAL HTML you paint. Open `video-overlay-composition/index.html`: the reference's overlays are seeded inside `#overlay-root` as plain elements (text + a `.pos-*` class + `data-start`/`data-dur`). Restyle the CSS freely \u2014 build lower-thirds, a ticker, whatever the look needs \u2014 and replace a logo placeholder with a real `<img>` you drop in that dir. The runtime only shows/hides by timestamp; it makes no styling decisions. Drop `brand-bold.otf` / `brand-regular.otf` there for on-brand type.",
-  "6. `baker canvas validate` (proves audio/lip-sync timing for free) then `baker canvas run` (generates many billed image/video/audio assets \u2014 not free).",
+  "3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
+  "4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
+  "5. Overlays are REAL HTML you paint. Open `video-overlay-composition/index.html`: the reference's overlays are seeded inside `#overlay-root` as plain elements (text + a `.pos-*` class + `data-start`/`data-dur`). Restyle the CSS freely \u2014 build lower-thirds, a ticker, whatever the look needs \u2014 and replace a logo placeholder with a real `<img>` you source (`baker images icon/sticker/gif/logo`) and drop in that dir. The runtime only shows/hides by timestamp; it makes no styling decisions. Drop `brand-bold.otf` / `brand-regular.otf` there for on-brand type.",
+  "6. `baker canvas validate` (proves native-audio + timing for free) then `baker canvas run` (generates many billed image/video/audio assets \u2014 not free).",
   "",
-  "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are."
+  "CRAFT \u2014 raises every clip's realism (one-liners; full rationale in references/video-craft.md):",
+  "- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
+  "- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
+  "- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
+  "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it). Match-cut continuous action by setting scene N+1's start frame = scene N's end frame (costs no extra gens).",
+  "- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
+  "- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
+  "",
+  "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are.",
+  "Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
 ].join("\n");
-function buildVideoTodo(report, overlayCount, floatingCount, opts) {
+function inferNarrativeRole(index, total) {
+  if (index === 0 && total > 1) return "hook";
+  if (index === total - 1) return "cta";
+  return "body";
+}
+function buildScriptRecraft(blueprint) {
+  const total = blueprint.scenes.length;
+  return blueprint.scenes.map((scene, i) => {
+    const role = scene.narrative_role?.trim() || inferNarrativeRole(i, total);
+    const original = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ");
+    return {
+      scene: i,
+      role,
+      original_line: original || null,
+      recraft: `[RECRAFT: rewrite this ${role} for OUR brand \u2014 true claims only; do NOT render the reference's words. See references/script-craft.md + meta-ads-playbook.]`
+    };
+  });
+}
+function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
   return {
+    recraft_the_script_first: "VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer. Work the per-scene `script_recraft` checklist below; see references/script-craft.md + the meta-ads-playbook skill.",
+    script_recraft: buildScriptRecraft(blueprint),
     edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference. Frames are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
     frames_mode: opts.frames ?? "generate",
+    review_storyboard_before_clips: "STORYBOARD FIRST. The per-scene boundary frames (s<i>_start / s<i>_end) ARE your storyboard \u2014 align the LOOK on them before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's frames, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
+    motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
     assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
+    sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
     recurring_elements_to_supply: report.elements,
     text_strategy: "Decide per ad: text is either baked by the generated creative OR painted via the overlay HTML \u2014 not both. Default here is clean text-free frames + the HTML overlay layer (video-overlay-composition/index.html) as the single text source, which you fully control.",
-    timeline: "Automatic: each clip is generated at >= its scene length then trimmed back to the real scene duration, so the concatenated picture stays on the same timeline as the absolute-timed audio (this is what makes the lips line up). You don't manage it.",
+    timeline: "Automatic: each clip is generated at >= its scene length then trimmed back to the real scene duration, so the concatenated picture stays on the same timeline as the audio. You don't manage it.",
     voices_to_confirm: report.dialogue.map((d) => ({
       scene: d.scene,
       speaker: d.speaker,
       voice_description: d.voice_description,
       line: d.line
     })),
-    voiceover_note: "Sequenced: one tts per contiguous same-speaker TURN, placed at its real start_s so turns alternate (no parallel monologues); same voice locked via voice_select.voice_id. Edit a turn's params.text (punctuation / ALL CAPS / line breaks read verbatim) to shape delivery.",
-    lip_sync_note: "Scenes with a single on-camera speaker route their clip through video_lipsync (~20 cr each) so the mouth matches the line. Two-speaker scenes are left un-synced (one track can't drive two faces) \u2014 split or pick a primary. `baker canvas validate` checks every talking scene is synced.",
+    talking_head_note: "NATIVE: a single-on-camera-speaker scene is voiced by Seedance itself (line in s<i>_clip prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Edit the line in the clip's prompt to re-author it.",
+    voice_note: "Every native talking clip's audio is re-voiced to ONE brand voice via audio_voice_convert (eleven_multilingual_sts_v2), timing preserved so lips stay matched. voice_select.voice_id is that brand voice \u2014 set its gender/language to match the creator. Off-camera narration uses a sequenced tts per turn.",
+    native_timing: "Seedance paces the spoken line to fill the clip, so each native talking clip is generated long enough for the estimated speech and its audio is kept full-length (not hard-trimmed to the visual scene) \u2014 the line is never cut mid-word; the voice may continue a beat past the visual cut (natural VO continuity). `metadata.video.talking_scenes` carries each scene's scene_s vs est_speech_s. If a rendered line still sounds clipped, the line is simply longer than the scene: shorten the line or lengthen the scene in the deconstruct.",
+    craft: {
+      note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
+      principles: [
+        "Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
+        "Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
+        "LOCK THE CAMERA \u2014 a first/last-frame clip holds the framing the two frames define; only move when a move is specified. Unmotivated camera drift is the top realism tell.",
+        "One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
+        "Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
+        "Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
+        "Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
+        "Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
+        "Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
+        "The hook is VISUAL-FIRST: the feed plays muted, so scene 1's opening frame + its overlay text must read sound-off in ~1 second \u2014 don't bury the hook in the spoken line alone (meta-ads-playbook \xA739 visual-hooks-beat-audio, \xA748 the 1-second/feed-native rule)."
+      ]
+    },
+    transitions: "Scene-to-scene cuts the deconstruct flagged as fade/whip/zoom/dissolve/swipe are reproduced as an ffmpeg xfade at the boundary (everything else stays a hard cut). The overlap is consumed from extra generated footage, so the picture stays exactly on the audio timeline. To change a transition, edit the scene's `transition_out.type` in prompt.json and re-scaffold, or hand-edit the `spine` node's ffmpeg args.",
     text_overlays: {
       count: overlayCount,
       note: "Seeded as editable HTML inside `#overlay-root` in video-overlay-composition/index.html (text + a .pos-* class + data-start/data-dur). PAINT it: restyle the CSS, build lower-thirds/tickers, drop brand-*.otf for on-brand type. The runtime only shows/hides by timestamp."
     },
     floating_elements: {
       count: floatingCount,
-      note: floatingCount > 0 ? "Seeded as labeled placeholders in index.html \u2014 replace each with a real <img> you drop into video-overlay-composition/. Recurring logos are also handled well as an el_* element baked into frames." : "none detected"
+      note: floatingCount > 0 ? "Seeded as commented <img> stubs in index.html (each names the `baker images icon/sticker/gif/logo` command to source it) \u2014 source the asset, drop it in video-overlay-composition/, uncomment the <img>." : "none detected by the deconstruct \u2014 see `completeness_check`, the reference may still have icons/stickers it missed."
+    },
+    sound_effects: {
+      count: report.sfx_count,
+      note: report.sfx_count > 0 ? "Seeded as `sound_effect` nodes on `audio_mix` at their timestamps \u2014 edit the prompt or retime." : "none detected by the deconstruct \u2014 see `completeness_check`, the reference may still have sound cues it missed."
     },
-    sound_effects: { count: report.sfx_count },
     music: {
       present: report.has_music,
-      note: report.has_music ? "Original bed regenerated from the deconstruct prompt (styled after the AudD-identified track when available); ducked under the voices." : "no music bed scaffolded"
+      note: report.has_music ? "Original bed regenerated from the deconstruct prompt (styled after the AudD-identified track when available); ducked under the voices." : "no music bed scaffolded \u2014 if the reference has music, see `completeness_check`."
     },
+    // ALWAYS-ON safety net: the scaffold can only seed what the deconstruct
+    // cataloged, and it under-detects on-image graphics + sound cues. Never trust
+    // "none detected" — re-watch the reference and fill the gaps with the right tool.
+    completeness_check: 'The scaffold mirrors the deconstruct\'s catalog, which UNDER-DETECTS \u2014 never trust a 0 count. Re-watch the reference frame-by-frame and add anything missing: (1) ON-IMAGE GRAPHICS not in floating_elements (dollar/coin icons, emojis, checkmarks, rating stars, price tags, arrows, progress bars, app UI) \u2192 source each with `baker images icon "<desc>"` / `baker images sticker` / `baker images gif` / `baker images logo <domain>` and add it as an <img class="ov pos-* " data-start data-dur> in video-overlay-composition/index.html (NEVER bake graphics into the frame plates). (2) SOUND CUES not in sound_effects (cha-ching/coin, whoosh, ding, pop, notification, keyboard) \u2192 add a `sound_effect` node (eleven_text_to_sound) and wire it onto `audio_mix` at its timestamp. (3) RECURRING people/animals/products/logos/sets with no el_* slot \u2192 add an `ingest` [TODO] slot and reference it from the frames they appear in. (4) Burned-in captions/text not in text_overlays \u2192 add an <img>-free <div class="ov"> in index.html. (5) ONE person playing MULTIPLE personas/wardrobes (skeptic vs believer, before vs after, two outfits) collapsed into a single el_* slot \u2192 split into one el_* slot PER look, each linked as the SAME individual via `same_as` so every outfit has its own reference image but the face/identity stays identical.',
     scenes_clamped_to_15s: report.clamped_scenes,
     run_warning: "`baker canvas run` generates many billed image/video/audio assets \u2014 validate first, it is not free."
   };
@@ -9118,10 +9620,13 @@ List ONLY the elements worth keeping consistent across frames \u2014 the ones a
 - a showcased product, package, card, or device the ad sells or demonstrates -> type "product"
 - the advertiser brand logo/wordmark (from global.branding) -> type "logo"
 - a recurring trust/rating/certification badge -> type "badge"
+- the dominant recording set/location the scenes share (e.g. the same living room, car interior, kitchen) -> type "location"
-DROP one-off background extras, incidental props, and generic scenery. A person in global.cast is almost always recurring. Keep at most ~8.
+DROP one-off background extras and incidental props \u2014 but the shared set/location is NOT generic scenery: pin it as ONE "location" element so the room stays identical across scenes. A person in global.cast is almost always recurring. Keep at most ~8.
-For each kept element return: { "type": one of person|animal|product|logo|badge, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset (for a person/animal repeat the exact look from global.cast verbatim), "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
+ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
+For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
 async function loadAssetText2(ref, label) {
   const r = ref;
   if (typeof r?.path === "string") return readFile4(r.path, "utf8");
@@ -9215,6 +9720,14 @@ var scaffoldVideoCommand = defineCommand76({
     file: { type: "positional", required: true, description: "Path to the reference video" },
     out: { type: "string", description: "Output canvas path (default <video-dir>/<name>.video.canvas.json)" },
     frames: { type: "string", description: '"generate" (default, anchored regen) or "reuse" (wire real frames in)' },
+    ambient: {
+      type: "boolean",
+      description: "Give silent b-roll scenes native diegetic ambient mixed deep under the music bed (off by default)"
+    },
+    "actor-sheets": {
+      type: "boolean",
+      description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
+    },
     "max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
     language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
     focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
@@ -9266,7 +9779,9 @@ var scaffoldVideoCommand = defineCommand76({
       videoModel,
       overlayCompositionPath: compositionDest,
       blueprintPath,
-      frames
+      frames,
+      ambient: Boolean(args.ambient),
+      actorSheets: Boolean(args["actor-sheets"])
     };
     let canvas;
     let report;
@@ -10557,7 +11072,7 @@ registerSchema({
     query: { type: "string", description: "Search query", required: true },
     sources: {
       type: "string",
-      description: "Comma-separated providers: library,magnific,google,iconify,giphy (brandfetch lives at `baker images logo`)",
+      description: "Comma-separated providers: library,magnific,google,iconify,giphy,pinterest (brandfetch lives at `baker images logo`)",
       required: false
     },
     limit: { type: "number", description: "Max results per group", required: false, default: 20 },