@koda-sl/baker-cli 0.82.0 → 0.91.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  defaultRegistry,
10
10
  generateCatalog,
11
11
  validateCanvasDeep
12
- } from "./chunk-KIL2ZJST.js";
12
+ } from "./chunk-LMVDA3EZ.js";
13
13
 
14
14
  // src/cli.ts
15
15
  import { defineCommand as defineCommand141, runMain } from "citty";
@@ -8274,10 +8274,100 @@ var scaffoldStaticAdCommand = defineCommand75({
8274
8274
  });
8275
8275
 
8276
8276
  // src/commands/canvas/scaffold-video.ts
8277
- import { cp, mkdir, readFile as readFile4, writeFile as writeFile2 } from "fs/promises";
8277
+ import { cp, mkdir, readFile as readFile5, writeFile as writeFile2 } from "fs/promises";
8278
8278
  import path5 from "path";
8279
8279
  import { defineCommand as defineCommand76 } from "citty";
8280
8280
 
8281
+ // src/engine/nodes/local/lib/sceneDetect.ts
8282
+ import { execFile as execFile2 } from "child_process";
8283
+ import { mkdtemp, readdir as readdir2, readFile as readFile4, rm } from "fs/promises";
8284
+ import { tmpdir } from "os";
8285
+ import { join as join2 } from "path";
8286
+ import { promisify as promisify2 } from "util";
8287
+ var execFileAsync2 = promisify2(execFile2);
8288
+ var PYSCENEDETECT_THRESHOLD = 18;
8289
+ var PYSCENEDETECT_MIN_SCENE_LEN_S = 0.25;
8290
+ var PYSCENEDETECT_RECHECK_THRESHOLD = 27;
8291
+ var PYSCENEDETECT_RECHECK_MIN_SCENE_LEN_S = 0.6;
8292
+ function isLikelyOverSegmented(cuts, opts = {}) {
8293
+ const minCuts = opts.minCuts ?? 6;
8294
+ const maxMedianGap = opts.medianGapS ?? 2;
8295
+ const sorted = [...cuts].filter((c) => Number.isFinite(c) && c > 0).sort((a, b) => a - b);
8296
+ if (sorted.length < minCuts) return false;
8297
+ const gaps = [];
8298
+ let prev = 0;
8299
+ for (const c of sorted) {
8300
+ gaps.push(c - prev);
8301
+ prev = c;
8302
+ }
8303
+ gaps.sort((a, b) => a - b);
8304
+ const mid = Math.floor(gaps.length / 2);
8305
+ const median = gaps.length % 2 ? gaps[mid] : (gaps[mid - 1] + gaps[mid]) / 2;
8306
+ return median < maxMedianGap;
8307
+ }
8308
+ function timecodeToSeconds(tc) {
8309
+ const m = tc.trim().match(/^(\d+):(\d{1,2}):(\d{1,2}(?:\.\d+)?)$/);
8310
+ if (!m) return null;
8311
+ const h = Number.parseInt(m[1] ?? "", 10);
8312
+ const min = Number.parseInt(m[2] ?? "", 10);
8313
+ const s = Number.parseFloat(m[3] ?? "");
8314
+ if (!Number.isFinite(h) || !Number.isFinite(min) || !Number.isFinite(s)) return null;
8315
+ return h * 3600 + min * 60 + s;
8316
+ }
8317
+ function parsePySceneDetectCsvCuts(csv) {
8318
+ const firstLine = csv.split(/\r?\n/, 1)[0] ?? "";
8319
+ if (!/^\s*Timecode List:/i.test(firstLine)) return [];
8320
+ const cuts = [];
8321
+ for (const cell of firstLine.split(",").slice(1)) {
8322
+ const t = timecodeToSeconds(cell);
8323
+ if (t !== null && t > 0) cuts.push(Math.round(t * 1e3) / 1e3);
8324
+ }
8325
+ return [...new Set(cuts)].sort((a, b) => a - b);
8326
+ }
8327
+ async function runSceneDetectOnce(filePath, threshold, minSceneLenS, timeoutMs) {
8328
+ const outDir = await mkdtemp(join2(tmpdir(), "baker-scenedetect-"));
8329
+ try {
8330
+ await execFileAsync2(
8331
+ "scenedetect",
8332
+ [
8333
+ "--input",
8334
+ filePath,
8335
+ "--output",
8336
+ outDir,
8337
+ "detect-content",
8338
+ "--threshold",
8339
+ String(threshold),
8340
+ "--min-scene-len",
8341
+ String(minSceneLenS),
8342
+ "list-scenes",
8343
+ "--quiet"
8344
+ ],
8345
+ { encoding: "utf-8", maxBuffer: 32 * 1024 * 1024, timeout: timeoutMs }
8346
+ );
8347
+ const csvName = (await readdir2(outDir)).find((f) => f.toLowerCase().endsWith(".csv"));
8348
+ if (!csvName) return [];
8349
+ return parsePySceneDetectCsvCuts(await readFile4(join2(outDir, csvName), "utf-8"));
8350
+ } finally {
8351
+ await rm(outDir, { recursive: true, force: true });
8352
+ }
8353
+ }
8354
+ async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
8355
+ const pinned = opts.threshold !== void 0;
8356
+ const threshold = opts.threshold ?? PYSCENEDETECT_THRESHOLD;
8357
+ const minSceneLenS = opts.minSceneLenS ?? PYSCENEDETECT_MIN_SCENE_LEN_S;
8358
+ const timeoutMs = opts.timeout_ms ?? 12e4;
8359
+ const cuts = await runSceneDetectOnce(filePath, threshold, minSceneLenS, timeoutMs);
8360
+ if (!pinned && isLikelyOverSegmented(cuts)) {
8361
+ return await runSceneDetectOnce(
8362
+ filePath,
8363
+ PYSCENEDETECT_RECHECK_THRESHOLD,
8364
+ PYSCENEDETECT_RECHECK_MIN_SCENE_LEN_S,
8365
+ timeoutMs
8366
+ );
8367
+ }
8368
+ return cuts;
8369
+ }
8370
+
8281
8371
  // src/engine/scaffold/video.ts
8282
8372
  import { z as z3 } from "zod";
8283
8373
 
@@ -8380,7 +8470,7 @@ var FIXED_TTS_MODEL = "elevenlabs/eleven_v3";
8380
8470
  var FIXED_SFX_MODEL = "elevenlabs/eleven_text_to_sound_v2";
8381
8471
  var FIXED_MUSIC_MODEL = "elevenlabs/music-v1";
8382
8472
  var FIXED_VOICE_CONVERT_MODEL = "elevenlabs/eleven_multilingual_sts_v2";
8383
- var MUSIC_BED_GAIN_DB = -12;
8473
+ var MUSIC_BED_GAIN_DB = -20;
8384
8474
  var AMBIENT_BED_GAIN_DB = -20;
8385
8475
  var TRANSITION_DEFAULT_S = 0.4;
8386
8476
  var XFADE_BY_TYPE = {
@@ -8432,10 +8522,78 @@ function sceneDurationS(scene) {
8432
8522
  const max = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
8433
8523
  return Math.min(Math.max(raw, 0.5), max);
8434
8524
  }
8435
- function trimArgs(durationS) {
8525
+ function canvasDims(ar) {
8526
+ switch (ar) {
8527
+ case "1:1":
8528
+ return { w: 1080, h: 1080 };
8529
+ case "16:9":
8530
+ return { w: 1920, h: 1080 };
8531
+ case "4:3":
8532
+ return { w: 1440, h: 1080 };
8533
+ case "3:4":
8534
+ return { w: 1080, h: 1440 };
8535
+ case "21:9":
8536
+ return { w: 1920, h: 822 };
8537
+ default:
8538
+ return { w: 1080, h: 1920 };
8539
+ }
8540
+ }
8541
+ function fillPanel(label, w, h, out) {
8542
+ return `[${label}]scale=${w}:${h}:force_original_aspect_ratio=increase,crop=${w}:${h},setsar=1,fps=30[${out}]`;
8543
+ }
8544
+ function splitStackArgs(count, axis, dims) {
8545
+ const pw = axis === "horizontal" ? Math.round(dims.w / count) : dims.w;
8546
+ const ph = axis === "vertical" ? Math.round(dims.h / count) : dims.h;
8547
+ const inputs = [];
8548
+ const filt = [];
8549
+ let labels = "";
8550
+ for (let i = 0; i < count; i++) {
8551
+ inputs.push("-i", `{{in.c${i}}}`);
8552
+ filt.push(fillPanel(`${i}:v`, pw, ph, `p${i}`));
8553
+ labels += `[p${i}]`;
8554
+ }
8555
+ const stack = axis === "vertical" ? "vstack" : "hstack";
8556
+ filt.push(`${labels}${stack}=inputs=${count}[v]`);
8557
+ return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
8558
+ }
8559
+ function overlayXY(position, marginPx) {
8560
+ const p = (position ?? "bottom_right").toLowerCase();
8561
+ const x = p.includes("left") ? `${marginPx}` : p.includes("right") ? `W-w-${marginPx}` : "(W-w)/2";
8562
+ const y = p.includes("top") ? `${marginPx}` : p.includes("bottom") ? `H-h-${marginPx}` : "(H-h)/2";
8563
+ return { x, y };
8564
+ }
8565
+ function pipOverlayArgs(dims, position, insetWpct) {
8566
+ const iw = Math.round(dims.w * insetWpct);
8567
+ const margin = Math.round(dims.w * 0.04);
8568
+ const { x, y } = overlayXY(position, margin);
8569
+ const filt = `${fillPanel("0:v", dims.w, dims.h, "bg")};[1:v]scale=${iw}:-2,setsar=1,fps=30[fg];[bg][fg]overlay=x=${x}:y=${y}:format=auto[v]`;
8570
+ return ["-i", "{{in.c0}}", "-i", "{{in.c1}}", "-filter_complex", filt, "-map", "[v]", "{{out.video}}"];
8571
+ }
8572
+ var FLASH_HOLD_MAX_S = 2;
8573
+ function stillHoldArgs(durationS, dims) {
8574
+ return [
8575
+ "-loop",
8576
+ "1",
8577
+ "-i",
8578
+ "{{in.frame}}",
8579
+ "-t",
8580
+ durationS.toFixed(3),
8581
+ "-r",
8582
+ "30",
8583
+ "-vf",
8584
+ `scale=${dims.w}:${dims.h}:force_original_aspect_ratio=increase,crop=${dims.w}:${dims.h},setsar=1,format=yuv420p`,
8585
+ "-c:v",
8586
+ "libx264",
8587
+ "-pix_fmt",
8588
+ "yuv420p",
8589
+ "{{out.video}}"
8590
+ ];
8591
+ }
8592
+ function trimArgs(durationS, offsetS = 0) {
8436
8593
  return [
8437
8594
  "-i",
8438
8595
  "{{in.clip}}",
8596
+ ...offsetS > 0 ? ["-ss", offsetS.toFixed(3)] : [],
8439
8597
  "-t",
8440
8598
  durationS.toFixed(3),
8441
8599
  "-an",
@@ -8462,6 +8620,25 @@ var Sfx = z3.object({
8462
8620
  sound_effect_prompt: z3.string().optional(),
8463
8621
  description: z3.string().optional()
8464
8622
  }).loose();
8623
+ var CompositionRegion = z3.object({
8624
+ // full | top | bottom | left | right | inset
8625
+ panel: z3.string().optional(),
8626
+ // 9-grid anchor for an `inset` presenter box.
8627
+ position: z3.string().optional(),
8628
+ is_presenter: z3.boolean().optional(),
8629
+ // The cast id shown/speaking in this region (routes lip-sync + element refs).
8630
+ cast_ref: z3.string().optional(),
8631
+ summary: z3.string().optional(),
8632
+ frame_prompt: z3.string().optional(),
8633
+ motion_prompt: z3.string().optional()
8634
+ }).loose();
8635
+ var SceneComposition = z3.object({
8636
+ // full_frame (default) | split_screen | pip | keyed_overlay
8637
+ layout: z3.string().optional(),
8638
+ // split_screen only: vertical (top/bottom) | horizontal (left/right).
8639
+ split_axis: z3.string().optional(),
8640
+ regions: z3.array(CompositionRegion).optional()
8641
+ }).loose();
8465
8642
  var CameraMotion = z3.object({ movement: z3.string().optional(), detail: z3.string().optional() }).loose();
8466
8643
  var TranscriptWord = z3.object({ text: z3.string().optional() }).loose();
8467
8644
  var Scene = z3.object({
@@ -8470,6 +8647,10 @@ var Scene = z3.object({
8470
8647
  duration_s: z3.number().optional(),
8471
8648
  summary: z3.string().optional(),
8472
8649
  action_detail: z3.string().optional(),
8650
+ // The scene's spatial layout. Absent/full_frame ⇒ one uncut shot (default path).
8651
+ // A layered layout (split_screen/pip/keyed_overlay) with regions ⇒ the scaffold
8652
+ // builds one clip per region and stacks/overlays them into the scene picture.
8653
+ composition: SceneComposition.optional(),
8473
8654
  // The capture "look" for this scene — selected from the ad-native shoot-mode
8474
8655
  // grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
8475
8656
  // UGC/product mode; a human can override per scene by setting this.
@@ -8495,7 +8676,12 @@ var Scene = z3.object({
8495
8676
  floating_elements: z3.array(z3.unknown()).optional(),
8496
8677
  transcript_slice: z3.array(TranscriptWord).optional(),
8497
8678
  start_frame_asset: FrameAsset,
8498
- end_frame_asset: FrameAsset
8679
+ end_frame_asset: FrameAsset,
8680
+ // DECON-supplied: true when this scene is a length-split CONTINUATION of the
8681
+ // previous one (the SAME physical shot, broken up only because it exceeded the
8682
+ // clip ceiling). The scaffold then shares the splice keyframe — this scene's
8683
+ // start frame IS the previous scene's end frame — so the join is seamless.
8684
+ continues_previous: z3.boolean().optional()
8499
8685
  }).loose();
8500
8686
  var VideoBlueprint = z3.object({
8501
8687
  source: z3.object({ aspect_ratio: z3.string().optional(), duration_s: z3.number().optional() }).loose().optional(),
@@ -8600,6 +8786,40 @@ function annotateBlueprintWithElements(blueprintInput, elementsInput) {
8600
8786
  clone.reference_elements = summary;
8601
8787
  return clone;
8602
8788
  }
8789
+ var SELECT_SCENE_FIELDS = [
8790
+ "index",
8791
+ "start_s",
8792
+ "end_s",
8793
+ "duration_s",
8794
+ "summary",
8795
+ "narrative_role",
8796
+ "action_detail",
8797
+ "start_frame_prompt",
8798
+ "end_frame_prompt"
8799
+ ];
8800
+ var SELECT_GLOBAL_FIELDS = ["cast", "branding", "voiceover"];
8801
+ function slimBlueprintForSelection(blueprintInput) {
8802
+ if (!blueprintInput || typeof blueprintInput !== "object" || Array.isArray(blueprintInput)) return blueprintInput;
8803
+ const bp = blueprintInput;
8804
+ const out = {};
8805
+ for (const k of ["version", "source"]) if (k in bp) out[k] = bp[k];
8806
+ if (bp.global && typeof bp.global === "object" && !Array.isArray(bp.global)) {
8807
+ const g = bp.global;
8808
+ const slimG = {};
8809
+ for (const k of SELECT_GLOBAL_FIELDS) if (k in g) slimG[k] = g[k];
8810
+ out.global = slimG;
8811
+ }
8812
+ if (Array.isArray(bp.scenes)) {
8813
+ out.scenes = bp.scenes.map((s) => {
8814
+ if (!s || typeof s !== "object" || Array.isArray(s)) return s;
8815
+ const sr = s;
8816
+ const slim = {};
8817
+ for (const k of SELECT_SCENE_FIELDS) if (k in sr) slim[k] = sr[k];
8818
+ return slim;
8819
+ });
8820
+ }
8821
+ return out;
8822
+ }
8603
8823
  function roleForType2(type) {
8604
8824
  switch (type.toLowerCase()) {
8605
8825
  case "logo":
@@ -8633,7 +8853,16 @@ function todoPath2(el, label) {
8633
8853
  return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in${fresh}${same}]`;
8634
8854
  }
8635
8855
  function buildElementSlots(elements) {
8636
- const usedIds = /* @__PURE__ */ new Set(["prompt", "spine", "overlaid", "audio_mix", "final", "music_bed"]);
8856
+ const usedIds = /* @__PURE__ */ new Set([
8857
+ "prompt",
8858
+ "spine",
8859
+ "overlaid",
8860
+ "captions",
8861
+ "captions_transcript",
8862
+ "audio_mix",
8863
+ "final",
8864
+ "music_bed"
8865
+ ]);
8637
8866
  const slots = [];
8638
8867
  assignElementLabels2(elements).forEach(({ el, label }, i) => {
8639
8868
  let id = sanitizeId2(`el_${label}`, `el_${i}`);
@@ -8646,6 +8875,7 @@ function buildElementSlots(elements) {
8646
8875
  type: el.type,
8647
8876
  description: el.description,
8648
8877
  sameAs: el.same_as ?? void 0,
8878
+ castId: el.cast_id ?? void 0,
8649
8879
  presence: presenceOf(el)
8650
8880
  });
8651
8881
  });
@@ -8684,7 +8914,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
8684
8914
  const legend = [
8685
8915
  ...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
8686
8916
  ...hasAnchor ? [
8687
- "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions of THIS frame. IGNORE its overlay text, captions, and any brand that is being swapped."
8917
+ "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions. IGNORE its text, its logo, its brand name, and its colors entirely \u2014 it is a DIFFERENT brand's footage, here only to anchor layout/pose, never identity or palette."
8688
8918
  ] : []
8689
8919
  ].join("\n");
8690
8920
  const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
@@ -8703,6 +8933,9 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
8703
8933
  "a studio) contains any text or graphics, DO NOT reproduce them \u2014 render the subject/scene",
8704
8934
  "only, leaving the regions where overlays will sit clean. Imperfect/garbled letterforms or",
8705
8935
  "stray icons are the worst outcome; leave those areas blank.",
8936
+ "A SCREEN/UI surface \u2014 an app, website, chat, dashboard, or phone display \u2014 is NEVER",
8937
+ "rendered here: leave any phone/screen OFF or blank-screened. The real interface is",
8938
+ "composited later as a screenshot or a brand HTML block, never AI-generated.",
8706
8939
  "",
8707
8940
  "FRAMING \u2014 ONE UNCUT FRAME:",
8708
8941
  "Render ONE single uncut photographic frame: NO split screen, NO panels, NO dividing line,",
@@ -8730,41 +8963,71 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
8730
8963
  "REFERENCE IMAGES (in the order provided):",
8731
8964
  legend,
8732
8965
  "",
8733
- "Identity comes from the reference images, not from this prose \u2014 render each person,",
8734
- "product, and set to MATCH its reference image, and describe only pose, expression, action,",
8735
- "and camera in the FRAME DESCRIPTION below.",
8736
- "",
8966
+ // RECAST is the whole point of a transform: the dropped el_* images define who/
8967
+ // what is on screen, NOT the source footage and NOT the prose. Without this, the
8968
+ // model reproduces the original ad's people (a proven failure mode).
8969
+ ...present.length > 0 ? [
8970
+ "IDENTITY & AESTHETIC \u2014 RECAST (this is a transform, not a copy):",
8971
+ "Identity comes from the reference image, never from the source footage or this prose. Render every",
8972
+ "person, animal, product, and set to MATCH its labeled reference image above \u2014 that image is the ONLY",
8973
+ "source of their identity, wardrobe, styling, and look. This is a complete recast: do NOT reproduce,",
8974
+ "trace, or resemble any individual, animal, product, or set from the source ad. Where the FRAME",
8975
+ "DESCRIPTION below names an appearance detail (hair, outfit, color, age, breed, brand of an object),",
8976
+ "IGNORE that wording \u2014 the reference image is the truth; use the description ONLY for pose, expression,",
8977
+ "action, framing, lighting, and palette.",
8978
+ ""
8979
+ ] : [
8980
+ "Identity comes from the reference image, never from prose \u2014 render the subject to MATCH it and",
8981
+ "describe only pose, expression, action, framing, and lighting in the FRAME DESCRIPTION below.",
8982
+ ""
8983
+ ],
8737
8984
  "FRAME DESCRIPTION (this frame's editable prompt):",
8738
8985
  description,
8739
8986
  "",
8740
- "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
8987
+ "Render exactly what the FRAME DESCRIPTION and the SHARED AD SPEC specify \u2014 this is the authoritative ad: its cast identity (via the reference images), palette, brand, and intent are law. Keep every recurring element identical to its reference image across all frames. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
8741
8988
  "",
8742
- "GLOBAL STYLE REFERENCE (shared across frames; not this frame's content):",
8989
+ "SHARED AD SPEC (authoritative \u2014 the ad blueprint this frame belongs to; align cast/palette/brand/type with it):",
8743
8990
  "{{target_blueprint}}"
8744
8991
  ].join("\n");
8745
8992
  }
8993
+ function ingestFrameRef(url, edge, ctx, nodes) {
8994
+ const cached2 = ctx.ingestCache?.get(url);
8995
+ if (cached2) return cached2;
8996
+ const tag = ctx.tag ?? "";
8997
+ const refId = `s${ctx.sceneIndex}${tag}_${edge}_ref`;
8998
+ nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
8999
+ const ref = `$ref:${refId}.asset`;
9000
+ ctx.ingestCache?.set(url, ref);
9001
+ return ref;
9002
+ }
8746
9003
  function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
8747
- const refId = `s${ctx.sceneIndex}_${edge}_ref`;
8748
- if (url) nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
8749
- if (ctx.reuse && url) return `$ref:${refId}.asset`;
8750
- const reference = [...present.map((s) => s.ref), ...url ? [`$ref:${refId}.asset`] : []];
9004
+ const tag = ctx.tag ?? "";
9005
+ if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
9006
+ const hasPersonOrAnimal = present.some((s) => {
9007
+ const t = s.type.toLowerCase();
9008
+ return t === "person" || t === "animal";
9009
+ });
9010
+ const useOriginalAnchor = Boolean(url) && !hasPersonOrAnimal;
9011
+ const hasOriginal = useOriginalAnchor;
9012
+ const originalRef = useOriginalAnchor && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
9013
+ const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
8751
9014
  const genParams = {
8752
9015
  model: ctx.imageModel,
8753
9016
  image_size: "2K",
8754
- prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url), ctx.shootMode)
9017
+ prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, hasOriginal, ctx.shootMode)
8755
9018
  };
8756
9019
  if (ctx.ar) genParams.aspect_ratio = ctx.ar;
8757
- const genNode = {
8758
- id: `s${ctx.sceneIndex}_${edge}`,
9020
+ const genId = `s${ctx.sceneIndex}${tag}_${edge}`;
9021
+ nodes.push({
9022
+ id: genId,
8759
9023
  type: "image_generate",
8760
9024
  // `params.prompt` is this frame's authoritative, edit-per-frame description.
8761
- // `target_blueprint` is kept only as a demoted shared style reference (global
8762
- // cast/palette/typography), so editing one frame never touches another.
9025
+ // `target_blueprint` is the shared ad spec (cast identity, palette, brand, type)
9026
+ // the frame must stay consistent with — editing one frame never touches another.
8763
9027
  inputs: { target_blueprint: "$ref:prompt.asset", ...reference.length > 0 ? { reference } : {} },
8764
9028
  params: genParams
8765
- };
8766
- nodes.push(genNode);
8767
- return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
9029
+ });
9030
+ return `$ref:${genId}.images#0`;
8768
9031
  }
8769
9032
  function seedanceAudioLine(scene, mode, audio, nativeLine) {
8770
9033
  const ambient = scene.ambient?.trim() || diegeticFor(mode);
@@ -8810,10 +9073,11 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
8810
9073
  );
8811
9074
  return parts.join("\n");
8812
9075
  }
8813
- function audioExtractArgs(durationS) {
9076
+ function audioExtractArgs(durationS, offsetS = 0) {
8814
9077
  return [
8815
9078
  "-i",
8816
9079
  "{{in.clip}}",
9080
+ ...offsetS > 0.05 ? ["-ss", offsetS.toFixed(3)] : [],
8817
9081
  "-t",
8818
9082
  durationS.toFixed(3),
8819
9083
  "-vn",
@@ -8841,27 +9105,21 @@ function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
8841
9105
  hasProduct: present.some((s) => s.type.toLowerCase() === "product")
8842
9106
  });
8843
9107
  }
8844
- function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
9108
+ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks, nativeSegments, clipRef = `$ref:s${i}_clip.video`) {
8845
9109
  if (nativeTurn) {
8846
- const extractLen = Math.min(Math.max(lengths.dur, lengths.speech), lengths.genDur);
9110
+ const speechWindow = Math.max(0.5, nativeTurn.end_s - nativeTurn.start_s);
9111
+ const extractLen = Math.min(speechWindow, lengths.genDur);
8847
9112
  nodes.push({
8848
9113
  id: `s${i}_voextract`,
8849
9114
  type: "ffmpeg",
8850
- inputs: { clip: `$ref:s${i}_clip.video` },
9115
+ inputs: { clip: clipRef },
8851
9116
  params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
8852
9117
  });
8853
- nodes.push({
8854
- id: `s${i}_voconv`,
8855
- type: "audio_voice_convert",
8856
- inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
8857
- params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
8858
- });
8859
- voTracks.push({
8860
- slot: `s${i}_voconv`,
8861
- ref: `$ref:s${i}_voconv.audio`,
9118
+ nativeSegments.push({
9119
+ voiceNode: nativeTurn.voiceNode,
9120
+ ref: `$ref:s${i}_voextract.audio`,
8862
9121
  start_s: nativeTurn.start_s,
8863
- end_s: nativeTurn.start_s + extractLen,
8864
- kind: "vo"
9122
+ end_s: nativeTurn.start_s + extractLen
8865
9123
  });
8866
9124
  } else if (ambientBroll) {
8867
9125
  const ambientStart = scene.start_s ?? 0;
@@ -8881,85 +9139,260 @@ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes
8881
9139
  });
8882
9140
  }
8883
9141
  }
8884
- function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
8885
- const ar = aspectRatioParam(blueprint);
8886
- const reuse = opts.frames === "reuse";
8887
- const clips = [];
8888
- const voTracks = [];
8889
- const lastIndex = blueprint.scenes.length - 1;
8890
- const cameraOn = onCameraDialogue(blueprint);
8891
- const casts = castIdSet(blueprint);
8892
- blueprint.scenes.forEach((scene, i) => {
8893
- const nativeTurn = (sceneTurns.get(i) ?? []).find((t) => t.native);
8894
- const present = slotsForScene(slots, i);
8895
- const mode = sceneShootMode(scene, present, nativeTurn, cameraOn, casts);
8896
- const ambientBroll = Boolean(opts.ambient) && !nativeTurn && mode !== "ugc_selfie";
8897
- const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel, shootMode: mode };
8898
- const firstFrame = buildFrameRef(
8899
- "start",
8900
- scene.start_frame_asset?.url,
8901
- scene.start_frame_prompt,
8902
- slotsForFrame(slots, i, "start"),
8903
- ctx,
8904
- nodes
8905
- );
8906
- const lastFrame = buildFrameRef(
8907
- "end",
8908
- scene.end_frame_asset?.url,
8909
- scene.end_frame_prompt,
8910
- slotsForFrame(slots, i, "end"),
8911
- ctx,
8912
- nodes
8913
- );
8914
- const dur = sceneDurationS(scene);
8915
- let out = sceneOutTransition(scene, i === lastIndex);
8916
- let trimTarget = dur + (out?.dur ?? 0);
8917
- if (out && ceilToSeedance(trimTarget) < trimTarget) {
8918
- out = null;
8919
- trimTarget = dur;
8920
- }
8921
- const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
8922
- const genDur = ceilToSeedance(Math.max(trimTarget, speech));
8923
- const clipParams = {
8924
- model: opts.videoModel,
8925
- prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
8926
- duration: genDur,
8927
- // Native talking scene → Seedance generates the spoken audio + lip-sync;
8928
- // an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
8929
- // clip is silent and audio comes from the tts/music timeline.
8930
- generate_audio: Boolean(nativeTurn) || ambientBroll
8931
- };
8932
- if (ar) clipParams.aspect_ratio = ar;
9142
+ function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
9143
+ const bySpeaker = /* @__PURE__ */ new Map();
9144
+ for (const seg of segments) {
9145
+ const arr = bySpeaker.get(seg.voiceNode) ?? [];
9146
+ arr.push(seg);
9147
+ bySpeaker.set(seg.voiceNode, arr);
9148
+ }
9149
+ const tracks = [];
9150
+ for (const [voiceNode, segs] of bySpeaker) {
9151
+ const trackId = `${voiceNode}_track`;
9152
+ const convId = `${voiceNode}_conv`;
9153
+ const mixInputs = {};
9154
+ segs.forEach((s, k) => {
9155
+ mixInputs[`seg${k}`] = s.ref;
9156
+ });
8933
9157
  nodes.push({
8934
- id: `s${i}_clip`,
8935
- type: "video_generate",
8936
- inputs: { first_frame: firstFrame, last_frame: lastFrame },
8937
- params: clipParams
9158
+ id: trackId,
9159
+ type: "audio_timeline",
9160
+ inputs: mixInputs,
9161
+ params: {
9162
+ tracks: segs.map((s, k) => ({ slot: `seg${k}`, start_s: s.start_s })),
9163
+ total_ms: totalMs
9164
+ }
8938
9165
  });
8939
- emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, { dur, speech, genDur }, nodes, voTracks);
8940
- const base = `$ref:s${i}_clip.video`;
8941
- if (genDur === trimTarget) {
8942
- clips.push({ ref: base, scene_s: dur, out });
8943
- } else {
8944
- nodes.push({
8945
- id: `s${i}_trim`,
8946
- type: "ffmpeg",
8947
- inputs: { clip: base },
8948
- params: { args: trimArgs(trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
8949
- });
8950
- clips.push({ ref: `$ref:s${i}_trim.video`, scene_s: dur, out });
9166
+ nodes.push({
9167
+ id: convId,
9168
+ type: "audio_voice_convert",
9169
+ inputs: { audio: `$ref:${trackId}.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
9170
+ params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
9171
+ });
9172
+ tracks.push({ slot: convId, ref: `$ref:${convId}.audio`, start_s: 0, kind: "vo" });
9173
+ }
9174
+ return tracks;
9175
+ }
9176
+ function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
9177
+ const clipParams = {
9178
+ model: opts.videoModel,
9179
+ prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
9180
+ duration: lengths.genDur,
9181
+ // Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
9182
+ // ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
9183
+ generate_audio: Boolean(nativeTurn) || ambientBroll
9184
+ };
9185
+ if (opts.ar) clipParams.aspect_ratio = opts.ar;
9186
+ nodes.push({
9187
+ id: `s${i}${tag}_clip`,
9188
+ type: "video_generate",
9189
+ inputs: { first_frame: frames.first, ...frames.last ? { last_frame: frames.last } : {} },
9190
+ params: clipParams
9191
+ });
9192
+ const base = `$ref:s${i}${tag}_clip.video`;
9193
+ if (lengths.genDur === lengths.trimTarget) return { ref: base, scene_s: lengths.dur, out };
9194
+ nodes.push({
9195
+ id: `s${i}${tag}_clip_trim`,
9196
+ type: "ffmpeg",
9197
+ inputs: { clip: base },
9198
+ params: { args: trimArgs(lengths.trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
9199
+ });
9200
+ return { ref: `$ref:s${i}${tag}_clip_trim.video`, scene_s: lengths.dur, out };
9201
+ }
9202
+ var COMPOSITE_LAYOUTS = /* @__PURE__ */ new Set(["split_screen", "pip", "keyed_overlay"]);
9203
+ var UI_SURFACE_RE = /\b(?:app|ui|web ?site|web ?page|website|browser|chat|interface|mock-?up|in[- ]?app|dashboard|app screen|phone screen|screen[- ]?(?:recording|capture|grab|share))\b/i;
9204
+ function regionIsUiSurface(r) {
9205
+ return UI_SURFACE_RE.test(`${r.panel ?? ""} ${r.summary ?? ""} ${r.frame_prompt ?? ""}`);
9206
+ }
9207
+ function isUiOnlyComposite(regions) {
9208
+ const ui = regions.filter(regionIsUiSurface).length;
9209
+ return ui >= 1 && regions.length - ui <= 1;
9210
+ }
9211
+ function layeredComposition(scene) {
9212
+ const comp = scene.composition;
9213
+ const layout = (comp?.layout ?? "").toLowerCase();
9214
+ if (!COMPOSITE_LAYOUTS.has(layout)) return null;
9215
+ const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
9216
+ if (regions.length < 2) return null;
9217
+ if (isUiOnlyComposite(regions)) return null;
9218
+ return { layout, regions, comp: comp ?? {} };
9219
+ }
9220
+ function splitAxisOf(comp, regions) {
9221
+ const panels = regions.map((r) => (r.panel ?? "").toLowerCase());
9222
+ if (panels.some((p) => p === "top" || p === "bottom")) return "vertical";
9223
+ if (panels.some((p) => p === "left" || p === "right")) return "horizontal";
9224
+ return (comp.split_axis ?? "").toLowerCase() === "horizontal" ? "horizontal" : "vertical";
9225
+ }
9226
+ function orderSplitRefs(regions, regionRefs, axis) {
9227
+ const rank = (panel) => {
9228
+ const p = (panel ?? "").toLowerCase();
9229
+ if (axis === "vertical") return p === "top" ? 0 : p === "bottom" ? 2 : 1;
9230
+ return p === "left" ? 0 : p === "right" ? 2 : 1;
9231
+ };
9232
+ return regionRefs.map((ref, k) => ({ ref, k, rank: rank(regions[k]?.panel) })).sort((a, b) => a.rank - b.rank || a.k - b.k).map((x) => x.ref);
9233
+ }
9234
+ function presenterIndexOf(regions, hasNative) {
9235
+ const flagged = regions.findIndex((r) => r.is_presenter);
9236
+ if (flagged >= 0) return flagged;
9237
+ return hasNative ? 0 : -1;
9238
+ }
9239
+ function slotsForRegion(present, isPresenter) {
9240
+ return present.filter((s) => {
9241
+ const t = s.type.toLowerCase();
9242
+ const person = t === "person" || t === "animal";
9243
+ return isPresenter ? person : !person;
9244
+ });
9245
+ }
9246
+ function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes) {
9247
+ const dims = canvasDims(opts.ar);
9248
+ const presIdx = presenterIndexOf(regions, Boolean(nativeTurn));
9249
+ const regionRefs = [];
9250
+ let presenterPosition;
9251
+ regions.forEach((region, r) => {
9252
+ const isPresenter = r === presIdx;
9253
+ const tag = `_r${r}`;
9254
+ const regionSlots = slotsForRegion(present, isPresenter);
9255
+ const ctx = {
9256
+ sceneIndex: i,
9257
+ ar: opts.ar,
9258
+ reuse: opts.reuse,
9259
+ imageModel: opts.imageModel,
9260
+ shootMode: mode,
9261
+ tag
9262
+ };
9263
+ const startPrompt = region.frame_prompt ?? scene.start_frame_prompt;
9264
+ const endPrompt = region.frame_prompt ?? scene.end_frame_prompt;
9265
+ const first = buildFrameRef("start", void 0, startPrompt, regionSlots, ctx, nodes);
9266
+ const last = buildFrameRef("end", void 0, endPrompt, regionSlots, ctx, nodes);
9267
+ const regionNative = isPresenter ? nativeTurn : void 0;
9268
+ const regionScene = {
9269
+ ...scene,
9270
+ summary: region.summary ?? scene.summary,
9271
+ motion_prompt: region.motion_prompt ?? scene.motion_prompt,
9272
+ dialogue: isPresenter ? scene.dialogue : []
9273
+ };
9274
+ const clip = emitSceneClip(
9275
+ i,
9276
+ regionScene,
9277
+ regionSlots,
9278
+ mode,
9279
+ regionNative,
9280
+ false,
9281
+ { first, last },
9282
+ lengths,
9283
+ null,
9284
+ { ar: opts.ar, videoModel: opts.videoModel },
9285
+ nodes,
9286
+ tag
9287
+ );
9288
+ regionRefs.push(clip.ref);
9289
+ if (isPresenter) presenterPosition = region.position;
9290
+ });
9291
+ const compInputs = {};
9292
+ let args;
9293
+ if (layout === "split_screen") {
9294
+ const axis = splitAxisOf(comp, regions);
9295
+ orderSplitRefs(regions, regionRefs, axis).forEach((ref, k) => {
9296
+ compInputs[`c${k}`] = ref;
9297
+ });
9298
+ args = splitStackArgs(regionRefs.length, axis, dims);
9299
+ } else {
9300
+ const bgIdx = regions.findIndex((_, k) => k !== presIdx);
9301
+ const bgRef = regionRefs[bgIdx >= 0 ? bgIdx : 0];
9302
+ let presRef = regionRefs[presIdx >= 0 ? presIdx : 1];
9303
+ if (layout === "keyed_overlay" && presIdx >= 0) {
9304
+ const keyId = `s${i}_key`;
9305
+ nodes.push({ id: keyId, type: "video_background_remove", inputs: { video: presRef }, params: {} });
9306
+ presRef = `$ref:${keyId}.video`;
9307
+ }
9308
+ compInputs.c0 = bgRef;
9309
+ compInputs.c1 = presRef;
9310
+ args = pipOverlayArgs(dims, presenterPosition, layout === "keyed_overlay" ? 0.5 : 0.34);
9311
+ }
9312
+ nodes.push({
9313
+ id: `s${i}_composite`,
9314
+ type: "ffmpeg",
9315
+ inputs: compInputs,
9316
+ params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
9317
+ });
9318
+ const presenterClipRef = presIdx >= 0 ? `$ref:s${i}_r${presIdx}_clip.video` : void 0;
9319
+ return { clip: { ref: `$ref:s${i}_composite.video`, scene_s: lengths.dur, out }, presenterClipRef };
9320
+ }
9321
+ function sceneTiming(scene, isLast, nativeTurn) {
9322
+ const dur = sceneDurationS(scene);
9323
+ let out = sceneOutTransition(scene, isLast);
9324
+ let trimTarget = dur + (out?.dur ?? 0);
9325
+ if (out && ceilToSeedance(trimTarget) < trimTarget) {
9326
+ out = null;
9327
+ trimTarget = dur;
9328
+ }
9329
+ const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
9330
+ const genDur = ceilToSeedance(Math.max(trimTarget, speech));
9331
+ return { dur, out, trimTarget, genDur, speech };
9332
+ }
9333
+ function emitCompositeScene(composite, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes, voTracks, nativeSegments, clips) {
9334
+ const built = buildCompositeScene(
9335
+ composite.layout,
9336
+ composite.regions,
9337
+ composite.comp,
9338
+ scene,
9339
+ i,
9340
+ present,
9341
+ mode,
9342
+ nativeTurn,
9343
+ { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
9344
+ out,
9345
+ opts,
9346
+ nodes
9347
+ );
9348
+ emitSceneNativeAudio(
9349
+ i,
9350
+ scene,
9351
+ nativeTurn,
9352
+ false,
9353
+ { dur: lengths.dur, speech: lengths.speech, genDur: lengths.genDur },
9354
+ nodes,
9355
+ voTracks,
9356
+ nativeSegments,
9357
+ built.presenterClipRef
9358
+ );
9359
+ clips.push(built.clip);
9360
+ }
9361
+ function emitFlashHold(i, scene, slots, ctx, lengths, out, ar, nodes, clips) {
9362
+ const frame = buildFrameRef(
9363
+ "start",
9364
+ scene.start_frame_asset?.url,
9365
+ scene.start_frame_prompt,
9366
+ slotsForFrame(slots, i, "start"),
9367
+ ctx,
9368
+ nodes
9369
+ );
9370
+ nodes.push({
9371
+ id: `s${i}_clip`,
9372
+ type: "ffmpeg",
9373
+ inputs: { frame },
9374
+ params: {
9375
+ args: stillHoldArgs(lengths.trimTarget, canvasDims(ar)),
9376
+ outputs: { video: { kind: "video", ext: "mp4" } }
8951
9377
  }
8952
9378
  });
8953
- return { clips, voTracks };
9379
+ clips.push({ ref: `$ref:s${i}_clip.video`, scene_s: lengths.dur, out });
9380
+ }
9381
+ function musicArcDigest(blueprint) {
9382
+ const roles = blueprint.scenes.map((s) => s.narrative_role).filter((r) => Boolean(r));
9383
+ const arc = roles.length > 0 ? roles.join(" \u2192 ") : "";
9384
+ return arc ? `
9385
+
9386
+ Emotional arc across scenes: ${arc}. Shape the bed's energy to this arc, swelling on the payoff. Purely instrumental \u2014 no vocals, no singing, no spoken words.` : "";
8954
9387
  }
8955
9388
  function musicBedPrompt(blueprint, musicPrompt) {
9389
+ const digest = musicArcDigest(blueprint);
8956
9390
  const track2 = blueprint.global?.music?.identified_track;
8957
9391
  const title = track2?.title?.trim();
8958
- if (!title) return musicPrompt;
8959
- const by = track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : "";
8960
- return `${musicPrompt}
9392
+ const vibe = title ? `
8961
9393
 
8962
- Reference vibe: the original used "${title}"${by} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.`;
9394
+ Reference vibe: the original used "${title}"${track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : ""} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.` : "";
9395
+ return `${musicPrompt}${digest}${vibe}`;
8963
9396
  }
8964
9397
  function onCameraDialogue(blueprint) {
8965
9398
  const mode = blueprint.global?.voiceover?.mode;
@@ -8998,92 +9431,483 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
8998
9431
  if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
8999
9432
  return casts.has(speaker);
9000
9433
  }
9001
- function buildDialogue(blueprint, nodes) {
9002
- const tracks = [];
9003
- const sceneTurns = /* @__PURE__ */ new Map();
9434
+ function makePresenterPresent(slots, canonical) {
9435
+ const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
9436
+ const bySpeaker = /* @__PURE__ */ new Map();
9437
+ for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
9438
+ const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
9439
+ return (speaker, sceneIndex) => {
9440
+ const presence = bySpeaker.get(speaker) ?? solePerson;
9441
+ if (!presence) return true;
9442
+ return presence.has(sceneIndex);
9443
+ };
9444
+ }
9445
+ var PAUSE_GAP_S = 0.6;
9446
+ var PHRASE_MAX_S = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
9447
+ function collapseVoiceover(blueprint) {
9004
9448
  const casts = castIdSet(blueprint);
9005
9449
  const cameraOn = onCameraDialogue(blueprint);
9006
- const voiceNodeBySpeaker = /* @__PURE__ */ new Map();
9007
- const speakerDescription = (speaker) => {
9008
- for (const scene of blueprint.scenes) {
9009
- for (const line of scene.dialogue ?? []) {
9010
- if ((line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
9011
- }
9450
+ const presenters = /* @__PURE__ */ new Set();
9451
+ for (const scene of blueprint.scenes)
9452
+ for (const l of scene.dialogue ?? []) {
9453
+ const sp = l.speaker ?? "voiceover";
9454
+ if (isOnCameraSpeaker(sp, casts, cameraOn)) presenters.add(sp);
9012
9455
  }
9456
+ if (presenters.size !== 1) return (s) => s;
9457
+ const presenter = [...presenters][0];
9458
+ return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
9459
+ }
9460
+ function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
9461
+ const casts = castIdSet(blueprint);
9462
+ const cameraOn = onCameraDialogue(blueprint);
9463
+ const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
9464
+ const multiSpeaker = /* @__PURE__ */ new Set();
9465
+ blueprint.scenes.forEach((scene, i) => {
9466
+ const onCam = new Set(
9467
+ (scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
9468
+ );
9469
+ if (onCam.size >= 2) multiSpeaker.add(i);
9470
+ });
9471
+ const lines = blueprint.scenes.flatMap(
9472
+ (scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
9473
+ const raw = l.speaker ?? "voiceover";
9474
+ const sp = canonical(raw);
9475
+ const text = l.line.trim();
9476
+ const start = l.start_s ?? scene.start_s ?? 0;
9477
+ return {
9478
+ sceneIndex,
9479
+ speaker: sp,
9480
+ // Shown = a cast member speaking AND their element is actually on screen
9481
+ // here (not a cutaway). A b-roll cutaway mid-phrase fails this and gets
9482
+ // its own clip while the phrase voice plays under it.
9483
+ shown: isOnCameraSpeaker(raw, casts, cameraOn) && !multiSpeaker.has(sceneIndex) && presenterPresent(sp, sceneIndex),
9484
+ start,
9485
+ // Real speech end. When the deconstruct gives no end_s, estimate it from
9486
+ // the words — NOT the scene end (which would fabricate continuity across
9487
+ // a long silent b-roll gap and wrongly merge two separate phrases).
9488
+ end: l.end_s ?? start + estSpeechS(text),
9489
+ text
9490
+ };
9491
+ })
9492
+ ).sort((a, b) => a.start - b.start);
9493
+ const phrases = [];
9494
+ let cur = null;
9495
+ const flush = () => {
9496
+ if (!cur) return;
9497
+ const shownScenes = [...cur.shown].sort((a, b) => a - b);
9498
+ phrases.push({
9499
+ speaker: cur.speaker,
9500
+ start_s: cur.start,
9501
+ end_s: cur.end,
9502
+ text: cur.texts.join(" "),
9503
+ firstScene: cur.firstScene,
9504
+ shownScenes,
9505
+ presenterShown: shownScenes.length > 0
9506
+ });
9507
+ cur = null;
9508
+ };
9509
+ for (const ln of lines) {
9510
+ const lineCover = ln.shown ? Math.max(ln.end, sceneEndS(ln.sceneIndex)) : ln.end;
9511
+ const lineClipStart = ln.shown ? Math.min(ln.start, blueprint.scenes[ln.sceneIndex]?.start_s ?? ln.start) : ln.start;
9512
+ const breakRun = !cur || cur.speaker !== ln.speaker || ln.start - cur.end > PAUSE_GAP_S || // Cap by SCENE COVERAGE span, not line end — a presenter run whose sliced scenes span
9513
+ // more than one Seedance clip splits into the next take here (at this scene's
9514
+ // boundary, never mid-scene), so no segment ever reads past the generated clip.
9515
+ Math.max(cur.coverEnd, lineCover) - Math.min(cur.clipStart, lineClipStart) > PHRASE_MAX_S;
9516
+ if (breakRun || !cur) {
9517
+ flush();
9518
+ cur = {
9519
+ speaker: ln.speaker,
9520
+ firstScene: ln.sceneIndex,
9521
+ start: ln.start,
9522
+ end: ln.end,
9523
+ coverEnd: lineCover,
9524
+ clipStart: lineClipStart,
9525
+ texts: [ln.text],
9526
+ shown: /* @__PURE__ */ new Set()
9527
+ };
9528
+ } else {
9529
+ cur.texts.push(ln.text);
9530
+ cur.end = Math.max(cur.end, ln.end);
9531
+ cur.coverEnd = Math.max(cur.coverEnd, lineCover);
9532
+ cur.clipStart = Math.min(cur.clipStart, lineClipStart);
9533
+ }
9534
+ if (ln.shown) cur.shown.add(ln.sceneIndex);
9535
+ }
9536
+ flush();
9537
+ return phrases;
9538
+ }
9539
+ function makeVoiceFactory(blueprint, canonical, nodes) {
9540
+ const bySpeaker = /* @__PURE__ */ new Map();
9541
+ const describe = (speaker) => {
9542
+ for (const scene of blueprint.scenes)
9543
+ for (const line of scene.dialogue ?? [])
9544
+ if (canonical(line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
9013
9545
  const cast = blueprint.global?.cast?.find((c) => c.id === speaker);
9014
9546
  return cast?.description ?? blueprint.global?.voiceover?.voice_description ?? `${speaker} voice`;
9015
9547
  };
9016
- const ensureVoiceNode = (speaker) => {
9017
- const existing = voiceNodeBySpeaker.get(speaker);
9548
+ return (speaker) => {
9549
+ const existing = bySpeaker.get(speaker);
9018
9550
  if (existing) return existing;
9019
- const id = sanitizeId2(`voice_${speaker}`, `voice_${voiceNodeBySpeaker.size}`);
9020
- const description = speakerDescription(speaker);
9021
- const traits = parseVoiceTraits(description);
9022
- nodes.push({ id, type: "voice_select", params: { description, ...traits } });
9023
- voiceNodeBySpeaker.set(speaker, id);
9551
+ const id = sanitizeId2(`voice_${speaker}`, `voice_${bySpeaker.size}`);
9552
+ const description = describe(speaker);
9553
+ nodes.push({ id, type: "voice_select", params: { description, ...parseVoiceTraits(description) } });
9554
+ bySpeaker.set(speaker, id);
9024
9555
  return id;
9025
9556
  };
9557
+ }
9558
+ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
9559
+ const anchor = phrase.shownScenes[0];
9560
+ const anchorScene = env.blueprint.scenes[anchor];
9561
+ if (!anchorScene) return;
9562
+ const present = slotsForScene(env.slots, anchor);
9563
+ const nativeTurn = {
9564
+ sceneIndex: anchor,
9565
+ speaker: phrase.speaker,
9566
+ start_s: phrase.start_s,
9567
+ end_s: phrase.end_s,
9568
+ text: phrase.text,
9569
+ voiceNode,
9570
+ native: true
9571
+ };
9572
+ const mode = sceneShootMode(anchorScene, present, nativeTurn, env.cameraOn, env.casts);
9573
+ const ctx = {
9574
+ sceneIndex: anchor,
9575
+ ar: env.ar,
9576
+ reuse: env.reuse,
9577
+ imageModel: env.opts.imageModel,
9578
+ shootMode: mode,
9579
+ ingestCache: env.ingestCache
9580
+ };
9581
+ const first = buildFrameRef(
9582
+ "start",
9583
+ anchorScene.start_frame_asset?.url,
9584
+ anchorScene.start_frame_prompt,
9585
+ slotsForFrame(env.slots, anchor, "start"),
9586
+ ctx,
9587
+ nodes
9588
+ );
9589
+ const lastShown = phrase.shownScenes[phrase.shownScenes.length - 1] ?? anchor;
9590
+ const lastScene = env.blueprint.scenes[lastShown] ?? anchorScene;
9591
+ const last = buildFrameRef(
9592
+ "end",
9593
+ lastScene.end_frame_asset?.url,
9594
+ lastScene.end_frame_prompt,
9595
+ slotsForFrame(env.slots, lastShown, "end"),
9596
+ ctx,
9597
+ nodes
9598
+ );
9599
+ const clipStart = phrase.shownScenes.reduce(
9600
+ (m, s) => Math.min(m, env.blueprint.scenes[s]?.start_s ?? phrase.start_s),
9601
+ phrase.start_s
9602
+ );
9603
+ const coverEnd = phrase.shownScenes.reduce((m, s) => Math.max(m, env.blueprint.scenes[s]?.end_s ?? 0), phrase.end_s);
9604
+ const phraseLen = Math.max(0.5, coverEnd - clipStart);
9605
+ const genDur = ceilToSeedance(phraseLen);
9606
+ const clipParams = {
9607
+ model: env.opts.videoModel,
9608
+ prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
9609
+ duration: genDur,
9610
+ generate_audio: true
9611
+ };
9612
+ if (env.ar) clipParams.aspect_ratio = env.ar;
9613
+ nodes.push({
9614
+ id: `s${anchor}_clip`,
9615
+ type: "video_generate",
9616
+ inputs: { first_frame: first, last_frame: last },
9617
+ params: clipParams
9618
+ });
9619
+ const clipRef = `$ref:s${anchor}_clip.video`;
9620
+ const speechOffset = Math.max(0, phrase.start_s - clipStart);
9621
+ const extractLen = Math.min(Math.max(0.5, phrase.end_s - phrase.start_s), Math.max(0.5, genDur - speechOffset));
9622
+ nodes.push({
9623
+ id: `s${anchor}_voextract`,
9624
+ type: "ffmpeg",
9625
+ inputs: { clip: clipRef },
9626
+ params: { args: audioExtractArgs(extractLen, speechOffset), outputs: { audio: { kind: "audio", ext: "mp3" } } }
9627
+ });
9628
+ const convId = `s${anchor}_conv`;
9629
+ nodes.push({
9630
+ id: convId,
9631
+ type: "audio_voice_convert",
9632
+ inputs: { audio: `$ref:s${anchor}_voextract.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
9633
+ params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
9634
+ });
9635
+ out.voTracks.push({
9636
+ slot: convId,
9637
+ ref: `$ref:${convId}.audio`,
9638
+ start_s: phrase.start_s,
9639
+ end_s: phrase.end_s,
9640
+ kind: "vo"
9641
+ });
9642
+ out.voSegments.push({
9643
+ slot: convId,
9644
+ start_s: phrase.start_s,
9645
+ end_s: phrase.end_s,
9646
+ scene: anchor,
9647
+ speaker: phrase.speaker
9648
+ });
9649
+ out.talkingScenes.push({
9650
+ scene: anchor,
9651
+ voice_convert_node: convId,
9652
+ scene_s: Math.round(phraseLen * 100) / 100,
9653
+ est_speech_s: Math.round(estSpeechS(phrase.text) * 100) / 100
9654
+ });
9655
+ for (const s of phrase.shownScenes) {
9656
+ const sc = env.blueprint.scenes[s];
9657
+ if (!sc) continue;
9658
+ const rawOffset = (sc.start_s ?? clipStart) - clipStart;
9659
+ out.sceneSlice.set(s, {
9660
+ clipRef,
9661
+ // Snap a sub-frame offset (line-start vs scene-start drift) to 0 so a single-scene
9662
+ // phrase hits the whole-clip fast path instead of a needless re-encode + tiny shift.
9663
+ offset: rawOffset < 0.05 ? 0 : rawOffset,
9664
+ len: sceneDurationS(sc),
9665
+ clipDur: genDur
9666
+ });
9667
+ }
9668
+ }
9669
+ function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
9670
+ let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
9671
+ while (used.has(id)) id = `${id}_x`;
9672
+ used.add(id);
9673
+ nodes.push({
9674
+ id,
9675
+ type: "tts",
9676
+ inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
9677
+ params: { model: FIXED_TTS_MODEL, text: phrase.text, voice: "{{voice_ref}}" }
9678
+ });
9679
+ out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
9680
+ out.voSegments.push({
9681
+ slot: id,
9682
+ start_s: phrase.start_s,
9683
+ end_s: phrase.end_s,
9684
+ scene: phrase.firstScene,
9685
+ speaker: phrase.speaker
9686
+ });
9687
+ }
9688
+ function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
9689
+ const present = slotsForScene(env.slots, i);
9690
+ const onCam = (scene.dialogue ?? []).filter(
9691
+ (l) => Boolean(l.line?.trim()) && isOnCameraSpeaker(l.speaker ?? "voiceover", env.casts, env.cameraOn)
9692
+ );
9693
+ const distinctSpeakers = new Set(onCam.map((l) => canonical(l.speaker ?? "voiceover")));
9694
+ let nativeTurn;
9695
+ if (onCam.length > 0 && distinctSpeakers.size === 1) {
9696
+ const speaker = canonical(onCam[0]?.speaker ?? "voiceover");
9697
+ const voiceNode = ensureVoiceNode(speaker);
9698
+ const start = onCam[0]?.start_s ?? scene.start_s ?? 0;
9699
+ const end = onCam[onCam.length - 1]?.end_s ?? scene.end_s ?? start;
9700
+ const text = onCam.map((l) => l.line.trim()).join(" ");
9701
+ nativeTurn = { sceneIndex: i, speaker, start_s: start, end_s: end, text, voiceNode, native: true };
9702
+ out.talkingScenes.push({
9703
+ scene: i,
9704
+ voice_convert_node: `${voiceNode}_conv`,
9705
+ scene_s: Math.round(sceneDurationS(scene) * 100) / 100,
9706
+ est_speech_s: Math.round(estSpeechS(text) * 100) / 100
9707
+ });
9708
+ }
9709
+ const mode = sceneShootMode(scene, present, nativeTurn, env.cameraOn, env.casts);
9710
+ const lengths = sceneTiming(scene, isLast, nativeTurn);
9711
+ emitCompositeScene(
9712
+ composite,
9713
+ scene,
9714
+ i,
9715
+ present,
9716
+ mode,
9717
+ nativeTurn,
9718
+ lengths,
9719
+ lengths.out,
9720
+ { ar: env.ar, reuse: env.reuse, imageModel: env.opts.imageModel, videoModel: env.opts.videoModel },
9721
+ nodes,
9722
+ out.voTracks,
9723
+ out.nativeSegments,
9724
+ out.clips
9725
+ );
9726
+ if (!nativeTurn && distinctSpeakers.size >= 2) {
9727
+ emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out);
9728
+ }
9729
+ }
9730
+ function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
9731
+ const bySpeaker = /* @__PURE__ */ new Map();
9732
+ for (const l of onCam) {
9733
+ const speaker = canonical(l.speaker ?? "voiceover");
9734
+ const text = l.line.trim();
9735
+ const start = l.start_s ?? scene.start_s ?? 0;
9736
+ const end = l.end_s ?? start + estSpeechS(text);
9737
+ const cur = bySpeaker.get(speaker);
9738
+ if (cur) {
9739
+ cur.lines.push(text);
9740
+ cur.start = Math.min(cur.start, start);
9741
+ cur.end = Math.max(cur.end, end);
9742
+ } else {
9743
+ bySpeaker.set(speaker, { lines: [text], start, end });
9744
+ }
9745
+ }
9746
+ for (const [speaker, agg] of bySpeaker) {
9747
+ const voiceNode = ensureVoiceNode(speaker);
9748
+ emitPhraseTts(
9749
+ {
9750
+ speaker,
9751
+ start_s: agg.start,
9752
+ end_s: agg.end,
9753
+ text: agg.lines.join(" "),
9754
+ firstScene: i,
9755
+ shownScenes: [],
9756
+ presenterShown: false
9757
+ },
9758
+ voiceNode,
9759
+ i,
9760
+ usedVoIds,
9761
+ nodes,
9762
+ out
9763
+ );
9764
+ }
9765
+ }
9766
+ function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
9767
+ const present = slotsForScene(env.slots, i);
9768
+ const mode = sceneShootMode(scene, present, void 0, env.cameraOn, env.casts);
9769
+ const ambientBroll = Boolean(env.opts.ambient) && mode !== "ugc_selfie";
9770
+ const lengths = sceneTiming(scene, isLast, void 0);
9771
+ const ctx = {
9772
+ sceneIndex: i,
9773
+ ar: env.ar,
9774
+ reuse: env.reuse,
9775
+ imageModel: env.opts.imageModel,
9776
+ shootMode: mode,
9777
+ ingestCache: env.ingestCache
9778
+ };
9779
+ if (!ambientBroll && lengths.dur <= FLASH_HOLD_MAX_S) {
9780
+ emitFlashHold(i, scene, env.slots, ctx, lengths, lengths.out, env.ar, nodes, out.clips);
9781
+ return void 0;
9782
+ }
9783
+ const first = scene.continues_previous && prevEndFrame ? prevEndFrame : buildFrameRef(
9784
+ "start",
9785
+ scene.start_frame_asset?.url,
9786
+ scene.start_frame_prompt,
9787
+ slotsForFrame(env.slots, i, "start"),
9788
+ ctx,
9789
+ nodes
9790
+ );
9791
+ const last = buildFrameRef(
9792
+ "end",
9793
+ scene.end_frame_asset?.url,
9794
+ scene.end_frame_prompt,
9795
+ slotsForFrame(env.slots, i, "end"),
9796
+ ctx,
9797
+ nodes
9798
+ );
9799
+ const clip = emitSceneClip(
9800
+ i,
9801
+ scene,
9802
+ present,
9803
+ mode,
9804
+ void 0,
9805
+ ambientBroll,
9806
+ { first, last },
9807
+ { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
9808
+ lengths.out,
9809
+ { ar: env.ar, videoModel: env.opts.videoModel },
9810
+ nodes
9811
+ );
9812
+ if (ambientBroll) {
9813
+ emitSceneNativeAudio(
9814
+ i,
9815
+ scene,
9816
+ void 0,
9817
+ true,
9818
+ { dur: lengths.dur, speech: 0, genDur: lengths.genDur },
9819
+ nodes,
9820
+ out.voTracks,
9821
+ out.nativeSegments
9822
+ );
9823
+ }
9824
+ out.clips.push(clip);
9825
+ return last;
9826
+ }
9827
+ function buildTimeline(blueprint, slots, opts, nodes) {
9828
+ const reuse = opts.frames === "reuse";
9829
+ const compositeScenes = /* @__PURE__ */ new Set();
9830
+ if (!reuse) {
9831
+ blueprint.scenes.forEach((s, i) => {
9832
+ if (layeredComposition(s)) compositeScenes.add(i);
9833
+ });
9834
+ }
9835
+ const canonical = collapseVoiceover(blueprint);
9836
+ const ensureVoiceNode = makeVoiceFactory(blueprint, canonical, nodes);
9837
+ const env = {
9838
+ blueprint,
9839
+ slots,
9840
+ opts,
9841
+ ar: aspectRatioParam(blueprint),
9842
+ reuse,
9843
+ cameraOn: onCameraDialogue(blueprint),
9844
+ casts: castIdSet(blueprint),
9845
+ ingestCache: /* @__PURE__ */ new Map()
9846
+ };
9847
+ const out = {
9848
+ clips: [],
9849
+ voTracks: [],
9850
+ voSegments: [],
9851
+ talkingScenes: [],
9852
+ nativeSegments: [],
9853
+ sceneSlice: /* @__PURE__ */ new Map()
9854
+ };
9855
+ const presenterPresent = makePresenterPresent(slots, canonical);
9856
+ const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent);
9026
9857
  const usedVoIds = /* @__PURE__ */ new Set();
9027
- blueprint.scenes.forEach((scene, sceneIndex) => {
9028
- const lines = (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).slice().sort((a, b) => (a.start_s ?? 0) - (b.start_s ?? 0));
9029
- if (lines.length === 0) return;
9030
- const groups = [];
9031
- for (const line of lines) {
9032
- const speaker = line.speaker ?? "voiceover";
9033
- const last = groups[groups.length - 1];
9034
- if (last && last.speaker === speaker) last.lines.push(line);
9035
- else groups.push({ speaker, lines: [line] });
9036
- }
9037
- const shells = groups.map((group) => {
9038
- const first = group.lines[0];
9039
- const last = group.lines[group.lines.length - 1];
9040
- if (!first || !last) return void 0;
9041
- return {
9042
- group,
9043
- start: first.start_s ?? scene.start_s ?? 0,
9044
- end: last.end_s ?? last.start_s ?? scene.end_s ?? first.start_s ?? scene.start_s ?? 0,
9045
- onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn)
9046
- };
9047
- }).filter((s) => Boolean(s));
9048
- const onCamCount = shells.filter((s) => s.onCamera).length;
9049
- const list = [];
9050
- shells.forEach((shell, gi) => {
9051
- const { group, start, end, onCamera } = shell;
9052
- const voiceNode = ensureVoiceNode(group.speaker);
9053
- const text = group.lines.map((l) => l.line.trim()).join(" ");
9054
- const native = onCamera && onCamCount === 1;
9055
- const turn = {
9056
- sceneIndex,
9057
- speaker: group.speaker,
9058
- start_s: start,
9059
- end_s: end,
9060
- text,
9061
- voiceNode,
9062
- native
9063
- };
9064
- if (!native) {
9065
- let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
9066
- if (usedVoIds.has(id)) {
9067
- let n = 2;
9068
- while (usedVoIds.has(`${id}_${n}`)) n++;
9069
- id = `${id}_${n}`;
9070
- }
9071
- usedVoIds.add(id);
9858
+ const claimed = /* @__PURE__ */ new Set();
9859
+ phrases.forEach((phrase, k) => {
9860
+ const voiceNode = ensureVoiceNode(phrase.speaker);
9861
+ const available = phrase.shownScenes.filter((s) => !claimed.has(s));
9862
+ if (phrase.presenterShown && available.length > 0) {
9863
+ for (const s of available) claimed.add(s);
9864
+ emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
9865
+ } else {
9866
+ emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
9867
+ }
9868
+ });
9869
+ const lastIndex = blueprint.scenes.length - 1;
9870
+ let prevEndFrame;
9871
+ blueprint.scenes.forEach((scene, i) => {
9872
+ const composite = compositeScenes.has(i) ? layeredComposition(scene) : null;
9873
+ if (composite) {
9874
+ emitCompositeInTimeline(
9875
+ composite,
9876
+ scene,
9877
+ i,
9878
+ i === lastIndex,
9879
+ env,
9880
+ canonical,
9881
+ ensureVoiceNode,
9882
+ usedVoIds,
9883
+ nodes,
9884
+ out
9885
+ );
9886
+ prevEndFrame = void 0;
9887
+ return;
9888
+ }
9889
+ const slice = out.sceneSlice.get(i);
9890
+ if (slice) {
9891
+ const whole = slice.offset === 0 && Math.abs(slice.len - slice.clipDur) <= 0.05;
9892
+ if (whole) {
9893
+ out.clips.push({ ref: slice.clipRef, scene_s: slice.len, out: null });
9894
+ } else {
9072
9895
  nodes.push({
9073
- id,
9074
- type: "tts",
9075
- inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
9076
- params: { model: FIXED_TTS_MODEL, text, voice: "{{voice_ref}}" }
9896
+ id: `s${i}_seg`,
9897
+ type: "ffmpeg",
9898
+ inputs: { clip: slice.clipRef },
9899
+ params: { args: trimArgs(slice.len, slice.offset), outputs: { video: { kind: "video", ext: "mp4" } } }
9077
9900
  });
9078
- turn.ttsId = id;
9079
- const audioRef = `$ref:${id}.audio`;
9080
- tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
9901
+ out.clips.push({ ref: `$ref:s${i}_seg.video`, scene_s: slice.len, out: null });
9081
9902
  }
9082
- list.push(turn);
9083
- });
9084
- sceneTurns.set(sceneIndex, list);
9903
+ prevEndFrame = void 0;
9904
+ return;
9905
+ }
9906
+ prevEndFrame = emitBrollScene(scene, i, i === lastIndex, env, nodes, out, prevEndFrame);
9085
9907
  });
9086
- return { tracks, sceneTurns };
9908
+ const totalMs = Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3);
9909
+ out.voTracks.push(...buildPerSpeakerVoiceConversion(out.nativeSegments, totalMs, nodes));
9910
+ return { clips: out.clips, voTracks: out.voTracks, vo_segments: out.voSegments, talking_scenes: out.talkingScenes };
9087
9911
  }
9088
9912
  function buildSfxMusic(blueprint, nodes) {
9089
9913
  const tracks = [];
@@ -9106,13 +9930,21 @@ function buildSfxMusic(blueprint, nodes) {
9106
9930
  const musicPrompt = blueprint.global?.music?.music_prompt;
9107
9931
  if (musicPrompt) {
9108
9932
  const total = blueprint.source?.duration_s ?? lastSceneEnd(blueprint);
9109
- const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, 0), Math.max(total - 0.5, 0));
9933
+ const hookEnd = blueprint.scenes[0]?.end_s ?? 0;
9934
+ const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, hookEnd), Math.max(total - 0.5, 0));
9110
9935
  const totalMs = Math.round((total - startAt) * 1e3);
9111
9936
  const musicMs = Math.min(Math.max(totalMs, 3e3), ELEVENLABS_MAX_MUSIC_LENGTH_MS);
9112
9937
  nodes.push({
9113
9938
  id: "music_bed",
9114
9939
  type: "music",
9115
- params: { model: FIXED_MUSIC_MODEL, prompt: musicBedPrompt(blueprint, musicPrompt), music_length_ms: musicMs }
9940
+ // force_instrumental: the model is vocal-capable; without this it can SING the
9941
+ // mood (and feeding it the script made it sing the ad). The voice owns the words.
9942
+ params: {
9943
+ model: FIXED_MUSIC_MODEL,
9944
+ prompt: musicBedPrompt(blueprint, musicPrompt),
9945
+ music_length_ms: musicMs,
9946
+ force_instrumental: true
9947
+ }
9116
9948
  });
9117
9949
  tracks.push({
9118
9950
  slot: "music",
@@ -9156,22 +9988,63 @@ function normalizeAnim(animation) {
9156
9988
  const mapped = animation === "slide" ? "slide_up" : animation;
9157
9989
  return SUPPORTED_ANIMS.has(mapped) ? mapped : void 0;
9158
9990
  }
9991
+ var FACE_ZONE_POSITIONS = /* @__PURE__ */ new Set([
9992
+ "center",
9993
+ "centre",
9994
+ "mid-center",
9995
+ "mid-centre",
9996
+ "middle-center",
9997
+ "center-center",
9998
+ "mid",
9999
+ "middle"
10000
+ ]);
9159
10001
  function positionClass(position) {
9160
10002
  const p = (position ?? "bottom_center").toLowerCase().replace(/[^a-z]+/g, "-");
9161
- return `pos-${p}`;
10003
+ const safe = FACE_ZONE_POSITIONS.has(p) ? "bottom-center" : p;
10004
+ return `pos-${safe}`;
9162
10005
  }
9163
- function overlayElement(ov, sceneStart) {
10006
+ function collectCaptions(blueprint) {
10007
+ return blueprint.scenes.flatMap((scene) => {
10008
+ const sceneStart = scene.start_s ?? 0;
10009
+ const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
10010
+ return overlays.success ? overlays.data.filter((ov) => Boolean(ov.text?.trim())).map((ov) => {
10011
+ const at = ov.appears_at_s ?? sceneStart;
10012
+ return { text: ov.text.trim(), at, end: at + (ov.duration_s ?? 2.5), ov };
10013
+ }) : [];
10014
+ }).sort((a, b) => a.at - b.at);
10015
+ }
10016
+ function mergeCaptions(blueprint) {
10017
+ const byText = /* @__PURE__ */ new Map();
10018
+ for (const e of collectCaptions(blueprint)) {
10019
+ const arr = byText.get(e.text);
10020
+ if (arr) arr.push(e);
10021
+ else byText.set(e.text, [e]);
10022
+ }
10023
+ const merged = [];
10024
+ for (const arr of byText.values()) {
10025
+ let cur = null;
10026
+ for (const e of arr) {
10027
+ if (cur && e.at <= cur.end + 0.35) cur.end = Math.max(cur.end, e.end);
10028
+ else {
10029
+ cur = { ...e };
10030
+ merged.push(cur);
10031
+ }
10032
+ }
10033
+ }
10034
+ return merged.sort((a, b) => a.at - b.at);
10035
+ }
10036
+ function overlayElement(ov, at, dur) {
9164
10037
  if (!ov.text?.trim()) return "";
9165
- const at = ov.appears_at_s ?? sceneStart;
9166
- const dur = ov.duration_s ?? 2.5;
9167
10038
  const role = ov.role ? ` data-role="${escapeHtml(ov.role)}"` : "";
9168
10039
  const normAnim = normalizeAnim(ov.animation);
9169
10040
  const anim = normAnim ? ` data-anim="${normAnim}"` : "";
9170
10041
  const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
9171
10042
  return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
9172
10043
  }
10044
+ var RICH_OVERLAY_RE = /notif|tweet|\bx post\b|post\b|comment|message|chat|bubble|card|review|rating|stat|counter|toast|popup/;
9173
10045
  function sourceHint(fe) {
9174
10046
  const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
10047
+ const haystack = `${fe.kind ?? ""} ${fe.description ?? ""} ${fe.what_it_represents ?? ""}`.toLowerCase();
9175
10048
  switch ((fe.kind ?? "").toLowerCase()) {
9176
10049
  case "logo":
9177
10050
  return "baker images logo <domain> (or baker images library)";
@@ -9181,6 +10054,9 @@ function sourceHint(fe) {
9181
10054
  case "product_cutout":
9182
10055
  return `baker images library "${desc}" (the client's own product)`;
9183
10056
  default:
10057
+ if (RICH_OVERLAY_RE.test(haystack)) {
10058
+ return `npx hyperframes add <social-card/notification block> for "${desc}" (animated overlay, not a static icon \u2014 see references/hyperframes/catalog.md)`;
10059
+ }
9184
10060
  return `baker images icon "${desc}"`;
9185
10061
  }
9186
10062
  }
@@ -9196,6 +10072,26 @@ function floatingStub(fe, sceneStart) {
9196
10072
  `<img class="ov ${positionClass(fe.position)}" src="your-${slug}.png" data-start="${at}" data-dur="${dur}" alt="" /> -->`
9197
10073
  ].join("\n");
9198
10074
  }
10075
+ function uiPipStub(scene) {
10076
+ const comp = scene.composition;
10077
+ const layout = (comp?.layout ?? "").toLowerCase();
10078
+ if (!COMPOSITE_LAYOUTS.has(layout)) return "";
10079
+ const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
10080
+ if (regions.length < 2 || !isUiOnlyComposite(regions)) return "";
10081
+ const ui = regions.find(regionIsUiSurface);
10082
+ const at = scene.start_s ?? 0;
10083
+ const dur = Math.max(0.5, Math.round(((scene.end_s ?? at + 2.5) - at) * 100) / 100);
10084
+ const label = commentSafe(ui?.summary || ui?.frame_prompt || ui?.panel || "the app screen");
10085
+ return [
10086
+ `<!-- PHONE UI @ ${at}s for ${dur}s \u2014 the app/site screen this scene shows: ${label}.`,
10087
+ " Build it as a REAL surface, NEVER AI: capture the live page \u2014",
10088
+ " baker images screenshot https://<brand-domain>/<path> (image-library skill)",
10089
+ " \u2014 OR hand-build a brand-accurate HTML screen; then frame it in a phone mockup:",
10090
+ " npx hyperframes add phone-scroll (writes compositions/phone-scroll.html)",
10091
+ " drop the screenshot as screenshot.png in this dir and nest it as a PIP clip:",
10092
+ ` <div data-composition-src="compositions/phone-scroll.html" data-start="${at}" data-duration="${dur}" data-track-index="2" data-width="1080" data-height="1920"></div> -->`
10093
+ ].join("\n");
10094
+ }
9199
10095
  function buildOverlayHtml(input) {
9200
10096
  const blueprint = VideoBlueprint.parse(input);
9201
10097
  const blocks = [
@@ -9215,14 +10111,14 @@ function buildOverlayHtml(input) {
9215
10111
  " Positions: edit the .pos-* classes or add your own. -->"
9216
10112
  ].join("\n")
9217
10113
  ];
10114
+ const ovParts = mergeCaptions(blueprint).map((e) => overlayElement(e.ov, e.at, Math.round((e.end - e.at) * 1e3) / 1e3)).filter(Boolean);
10115
+ if (ovParts.length > 0) blocks.push(ovParts.join("\n"));
9218
10116
  for (const scene of blueprint.scenes) {
9219
10117
  const sceneStart = scene.start_s ?? 0;
9220
- const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
9221
10118
  const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
9222
- const parts = [
9223
- ...overlays.success ? overlays.data.map((ov) => overlayElement(ov, sceneStart)) : [],
9224
- ...floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []
9225
- ].filter(Boolean);
10119
+ const parts = (floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []).filter(Boolean);
10120
+ const pip = uiPipStub(scene);
10121
+ if (pip) parts.push(pip);
9226
10122
  if (parts.length > 0) blocks.push(parts.join("\n"));
9227
10123
  }
9228
10124
  return blocks.join("\n\n");
@@ -9255,15 +10151,15 @@ function xfadeSpineArgs(clips) {
9255
10151
  let cur = "c0";
9256
10152
  let accLen = clipInputLen(clips[0]);
9257
10153
  for (let k = 0; k < n - 1; k++) {
9258
- const join3 = clips[k].out;
10154
+ const join4 = clips[k].out;
9259
10155
  const next = `c${k + 1}`;
9260
10156
  const out = k === n - 2 ? "v" : `j${k + 1}`;
9261
- if (join3) {
9262
- const offset = Math.max(0, accLen - join3.dur);
10157
+ if (join4) {
10158
+ const offset = Math.max(0, accLen - join4.dur);
9263
10159
  filt.push(
9264
- `[${cur}][${next}]xfade=transition=${join3.xfade}:duration=${join3.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
10160
+ `[${cur}][${next}]xfade=transition=${join4.xfade}:duration=${join4.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
9265
10161
  );
9266
- accLen = accLen - join3.dur + clipInputLen(clips[k + 1]);
10162
+ accLen = accLen - join4.dur + clipInputLen(clips[k + 1]);
9267
10163
  } else {
9268
10164
  filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
9269
10165
  accLen += clipInputLen(clips[k + 1]);
@@ -9304,15 +10200,14 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
9304
10200
  params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
9305
10201
  });
9306
10202
  });
9307
- if (opts.actorSheets) applyActorSheets(slots, nodes);
9308
- const { tracks: ttsTracks, sceneTurns } = buildDialogue(blueprint, nodes);
9309
- const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
9310
- const voTracks = [...ttsTracks, ...nativeVoTracks];
10203
+ applyActorSheets(slots, nodes);
10204
+ const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
9311
10205
  let videoRef = buildSpine(clips, nodes);
9312
10206
  let videoNode = "spine";
9313
10207
  const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
9314
10208
  const floating = blueprint.scenes.flatMap((s) => s.floating_elements ?? []);
9315
- if (overlays.length > 0 || floating.length > 0) {
10209
+ const hasUiPip = blueprint.scenes.some((s) => uiPipStub(s) !== "");
10210
+ if (overlays.length > 0 || floating.length > 0 || hasUiPip) {
9316
10211
  nodes.push({
9317
10212
  id: "overlaid",
9318
10213
  type: "hyperframe_render",
@@ -9322,10 +10217,28 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
9322
10217
  videoRef = "$ref:overlaid.video";
9323
10218
  videoNode = "overlaid";
9324
10219
  }
10220
+ if (opts.captionsCompositionPath && opts.transcriptPath) {
10221
+ nodes.push({
10222
+ id: "captions_transcript",
10223
+ type: "ingest",
10224
+ params: { source: "path", path: opts.transcriptPath, expect: "json" }
10225
+ });
10226
+ nodes.push({
10227
+ id: "captions",
10228
+ type: "hyperframe_render",
10229
+ inputs: { background: videoRef, transcript: "$ref:captions_transcript.asset" },
10230
+ params: { composition: opts.captionsCompositionPath }
10231
+ });
10232
+ videoRef = "$ref:captions.video";
10233
+ videoNode = "captions";
10234
+ }
9325
10235
  const tracks = [...voTracks, ...buildSfxMusic(blueprint, nodes)];
9326
10236
  if (tracks.length > 0) {
9327
10237
  const mixInputs = {};
9328
10238
  for (const t of tracks) mixInputs[t.slot] = t.ref;
10239
+ const musicTrack = tracks.find((t) => t.kind === "music");
10240
+ const voiceSlots = tracks.filter((t) => t.kind === "vo").map((t) => t.slot);
10241
+ const duck = musicTrack && voiceSlots.length > 0 ? { duck: { track: musicTrack.slot, against: voiceSlots } } : {};
9329
10242
  nodes.push({
9330
10243
  id: "audio_mix",
9331
10244
  type: "audio_timeline",
@@ -9336,7 +10249,8 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
9336
10249
  start_s: t.start_s,
9337
10250
  ...t.gain_db !== void 0 ? { gain_db: t.gain_db } : {}
9338
10251
  })),
9339
- total_ms: Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3)
10252
+ total_ms: Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3),
10253
+ ...duck
9340
10254
  }
9341
10255
  });
9342
10256
  nodes.push({
@@ -9384,45 +10298,31 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
9384
10298
  // The timing plan `baker canvas validate` checks before any billed render:
9385
10299
  // sequenced voiceover turns (no overlap), audio ≈ video length, and which
9386
10300
  // scenes must be lip-synced.
9387
- video: buildVideoMeta(blueprint, sceneTurns)
10301
+ video: buildVideoMeta(blueprint, { vo_segments, talking_scenes })
9388
10302
  },
9389
10303
  nodes,
9390
10304
  output: { node: videoNode, output: "video" }
9391
10305
  };
9392
10306
  }
9393
- function buildVideoMeta(blueprint, sceneTurns) {
9394
- const vo_segments = [];
9395
- const talking_scenes = [];
9396
- for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
9397
- for (const t of turns) {
9398
- if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
9399
- }
9400
- const nativeTurn = turns.find((t) => t.native);
9401
- if (nativeTurn) {
9402
- const sceneObj = blueprint.scenes[scene];
9403
- talking_scenes.push({
9404
- scene,
9405
- voice_convert_node: `s${scene}_voconv`,
9406
- scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
9407
- est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
9408
- });
9409
- }
9410
- }
10307
+ function buildVideoMeta(blueprint, meta) {
9411
10308
  return {
9412
10309
  duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
9413
- vo_segments,
9414
- talking_scenes,
9415
- motion_board: buildMotionBoard(blueprint, sceneTurns)
10310
+ vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
10311
+ talking_scenes: meta.talking_scenes,
10312
+ motion_board: buildMotionBoard(blueprint)
9416
10313
  };
9417
10314
  }
9418
- function buildMotionBoard(blueprint, sceneTurns) {
10315
+ function sceneSpokenText(scene) {
10316
+ return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
10317
+ }
10318
+ function buildMotionBoard(blueprint) {
9419
10319
  const round = (n) => Math.round(n * 100) / 100;
9420
10320
  let cursor = 0;
9421
10321
  return blueprint.scenes.map((scene, i) => {
9422
10322
  const start_s = scene.start_s ?? cursor;
9423
10323
  const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
9424
10324
  cursor = end_s;
9425
- const spoken = (sceneTurns.get(i) ?? []).map((t) => t.text?.trim()).filter((l) => Boolean(l)).join(" ") || null;
10325
+ const spoken = sceneSpokenText(scene);
9426
10326
  const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
9427
10327
  const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
9428
10328
  const graphics = [
@@ -9445,19 +10345,21 @@ function buildMotionBoard(blueprint, sceneTurns) {
9445
10345
  scene: i,
9446
10346
  role: resolveSceneRole(scene, i, blueprint.scenes.length),
9447
10347
  window_s: [round(start_s), round(end_s)],
9448
- storyboard_frames: [`s${i}_start`, `s${i}_end`],
10348
+ // A continuation b-roll scene shares the previous scene's end frame as its start
10349
+ // (no own `s<i>_start` node), so point the storyboard at that shared keyframe.
10350
+ storyboard_frames: [scene.continues_previous && i > 0 ? `s${i - 1}_end` : `s${i}_start`],
9449
10351
  spoken,
9450
10352
  graphics
9451
10353
  };
9452
10354
  });
9453
10355
  }
9454
10356
  var VIDEO_GUIDE = [
9455
- "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 concatenated. Talking heads are voiced NATIVELY by Seedance (lips+voice generated together) then re-voiced to one brand voice; off-camera narration is sequenced tts. On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
10357
+ "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video, built like an editing timeline. The VOICE is cut at PAUSES, not at visual cuts: each continuous-speech PHRASE is ONE Seedance clip (native lip-sync + audio) re-voiced to one brand voice, so a sentence never breaks mid-word across a cut. Each scene's PICTURE is independent: a scene that SHOWS the speaker slices its window out of the phrase clip; a b-roll cutaway gets its own silent clip (or a still hold for a sub-2s flash) laid over the continuing voice; a pure-voiceover stretch is one ElevenLabs tts read. Every clip gets a CLEAN-PLATE start AND end keyframe (no baked text), RECAST to your dropped reference assets \u2014 Seedance interpolates real in-shot motion between them. Each frame grounds ONLY on its own extracted frame + el_* slots (never another generated frame), so all frames render in PARALLEL (no cross-frame cascade). A SPLIT-SCREEN / PICTURE-IN-PICTURE / KEYED-PRESENTER scene is reproduced as one clip PER REGION, stacked or overlaid (see `metadata.todo.composition`). On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voice + SFX + a ducked music bed, normalized stereo. It is a STARTING POINT, not a locked render: add, delete, reorder, split, merge, or re-time scenes freely (a b-roll cutaway INSIDE a phrase lands at an approximate beat \u2014 nudge it) \u2014 see `metadata.todo.full_flexibility`.",
9456
10358
  "",
9457
10359
  "WHAT TO DO NEXT:",
9458
10360
  "0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
9459
- "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` / `s<i>_end` node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want.",
9460
- "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. The boundary frames ARE your storyboard; `metadata.video.motion_board` lays out each scene's frames, time window, spoken line, and the graphics scheduled in it. Lock the frames + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
10361
+ "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` keyframe node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want. The frame is RECAST to the el_* reference images you drop (the source ad's people are never reused), so describe pose/action/framing here and let the references carry identity.",
10362
+ "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. Each scene's keyframe IS your storyboard; `metadata.video.motion_board` lays out each scene's frame, time window, spoken line, and the graphics scheduled in it. Lock the keyframes + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
9461
10363
  "2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
9462
10364
  "3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
9463
10365
  "4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
@@ -9468,11 +10370,11 @@ var VIDEO_GUIDE = [
9468
10370
  "- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
9469
10371
  "- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
9470
10372
  "- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
9471
- "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it). Match-cut continuous action by setting scene N+1's start frame = scene N's end frame (costs no extra gens).",
10373
+ "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it).",
9472
10374
  "- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
9473
10375
  "- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
9474
10376
  "",
9475
- "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are.",
10377
+ "Tip: `prompt.json` is the deconstruction provenance + the authoritative SHARED AD SPEC each frame reads for cast identity, palette, brand, and type cohesion. The per-frame editing surface is the frame node's own FRAME DESCRIPTION.",
9476
10378
  "Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
9477
10379
  ].join("\n");
9478
10380
  function inferNarrativeRole(index, total) {
@@ -9514,14 +10416,16 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
9514
10416
  const hookSceneIndex = findHookSceneIndex(blueprint);
9515
10417
  const h = hookSceneIndex;
9516
10418
  return {
10419
+ full_flexibility: "THIS CANVAS IS A STARTING POINT, NOT A LOCKED RENDER. It mirrors the reference's structure so you have a faithful scaffold \u2014 but you have FULL EDITING FREEDOM and should use it. You can: ADD a scene (new s<i>_start/_end + s<i>_clip + wire it into `spine`), DELETE a scene (drop its nodes + its `spine` input), REORDER scenes, SPLIT one beat into two or MERGE two into one, change any frame prompt or motion brief, swap an element reference, re-time or rewrite any overlay/voice, or change a scene's LAYOUT (make a full-frame beat a split-screen/PIP, or flatten a composite to one shot \u2014 see `composition`). Re-craft for OUR brand and OUR best ad; the reference is inspiration, not a spec to trace. The content-addressed cache re-bills only what you actually change, so iterate freely. `baker canvas validate` re-checks timing/lip-sync after any edit.",
10420
+ composition: "Some scenes are COMPOSITED, not single shots \u2014 `prompt.json`'s scene.composition.layout tells you which: `split_screen` (panels each showing different footage \u2014 e.g. b-roll on top, presenter on the bottom), `pip` (a presenter boxed in a corner over full-frame background), or `keyed_overlay` (a green-screen/cut-out presenter over background). Each is reproduced as ONE generated clip PER REGION (`s<i>_r0_*`, `s<i>_r1_*`, \u2026) stacked (vstack/hstack) or overlaid by an `s<i>_composite` ffmpeg node; a keyed presenter runs through `s<i>_key` (video_background_remove) for a transparent cut-out first. Edit each region's own keyframe prompt + motion brief independently. The presenter region (is_presenter) carries the lip-synced voice. To CHANGE a layout, edit composition in prompt.json and re-scaffold, or hand-edit the s<i>_composite ffmpeg args (splitStackArgs/pipOverlayArgs patterns). A clean full-frame talking head is simpler than a composite \u2014 flatten when the brand's version doesn't need the split.",
9517
10421
  recraft_the_script_first: `VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer.${h >= 0 ? " The HOOK is the #1 decision (see the `hook` todo);" : ""} ${h >= 0 ? "then work" : "Work"} the per-scene \`script_recraft\` checklist. References: references/hook-craft.md (the hook), references/script-craft.md (body/CTA) + the meta-ads-playbook skill.`,
9518
10422
  ...h >= 0 ? {
9519
10423
  hook: `THE HOOK IS THE HIGHEST-LEVERAGE BEAT \u2014 the first frame + first 3\u20134s decide whether the ad is watched at all, and the hook is TARGETING. But highest-leverage does NOT mean always rewrite: this hook already won, so MOST OF THE TIME you KEEP it and build on top (swap only the specifics). REBUILD is the exception \u2014 only when it doesn't transfer (a claim we lack or a different funnel/awareness stage), and then by reaching for its deeper INNER MECHANIC and delivering that truthfully, not inventing a new opener from nothing. For scene ${h}: DIAGNOSE it (device + mechanic + what stage it targets), DECIDE keep/adapt/rebuild, then hold the opener to the criteria \u2014 ${HOOK_OPENER_CRITERIA}. The hook lives across s${h}_start (the scroll-stopping first frame), the scene-${h} overlay text, the s${h}_clip line, an optional ~0.5s micro-hook, and the ramp into the body. Full diagnose\u2192decide\u2192(keep/adapt/rebuild) discipline + the proven hook-type menu: references/hook-craft.md (+ meta-ads-playbook \xA710/\xA717/\xA739).`
9520
10424
  } : {},
9521
10425
  script_recraft: buildScriptRecraft(blueprint),
9522
- edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference. Frames are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
10426
+ edit_frames_in_place: "Each s<i>_start keyframe node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is the authoritative shared ad spec (cast identity, palette, brand). Frames are RECAST to the el_* reference images (the source ad's cast is never reused) and are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
9523
10427
  frames_mode: opts.frames ?? "generate",
9524
- review_storyboard_before_clips: "STORYBOARD FIRST. The per-scene boundary frames (s<i>_start / s<i>_end) ARE your storyboard \u2014 align the LOOK on them before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's frames, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
10428
+ review_storyboard_before_clips: "STORYBOARD FIRST. Each scene's keyframe (s<i>_start) IS your storyboard \u2014 align the LOOK on it before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's keyframe, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
9525
10429
  motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
9526
10430
  assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
9527
10431
  sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
@@ -9534,18 +10438,17 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
9534
10438
  voice_description: d.voice_description,
9535
10439
  line: d.line
9536
10440
  })),
9537
- talking_head_note: "NATIVE: a single-on-camera-speaker scene is voiced by Seedance itself (line in s<i>_clip prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Edit the line in the clip's prompt to re-author it.",
9538
- voice_note: "Every native talking clip's audio is re-voiced to ONE brand voice via audio_voice_convert (eleven_multilingual_sts_v2), timing preserved so lips stay matched. voice_select.voice_id is that brand voice \u2014 set its gender/language to match the creator. Off-camera narration uses a sequenced tts per turn.",
9539
- native_timing: "Seedance paces the spoken line to fill the clip, so each native talking clip is generated long enough for the estimated speech and its audio is kept full-length (not hard-trimmed to the visual scene) \u2014 the line is never cut mid-word; the voice may continue a beat past the visual cut (natural VO continuity). `metadata.video.talking_scenes` carries each scene's scene_s vs est_speech_s. If a rendered line still sounds clipped, the line is simply longer than the scene: shorten the line or lengthen the scene in the deconstruct.",
10441
+ talking_head_note: "PHRASE-NATIVE: a continuous-speech phrase where the speaker is shown is ONE Seedance clip (the full phrase quoted in s<anchor>_clip's prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Scenes that show the speaker slice their window out of that clip (s<i>_seg); edit the phrase line in the s<anchor>_clip prompt to re-author it. A pure-voiceover phrase (speaker never shown) is one ElevenLabs tts read instead.",
10442
+ voice_note: "ONE voice per person: a single voice_select is reused across all that person's phrases (on-camera AND off \u2014 the deconstruct's `voiceover` label folds into the sole presenter). Each presenter phrase's native audio is re-voiced to that brand voice via audio_voice_convert (eleven_multilingual_sts_v2, one convert per phrase, timing preserved so lips stay matched). Set voice_select.voice_id's gender/language to match the creator.",
10443
+ native_timing: "The voice is cut at PAUSES, not at visual cuts, so a sentence spanning a cut stays one continuous read (no mid-word break). The clip is generated long enough for the estimated speech; if a line runs longer than its phrase window the voice continues a beat into the following pause (natural VO continuity). `metadata.video.talking_scenes` carries each phrase's scene_s vs est_speech_s. CAVEAT: a b-roll cutaway INSIDE a phrase lands at an approximate (proportional) time \u2014 Seedance exposes no word timing \u2014 so if a cutaway is off its beat, nudge the scene boundary (it's a starting point).",
9540
10444
  craft: {
9541
10445
  note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
9542
10446
  principles: [
9543
10447
  "Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
9544
10448
  "Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
9545
- "LOCK THE CAMERA \u2014 a first/last-frame clip holds the framing the two frames define; only move when a move is specified. Unmotivated camera drift is the top realism tell.",
10449
+ "LOCK THE CAMERA \u2014 Seedance animates forward from the single keyframe; only move when the motion brief specifies a move. Unmotivated camera drift is the top realism tell.",
9546
10450
  "One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
9547
10451
  "Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
9548
- "Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
9549
10452
  "Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
9550
10453
  "Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
9551
10454
  "Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
@@ -9639,6 +10542,7 @@ function resolveShippedCanvasDir(name, startDir, exists = existsSync3, maxDepth
9639
10542
 
9640
10543
  // src/commands/canvas/scaffold-video.ts
9641
10544
  var SHIPPED_COMPOSITION_DIR = resolveShippedCanvasDir("video-overlay-composition", import.meta.dirname);
10545
+ var SHIPPED_CAPTIONS_DIR = resolveShippedCanvasDir("tiktok-captions-composition", import.meta.dirname);
9642
10546
  function resolveModel2(kind, preferred) {
9643
10547
  const ids = Object.keys(MODEL_REGISTRY[kind]);
9644
10548
  return ids.includes(preferred) ? preferred : ids[0] ?? preferred;
@@ -9659,10 +10563,10 @@ DROP one-off background extras and incidental props \u2014 but the shared set/lo
9659
10563
 
9660
10564
  ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
9661
10565
 
9662
- For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
10566
+ For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of ONLY the scenes where the element is ACTUALLY VISIBLE ON SCREEN \u2014 judged from that scene's start_frame_prompt / end_frame_prompt subjects and its action_detail, NOT from who is merely speaking. A narrator heard over b-roll is NOT present in that b-roll scene; a dog-running cutaway does NOT contain the couch creator just because she talks across it. Do NOT pad the list \u2014 an element wrongly listed in a scene makes the reproduction render the wrong subject there (e.g. the creator appearing in a pure-dog b-roll). When in doubt, leave a scene OUT. Output ONLY the JSON object.`;
9663
10567
  async function loadAssetText2(ref, label) {
9664
10568
  const r = ref;
9665
- if (typeof r?.path === "string") return readFile4(r.path, "utf8");
10569
+ if (typeof r?.path === "string") return readFile5(r.path, "utf8");
9666
10570
  if (typeof r?.url === "string") {
9667
10571
  const res = await fetch(r.url);
9668
10572
  if (!res.ok) throw new Error(`failed to fetch ${label} (${res.status})`);
@@ -9670,6 +10574,24 @@ async function loadAssetText2(ref, label) {
9670
10574
  }
9671
10575
  throw new Error(`${label}: output had no readable path or url`);
9672
10576
  }
10577
+ async function loadTranscriptBestEffort(ref) {
10578
+ if (!ref) return void 0;
10579
+ try {
10580
+ return await loadAssetText2(ref, "deconstruct transcript");
10581
+ } catch {
10582
+ return void 0;
10583
+ }
10584
+ }
10585
+ async function stageCaptions(outDir, transcript) {
10586
+ const text = transcript?.trim();
10587
+ if (!text || text === "[]") return {};
10588
+ const transcriptPath = path5.join(outDir, "transcript.json");
10589
+ await writeFile2(transcriptPath, `${text}
10590
+ `, "utf8");
10591
+ const compositionPath = path5.join(outDir, "tiktok-captions-composition");
10592
+ await cp(SHIPPED_CAPTIONS_DIR, compositionPath, { recursive: true });
10593
+ return { compositionPath, transcriptPath };
10594
+ }
9673
10595
  function parseElements2(raw) {
9674
10596
  const parsed = JSON.parse(raw);
9675
10597
  if (Array.isArray(parsed)) return parsed;
@@ -9678,6 +10600,31 @@ function parseElements2(raw) {
9678
10600
  }
9679
10601
  return [];
9680
10602
  }
10603
+ async function detectShotCutsBestEffort(videoPath, threshold) {
10604
+ try {
10605
+ const cuts = await detectSceneCutsPySceneDetect(videoPath, threshold ? { threshold } : {});
10606
+ if (cuts.length > 0) {
10607
+ process.stderr.write(`Detected ${cuts.length} shot cut(s) via PySceneDetect: ${cuts.join(", ")}s
10608
+ `);
10609
+ } else {
10610
+ process.stderr.write("PySceneDetect ran but found no hard cuts; using LLM scene boundaries.\n");
10611
+ }
10612
+ return cuts;
10613
+ } catch (e) {
10614
+ const msg = e instanceof Error ? e.message : String(e);
10615
+ const code = e?.code;
10616
+ const missing = code === "ENOENT" || /ENOENT|not found|command not found/i.test(msg);
10617
+ if (missing) {
10618
+ process.stderr.write(
10619
+ "WARNING: `scenedetect` (PySceneDetect) is NOT installed \u2014 falling back to LLM-only scene boundaries, which under-segments (coarse 9-15s scenes instead of the real 1-4s cuts). Install it (`pipx install scenedetect[opencv]` or `pip install scenedetect[opencv]`) for accurate shot-cut detection.\n"
10620
+ );
10621
+ } else {
10622
+ process.stderr.write(`Shot-cut detection skipped (${msg}); using LLM boundaries.
10623
+ `);
10624
+ }
10625
+ return [];
10626
+ }
10627
+ }
9681
10628
  function fail2(code, message) {
9682
10629
  process.stderr.write(`${JSON.stringify({ ok: false, error: { code, message } }, null, 2)}
9683
10630
  `);
@@ -9699,53 +10646,78 @@ function resolveModels2(args) {
9699
10646
  videoModel: pick("video-model", "video_generate", "bytedance/seedance-2.0")
9700
10647
  };
9701
10648
  }
9702
- function buildAnalysisCanvas(videoPath, deconstructModel, selectModel, opts) {
10649
+ function buildDeconstructCanvas(videoPath, deconstructModel, opts) {
9703
10650
  const deconstructParams = { model: deconstructModel, mode: "full" };
9704
10651
  if (typeof opts.maxScenes === "number") deconstructParams.max_scenes = opts.maxScenes;
9705
10652
  if (opts.language) deconstructParams.language = opts.language;
9706
10653
  if (opts.focus) deconstructParams.focus = opts.focus;
10654
+ if (opts.shotCuts && opts.shotCuts.length > 0) deconstructParams.shot_cuts = opts.shotCuts;
10655
+ deconstructParams.max_clip_s = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
9707
10656
  return {
9708
10657
  schema: "baker-canvas/1",
9709
10658
  metadata: { name: "video deconstruct pass" },
9710
10659
  nodes: [
9711
10660
  { id: "src", type: "ingest", params: { source: "path", path: videoPath, expect: "video" } },
9712
- { id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams },
10661
+ { id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams }
10662
+ ],
10663
+ output: { node: "deconstruct", output: "analysis" }
10664
+ };
10665
+ }
10666
+ function buildSelectCanvas(selectModel, slimmedBlueprintJson) {
10667
+ return {
10668
+ schema: "baker-canvas/1",
10669
+ metadata: { name: "element selection pass" },
10670
+ nodes: [
9713
10671
  {
9714
10672
  id: "select",
9715
10673
  type: "text_generate",
9716
- inputs: { blueprint: "$ref:deconstruct.analysis" },
9717
10674
  params: {
9718
10675
  model: selectModel,
9719
10676
  max_tokens: 6e3,
9720
10677
  temperature: 0,
9721
10678
  response_format: "json_object",
9722
10679
  system: SELECT_SYSTEM2,
9723
- prompt: SELECT_PROMPT2
10680
+ prompt: SELECT_PROMPT2.replace("{{blueprint}}", () => slimmedBlueprintJson)
9724
10681
  }
9725
10682
  }
9726
10683
  ],
9727
10684
  output: { node: "select", output: "text" }
9728
10685
  };
9729
10686
  }
9730
- async function runAnalysisPasses(canvas) {
10687
+ async function runAnalysisPasses(deconstructCanvas, selectModel) {
9731
10688
  const engine = createEngineFromEnv({ log: (line) => process.stderr.write(`${line}
9732
10689
  `) });
9733
- let outputsByNode;
9734
- let creditsSpent;
10690
+ let credits = 0;
10691
+ let sawCredits = false;
10692
+ const addCredits = (stats) => {
10693
+ const c = stats?.total_credits;
10694
+ if (typeof c === "number") {
10695
+ credits += c;
10696
+ sawCredits = true;
10697
+ }
10698
+ };
10699
+ let blueprint;
10700
+ let transcript;
9735
10701
  try {
9736
- const result = await engine.run(canvas, {});
9737
- outputsByNode = result.outputs_by_node;
9738
- creditsSpent = result.stats?.total_credits;
10702
+ const r1 = await engine.run(deconstructCanvas, {});
10703
+ addCredits(r1.stats);
10704
+ blueprint = JSON.parse(await loadAssetText2(r1.outputs_by_node.deconstruct?.analysis, "deconstruct output"));
10705
+ transcript = await loadTranscriptBestEffort(r1.outputs_by_node.deconstruct?.transcript);
9739
10706
  } catch (e) {
9740
10707
  if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
10708
+ if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
9741
10709
  return fail2("deconstruct", e instanceof Error ? e.message : String(e));
9742
10710
  }
10711
+ const slimJson = JSON.stringify(slimBlueprintForSelection(blueprint));
9743
10712
  try {
9744
- const blueprint = JSON.parse(await loadAssetText2(outputsByNode.deconstruct?.analysis, "deconstruct output"));
9745
- const elements = parseElements2(await loadAssetText2(outputsByNode.select?.text, "selection output"));
9746
- return { blueprint, elements, creditsSpent };
10713
+ const r2 = await engine.run(buildSelectCanvas(selectModel, slimJson), {});
10714
+ addCredits(r2.stats);
10715
+ const elements = parseElements2(await loadAssetText2(r2.outputs_by_node.select?.text, "selection output"));
10716
+ return { blueprint, elements, transcript, creditsSpent: sawCredits ? credits : void 0 };
9747
10717
  } catch (e) {
9748
- return fail2("read_outputs", e instanceof Error ? e.message : String(e));
10718
+ if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
10719
+ if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
10720
+ return fail2("deconstruct", e instanceof Error ? e.message : String(e));
9749
10721
  }
9750
10722
  }
9751
10723
  var scaffoldVideoCommand = defineCommand76({
@@ -9761,11 +10733,11 @@ var scaffoldVideoCommand = defineCommand76({
9761
10733
  type: "boolean",
9762
10734
  description: "Give silent b-roll scenes native diegetic ambient mixed deep under the music bed (off by default)"
9763
10735
  },
9764
- "actor-sheets": {
9765
- type: "boolean",
9766
- description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
9767
- },
9768
10736
  "max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
10737
+ "shot-threshold": {
10738
+ type: "string",
10739
+ description: "PySceneDetect content threshold. Default is adaptive (18, auto re-checked at 27 when a continuous shot looks over-segmented); pinning a value disables the re-check. Lower = more/softer cuts, higher = fewer."
10740
+ },
9769
10741
  language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
9770
10742
  focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
9771
10743
  "deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
@@ -9788,12 +10760,15 @@ var scaffoldVideoCommand = defineCommand76({
9788
10760
  );
9789
10761
  }
9790
10762
  const { deconstructModel, selectModel, imageModel, videoModel } = resolveModels2(args);
9791
- const analysisCanvas = buildAnalysisCanvas(videoPath, deconstructModel, selectModel, {
10763
+ const shotThreshold = args["shot-threshold"] ? Number(args["shot-threshold"]) : void 0;
10764
+ const shotCuts = await detectShotCutsBestEffort(videoPath, shotThreshold);
10765
+ const deconstructCanvas = buildDeconstructCanvas(videoPath, deconstructModel, {
9792
10766
  maxScenes: Number.isFinite(maxScenes) ? maxScenes : void 0,
9793
10767
  language: args.language ? String(args.language) : void 0,
9794
- focus: args.focus ? String(args.focus) : void 0
10768
+ focus: args.focus ? String(args.focus) : void 0,
10769
+ shotCuts
9795
10770
  });
9796
- const { blueprint, elements, creditsSpent } = await runAnalysisPasses(analysisCanvas);
10771
+ const { blueprint, elements, transcript, creditsSpent } = await runAnalysisPasses(deconstructCanvas, selectModel);
9797
10772
  await mkdir(outDir, { recursive: true });
9798
10773
  const annotated = annotateBlueprintWithElements(blueprint, elements);
9799
10774
  await writeFile2(blueprintPath, `${JSON.stringify(annotated, null, 2)}
@@ -9802,7 +10777,7 @@ var scaffoldVideoCommand = defineCommand76({
9802
10777
  await cp(SHIPPED_COMPOSITION_DIR, compositionDest, { recursive: true });
9803
10778
  const indexPath = path5.join(compositionDest, "index.html");
9804
10779
  const overlayHtml = buildOverlayHtml(blueprint);
9805
- const indexHtml = await readFile4(indexPath, "utf8");
10780
+ const indexHtml = await readFile5(indexPath, "utf8");
9806
10781
  const injected = indexHtml.replace("<!--OVERLAYS-->", () => overlayHtml);
9807
10782
  if (injected === indexHtml && overlayHtml.trim()) {
9808
10783
  fail2(
@@ -9811,14 +10786,16 @@ var scaffoldVideoCommand = defineCommand76({
9811
10786
  );
9812
10787
  }
9813
10788
  await writeFile2(indexPath, injected, "utf8");
10789
+ const captions = await stageCaptions(outDir, transcript);
9814
10790
  const opts = {
9815
10791
  imageModel,
9816
10792
  videoModel,
9817
10793
  overlayCompositionPath: compositionDest,
10794
+ captionsCompositionPath: captions.compositionPath,
10795
+ transcriptPath: captions.transcriptPath,
9818
10796
  blueprintPath,
9819
10797
  frames,
9820
- ambient: Boolean(args.ambient),
9821
- actorSheets: Boolean(args["actor-sheets"])
10798
+ ambient: Boolean(args.ambient)
9822
10799
  };
9823
10800
  let canvas;
9824
10801
  let report;
@@ -9851,7 +10828,7 @@ var scaffoldVideoCommand = defineCommand76({
9851
10828
  stats: {
9852
10829
  scene_count: report.scene_count,
9853
10830
  total_nodes: canvas.nodes.length,
9854
- deconstruct_credits_spent: creditsSpent,
10831
+ analysis_credits_spent: creditsSpent,
9855
10832
  run_estimated_credits: validation.estimatedCredits
9856
10833
  },
9857
10834
  checklist: {
@@ -9879,7 +10856,7 @@ var scaffoldVideoCommand = defineCommand76({
9879
10856
  });
9880
10857
 
9881
10858
  // src/commands/canvas/validate.ts
9882
- import { readFile as readFile5 } from "fs/promises";
10859
+ import { readFile as readFile6 } from "fs/promises";
9883
10860
  import path6 from "path";
9884
10861
  import { defineCommand as defineCommand77 } from "citty";
9885
10862
  var validateCommand = defineCommand77({
@@ -9890,7 +10867,7 @@ var validateCommand = defineCommand77({
9890
10867
  args: { file: { type: "positional", required: true, description: "Path to canvas JSON" } },
9891
10868
  async run({ args }) {
9892
10869
  const filePath = path6.resolve(String(args.file));
9893
- const raw = await readFile5(filePath, "utf8");
10870
+ const raw = await readFile6(filePath, "utf8");
9894
10871
  let parsed;
9895
10872
  try {
9896
10873
  parsed = JSON.parse(raw);
@@ -10779,8 +11756,8 @@ function cropSprite(input, region) {
10779
11756
 
10780
11757
  // src/lib/image/io.ts
10781
11758
  import { randomBytes } from "crypto";
10782
- import { glob as fsGlob, readFile as readFile6, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
10783
- import { dirname, extname, join as join2, resolve as resolve4 } from "path";
11759
+ import { glob as fsGlob, readFile as readFile7, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
11760
+ import { dirname, extname, join as join3, resolve as resolve4 } from "path";
10784
11761
  var REMOTE_RE = /^https?:\/\//i;
10785
11762
  var GLOB_RE = /[*?[\]{}]/;
10786
11763
  function isRemoteUrl(value) {
@@ -10815,7 +11792,7 @@ async function readImageBuffer(pathOrUrl) {
10815
11792
  }
10816
11793
  return Buffer.from(await response.arrayBuffer());
10817
11794
  }
10818
- return readFile6(pathOrUrl);
11795
+ return readFile7(pathOrUrl);
10819
11796
  }
10820
11797
  async function isDirectory(path7) {
10821
11798
  try {
@@ -10830,14 +11807,14 @@ async function resolveOutputPath(inputPath, outputArg, options) {
10830
11807
  if (!outputArg) return base;
10831
11808
  if (options.multipleInputs || await isDirectory(outputArg)) {
10832
11809
  const filename = base.split("/").pop() ?? "out.png";
10833
- return join2(outputArg, filename);
11810
+ return join3(outputArg, filename);
10834
11811
  }
10835
11812
  return outputArg;
10836
11813
  }
10837
11814
  async function atomicWrite(targetPath, data) {
10838
11815
  const absolute = resolve4(targetPath);
10839
11816
  const dir = dirname(absolute);
10840
- const tmp = join2(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
11817
+ const tmp = join3(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
10841
11818
  await writeFile3(tmp, data);
10842
11819
  await rename(tmp, absolute);
10843
11820
  }
@@ -11179,7 +12156,7 @@ var findCommand = defineCommand91({
11179
12156
  });
11180
12157
 
11181
12158
  // src/commands/images/generate.ts
11182
- import { readFile as readFile7 } from "fs/promises";
12159
+ import { readFile as readFile8 } from "fs/promises";
11183
12160
  import { defineCommand as defineCommand92 } from "citty";
11184
12161
  import sharp2 from "sharp";
11185
12162
  var GENERATE_TIMEOUT_MS = 18e4;
@@ -11262,7 +12239,7 @@ async function resolveReferences(spec) {
11262
12239
  }
11263
12240
  let raw;
11264
12241
  try {
11265
- raw = await readFile7(entry);
12242
+ raw = await readFile8(entry);
11266
12243
  } catch {
11267
12244
  throw new ApiError("VALIDATION_ERROR", `Reference file not found: ${entry}`);
11268
12245
  }
@@ -12983,7 +13960,7 @@ var stockCommand = defineCommand105({
12983
13960
  });
12984
13961
 
12985
13962
  // src/commands/images/upload.ts
12986
- import { readFile as readFile8 } from "fs/promises";
13963
+ import { readFile as readFile9 } from "fs/promises";
12987
13964
  import { extname as extname2 } from "path";
12988
13965
  import { defineCommand as defineCommand106 } from "citty";
12989
13966
  var MIME_MAP = {
@@ -13123,7 +14100,7 @@ async function uploadLocal(target, args) {
13123
14100
  });
13124
14101
  return;
13125
14102
  }
13126
- const fileBuffer = await readFile8(target);
14103
+ const fileBuffer = await readFile9(target);
13127
14104
  const base64 = fileBuffer.toString("base64");
13128
14105
  const body = { base64, contentType };
13129
14106
  if (args.source) body.source = args.source;
@@ -15088,7 +16065,7 @@ var searchCommand3 = defineCommand135({
15088
16065
  });
15089
16066
 
15090
16067
  // src/commands/videos/upload.ts
15091
- import { readFile as readFile9, stat as stat3 } from "fs/promises";
16068
+ import { readFile as readFile10, stat as stat3 } from "fs/promises";
15092
16069
  import { extname as extname3 } from "path";
15093
16070
  import { defineCommand as defineCommand136 } from "citty";
15094
16071
  var MIME_MAP2 = {
@@ -15153,7 +16130,7 @@ var uploadCommand2 = defineCommand136({
15153
16130
  return;
15154
16131
  }
15155
16132
  const { uploadUrl, videoId } = await apiPost("/api/videos/upload", {});
15156
- const fileBuffer = await readFile9(filePath);
16133
+ const fileBuffer = await readFile10(filePath);
15157
16134
  const uploadResponse = await fetch(uploadUrl, {
15158
16135
  method: "PUT",
15159
16136
  headers: { "Content-Type": contentType },