@koda-sl/baker-cli 0.82.0 → 0.90.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  defaultRegistry,
10
10
  generateCatalog,
11
11
  validateCanvasDeep
12
- } from "./chunk-KIL2ZJST.js";
12
+ } from "./chunk-2E4H2GIJ.js";
13
13
 
14
14
  // src/cli.ts
15
15
  import { defineCommand as defineCommand141, runMain } from "citty";
@@ -8274,10 +8274,68 @@ var scaffoldStaticAdCommand = defineCommand75({
8274
8274
  });
8275
8275
 
8276
8276
  // src/commands/canvas/scaffold-video.ts
8277
- import { cp, mkdir, readFile as readFile4, writeFile as writeFile2 } from "fs/promises";
8277
+ import { cp, mkdir, readFile as readFile5, writeFile as writeFile2 } from "fs/promises";
8278
8278
  import path5 from "path";
8279
8279
  import { defineCommand as defineCommand76 } from "citty";
8280
8280
 
8281
+ // src/engine/nodes/local/lib/sceneDetect.ts
8282
+ import { execFile as execFile2 } from "child_process";
8283
+ import { mkdtemp, readdir as readdir2, readFile as readFile4, rm } from "fs/promises";
8284
+ import { tmpdir } from "os";
8285
+ import { join as join2 } from "path";
8286
+ import { promisify as promisify2 } from "util";
8287
+ var execFileAsync2 = promisify2(execFile2);
8288
+ var PYSCENEDETECT_THRESHOLD = 18;
8289
+ var PYSCENEDETECT_MIN_SCENE_LEN_S = 0.25;
8290
+ function timecodeToSeconds(tc) {
8291
+ const m = tc.trim().match(/^(\d+):(\d{1,2}):(\d{1,2}(?:\.\d+)?)$/);
8292
+ if (!m) return null;
8293
+ const h = Number.parseInt(m[1] ?? "", 10);
8294
+ const min = Number.parseInt(m[2] ?? "", 10);
8295
+ const s = Number.parseFloat(m[3] ?? "");
8296
+ if (!Number.isFinite(h) || !Number.isFinite(min) || !Number.isFinite(s)) return null;
8297
+ return h * 3600 + min * 60 + s;
8298
+ }
8299
+ function parsePySceneDetectCsvCuts(csv) {
8300
+ const firstLine = csv.split(/\r?\n/, 1)[0] ?? "";
8301
+ if (!/^\s*Timecode List:/i.test(firstLine)) return [];
8302
+ const cuts = [];
8303
+ for (const cell of firstLine.split(",").slice(1)) {
8304
+ const t = timecodeToSeconds(cell);
8305
+ if (t !== null && t > 0) cuts.push(Math.round(t * 1e3) / 1e3);
8306
+ }
8307
+ return [...new Set(cuts)].sort((a, b) => a - b);
8308
+ }
8309
+ async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
8310
+ const threshold = opts.threshold ?? PYSCENEDETECT_THRESHOLD;
8311
+ const minSceneLenS = opts.minSceneLenS ?? PYSCENEDETECT_MIN_SCENE_LEN_S;
8312
+ const outDir = await mkdtemp(join2(tmpdir(), "baker-scenedetect-"));
8313
+ try {
8314
+ await execFileAsync2(
8315
+ "scenedetect",
8316
+ [
8317
+ "--input",
8318
+ filePath,
8319
+ "--output",
8320
+ outDir,
8321
+ "detect-content",
8322
+ "--threshold",
8323
+ String(threshold),
8324
+ "--min-scene-len",
8325
+ String(minSceneLenS),
8326
+ "list-scenes",
8327
+ "--quiet"
8328
+ ],
8329
+ { encoding: "utf-8", maxBuffer: 32 * 1024 * 1024, timeout: opts.timeout_ms ?? 12e4 }
8330
+ );
8331
+ const csvName = (await readdir2(outDir)).find((f) => f.toLowerCase().endsWith(".csv"));
8332
+ if (!csvName) return [];
8333
+ return parsePySceneDetectCsvCuts(await readFile4(join2(outDir, csvName), "utf-8"));
8334
+ } finally {
8335
+ await rm(outDir, { recursive: true, force: true });
8336
+ }
8337
+ }
8338
+
8281
8339
  // src/engine/scaffold/video.ts
8282
8340
  import { z as z3 } from "zod";
8283
8341
 
@@ -8432,10 +8490,78 @@ function sceneDurationS(scene) {
8432
8490
  const max = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
8433
8491
  return Math.min(Math.max(raw, 0.5), max);
8434
8492
  }
8435
- function trimArgs(durationS) {
8493
+ function canvasDims(ar) {
8494
+ switch (ar) {
8495
+ case "1:1":
8496
+ return { w: 1080, h: 1080 };
8497
+ case "16:9":
8498
+ return { w: 1920, h: 1080 };
8499
+ case "4:3":
8500
+ return { w: 1440, h: 1080 };
8501
+ case "3:4":
8502
+ return { w: 1080, h: 1440 };
8503
+ case "21:9":
8504
+ return { w: 1920, h: 822 };
8505
+ default:
8506
+ return { w: 1080, h: 1920 };
8507
+ }
8508
+ }
8509
+ function fillPanel(label, w, h, out) {
8510
+ return `[${label}]scale=${w}:${h}:force_original_aspect_ratio=increase,crop=${w}:${h},setsar=1,fps=30[${out}]`;
8511
+ }
8512
+ function splitStackArgs(count, axis, dims) {
8513
+ const pw = axis === "horizontal" ? Math.round(dims.w / count) : dims.w;
8514
+ const ph = axis === "vertical" ? Math.round(dims.h / count) : dims.h;
8515
+ const inputs = [];
8516
+ const filt = [];
8517
+ let labels = "";
8518
+ for (let i = 0; i < count; i++) {
8519
+ inputs.push("-i", `{{in.c${i}}}`);
8520
+ filt.push(fillPanel(`${i}:v`, pw, ph, `p${i}`));
8521
+ labels += `[p${i}]`;
8522
+ }
8523
+ const stack = axis === "vertical" ? "vstack" : "hstack";
8524
+ filt.push(`${labels}${stack}=inputs=${count}[v]`);
8525
+ return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
8526
+ }
8527
+ function overlayXY(position, marginPx) {
8528
+ const p = (position ?? "bottom_right").toLowerCase();
8529
+ const x = p.includes("left") ? `${marginPx}` : p.includes("right") ? `W-w-${marginPx}` : "(W-w)/2";
8530
+ const y = p.includes("top") ? `${marginPx}` : p.includes("bottom") ? `H-h-${marginPx}` : "(H-h)/2";
8531
+ return { x, y };
8532
+ }
8533
+ function pipOverlayArgs(dims, position, insetWpct) {
8534
+ const iw = Math.round(dims.w * insetWpct);
8535
+ const margin = Math.round(dims.w * 0.04);
8536
+ const { x, y } = overlayXY(position, margin);
8537
+ const filt = `${fillPanel("0:v", dims.w, dims.h, "bg")};[1:v]scale=${iw}:-2,setsar=1,fps=30[fg];[bg][fg]overlay=x=${x}:y=${y}:format=auto[v]`;
8538
+ return ["-i", "{{in.c0}}", "-i", "{{in.c1}}", "-filter_complex", filt, "-map", "[v]", "{{out.video}}"];
8539
+ }
8540
+ var FLASH_HOLD_MAX_S = 2;
8541
+ function stillHoldArgs(durationS, dims) {
8542
+ return [
8543
+ "-loop",
8544
+ "1",
8545
+ "-i",
8546
+ "{{in.frame}}",
8547
+ "-t",
8548
+ durationS.toFixed(3),
8549
+ "-r",
8550
+ "30",
8551
+ "-vf",
8552
+ `scale=${dims.w}:${dims.h}:force_original_aspect_ratio=increase,crop=${dims.w}:${dims.h},setsar=1,format=yuv420p`,
8553
+ "-c:v",
8554
+ "libx264",
8555
+ "-pix_fmt",
8556
+ "yuv420p",
8557
+ "{{out.video}}"
8558
+ ];
8559
+ }
8560
+ function trimArgs(durationS, offsetS = 0) {
8436
8561
  return [
8437
8562
  "-i",
8438
8563
  "{{in.clip}}",
8564
+ ...offsetS > 0 ? ["-ss", offsetS.toFixed(3)] : [],
8439
8565
  "-t",
8440
8566
  durationS.toFixed(3),
8441
8567
  "-an",
@@ -8462,6 +8588,25 @@ var Sfx = z3.object({
8462
8588
  sound_effect_prompt: z3.string().optional(),
8463
8589
  description: z3.string().optional()
8464
8590
  }).loose();
8591
+ var CompositionRegion = z3.object({
8592
+ // full | top | bottom | left | right | inset
8593
+ panel: z3.string().optional(),
8594
+ // 9-grid anchor for an `inset` presenter box.
8595
+ position: z3.string().optional(),
8596
+ is_presenter: z3.boolean().optional(),
8597
+ // The cast id shown/speaking in this region (routes lip-sync + element refs).
8598
+ cast_ref: z3.string().optional(),
8599
+ summary: z3.string().optional(),
8600
+ frame_prompt: z3.string().optional(),
8601
+ motion_prompt: z3.string().optional()
8602
+ }).loose();
8603
+ var SceneComposition = z3.object({
8604
+ // full_frame (default) | split_screen | pip | keyed_overlay
8605
+ layout: z3.string().optional(),
8606
+ // split_screen only: vertical (top/bottom) | horizontal (left/right).
8607
+ split_axis: z3.string().optional(),
8608
+ regions: z3.array(CompositionRegion).optional()
8609
+ }).loose();
8465
8610
  var CameraMotion = z3.object({ movement: z3.string().optional(), detail: z3.string().optional() }).loose();
8466
8611
  var TranscriptWord = z3.object({ text: z3.string().optional() }).loose();
8467
8612
  var Scene = z3.object({
@@ -8470,6 +8615,10 @@ var Scene = z3.object({
8470
8615
  duration_s: z3.number().optional(),
8471
8616
  summary: z3.string().optional(),
8472
8617
  action_detail: z3.string().optional(),
8618
+ // The scene's spatial layout. Absent/full_frame ⇒ one uncut shot (default path).
8619
+ // A layered layout (split_screen/pip/keyed_overlay) with regions ⇒ the scaffold
8620
+ // builds one clip per region and stacks/overlays them into the scene picture.
8621
+ composition: SceneComposition.optional(),
8473
8622
  // The capture "look" for this scene — selected from the ad-native shoot-mode
8474
8623
  // grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
8475
8624
  // UGC/product mode; a human can override per scene by setting this.
@@ -8495,7 +8644,12 @@ var Scene = z3.object({
8495
8644
  floating_elements: z3.array(z3.unknown()).optional(),
8496
8645
  transcript_slice: z3.array(TranscriptWord).optional(),
8497
8646
  start_frame_asset: FrameAsset,
8498
- end_frame_asset: FrameAsset
8647
+ end_frame_asset: FrameAsset,
8648
+ // DECON-supplied: true when this scene is a length-split CONTINUATION of the
8649
+ // previous one (the SAME physical shot, broken up only because it exceeded the
8650
+ // clip ceiling). The scaffold then shares the splice keyframe — this scene's
8651
+ // start frame IS the previous scene's end frame — so the join is seamless.
8652
+ continues_previous: z3.boolean().optional()
8499
8653
  }).loose();
8500
8654
  var VideoBlueprint = z3.object({
8501
8655
  source: z3.object({ aspect_ratio: z3.string().optional(), duration_s: z3.number().optional() }).loose().optional(),
@@ -8600,6 +8754,40 @@ function annotateBlueprintWithElements(blueprintInput, elementsInput) {
8600
8754
  clone.reference_elements = summary;
8601
8755
  return clone;
8602
8756
  }
8757
+ var SELECT_SCENE_FIELDS = [
8758
+ "index",
8759
+ "start_s",
8760
+ "end_s",
8761
+ "duration_s",
8762
+ "summary",
8763
+ "narrative_role",
8764
+ "action_detail",
8765
+ "start_frame_prompt",
8766
+ "end_frame_prompt"
8767
+ ];
8768
+ var SELECT_GLOBAL_FIELDS = ["cast", "branding", "voiceover"];
8769
+ function slimBlueprintForSelection(blueprintInput) {
8770
+ if (!blueprintInput || typeof blueprintInput !== "object" || Array.isArray(blueprintInput)) return blueprintInput;
8771
+ const bp = blueprintInput;
8772
+ const out = {};
8773
+ for (const k of ["version", "source"]) if (k in bp) out[k] = bp[k];
8774
+ if (bp.global && typeof bp.global === "object" && !Array.isArray(bp.global)) {
8775
+ const g = bp.global;
8776
+ const slimG = {};
8777
+ for (const k of SELECT_GLOBAL_FIELDS) if (k in g) slimG[k] = g[k];
8778
+ out.global = slimG;
8779
+ }
8780
+ if (Array.isArray(bp.scenes)) {
8781
+ out.scenes = bp.scenes.map((s) => {
8782
+ if (!s || typeof s !== "object" || Array.isArray(s)) return s;
8783
+ const sr = s;
8784
+ const slim = {};
8785
+ for (const k of SELECT_SCENE_FIELDS) if (k in sr) slim[k] = sr[k];
8786
+ return slim;
8787
+ });
8788
+ }
8789
+ return out;
8790
+ }
8603
8791
  function roleForType2(type) {
8604
8792
  switch (type.toLowerCase()) {
8605
8793
  case "logo":
@@ -8646,6 +8834,7 @@ function buildElementSlots(elements) {
8646
8834
  type: el.type,
8647
8835
  description: el.description,
8648
8836
  sameAs: el.same_as ?? void 0,
8837
+ castId: el.cast_id ?? void 0,
8649
8838
  presence: presenceOf(el)
8650
8839
  });
8651
8840
  });
@@ -8684,7 +8873,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
8684
8873
  const legend = [
8685
8874
  ...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
8686
8875
  ...hasAnchor ? [
8687
- "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions of THIS frame. IGNORE its overlay text, captions, and any brand that is being swapped."
8876
+ "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions. IGNORE its text, its logo, its brand name, and its colors entirely \u2014 it is a DIFFERENT brand's footage, here only to anchor layout/pose, never identity or palette."
8688
8877
  ] : []
8689
8878
  ].join("\n");
8690
8879
  const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
@@ -8730,41 +8919,66 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
8730
8919
  "REFERENCE IMAGES (in the order provided):",
8731
8920
  legend,
8732
8921
  "",
8733
- "Identity comes from the reference images, not from this prose \u2014 render each person,",
8734
- "product, and set to MATCH its reference image, and describe only pose, expression, action,",
8735
- "and camera in the FRAME DESCRIPTION below.",
8736
- "",
8922
+ // RECAST is the whole point of a transform: the dropped el_* images define who/
8923
+ // what is on screen, NOT the source footage and NOT the prose. Without this, the
8924
+ // model reproduces the original ad's people (a proven failure mode).
8925
+ ...present.length > 0 ? [
8926
+ "IDENTITY & AESTHETIC \u2014 RECAST (this is a transform, not a copy):",
8927
+ "Identity comes from the reference image, never from the source footage or this prose. Render every",
8928
+ "person, animal, product, and set to MATCH its labeled reference image above \u2014 that image is the ONLY",
8929
+ "source of their identity, wardrobe, styling, and look. This is a complete recast: do NOT reproduce,",
8930
+ "trace, or resemble any individual, animal, product, or set from the source ad. Where the FRAME",
8931
+ "DESCRIPTION below names an appearance detail (hair, outfit, color, age, breed, brand of an object),",
8932
+ "IGNORE that wording \u2014 the reference image is the truth; use the description ONLY for pose, expression,",
8933
+ "action, framing, lighting, and palette.",
8934
+ ""
8935
+ ] : [
8936
+ "Identity comes from the reference image, never from prose \u2014 render the subject to MATCH it and",
8937
+ "describe only pose, expression, action, framing, and lighting in the FRAME DESCRIPTION below.",
8938
+ ""
8939
+ ],
8737
8940
  "FRAME DESCRIPTION (this frame's editable prompt):",
8738
8941
  description,
8739
8942
  "",
8740
- "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
8943
+ "Render exactly what the FRAME DESCRIPTION and the SHARED AD SPEC specify \u2014 this is the authoritative ad: its cast identity (via the reference images), palette, brand, and intent are law. Keep every recurring element identical to its reference image across all frames. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
8741
8944
  "",
8742
- "GLOBAL STYLE REFERENCE (shared across frames; not this frame's content):",
8945
+ "SHARED AD SPEC (authoritative \u2014 the ad blueprint this frame belongs to; align cast/palette/brand/type with it):",
8743
8946
  "{{target_blueprint}}"
8744
8947
  ].join("\n");
8745
8948
  }
8949
+ function ingestFrameRef(url, edge, ctx, nodes) {
8950
+ const cached2 = ctx.ingestCache?.get(url);
8951
+ if (cached2) return cached2;
8952
+ const tag = ctx.tag ?? "";
8953
+ const refId = `s${ctx.sceneIndex}${tag}_${edge}_ref`;
8954
+ nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
8955
+ const ref = `$ref:${refId}.asset`;
8956
+ ctx.ingestCache?.set(url, ref);
8957
+ return ref;
8958
+ }
8746
8959
  function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
8747
- const refId = `s${ctx.sceneIndex}_${edge}_ref`;
8748
- if (url) nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
8749
- if (ctx.reuse && url) return `$ref:${refId}.asset`;
8750
- const reference = [...present.map((s) => s.ref), ...url ? [`$ref:${refId}.asset`] : []];
8960
+ const tag = ctx.tag ?? "";
8961
+ if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
8962
+ const hasOriginal = Boolean(url);
8963
+ const originalRef = hasOriginal && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
8964
+ const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
8751
8965
  const genParams = {
8752
8966
  model: ctx.imageModel,
8753
8967
  image_size: "2K",
8754
- prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url), ctx.shootMode)
8968
+ prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, hasOriginal, ctx.shootMode)
8755
8969
  };
8756
8970
  if (ctx.ar) genParams.aspect_ratio = ctx.ar;
8757
- const genNode = {
8758
- id: `s${ctx.sceneIndex}_${edge}`,
8971
+ const genId = `s${ctx.sceneIndex}${tag}_${edge}`;
8972
+ nodes.push({
8973
+ id: genId,
8759
8974
  type: "image_generate",
8760
8975
  // `params.prompt` is this frame's authoritative, edit-per-frame description.
8761
- // `target_blueprint` is kept only as a demoted shared style reference (global
8762
- // cast/palette/typography), so editing one frame never touches another.
8976
+ // `target_blueprint` is the shared ad spec (cast identity, palette, brand, type)
8977
+ // the frame must stay consistent with — editing one frame never touches another.
8763
8978
  inputs: { target_blueprint: "$ref:prompt.asset", ...reference.length > 0 ? { reference } : {} },
8764
8979
  params: genParams
8765
- };
8766
- nodes.push(genNode);
8767
- return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
8980
+ });
8981
+ return `$ref:${genId}.images#0`;
8768
8982
  }
8769
8983
  function seedanceAudioLine(scene, mode, audio, nativeLine) {
8770
8984
  const ambient = scene.ambient?.trim() || diegeticFor(mode);
@@ -8810,10 +9024,11 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
8810
9024
  );
8811
9025
  return parts.join("\n");
8812
9026
  }
8813
- function audioExtractArgs(durationS) {
9027
+ function audioExtractArgs(durationS, offsetS = 0) {
8814
9028
  return [
8815
9029
  "-i",
8816
9030
  "{{in.clip}}",
9031
+ ...offsetS > 0.05 ? ["-ss", offsetS.toFixed(3)] : [],
8817
9032
  "-t",
8818
9033
  durationS.toFixed(3),
8819
9034
  "-vn",
@@ -8841,27 +9056,21 @@ function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
8841
9056
  hasProduct: present.some((s) => s.type.toLowerCase() === "product")
8842
9057
  });
8843
9058
  }
8844
- function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
9059
+ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks, nativeSegments, clipRef = `$ref:s${i}_clip.video`) {
8845
9060
  if (nativeTurn) {
8846
- const extractLen = Math.min(Math.max(lengths.dur, lengths.speech), lengths.genDur);
9061
+ const speechWindow = Math.max(0.5, nativeTurn.end_s - nativeTurn.start_s);
9062
+ const extractLen = Math.min(speechWindow, lengths.genDur);
8847
9063
  nodes.push({
8848
9064
  id: `s${i}_voextract`,
8849
9065
  type: "ffmpeg",
8850
- inputs: { clip: `$ref:s${i}_clip.video` },
9066
+ inputs: { clip: clipRef },
8851
9067
  params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
8852
9068
  });
8853
- nodes.push({
8854
- id: `s${i}_voconv`,
8855
- type: "audio_voice_convert",
8856
- inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
8857
- params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
8858
- });
8859
- voTracks.push({
8860
- slot: `s${i}_voconv`,
8861
- ref: `$ref:s${i}_voconv.audio`,
9069
+ nativeSegments.push({
9070
+ voiceNode: nativeTurn.voiceNode,
9071
+ ref: `$ref:s${i}_voextract.audio`,
8862
9072
  start_s: nativeTurn.start_s,
8863
- end_s: nativeTurn.start_s + extractLen,
8864
- kind: "vo"
9073
+ end_s: nativeTurn.start_s + extractLen
8865
9074
  });
8866
9075
  } else if (ambientBroll) {
8867
9076
  const ambientStart = scene.start_s ?? 0;
@@ -8881,85 +9090,260 @@ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes
8881
9090
  });
8882
9091
  }
8883
9092
  }
8884
- function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
8885
- const ar = aspectRatioParam(blueprint);
8886
- const reuse = opts.frames === "reuse";
8887
- const clips = [];
8888
- const voTracks = [];
8889
- const lastIndex = blueprint.scenes.length - 1;
8890
- const cameraOn = onCameraDialogue(blueprint);
8891
- const casts = castIdSet(blueprint);
8892
- blueprint.scenes.forEach((scene, i) => {
8893
- const nativeTurn = (sceneTurns.get(i) ?? []).find((t) => t.native);
8894
- const present = slotsForScene(slots, i);
8895
- const mode = sceneShootMode(scene, present, nativeTurn, cameraOn, casts);
8896
- const ambientBroll = Boolean(opts.ambient) && !nativeTurn && mode !== "ugc_selfie";
8897
- const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel, shootMode: mode };
8898
- const firstFrame = buildFrameRef(
8899
- "start",
8900
- scene.start_frame_asset?.url,
8901
- scene.start_frame_prompt,
8902
- slotsForFrame(slots, i, "start"),
8903
- ctx,
8904
- nodes
8905
- );
8906
- const lastFrame = buildFrameRef(
8907
- "end",
8908
- scene.end_frame_asset?.url,
8909
- scene.end_frame_prompt,
8910
- slotsForFrame(slots, i, "end"),
8911
- ctx,
8912
- nodes
8913
- );
8914
- const dur = sceneDurationS(scene);
8915
- let out = sceneOutTransition(scene, i === lastIndex);
8916
- let trimTarget = dur + (out?.dur ?? 0);
8917
- if (out && ceilToSeedance(trimTarget) < trimTarget) {
8918
- out = null;
8919
- trimTarget = dur;
8920
- }
8921
- const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
8922
- const genDur = ceilToSeedance(Math.max(trimTarget, speech));
8923
- const clipParams = {
8924
- model: opts.videoModel,
8925
- prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
8926
- duration: genDur,
8927
- // Native talking scene → Seedance generates the spoken audio + lip-sync;
8928
- // an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
8929
- // clip is silent and audio comes from the tts/music timeline.
8930
- generate_audio: Boolean(nativeTurn) || ambientBroll
8931
- };
8932
- if (ar) clipParams.aspect_ratio = ar;
9093
+ function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
9094
+ const bySpeaker = /* @__PURE__ */ new Map();
9095
+ for (const seg of segments) {
9096
+ const arr = bySpeaker.get(seg.voiceNode) ?? [];
9097
+ arr.push(seg);
9098
+ bySpeaker.set(seg.voiceNode, arr);
9099
+ }
9100
+ const tracks = [];
9101
+ for (const [voiceNode, segs] of bySpeaker) {
9102
+ const trackId = `${voiceNode}_track`;
9103
+ const convId = `${voiceNode}_conv`;
9104
+ const mixInputs = {};
9105
+ segs.forEach((s, k) => {
9106
+ mixInputs[`seg${k}`] = s.ref;
9107
+ });
8933
9108
  nodes.push({
8934
- id: `s${i}_clip`,
8935
- type: "video_generate",
8936
- inputs: { first_frame: firstFrame, last_frame: lastFrame },
8937
- params: clipParams
9109
+ id: trackId,
9110
+ type: "audio_timeline",
9111
+ inputs: mixInputs,
9112
+ params: {
9113
+ tracks: segs.map((s, k) => ({ slot: `seg${k}`, start_s: s.start_s })),
9114
+ total_ms: totalMs
9115
+ }
8938
9116
  });
8939
- emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, { dur, speech, genDur }, nodes, voTracks);
8940
- const base = `$ref:s${i}_clip.video`;
8941
- if (genDur === trimTarget) {
8942
- clips.push({ ref: base, scene_s: dur, out });
8943
- } else {
8944
- nodes.push({
8945
- id: `s${i}_trim`,
8946
- type: "ffmpeg",
8947
- inputs: { clip: base },
8948
- params: { args: trimArgs(trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
8949
- });
8950
- clips.push({ ref: `$ref:s${i}_trim.video`, scene_s: dur, out });
9117
+ nodes.push({
9118
+ id: convId,
9119
+ type: "audio_voice_convert",
9120
+ inputs: { audio: `$ref:${trackId}.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
9121
+ params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
9122
+ });
9123
+ tracks.push({ slot: convId, ref: `$ref:${convId}.audio`, start_s: 0, kind: "vo" });
9124
+ }
9125
+ return tracks;
9126
+ }
9127
+ function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
9128
+ const clipParams = {
9129
+ model: opts.videoModel,
9130
+ prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
9131
+ duration: lengths.genDur,
9132
+ // Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
9133
+ // ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
9134
+ generate_audio: Boolean(nativeTurn) || ambientBroll
9135
+ };
9136
+ if (opts.ar) clipParams.aspect_ratio = opts.ar;
9137
+ nodes.push({
9138
+ id: `s${i}${tag}_clip`,
9139
+ type: "video_generate",
9140
+ inputs: { first_frame: frames.first, ...frames.last ? { last_frame: frames.last } : {} },
9141
+ params: clipParams
9142
+ });
9143
+ const base = `$ref:s${i}${tag}_clip.video`;
9144
+ if (lengths.genDur === lengths.trimTarget) return { ref: base, scene_s: lengths.dur, out };
9145
+ nodes.push({
9146
+ id: `s${i}${tag}_clip_trim`,
9147
+ type: "ffmpeg",
9148
+ inputs: { clip: base },
9149
+ params: { args: trimArgs(lengths.trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
9150
+ });
9151
+ return { ref: `$ref:s${i}${tag}_clip_trim.video`, scene_s: lengths.dur, out };
9152
+ }
9153
+ var COMPOSITE_LAYOUTS = /* @__PURE__ */ new Set(["split_screen", "pip", "keyed_overlay"]);
9154
+ function layeredComposition(scene) {
9155
+ const comp = scene.composition;
9156
+ const layout = (comp?.layout ?? "").toLowerCase();
9157
+ if (!COMPOSITE_LAYOUTS.has(layout)) return null;
9158
+ const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
9159
+ if (regions.length < 2) return null;
9160
+ return { layout, regions, comp: comp ?? {} };
9161
+ }
9162
+ function splitAxisOf(comp, regions) {
9163
+ const panels = regions.map((r) => (r.panel ?? "").toLowerCase());
9164
+ if (panels.some((p) => p === "top" || p === "bottom")) return "vertical";
9165
+ if (panels.some((p) => p === "left" || p === "right")) return "horizontal";
9166
+ return (comp.split_axis ?? "").toLowerCase() === "horizontal" ? "horizontal" : "vertical";
9167
+ }
9168
+ function orderSplitRefs(regions, regionRefs, axis) {
9169
+ const rank = (panel) => {
9170
+ const p = (panel ?? "").toLowerCase();
9171
+ if (axis === "vertical") return p === "top" ? 0 : p === "bottom" ? 2 : 1;
9172
+ return p === "left" ? 0 : p === "right" ? 2 : 1;
9173
+ };
9174
+ return regionRefs.map((ref, k) => ({ ref, k, rank: rank(regions[k]?.panel) })).sort((a, b) => a.rank - b.rank || a.k - b.k).map((x) => x.ref);
9175
+ }
9176
+ function presenterIndexOf(regions, hasNative) {
9177
+ const flagged = regions.findIndex((r) => r.is_presenter);
9178
+ if (flagged >= 0) return flagged;
9179
+ return hasNative ? 0 : -1;
9180
+ }
9181
+ function slotsForRegion(present, isPresenter) {
9182
+ return present.filter((s) => {
9183
+ const t = s.type.toLowerCase();
9184
+ const person = t === "person" || t === "animal";
9185
+ return isPresenter ? person : !person;
9186
+ });
9187
+ }
9188
+ function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes) {
9189
+ const dims = canvasDims(opts.ar);
9190
+ const presIdx = presenterIndexOf(regions, Boolean(nativeTurn));
9191
+ const regionRefs = [];
9192
+ let presenterPosition;
9193
+ regions.forEach((region, r) => {
9194
+ const isPresenter = r === presIdx;
9195
+ const tag = `_r${r}`;
9196
+ const regionSlots = slotsForRegion(present, isPresenter);
9197
+ const ctx = {
9198
+ sceneIndex: i,
9199
+ ar: opts.ar,
9200
+ reuse: opts.reuse,
9201
+ imageModel: opts.imageModel,
9202
+ shootMode: mode,
9203
+ tag
9204
+ };
9205
+ const startPrompt = region.frame_prompt ?? scene.start_frame_prompt;
9206
+ const endPrompt = region.frame_prompt ?? scene.end_frame_prompt;
9207
+ const first = buildFrameRef("start", void 0, startPrompt, regionSlots, ctx, nodes);
9208
+ const last = buildFrameRef("end", void 0, endPrompt, regionSlots, ctx, nodes);
9209
+ const regionNative = isPresenter ? nativeTurn : void 0;
9210
+ const regionScene = {
9211
+ ...scene,
9212
+ summary: region.summary ?? scene.summary,
9213
+ motion_prompt: region.motion_prompt ?? scene.motion_prompt,
9214
+ dialogue: isPresenter ? scene.dialogue : []
9215
+ };
9216
+ const clip = emitSceneClip(
9217
+ i,
9218
+ regionScene,
9219
+ regionSlots,
9220
+ mode,
9221
+ regionNative,
9222
+ false,
9223
+ { first, last },
9224
+ lengths,
9225
+ null,
9226
+ { ar: opts.ar, videoModel: opts.videoModel },
9227
+ nodes,
9228
+ tag
9229
+ );
9230
+ regionRefs.push(clip.ref);
9231
+ if (isPresenter) presenterPosition = region.position;
9232
+ });
9233
+ const compInputs = {};
9234
+ let args;
9235
+ if (layout === "split_screen") {
9236
+ const axis = splitAxisOf(comp, regions);
9237
+ orderSplitRefs(regions, regionRefs, axis).forEach((ref, k) => {
9238
+ compInputs[`c${k}`] = ref;
9239
+ });
9240
+ args = splitStackArgs(regionRefs.length, axis, dims);
9241
+ } else {
9242
+ const bgIdx = regions.findIndex((_, k) => k !== presIdx);
9243
+ const bgRef = regionRefs[bgIdx >= 0 ? bgIdx : 0];
9244
+ let presRef = regionRefs[presIdx >= 0 ? presIdx : 1];
9245
+ if (layout === "keyed_overlay" && presIdx >= 0) {
9246
+ const keyId = `s${i}_key`;
9247
+ nodes.push({ id: keyId, type: "video_background_remove", inputs: { video: presRef }, params: {} });
9248
+ presRef = `$ref:${keyId}.video`;
9249
+ }
9250
+ compInputs.c0 = bgRef;
9251
+ compInputs.c1 = presRef;
9252
+ args = pipOverlayArgs(dims, presenterPosition, layout === "keyed_overlay" ? 0.5 : 0.34);
9253
+ }
9254
+ nodes.push({
9255
+ id: `s${i}_composite`,
9256
+ type: "ffmpeg",
9257
+ inputs: compInputs,
9258
+ params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
9259
+ });
9260
+ const presenterClipRef = presIdx >= 0 ? `$ref:s${i}_r${presIdx}_clip.video` : void 0;
9261
+ return { clip: { ref: `$ref:s${i}_composite.video`, scene_s: lengths.dur, out }, presenterClipRef };
9262
+ }
9263
+ function sceneTiming(scene, isLast, nativeTurn) {
9264
+ const dur = sceneDurationS(scene);
9265
+ let out = sceneOutTransition(scene, isLast);
9266
+ let trimTarget = dur + (out?.dur ?? 0);
9267
+ if (out && ceilToSeedance(trimTarget) < trimTarget) {
9268
+ out = null;
9269
+ trimTarget = dur;
9270
+ }
9271
+ const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
9272
+ const genDur = ceilToSeedance(Math.max(trimTarget, speech));
9273
+ return { dur, out, trimTarget, genDur, speech };
9274
+ }
9275
+ function emitCompositeScene(composite, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes, voTracks, nativeSegments, clips) {
9276
+ const built = buildCompositeScene(
9277
+ composite.layout,
9278
+ composite.regions,
9279
+ composite.comp,
9280
+ scene,
9281
+ i,
9282
+ present,
9283
+ mode,
9284
+ nativeTurn,
9285
+ { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
9286
+ out,
9287
+ opts,
9288
+ nodes
9289
+ );
9290
+ emitSceneNativeAudio(
9291
+ i,
9292
+ scene,
9293
+ nativeTurn,
9294
+ false,
9295
+ { dur: lengths.dur, speech: lengths.speech, genDur: lengths.genDur },
9296
+ nodes,
9297
+ voTracks,
9298
+ nativeSegments,
9299
+ built.presenterClipRef
9300
+ );
9301
+ clips.push(built.clip);
9302
+ }
9303
+ function emitFlashHold(i, scene, slots, ctx, lengths, out, ar, nodes, clips) {
9304
+ const frame = buildFrameRef(
9305
+ "start",
9306
+ scene.start_frame_asset?.url,
9307
+ scene.start_frame_prompt,
9308
+ slotsForFrame(slots, i, "start"),
9309
+ ctx,
9310
+ nodes
9311
+ );
9312
+ nodes.push({
9313
+ id: `s${i}_clip`,
9314
+ type: "ffmpeg",
9315
+ inputs: { frame },
9316
+ params: {
9317
+ args: stillHoldArgs(lengths.trimTarget, canvasDims(ar)),
9318
+ outputs: { video: { kind: "video", ext: "mp4" } }
8951
9319
  }
8952
9320
  });
8953
- return { clips, voTracks };
9321
+ clips.push({ ref: `$ref:s${i}_clip.video`, scene_s: lengths.dur, out });
9322
+ }
9323
+ function musicScriptDigest(blueprint) {
9324
+ const lines = blueprint.scenes.flatMap((s) => (s.dialogue ?? []).map((d) => d.line?.trim())).filter((l) => Boolean(l));
9325
+ const script = lines.join(" ").slice(0, 500);
9326
+ const roles = blueprint.scenes.map((s) => s.narrative_role).filter((r) => Boolean(r));
9327
+ const arc = roles.length > 0 ? roles.join(" \u2192 ") : "";
9328
+ const parts = [];
9329
+ if (script) {
9330
+ parts.push(
9331
+ `Ad script (the bed must SUPPORT these words \u2014 leave room for the voice, swell on the payoff): "${script}"`
9332
+ );
9333
+ }
9334
+ if (arc) parts.push(`Emotional arc across scenes: ${arc}. Shape the bed's energy to this arc.`);
9335
+ return parts.length > 0 ? `
9336
+
9337
+ ${parts.join("\n")}` : "";
8954
9338
  }
8955
9339
  function musicBedPrompt(blueprint, musicPrompt) {
9340
+ const digest = musicScriptDigest(blueprint);
8956
9341
  const track2 = blueprint.global?.music?.identified_track;
8957
9342
  const title = track2?.title?.trim();
8958
- if (!title) return musicPrompt;
8959
- const by = track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : "";
8960
- return `${musicPrompt}
9343
+ const vibe = title ? `
8961
9344
 
8962
- Reference vibe: the original used "${title}"${by} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.`;
9345
+ Reference vibe: the original used "${title}"${track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : ""} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.` : "";
9346
+ return `${musicPrompt}${digest}${vibe}`;
8963
9347
  }
8964
9348
  function onCameraDialogue(blueprint) {
8965
9349
  const mode = blueprint.global?.voiceover?.mode;
@@ -8998,92 +9382,483 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
8998
9382
  if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
8999
9383
  return casts.has(speaker);
9000
9384
  }
9001
- function buildDialogue(blueprint, nodes) {
9002
- const tracks = [];
9003
- const sceneTurns = /* @__PURE__ */ new Map();
9385
+ function makePresenterPresent(slots, canonical) {
9386
+ const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
9387
+ const bySpeaker = /* @__PURE__ */ new Map();
9388
+ for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
9389
+ const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
9390
+ return (speaker, sceneIndex) => {
9391
+ const presence = bySpeaker.get(speaker) ?? solePerson;
9392
+ if (!presence) return true;
9393
+ return presence.has(sceneIndex);
9394
+ };
9395
+ }
9396
+ var PAUSE_GAP_S = 0.6;
9397
+ var PHRASE_MAX_S = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
9398
+ function collapseVoiceover(blueprint) {
9004
9399
  const casts = castIdSet(blueprint);
9005
9400
  const cameraOn = onCameraDialogue(blueprint);
9006
- const voiceNodeBySpeaker = /* @__PURE__ */ new Map();
9007
- const speakerDescription = (speaker) => {
9008
- for (const scene of blueprint.scenes) {
9009
- for (const line of scene.dialogue ?? []) {
9010
- if ((line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
9011
- }
9401
+ const presenters = /* @__PURE__ */ new Set();
9402
+ for (const scene of blueprint.scenes)
9403
+ for (const l of scene.dialogue ?? []) {
9404
+ const sp = l.speaker ?? "voiceover";
9405
+ if (isOnCameraSpeaker(sp, casts, cameraOn)) presenters.add(sp);
9012
9406
  }
9407
+ if (presenters.size !== 1) return (s) => s;
9408
+ const presenter = [...presenters][0];
9409
+ return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
9410
+ }
9411
+ function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
9412
+ const casts = castIdSet(blueprint);
9413
+ const cameraOn = onCameraDialogue(blueprint);
9414
+ const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
9415
+ const multiSpeaker = /* @__PURE__ */ new Set();
9416
+ blueprint.scenes.forEach((scene, i) => {
9417
+ const onCam = new Set(
9418
+ (scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
9419
+ );
9420
+ if (onCam.size >= 2) multiSpeaker.add(i);
9421
+ });
9422
+ const lines = blueprint.scenes.flatMap(
9423
+ (scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
9424
+ const raw = l.speaker ?? "voiceover";
9425
+ const sp = canonical(raw);
9426
+ const text = l.line.trim();
9427
+ const start = l.start_s ?? scene.start_s ?? 0;
9428
+ return {
9429
+ sceneIndex,
9430
+ speaker: sp,
9431
+ // Shown = a cast member speaking AND their element is actually on screen
9432
+ // here (not a cutaway). A b-roll cutaway mid-phrase fails this and gets
9433
+ // its own clip while the phrase voice plays under it.
9434
+ shown: isOnCameraSpeaker(raw, casts, cameraOn) && !multiSpeaker.has(sceneIndex) && presenterPresent(sp, sceneIndex),
9435
+ start,
9436
+ // Real speech end. When the deconstruct gives no end_s, estimate it from
9437
+ // the words — NOT the scene end (which would fabricate continuity across
9438
+ // a long silent b-roll gap and wrongly merge two separate phrases).
9439
+ end: l.end_s ?? start + estSpeechS(text),
9440
+ text
9441
+ };
9442
+ })
9443
+ ).sort((a, b) => a.start - b.start);
9444
+ const phrases = [];
9445
+ let cur = null;
9446
+ const flush = () => {
9447
+ if (!cur) return;
9448
+ const shownScenes = [...cur.shown].sort((a, b) => a - b);
9449
+ phrases.push({
9450
+ speaker: cur.speaker,
9451
+ start_s: cur.start,
9452
+ end_s: cur.end,
9453
+ text: cur.texts.join(" "),
9454
+ firstScene: cur.firstScene,
9455
+ shownScenes,
9456
+ presenterShown: shownScenes.length > 0
9457
+ });
9458
+ cur = null;
9459
+ };
9460
+ for (const ln of lines) {
9461
+ const lineCover = ln.shown ? Math.max(ln.end, sceneEndS(ln.sceneIndex)) : ln.end;
9462
+ const lineClipStart = ln.shown ? Math.min(ln.start, blueprint.scenes[ln.sceneIndex]?.start_s ?? ln.start) : ln.start;
9463
+ const breakRun = !cur || cur.speaker !== ln.speaker || ln.start - cur.end > PAUSE_GAP_S || // Cap by SCENE COVERAGE span, not line end — a presenter run whose sliced scenes span
9464
+ // more than one Seedance clip splits into the next take here (at this scene's
9465
+ // boundary, never mid-scene), so no segment ever reads past the generated clip.
9466
+ Math.max(cur.coverEnd, lineCover) - Math.min(cur.clipStart, lineClipStart) > PHRASE_MAX_S;
9467
+ if (breakRun || !cur) {
9468
+ flush();
9469
+ cur = {
9470
+ speaker: ln.speaker,
9471
+ firstScene: ln.sceneIndex,
9472
+ start: ln.start,
9473
+ end: ln.end,
9474
+ coverEnd: lineCover,
9475
+ clipStart: lineClipStart,
9476
+ texts: [ln.text],
9477
+ shown: /* @__PURE__ */ new Set()
9478
+ };
9479
+ } else {
9480
+ cur.texts.push(ln.text);
9481
+ cur.end = Math.max(cur.end, ln.end);
9482
+ cur.coverEnd = Math.max(cur.coverEnd, lineCover);
9483
+ cur.clipStart = Math.min(cur.clipStart, lineClipStart);
9484
+ }
9485
+ if (ln.shown) cur.shown.add(ln.sceneIndex);
9486
+ }
9487
+ flush();
9488
+ return phrases;
9489
+ }
9490
+ function makeVoiceFactory(blueprint, canonical, nodes) {
9491
+ const bySpeaker = /* @__PURE__ */ new Map();
9492
+ const describe = (speaker) => {
9493
+ for (const scene of blueprint.scenes)
9494
+ for (const line of scene.dialogue ?? [])
9495
+ if (canonical(line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
9013
9496
  const cast = blueprint.global?.cast?.find((c) => c.id === speaker);
9014
9497
  return cast?.description ?? blueprint.global?.voiceover?.voice_description ?? `${speaker} voice`;
9015
9498
  };
9016
- const ensureVoiceNode = (speaker) => {
9017
- const existing = voiceNodeBySpeaker.get(speaker);
9499
+ return (speaker) => {
9500
+ const existing = bySpeaker.get(speaker);
9018
9501
  if (existing) return existing;
9019
- const id = sanitizeId2(`voice_${speaker}`, `voice_${voiceNodeBySpeaker.size}`);
9020
- const description = speakerDescription(speaker);
9021
- const traits = parseVoiceTraits(description);
9022
- nodes.push({ id, type: "voice_select", params: { description, ...traits } });
9023
- voiceNodeBySpeaker.set(speaker, id);
9502
+ const id = sanitizeId2(`voice_${speaker}`, `voice_${bySpeaker.size}`);
9503
+ const description = describe(speaker);
9504
+ nodes.push({ id, type: "voice_select", params: { description, ...parseVoiceTraits(description) } });
9505
+ bySpeaker.set(speaker, id);
9024
9506
  return id;
9025
9507
  };
9508
+ }
9509
+ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
9510
+ const anchor = phrase.shownScenes[0];
9511
+ const anchorScene = env.blueprint.scenes[anchor];
9512
+ if (!anchorScene) return;
9513
+ const present = slotsForScene(env.slots, anchor);
9514
+ const nativeTurn = {
9515
+ sceneIndex: anchor,
9516
+ speaker: phrase.speaker,
9517
+ start_s: phrase.start_s,
9518
+ end_s: phrase.end_s,
9519
+ text: phrase.text,
9520
+ voiceNode,
9521
+ native: true
9522
+ };
9523
+ const mode = sceneShootMode(anchorScene, present, nativeTurn, env.cameraOn, env.casts);
9524
+ const ctx = {
9525
+ sceneIndex: anchor,
9526
+ ar: env.ar,
9527
+ reuse: env.reuse,
9528
+ imageModel: env.opts.imageModel,
9529
+ shootMode: mode,
9530
+ ingestCache: env.ingestCache
9531
+ };
9532
+ const first = buildFrameRef(
9533
+ "start",
9534
+ anchorScene.start_frame_asset?.url,
9535
+ anchorScene.start_frame_prompt,
9536
+ slotsForFrame(env.slots, anchor, "start"),
9537
+ ctx,
9538
+ nodes
9539
+ );
9540
+ const lastShown = phrase.shownScenes[phrase.shownScenes.length - 1] ?? anchor;
9541
+ const lastScene = env.blueprint.scenes[lastShown] ?? anchorScene;
9542
+ const last = buildFrameRef(
9543
+ "end",
9544
+ lastScene.end_frame_asset?.url,
9545
+ lastScene.end_frame_prompt,
9546
+ slotsForFrame(env.slots, lastShown, "end"),
9547
+ ctx,
9548
+ nodes
9549
+ );
9550
+ const clipStart = phrase.shownScenes.reduce(
9551
+ (m, s) => Math.min(m, env.blueprint.scenes[s]?.start_s ?? phrase.start_s),
9552
+ phrase.start_s
9553
+ );
9554
+ const coverEnd = phrase.shownScenes.reduce((m, s) => Math.max(m, env.blueprint.scenes[s]?.end_s ?? 0), phrase.end_s);
9555
+ const phraseLen = Math.max(0.5, coverEnd - clipStart);
9556
+ const genDur = ceilToSeedance(phraseLen);
9557
+ const clipParams = {
9558
+ model: env.opts.videoModel,
9559
+ prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
9560
+ duration: genDur,
9561
+ generate_audio: true
9562
+ };
9563
+ if (env.ar) clipParams.aspect_ratio = env.ar;
9564
+ nodes.push({
9565
+ id: `s${anchor}_clip`,
9566
+ type: "video_generate",
9567
+ inputs: { first_frame: first, last_frame: last },
9568
+ params: clipParams
9569
+ });
9570
+ const clipRef = `$ref:s${anchor}_clip.video`;
9571
+ const speechOffset = Math.max(0, phrase.start_s - clipStart);
9572
+ const extractLen = Math.min(Math.max(0.5, phrase.end_s - phrase.start_s), Math.max(0.5, genDur - speechOffset));
9573
+ nodes.push({
9574
+ id: `s${anchor}_voextract`,
9575
+ type: "ffmpeg",
9576
+ inputs: { clip: clipRef },
9577
+ params: { args: audioExtractArgs(extractLen, speechOffset), outputs: { audio: { kind: "audio", ext: "mp3" } } }
9578
+ });
9579
+ const convId = `s${anchor}_conv`;
9580
+ nodes.push({
9581
+ id: convId,
9582
+ type: "audio_voice_convert",
9583
+ inputs: { audio: `$ref:s${anchor}_voextract.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
9584
+ params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
9585
+ });
9586
+ out.voTracks.push({
9587
+ slot: convId,
9588
+ ref: `$ref:${convId}.audio`,
9589
+ start_s: phrase.start_s,
9590
+ end_s: phrase.end_s,
9591
+ kind: "vo"
9592
+ });
9593
+ out.voSegments.push({
9594
+ slot: convId,
9595
+ start_s: phrase.start_s,
9596
+ end_s: phrase.end_s,
9597
+ scene: anchor,
9598
+ speaker: phrase.speaker
9599
+ });
9600
+ out.talkingScenes.push({
9601
+ scene: anchor,
9602
+ voice_convert_node: convId,
9603
+ scene_s: Math.round(phraseLen * 100) / 100,
9604
+ est_speech_s: Math.round(estSpeechS(phrase.text) * 100) / 100
9605
+ });
9606
+ for (const s of phrase.shownScenes) {
9607
+ const sc = env.blueprint.scenes[s];
9608
+ if (!sc) continue;
9609
+ const rawOffset = (sc.start_s ?? clipStart) - clipStart;
9610
+ out.sceneSlice.set(s, {
9611
+ clipRef,
9612
+ // Snap a sub-frame offset (line-start vs scene-start drift) to 0 so a single-scene
9613
+ // phrase hits the whole-clip fast path instead of a needless re-encode + tiny shift.
9614
+ offset: rawOffset < 0.05 ? 0 : rawOffset,
9615
+ len: sceneDurationS(sc),
9616
+ clipDur: genDur
9617
+ });
9618
+ }
9619
+ }
9620
+ function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
9621
+ let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
9622
+ while (used.has(id)) id = `${id}_x`;
9623
+ used.add(id);
9624
+ nodes.push({
9625
+ id,
9626
+ type: "tts",
9627
+ inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
9628
+ params: { model: FIXED_TTS_MODEL, text: phrase.text, voice: "{{voice_ref}}" }
9629
+ });
9630
+ out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
9631
+ out.voSegments.push({
9632
+ slot: id,
9633
+ start_s: phrase.start_s,
9634
+ end_s: phrase.end_s,
9635
+ scene: phrase.firstScene,
9636
+ speaker: phrase.speaker
9637
+ });
9638
+ }
9639
+ function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
9640
+ const present = slotsForScene(env.slots, i);
9641
+ const onCam = (scene.dialogue ?? []).filter(
9642
+ (l) => Boolean(l.line?.trim()) && isOnCameraSpeaker(l.speaker ?? "voiceover", env.casts, env.cameraOn)
9643
+ );
9644
+ const distinctSpeakers = new Set(onCam.map((l) => canonical(l.speaker ?? "voiceover")));
9645
+ let nativeTurn;
9646
+ if (onCam.length > 0 && distinctSpeakers.size === 1) {
9647
+ const speaker = canonical(onCam[0]?.speaker ?? "voiceover");
9648
+ const voiceNode = ensureVoiceNode(speaker);
9649
+ const start = onCam[0]?.start_s ?? scene.start_s ?? 0;
9650
+ const end = onCam[onCam.length - 1]?.end_s ?? scene.end_s ?? start;
9651
+ const text = onCam.map((l) => l.line.trim()).join(" ");
9652
+ nativeTurn = { sceneIndex: i, speaker, start_s: start, end_s: end, text, voiceNode, native: true };
9653
+ out.talkingScenes.push({
9654
+ scene: i,
9655
+ voice_convert_node: `${voiceNode}_conv`,
9656
+ scene_s: Math.round(sceneDurationS(scene) * 100) / 100,
9657
+ est_speech_s: Math.round(estSpeechS(text) * 100) / 100
9658
+ });
9659
+ }
9660
+ const mode = sceneShootMode(scene, present, nativeTurn, env.cameraOn, env.casts);
9661
+ const lengths = sceneTiming(scene, isLast, nativeTurn);
9662
+ emitCompositeScene(
9663
+ composite,
9664
+ scene,
9665
+ i,
9666
+ present,
9667
+ mode,
9668
+ nativeTurn,
9669
+ lengths,
9670
+ lengths.out,
9671
+ { ar: env.ar, reuse: env.reuse, imageModel: env.opts.imageModel, videoModel: env.opts.videoModel },
9672
+ nodes,
9673
+ out.voTracks,
9674
+ out.nativeSegments,
9675
+ out.clips
9676
+ );
9677
+ if (!nativeTurn && distinctSpeakers.size >= 2) {
9678
+ emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out);
9679
+ }
9680
+ }
9681
+ function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
9682
+ const bySpeaker = /* @__PURE__ */ new Map();
9683
+ for (const l of onCam) {
9684
+ const speaker = canonical(l.speaker ?? "voiceover");
9685
+ const text = l.line.trim();
9686
+ const start = l.start_s ?? scene.start_s ?? 0;
9687
+ const end = l.end_s ?? start + estSpeechS(text);
9688
+ const cur = bySpeaker.get(speaker);
9689
+ if (cur) {
9690
+ cur.lines.push(text);
9691
+ cur.start = Math.min(cur.start, start);
9692
+ cur.end = Math.max(cur.end, end);
9693
+ } else {
9694
+ bySpeaker.set(speaker, { lines: [text], start, end });
9695
+ }
9696
+ }
9697
+ for (const [speaker, agg] of bySpeaker) {
9698
+ const voiceNode = ensureVoiceNode(speaker);
9699
+ emitPhraseTts(
9700
+ {
9701
+ speaker,
9702
+ start_s: agg.start,
9703
+ end_s: agg.end,
9704
+ text: agg.lines.join(" "),
9705
+ firstScene: i,
9706
+ shownScenes: [],
9707
+ presenterShown: false
9708
+ },
9709
+ voiceNode,
9710
+ i,
9711
+ usedVoIds,
9712
+ nodes,
9713
+ out
9714
+ );
9715
+ }
9716
+ }
9717
+ function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
9718
+ const present = slotsForScene(env.slots, i);
9719
+ const mode = sceneShootMode(scene, present, void 0, env.cameraOn, env.casts);
9720
+ const ambientBroll = Boolean(env.opts.ambient) && mode !== "ugc_selfie";
9721
+ const lengths = sceneTiming(scene, isLast, void 0);
9722
+ const ctx = {
9723
+ sceneIndex: i,
9724
+ ar: env.ar,
9725
+ reuse: env.reuse,
9726
+ imageModel: env.opts.imageModel,
9727
+ shootMode: mode,
9728
+ ingestCache: env.ingestCache
9729
+ };
9730
+ if (!ambientBroll && lengths.dur <= FLASH_HOLD_MAX_S) {
9731
+ emitFlashHold(i, scene, env.slots, ctx, lengths, lengths.out, env.ar, nodes, out.clips);
9732
+ return void 0;
9733
+ }
9734
+ const first = scene.continues_previous && prevEndFrame ? prevEndFrame : buildFrameRef(
9735
+ "start",
9736
+ scene.start_frame_asset?.url,
9737
+ scene.start_frame_prompt,
9738
+ slotsForFrame(env.slots, i, "start"),
9739
+ ctx,
9740
+ nodes
9741
+ );
9742
+ const last = buildFrameRef(
9743
+ "end",
9744
+ scene.end_frame_asset?.url,
9745
+ scene.end_frame_prompt,
9746
+ slotsForFrame(env.slots, i, "end"),
9747
+ ctx,
9748
+ nodes
9749
+ );
9750
+ const clip = emitSceneClip(
9751
+ i,
9752
+ scene,
9753
+ present,
9754
+ mode,
9755
+ void 0,
9756
+ ambientBroll,
9757
+ { first, last },
9758
+ { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
9759
+ lengths.out,
9760
+ { ar: env.ar, videoModel: env.opts.videoModel },
9761
+ nodes
9762
+ );
9763
+ if (ambientBroll) {
9764
+ emitSceneNativeAudio(
9765
+ i,
9766
+ scene,
9767
+ void 0,
9768
+ true,
9769
+ { dur: lengths.dur, speech: 0, genDur: lengths.genDur },
9770
+ nodes,
9771
+ out.voTracks,
9772
+ out.nativeSegments
9773
+ );
9774
+ }
9775
+ out.clips.push(clip);
9776
+ return last;
9777
+ }
9778
+ function buildTimeline(blueprint, slots, opts, nodes) {
9779
+ const reuse = opts.frames === "reuse";
9780
+ const compositeScenes = /* @__PURE__ */ new Set();
9781
+ if (!reuse) {
9782
+ blueprint.scenes.forEach((s, i) => {
9783
+ if (layeredComposition(s)) compositeScenes.add(i);
9784
+ });
9785
+ }
9786
+ const canonical = collapseVoiceover(blueprint);
9787
+ const ensureVoiceNode = makeVoiceFactory(blueprint, canonical, nodes);
9788
+ const env = {
9789
+ blueprint,
9790
+ slots,
9791
+ opts,
9792
+ ar: aspectRatioParam(blueprint),
9793
+ reuse,
9794
+ cameraOn: onCameraDialogue(blueprint),
9795
+ casts: castIdSet(blueprint),
9796
+ ingestCache: /* @__PURE__ */ new Map()
9797
+ };
9798
+ const out = {
9799
+ clips: [],
9800
+ voTracks: [],
9801
+ voSegments: [],
9802
+ talkingScenes: [],
9803
+ nativeSegments: [],
9804
+ sceneSlice: /* @__PURE__ */ new Map()
9805
+ };
9806
+ const presenterPresent = makePresenterPresent(slots, canonical);
9807
+ const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent);
9026
9808
  const usedVoIds = /* @__PURE__ */ new Set();
9027
- blueprint.scenes.forEach((scene, sceneIndex) => {
9028
- const lines = (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).slice().sort((a, b) => (a.start_s ?? 0) - (b.start_s ?? 0));
9029
- if (lines.length === 0) return;
9030
- const groups = [];
9031
- for (const line of lines) {
9032
- const speaker = line.speaker ?? "voiceover";
9033
- const last = groups[groups.length - 1];
9034
- if (last && last.speaker === speaker) last.lines.push(line);
9035
- else groups.push({ speaker, lines: [line] });
9036
- }
9037
- const shells = groups.map((group) => {
9038
- const first = group.lines[0];
9039
- const last = group.lines[group.lines.length - 1];
9040
- if (!first || !last) return void 0;
9041
- return {
9042
- group,
9043
- start: first.start_s ?? scene.start_s ?? 0,
9044
- end: last.end_s ?? last.start_s ?? scene.end_s ?? first.start_s ?? scene.start_s ?? 0,
9045
- onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn)
9046
- };
9047
- }).filter((s) => Boolean(s));
9048
- const onCamCount = shells.filter((s) => s.onCamera).length;
9049
- const list = [];
9050
- shells.forEach((shell, gi) => {
9051
- const { group, start, end, onCamera } = shell;
9052
- const voiceNode = ensureVoiceNode(group.speaker);
9053
- const text = group.lines.map((l) => l.line.trim()).join(" ");
9054
- const native = onCamera && onCamCount === 1;
9055
- const turn = {
9056
- sceneIndex,
9057
- speaker: group.speaker,
9058
- start_s: start,
9059
- end_s: end,
9060
- text,
9061
- voiceNode,
9062
- native
9063
- };
9064
- if (!native) {
9065
- let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
9066
- if (usedVoIds.has(id)) {
9067
- let n = 2;
9068
- while (usedVoIds.has(`${id}_${n}`)) n++;
9069
- id = `${id}_${n}`;
9070
- }
9071
- usedVoIds.add(id);
9809
+ const claimed = /* @__PURE__ */ new Set();
9810
+ phrases.forEach((phrase, k) => {
9811
+ const voiceNode = ensureVoiceNode(phrase.speaker);
9812
+ const available = phrase.shownScenes.filter((s) => !claimed.has(s));
9813
+ if (phrase.presenterShown && available.length > 0) {
9814
+ for (const s of available) claimed.add(s);
9815
+ emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
9816
+ } else {
9817
+ emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
9818
+ }
9819
+ });
9820
+ const lastIndex = blueprint.scenes.length - 1;
9821
+ let prevEndFrame;
9822
+ blueprint.scenes.forEach((scene, i) => {
9823
+ const composite = compositeScenes.has(i) ? layeredComposition(scene) : null;
9824
+ if (composite) {
9825
+ emitCompositeInTimeline(
9826
+ composite,
9827
+ scene,
9828
+ i,
9829
+ i === lastIndex,
9830
+ env,
9831
+ canonical,
9832
+ ensureVoiceNode,
9833
+ usedVoIds,
9834
+ nodes,
9835
+ out
9836
+ );
9837
+ prevEndFrame = void 0;
9838
+ return;
9839
+ }
9840
+ const slice = out.sceneSlice.get(i);
9841
+ if (slice) {
9842
+ const whole = slice.offset === 0 && Math.abs(slice.len - slice.clipDur) <= 0.05;
9843
+ if (whole) {
9844
+ out.clips.push({ ref: slice.clipRef, scene_s: slice.len, out: null });
9845
+ } else {
9072
9846
  nodes.push({
9073
- id,
9074
- type: "tts",
9075
- inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
9076
- params: { model: FIXED_TTS_MODEL, text, voice: "{{voice_ref}}" }
9847
+ id: `s${i}_seg`,
9848
+ type: "ffmpeg",
9849
+ inputs: { clip: slice.clipRef },
9850
+ params: { args: trimArgs(slice.len, slice.offset), outputs: { video: { kind: "video", ext: "mp4" } } }
9077
9851
  });
9078
- turn.ttsId = id;
9079
- const audioRef = `$ref:${id}.audio`;
9080
- tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
9852
+ out.clips.push({ ref: `$ref:s${i}_seg.video`, scene_s: slice.len, out: null });
9081
9853
  }
9082
- list.push(turn);
9083
- });
9084
- sceneTurns.set(sceneIndex, list);
9854
+ prevEndFrame = void 0;
9855
+ return;
9856
+ }
9857
+ prevEndFrame = emitBrollScene(scene, i, i === lastIndex, env, nodes, out, prevEndFrame);
9085
9858
  });
9086
- return { tracks, sceneTurns };
9859
+ const totalMs = Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3);
9860
+ out.voTracks.push(...buildPerSpeakerVoiceConversion(out.nativeSegments, totalMs, nodes));
9861
+ return { clips: out.clips, voTracks: out.voTracks, vo_segments: out.voSegments, talking_scenes: out.talkingScenes };
9087
9862
  }
9088
9863
  function buildSfxMusic(blueprint, nodes) {
9089
9864
  const tracks = [];
@@ -9160,18 +9935,48 @@ function positionClass(position) {
9160
9935
  const p = (position ?? "bottom_center").toLowerCase().replace(/[^a-z]+/g, "-");
9161
9936
  return `pos-${p}`;
9162
9937
  }
9163
- function overlayElement(ov, sceneStart) {
9938
+ function collectCaptions(blueprint) {
9939
+ return blueprint.scenes.flatMap((scene) => {
9940
+ const sceneStart = scene.start_s ?? 0;
9941
+ const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
9942
+ return overlays.success ? overlays.data.filter((ov) => Boolean(ov.text?.trim())).map((ov) => {
9943
+ const at = ov.appears_at_s ?? sceneStart;
9944
+ return { text: ov.text.trim(), at, end: at + (ov.duration_s ?? 2.5), ov };
9945
+ }) : [];
9946
+ }).sort((a, b) => a.at - b.at);
9947
+ }
9948
+ function mergeCaptions(blueprint) {
9949
+ const byText = /* @__PURE__ */ new Map();
9950
+ for (const e of collectCaptions(blueprint)) {
9951
+ const arr = byText.get(e.text);
9952
+ if (arr) arr.push(e);
9953
+ else byText.set(e.text, [e]);
9954
+ }
9955
+ const merged = [];
9956
+ for (const arr of byText.values()) {
9957
+ let cur = null;
9958
+ for (const e of arr) {
9959
+ if (cur && e.at <= cur.end + 0.35) cur.end = Math.max(cur.end, e.end);
9960
+ else {
9961
+ cur = { ...e };
9962
+ merged.push(cur);
9963
+ }
9964
+ }
9965
+ }
9966
+ return merged.sort((a, b) => a.at - b.at);
9967
+ }
9968
+ function overlayElement(ov, at, dur) {
9164
9969
  if (!ov.text?.trim()) return "";
9165
- const at = ov.appears_at_s ?? sceneStart;
9166
- const dur = ov.duration_s ?? 2.5;
9167
9970
  const role = ov.role ? ` data-role="${escapeHtml(ov.role)}"` : "";
9168
9971
  const normAnim = normalizeAnim(ov.animation);
9169
9972
  const anim = normAnim ? ` data-anim="${normAnim}"` : "";
9170
9973
  const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
9171
9974
  return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
9172
9975
  }
9976
+ var RICH_OVERLAY_RE = /notif|tweet|\bx post\b|post\b|comment|message|chat|bubble|card|review|rating|stat|counter|toast|popup/;
9173
9977
  function sourceHint(fe) {
9174
9978
  const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
9979
+ const haystack = `${fe.kind ?? ""} ${fe.description ?? ""} ${fe.what_it_represents ?? ""}`.toLowerCase();
9175
9980
  switch ((fe.kind ?? "").toLowerCase()) {
9176
9981
  case "logo":
9177
9982
  return "baker images logo <domain> (or baker images library)";
@@ -9181,6 +9986,9 @@ function sourceHint(fe) {
9181
9986
  case "product_cutout":
9182
9987
  return `baker images library "${desc}" (the client's own product)`;
9183
9988
  default:
9989
+ if (RICH_OVERLAY_RE.test(haystack)) {
9990
+ return `npx hyperframes add <social-card/notification block> for "${desc}" (animated overlay, not a static icon \u2014 see references/hyperframes/catalog.md)`;
9991
+ }
9184
9992
  return `baker images icon "${desc}"`;
9185
9993
  }
9186
9994
  }
@@ -9215,14 +10023,12 @@ function buildOverlayHtml(input) {
9215
10023
  " Positions: edit the .pos-* classes or add your own. -->"
9216
10024
  ].join("\n")
9217
10025
  ];
10026
+ const ovParts = mergeCaptions(blueprint).map((e) => overlayElement(e.ov, e.at, Math.round((e.end - e.at) * 1e3) / 1e3)).filter(Boolean);
10027
+ if (ovParts.length > 0) blocks.push(ovParts.join("\n"));
9218
10028
  for (const scene of blueprint.scenes) {
9219
10029
  const sceneStart = scene.start_s ?? 0;
9220
- const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
9221
10030
  const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
9222
- const parts = [
9223
- ...overlays.success ? overlays.data.map((ov) => overlayElement(ov, sceneStart)) : [],
9224
- ...floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []
9225
- ].filter(Boolean);
10031
+ const parts = (floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []).filter(Boolean);
9226
10032
  if (parts.length > 0) blocks.push(parts.join("\n"));
9227
10033
  }
9228
10034
  return blocks.join("\n\n");
@@ -9255,15 +10061,15 @@ function xfadeSpineArgs(clips) {
9255
10061
  let cur = "c0";
9256
10062
  let accLen = clipInputLen(clips[0]);
9257
10063
  for (let k = 0; k < n - 1; k++) {
9258
- const join3 = clips[k].out;
10064
+ const join4 = clips[k].out;
9259
10065
  const next = `c${k + 1}`;
9260
10066
  const out = k === n - 2 ? "v" : `j${k + 1}`;
9261
- if (join3) {
9262
- const offset = Math.max(0, accLen - join3.dur);
10067
+ if (join4) {
10068
+ const offset = Math.max(0, accLen - join4.dur);
9263
10069
  filt.push(
9264
- `[${cur}][${next}]xfade=transition=${join3.xfade}:duration=${join3.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
10070
+ `[${cur}][${next}]xfade=transition=${join4.xfade}:duration=${join4.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
9265
10071
  );
9266
- accLen = accLen - join3.dur + clipInputLen(clips[k + 1]);
10072
+ accLen = accLen - join4.dur + clipInputLen(clips[k + 1]);
9267
10073
  } else {
9268
10074
  filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
9269
10075
  accLen += clipInputLen(clips[k + 1]);
@@ -9305,9 +10111,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
9305
10111
  });
9306
10112
  });
9307
10113
  if (opts.actorSheets) applyActorSheets(slots, nodes);
9308
- const { tracks: ttsTracks, sceneTurns } = buildDialogue(blueprint, nodes);
9309
- const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
9310
- const voTracks = [...ttsTracks, ...nativeVoTracks];
10114
+ const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
9311
10115
  let videoRef = buildSpine(clips, nodes);
9312
10116
  let videoNode = "spine";
9313
10117
  const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
@@ -9384,45 +10188,31 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
9384
10188
  // The timing plan `baker canvas validate` checks before any billed render:
9385
10189
  // sequenced voiceover turns (no overlap), audio ≈ video length, and which
9386
10190
  // scenes must be lip-synced.
9387
- video: buildVideoMeta(blueprint, sceneTurns)
10191
+ video: buildVideoMeta(blueprint, { vo_segments, talking_scenes })
9388
10192
  },
9389
10193
  nodes,
9390
10194
  output: { node: videoNode, output: "video" }
9391
10195
  };
9392
10196
  }
9393
- function buildVideoMeta(blueprint, sceneTurns) {
9394
- const vo_segments = [];
9395
- const talking_scenes = [];
9396
- for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
9397
- for (const t of turns) {
9398
- if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
9399
- }
9400
- const nativeTurn = turns.find((t) => t.native);
9401
- if (nativeTurn) {
9402
- const sceneObj = blueprint.scenes[scene];
9403
- talking_scenes.push({
9404
- scene,
9405
- voice_convert_node: `s${scene}_voconv`,
9406
- scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
9407
- est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
9408
- });
9409
- }
9410
- }
10197
+ function buildVideoMeta(blueprint, meta) {
9411
10198
  return {
9412
10199
  duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
9413
- vo_segments,
9414
- talking_scenes,
9415
- motion_board: buildMotionBoard(blueprint, sceneTurns)
10200
+ vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
10201
+ talking_scenes: meta.talking_scenes,
10202
+ motion_board: buildMotionBoard(blueprint)
9416
10203
  };
9417
10204
  }
9418
- function buildMotionBoard(blueprint, sceneTurns) {
10205
+ function sceneSpokenText(scene) {
10206
+ return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
10207
+ }
10208
+ function buildMotionBoard(blueprint) {
9419
10209
  const round = (n) => Math.round(n * 100) / 100;
9420
10210
  let cursor = 0;
9421
10211
  return blueprint.scenes.map((scene, i) => {
9422
10212
  const start_s = scene.start_s ?? cursor;
9423
10213
  const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
9424
10214
  cursor = end_s;
9425
- const spoken = (sceneTurns.get(i) ?? []).map((t) => t.text?.trim()).filter((l) => Boolean(l)).join(" ") || null;
10215
+ const spoken = sceneSpokenText(scene);
9426
10216
  const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
9427
10217
  const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
9428
10218
  const graphics = [
@@ -9445,19 +10235,21 @@ function buildMotionBoard(blueprint, sceneTurns) {
9445
10235
  scene: i,
9446
10236
  role: resolveSceneRole(scene, i, blueprint.scenes.length),
9447
10237
  window_s: [round(start_s), round(end_s)],
9448
- storyboard_frames: [`s${i}_start`, `s${i}_end`],
10238
+ // A continuation b-roll scene shares the previous scene's end frame as its start
10239
+ // (no own `s<i>_start` node), so point the storyboard at that shared keyframe.
10240
+ storyboard_frames: [scene.continues_previous && i > 0 ? `s${i - 1}_end` : `s${i}_start`],
9449
10241
  spoken,
9450
10242
  graphics
9451
10243
  };
9452
10244
  });
9453
10245
  }
9454
10246
  var VIDEO_GUIDE = [
9455
- "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 concatenated. Talking heads are voiced NATIVELY by Seedance (lips+voice generated together) then re-voiced to one brand voice; off-camera narration is sequenced tts. On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
10247
+ "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video, built like an editing timeline. The VOICE is cut at PAUSES, not at visual cuts: each continuous-speech PHRASE is ONE Seedance clip (native lip-sync + audio) re-voiced to one brand voice, so a sentence never breaks mid-word across a cut. Each scene's PICTURE is independent: a scene that SHOWS the speaker slices its window out of the phrase clip; a b-roll cutaway gets its own silent clip (or a still hold for a sub-2s flash) laid over the continuing voice; a pure-voiceover stretch is one ElevenLabs tts read. Every clip gets a CLEAN-PLATE start AND end keyframe (no baked text), RECAST to your dropped reference assets \u2014 Seedance interpolates real in-shot motion between them. Each frame grounds ONLY on its own extracted frame + el_* slots (never another generated frame), so all frames render in PARALLEL (no cross-frame cascade). A SPLIT-SCREEN / PICTURE-IN-PICTURE / KEYED-PRESENTER scene is reproduced as one clip PER REGION, stacked or overlaid (see `metadata.todo.composition`). On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voice + SFX + a ducked music bed, normalized stereo. It is a STARTING POINT, not a locked render: add, delete, reorder, split, merge, or re-time scenes freely (a b-roll cutaway INSIDE a phrase lands at an approximate beat \u2014 nudge it) \u2014 see `metadata.todo.full_flexibility`.",
9456
10248
  "",
9457
10249
  "WHAT TO DO NEXT:",
9458
10250
  "0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
9459
- "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` / `s<i>_end` node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want.",
9460
- "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. The boundary frames ARE your storyboard; `metadata.video.motion_board` lays out each scene's frames, time window, spoken line, and the graphics scheduled in it. Lock the frames + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
10251
+ "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` keyframe node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want. The frame is RECAST to the el_* reference images you drop (the source ad's people are never reused), so describe pose/action/framing here and let the references carry identity.",
10252
+ "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. Each scene's keyframe IS your storyboard; `metadata.video.motion_board` lays out each scene's frame, time window, spoken line, and the graphics scheduled in it. Lock the keyframes + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
9461
10253
  "2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
9462
10254
  "3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
9463
10255
  "4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
@@ -9468,11 +10260,11 @@ var VIDEO_GUIDE = [
9468
10260
  "- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
9469
10261
  "- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
9470
10262
  "- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
9471
- "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it). Match-cut continuous action by setting scene N+1's start frame = scene N's end frame (costs no extra gens).",
10263
+ "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it).",
9472
10264
  "- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
9473
10265
  "- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
9474
10266
  "",
9475
- "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are.",
10267
+ "Tip: `prompt.json` is the deconstruction provenance + the authoritative SHARED AD SPEC each frame reads for cast identity, palette, brand, and type cohesion. The per-frame editing surface is the frame node's own FRAME DESCRIPTION.",
9476
10268
  "Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
9477
10269
  ].join("\n");
9478
10270
  function inferNarrativeRole(index, total) {
@@ -9514,14 +10306,16 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
9514
10306
  const hookSceneIndex = findHookSceneIndex(blueprint);
9515
10307
  const h = hookSceneIndex;
9516
10308
  return {
10309
+ full_flexibility: "THIS CANVAS IS A STARTING POINT, NOT A LOCKED RENDER. It mirrors the reference's structure so you have a faithful scaffold \u2014 but you have FULL EDITING FREEDOM and should use it. You can: ADD a scene (new s<i>_start/_end + s<i>_clip + wire it into `spine`), DELETE a scene (drop its nodes + its `spine` input), REORDER scenes, SPLIT one beat into two or MERGE two into one, change any frame prompt or motion brief, swap an element reference, re-time or rewrite any overlay/voice, or change a scene's LAYOUT (make a full-frame beat a split-screen/PIP, or flatten a composite to one shot \u2014 see `composition`). Re-craft for OUR brand and OUR best ad; the reference is inspiration, not a spec to trace. The content-addressed cache re-bills only what you actually change, so iterate freely. `baker canvas validate` re-checks timing/lip-sync after any edit.",
10310
+ composition: "Some scenes are COMPOSITED, not single shots \u2014 `prompt.json`'s scene.composition.layout tells you which: `split_screen` (panels each showing different footage \u2014 e.g. b-roll on top, presenter on the bottom), `pip` (a presenter boxed in a corner over full-frame background), or `keyed_overlay` (a green-screen/cut-out presenter over background). Each is reproduced as ONE generated clip PER REGION (`s<i>_r0_*`, `s<i>_r1_*`, \u2026) stacked (vstack/hstack) or overlaid by an `s<i>_composite` ffmpeg node; a keyed presenter runs through `s<i>_key` (video_background_remove) for a transparent cut-out first. Edit each region's own keyframe prompt + motion brief independently. The presenter region (is_presenter) carries the lip-synced voice. To CHANGE a layout, edit composition in prompt.json and re-scaffold, or hand-edit the s<i>_composite ffmpeg args (splitStackArgs/pipOverlayArgs patterns). A clean full-frame talking head is simpler than a composite \u2014 flatten when the brand's version doesn't need the split.",
9517
10311
  recraft_the_script_first: `VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer.${h >= 0 ? " The HOOK is the #1 decision (see the `hook` todo);" : ""} ${h >= 0 ? "then work" : "Work"} the per-scene \`script_recraft\` checklist. References: references/hook-craft.md (the hook), references/script-craft.md (body/CTA) + the meta-ads-playbook skill.`,
9518
10312
  ...h >= 0 ? {
9519
10313
  hook: `THE HOOK IS THE HIGHEST-LEVERAGE BEAT \u2014 the first frame + first 3\u20134s decide whether the ad is watched at all, and the hook is TARGETING. But highest-leverage does NOT mean always rewrite: this hook already won, so MOST OF THE TIME you KEEP it and build on top (swap only the specifics). REBUILD is the exception \u2014 only when it doesn't transfer (a claim we lack or a different funnel/awareness stage), and then by reaching for its deeper INNER MECHANIC and delivering that truthfully, not inventing a new opener from nothing. For scene ${h}: DIAGNOSE it (device + mechanic + what stage it targets), DECIDE keep/adapt/rebuild, then hold the opener to the criteria \u2014 ${HOOK_OPENER_CRITERIA}. The hook lives across s${h}_start (the scroll-stopping first frame), the scene-${h} overlay text, the s${h}_clip line, an optional ~0.5s micro-hook, and the ramp into the body. Full diagnose\u2192decide\u2192(keep/adapt/rebuild) discipline + the proven hook-type menu: references/hook-craft.md (+ meta-ads-playbook \xA710/\xA717/\xA739).`
9520
10314
  } : {},
9521
10315
  script_recraft: buildScriptRecraft(blueprint),
9522
- edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference. Frames are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
10316
+ edit_frames_in_place: "Each s<i>_start keyframe node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is the authoritative shared ad spec (cast identity, palette, brand). Frames are RECAST to the el_* reference images (the source ad's cast is never reused) and are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
9523
10317
  frames_mode: opts.frames ?? "generate",
9524
- review_storyboard_before_clips: "STORYBOARD FIRST. The per-scene boundary frames (s<i>_start / s<i>_end) ARE your storyboard \u2014 align the LOOK on them before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's frames, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
10318
+ review_storyboard_before_clips: "STORYBOARD FIRST. Each scene's keyframe (s<i>_start) IS your storyboard \u2014 align the LOOK on it before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's keyframe, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
9525
10319
  motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
9526
10320
  assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
9527
10321
  sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
@@ -9534,18 +10328,17 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
9534
10328
  voice_description: d.voice_description,
9535
10329
  line: d.line
9536
10330
  })),
9537
- talking_head_note: "NATIVE: a single-on-camera-speaker scene is voiced by Seedance itself (line in s<i>_clip prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Edit the line in the clip's prompt to re-author it.",
9538
- voice_note: "Every native talking clip's audio is re-voiced to ONE brand voice via audio_voice_convert (eleven_multilingual_sts_v2), timing preserved so lips stay matched. voice_select.voice_id is that brand voice \u2014 set its gender/language to match the creator. Off-camera narration uses a sequenced tts per turn.",
9539
- native_timing: "Seedance paces the spoken line to fill the clip, so each native talking clip is generated long enough for the estimated speech and its audio is kept full-length (not hard-trimmed to the visual scene) \u2014 the line is never cut mid-word; the voice may continue a beat past the visual cut (natural VO continuity). `metadata.video.talking_scenes` carries each scene's scene_s vs est_speech_s. If a rendered line still sounds clipped, the line is simply longer than the scene: shorten the line or lengthen the scene in the deconstruct.",
10331
+ talking_head_note: "PHRASE-NATIVE: a continuous-speech phrase where the speaker is shown is ONE Seedance clip (the full phrase quoted in s<anchor>_clip's prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Scenes that show the speaker slice their window out of that clip (s<i>_seg); edit the phrase line in the s<anchor>_clip prompt to re-author it. A pure-voiceover phrase (speaker never shown) is one ElevenLabs tts read instead.",
10332
+ voice_note: "ONE voice per person: a single voice_select is reused across all that person's phrases (on-camera AND off \u2014 the deconstruct's `voiceover` label folds into the sole presenter). Each presenter phrase's native audio is re-voiced to that brand voice via audio_voice_convert (eleven_multilingual_sts_v2, one convert per phrase, timing preserved so lips stay matched). Set voice_select.voice_id's gender/language to match the creator.",
10333
+ native_timing: "The voice is cut at PAUSES, not at visual cuts, so a sentence spanning a cut stays one continuous read (no mid-word break). The clip is generated long enough for the estimated speech; if a line runs longer than its phrase window the voice continues a beat into the following pause (natural VO continuity). `metadata.video.talking_scenes` carries each phrase's scene_s vs est_speech_s. CAVEAT: a b-roll cutaway INSIDE a phrase lands at an approximate (proportional) time \u2014 Seedance exposes no word timing \u2014 so if a cutaway is off its beat, nudge the scene boundary (it's a starting point).",
9540
10334
  craft: {
9541
10335
  note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
9542
10336
  principles: [
9543
10337
  "Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
9544
10338
  "Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
9545
- "LOCK THE CAMERA \u2014 a first/last-frame clip holds the framing the two frames define; only move when a move is specified. Unmotivated camera drift is the top realism tell.",
10339
+ "LOCK THE CAMERA \u2014 Seedance animates forward from the single keyframe; only move when the motion brief specifies a move. Unmotivated camera drift is the top realism tell.",
9546
10340
  "One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
9547
10341
  "Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
9548
- "Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
9549
10342
  "Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
9550
10343
  "Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
9551
10344
  "Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
@@ -9659,10 +10452,10 @@ DROP one-off background extras and incidental props \u2014 but the shared set/lo
9659
10452
 
9660
10453
  ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
9661
10454
 
9662
- For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
10455
+ For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of ONLY the scenes where the element is ACTUALLY VISIBLE ON SCREEN \u2014 judged from that scene's start_frame_prompt / end_frame_prompt subjects and its action_detail, NOT from who is merely speaking. A narrator heard over b-roll is NOT present in that b-roll scene; a dog-running cutaway does NOT contain the couch creator just because she talks across it. Do NOT pad the list \u2014 an element wrongly listed in a scene makes the reproduction render the wrong subject there (e.g. the creator appearing in a pure-dog b-roll). When in doubt, leave a scene OUT. Output ONLY the JSON object.`;
9663
10456
  async function loadAssetText2(ref, label) {
9664
10457
  const r = ref;
9665
- if (typeof r?.path === "string") return readFile4(r.path, "utf8");
10458
+ if (typeof r?.path === "string") return readFile5(r.path, "utf8");
9666
10459
  if (typeof r?.url === "string") {
9667
10460
  const res = await fetch(r.url);
9668
10461
  if (!res.ok) throw new Error(`failed to fetch ${label} (${res.status})`);
@@ -9678,6 +10471,31 @@ function parseElements2(raw) {
9678
10471
  }
9679
10472
  return [];
9680
10473
  }
10474
+ async function detectShotCutsBestEffort(videoPath, threshold) {
10475
+ try {
10476
+ const cuts = await detectSceneCutsPySceneDetect(videoPath, threshold ? { threshold } : {});
10477
+ if (cuts.length > 0) {
10478
+ process.stderr.write(`Detected ${cuts.length} shot cut(s) via PySceneDetect: ${cuts.join(", ")}s
10479
+ `);
10480
+ } else {
10481
+ process.stderr.write("PySceneDetect ran but found no hard cuts; using LLM scene boundaries.\n");
10482
+ }
10483
+ return cuts;
10484
+ } catch (e) {
10485
+ const msg = e instanceof Error ? e.message : String(e);
10486
+ const code = e?.code;
10487
+ const missing = code === "ENOENT" || /ENOENT|not found|command not found/i.test(msg);
10488
+ if (missing) {
10489
+ process.stderr.write(
10490
+ "WARNING: `scenedetect` (PySceneDetect) is NOT installed \u2014 falling back to LLM-only scene boundaries, which under-segments (coarse 9-15s scenes instead of the real 1-4s cuts). Install it (`pipx install scenedetect[opencv]` or `pip install scenedetect[opencv]`) for accurate shot-cut detection.\n"
10491
+ );
10492
+ } else {
10493
+ process.stderr.write(`Shot-cut detection skipped (${msg}); using LLM boundaries.
10494
+ `);
10495
+ }
10496
+ return [];
10497
+ }
10498
+ }
9681
10499
  function fail2(code, message) {
9682
10500
  process.stderr.write(`${JSON.stringify({ ok: false, error: { code, message } }, null, 2)}
9683
10501
  `);
@@ -9699,53 +10517,76 @@ function resolveModels2(args) {
9699
10517
  videoModel: pick("video-model", "video_generate", "bytedance/seedance-2.0")
9700
10518
  };
9701
10519
  }
9702
- function buildAnalysisCanvas(videoPath, deconstructModel, selectModel, opts) {
10520
+ function buildDeconstructCanvas(videoPath, deconstructModel, opts) {
9703
10521
  const deconstructParams = { model: deconstructModel, mode: "full" };
9704
10522
  if (typeof opts.maxScenes === "number") deconstructParams.max_scenes = opts.maxScenes;
9705
10523
  if (opts.language) deconstructParams.language = opts.language;
9706
10524
  if (opts.focus) deconstructParams.focus = opts.focus;
10525
+ if (opts.shotCuts && opts.shotCuts.length > 0) deconstructParams.shot_cuts = opts.shotCuts;
10526
+ deconstructParams.max_clip_s = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
9707
10527
  return {
9708
10528
  schema: "baker-canvas/1",
9709
10529
  metadata: { name: "video deconstruct pass" },
9710
10530
  nodes: [
9711
10531
  { id: "src", type: "ingest", params: { source: "path", path: videoPath, expect: "video" } },
9712
- { id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams },
10532
+ { id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams }
10533
+ ],
10534
+ output: { node: "deconstruct", output: "analysis" }
10535
+ };
10536
+ }
10537
+ function buildSelectCanvas(selectModel, slimmedBlueprintJson) {
10538
+ return {
10539
+ schema: "baker-canvas/1",
10540
+ metadata: { name: "element selection pass" },
10541
+ nodes: [
9713
10542
  {
9714
10543
  id: "select",
9715
10544
  type: "text_generate",
9716
- inputs: { blueprint: "$ref:deconstruct.analysis" },
9717
10545
  params: {
9718
10546
  model: selectModel,
9719
10547
  max_tokens: 6e3,
9720
10548
  temperature: 0,
9721
10549
  response_format: "json_object",
9722
10550
  system: SELECT_SYSTEM2,
9723
- prompt: SELECT_PROMPT2
10551
+ prompt: SELECT_PROMPT2.replace("{{blueprint}}", () => slimmedBlueprintJson)
9724
10552
  }
9725
10553
  }
9726
10554
  ],
9727
10555
  output: { node: "select", output: "text" }
9728
10556
  };
9729
10557
  }
9730
- async function runAnalysisPasses(canvas) {
10558
+ async function runAnalysisPasses(deconstructCanvas, selectModel) {
9731
10559
  const engine = createEngineFromEnv({ log: (line) => process.stderr.write(`${line}
9732
10560
  `) });
9733
- let outputsByNode;
9734
- let creditsSpent;
10561
+ let credits = 0;
10562
+ let sawCredits = false;
10563
+ const addCredits = (stats) => {
10564
+ const c = stats?.total_credits;
10565
+ if (typeof c === "number") {
10566
+ credits += c;
10567
+ sawCredits = true;
10568
+ }
10569
+ };
10570
+ let blueprint;
9735
10571
  try {
9736
- const result = await engine.run(canvas, {});
9737
- outputsByNode = result.outputs_by_node;
9738
- creditsSpent = result.stats?.total_credits;
10572
+ const r1 = await engine.run(deconstructCanvas, {});
10573
+ addCredits(r1.stats);
10574
+ blueprint = JSON.parse(await loadAssetText2(r1.outputs_by_node.deconstruct?.analysis, "deconstruct output"));
9739
10575
  } catch (e) {
9740
10576
  if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
10577
+ if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
9741
10578
  return fail2("deconstruct", e instanceof Error ? e.message : String(e));
9742
10579
  }
10580
+ const slimJson = JSON.stringify(slimBlueprintForSelection(blueprint));
9743
10581
  try {
9744
- const blueprint = JSON.parse(await loadAssetText2(outputsByNode.deconstruct?.analysis, "deconstruct output"));
9745
- const elements = parseElements2(await loadAssetText2(outputsByNode.select?.text, "selection output"));
9746
- return { blueprint, elements, creditsSpent };
10582
+ const r2 = await engine.run(buildSelectCanvas(selectModel, slimJson), {});
10583
+ addCredits(r2.stats);
10584
+ const elements = parseElements2(await loadAssetText2(r2.outputs_by_node.select?.text, "selection output"));
10585
+ return { blueprint, elements, creditsSpent: sawCredits ? credits : void 0 };
9747
10586
  } catch (e) {
9748
- return fail2("read_outputs", e instanceof Error ? e.message : String(e));
10587
+ if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
10588
+ if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
10589
+ return fail2("deconstruct", e instanceof Error ? e.message : String(e));
9749
10590
  }
9750
10591
  }
9751
10592
  var scaffoldVideoCommand = defineCommand76({
@@ -9766,6 +10607,10 @@ var scaffoldVideoCommand = defineCommand76({
9766
10607
  description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
9767
10608
  },
9768
10609
  "max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
10610
+ "shot-threshold": {
10611
+ type: "string",
10612
+ description: "PySceneDetect content threshold (default 18; lower = more/softer cuts, higher = fewer)"
10613
+ },
9769
10614
  language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
9770
10615
  focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
9771
10616
  "deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
@@ -9788,12 +10633,15 @@ var scaffoldVideoCommand = defineCommand76({
9788
10633
  );
9789
10634
  }
9790
10635
  const { deconstructModel, selectModel, imageModel, videoModel } = resolveModels2(args);
9791
- const analysisCanvas = buildAnalysisCanvas(videoPath, deconstructModel, selectModel, {
10636
+ const shotThreshold = args["shot-threshold"] ? Number(args["shot-threshold"]) : void 0;
10637
+ const shotCuts = await detectShotCutsBestEffort(videoPath, shotThreshold);
10638
+ const deconstructCanvas = buildDeconstructCanvas(videoPath, deconstructModel, {
9792
10639
  maxScenes: Number.isFinite(maxScenes) ? maxScenes : void 0,
9793
10640
  language: args.language ? String(args.language) : void 0,
9794
- focus: args.focus ? String(args.focus) : void 0
10641
+ focus: args.focus ? String(args.focus) : void 0,
10642
+ shotCuts
9795
10643
  });
9796
- const { blueprint, elements, creditsSpent } = await runAnalysisPasses(analysisCanvas);
10644
+ const { blueprint, elements, creditsSpent } = await runAnalysisPasses(deconstructCanvas, selectModel);
9797
10645
  await mkdir(outDir, { recursive: true });
9798
10646
  const annotated = annotateBlueprintWithElements(blueprint, elements);
9799
10647
  await writeFile2(blueprintPath, `${JSON.stringify(annotated, null, 2)}
@@ -9802,7 +10650,7 @@ var scaffoldVideoCommand = defineCommand76({
9802
10650
  await cp(SHIPPED_COMPOSITION_DIR, compositionDest, { recursive: true });
9803
10651
  const indexPath = path5.join(compositionDest, "index.html");
9804
10652
  const overlayHtml = buildOverlayHtml(blueprint);
9805
- const indexHtml = await readFile4(indexPath, "utf8");
10653
+ const indexHtml = await readFile5(indexPath, "utf8");
9806
10654
  const injected = indexHtml.replace("<!--OVERLAYS-->", () => overlayHtml);
9807
10655
  if (injected === indexHtml && overlayHtml.trim()) {
9808
10656
  fail2(
@@ -9851,7 +10699,7 @@ var scaffoldVideoCommand = defineCommand76({
9851
10699
  stats: {
9852
10700
  scene_count: report.scene_count,
9853
10701
  total_nodes: canvas.nodes.length,
9854
- deconstruct_credits_spent: creditsSpent,
10702
+ analysis_credits_spent: creditsSpent,
9855
10703
  run_estimated_credits: validation.estimatedCredits
9856
10704
  },
9857
10705
  checklist: {
@@ -9879,7 +10727,7 @@ var scaffoldVideoCommand = defineCommand76({
9879
10727
  });
9880
10728
 
9881
10729
  // src/commands/canvas/validate.ts
9882
- import { readFile as readFile5 } from "fs/promises";
10730
+ import { readFile as readFile6 } from "fs/promises";
9883
10731
  import path6 from "path";
9884
10732
  import { defineCommand as defineCommand77 } from "citty";
9885
10733
  var validateCommand = defineCommand77({
@@ -9890,7 +10738,7 @@ var validateCommand = defineCommand77({
9890
10738
  args: { file: { type: "positional", required: true, description: "Path to canvas JSON" } },
9891
10739
  async run({ args }) {
9892
10740
  const filePath = path6.resolve(String(args.file));
9893
- const raw = await readFile5(filePath, "utf8");
10741
+ const raw = await readFile6(filePath, "utf8");
9894
10742
  let parsed;
9895
10743
  try {
9896
10744
  parsed = JSON.parse(raw);
@@ -10779,8 +11627,8 @@ function cropSprite(input, region) {
10779
11627
 
10780
11628
  // src/lib/image/io.ts
10781
11629
  import { randomBytes } from "crypto";
10782
- import { glob as fsGlob, readFile as readFile6, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
10783
- import { dirname, extname, join as join2, resolve as resolve4 } from "path";
11630
+ import { glob as fsGlob, readFile as readFile7, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
11631
+ import { dirname, extname, join as join3, resolve as resolve4 } from "path";
10784
11632
  var REMOTE_RE = /^https?:\/\//i;
10785
11633
  var GLOB_RE = /[*?[\]{}]/;
10786
11634
  function isRemoteUrl(value) {
@@ -10815,7 +11663,7 @@ async function readImageBuffer(pathOrUrl) {
10815
11663
  }
10816
11664
  return Buffer.from(await response.arrayBuffer());
10817
11665
  }
10818
- return readFile6(pathOrUrl);
11666
+ return readFile7(pathOrUrl);
10819
11667
  }
10820
11668
  async function isDirectory(path7) {
10821
11669
  try {
@@ -10830,14 +11678,14 @@ async function resolveOutputPath(inputPath, outputArg, options) {
10830
11678
  if (!outputArg) return base;
10831
11679
  if (options.multipleInputs || await isDirectory(outputArg)) {
10832
11680
  const filename = base.split("/").pop() ?? "out.png";
10833
- return join2(outputArg, filename);
11681
+ return join3(outputArg, filename);
10834
11682
  }
10835
11683
  return outputArg;
10836
11684
  }
10837
11685
  async function atomicWrite(targetPath, data) {
10838
11686
  const absolute = resolve4(targetPath);
10839
11687
  const dir = dirname(absolute);
10840
- const tmp = join2(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
11688
+ const tmp = join3(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
10841
11689
  await writeFile3(tmp, data);
10842
11690
  await rename(tmp, absolute);
10843
11691
  }
@@ -11179,7 +12027,7 @@ var findCommand = defineCommand91({
11179
12027
  });
11180
12028
 
11181
12029
  // src/commands/images/generate.ts
11182
- import { readFile as readFile7 } from "fs/promises";
12030
+ import { readFile as readFile8 } from "fs/promises";
11183
12031
  import { defineCommand as defineCommand92 } from "citty";
11184
12032
  import sharp2 from "sharp";
11185
12033
  var GENERATE_TIMEOUT_MS = 18e4;
@@ -11262,7 +12110,7 @@ async function resolveReferences(spec) {
11262
12110
  }
11263
12111
  let raw;
11264
12112
  try {
11265
- raw = await readFile7(entry);
12113
+ raw = await readFile8(entry);
11266
12114
  } catch {
11267
12115
  throw new ApiError("VALIDATION_ERROR", `Reference file not found: ${entry}`);
11268
12116
  }
@@ -12983,7 +13831,7 @@ var stockCommand = defineCommand105({
12983
13831
  });
12984
13832
 
12985
13833
  // src/commands/images/upload.ts
12986
- import { readFile as readFile8 } from "fs/promises";
13834
+ import { readFile as readFile9 } from "fs/promises";
12987
13835
  import { extname as extname2 } from "path";
12988
13836
  import { defineCommand as defineCommand106 } from "citty";
12989
13837
  var MIME_MAP = {
@@ -13123,7 +13971,7 @@ async function uploadLocal(target, args) {
13123
13971
  });
13124
13972
  return;
13125
13973
  }
13126
- const fileBuffer = await readFile8(target);
13974
+ const fileBuffer = await readFile9(target);
13127
13975
  const base64 = fileBuffer.toString("base64");
13128
13976
  const body = { base64, contentType };
13129
13977
  if (args.source) body.source = args.source;
@@ -15088,7 +15936,7 @@ var searchCommand3 = defineCommand135({
15088
15936
  });
15089
15937
 
15090
15938
  // src/commands/videos/upload.ts
15091
- import { readFile as readFile9, stat as stat3 } from "fs/promises";
15939
+ import { readFile as readFile10, stat as stat3 } from "fs/promises";
15092
15940
  import { extname as extname3 } from "path";
15093
15941
  import { defineCommand as defineCommand136 } from "citty";
15094
15942
  var MIME_MAP2 = {
@@ -15153,7 +16001,7 @@ var uploadCommand2 = defineCommand136({
15153
16001
  return;
15154
16002
  }
15155
16003
  const { uploadUrl, videoId } = await apiPost("/api/videos/upload", {});
15156
- const fileBuffer = await readFile9(filePath);
16004
+ const fileBuffer = await readFile10(filePath);
15157
16005
  const uploadResponse = await fetch(uploadUrl, {
15158
16006
  method: "PUT",
15159
16007
  headers: { "Content-Type": contentType },