@koda-sl/baker-cli 0.81.1 → 0.90.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -7
- package/dist/{chunk-NBNUNCY7.js → chunk-2E4H2GIJ.js} +130 -15
- package/dist/chunk-2E4H2GIJ.js.map +1 -0
- package/dist/cli.js +1120 -268
- package/dist/cli.js.map +1 -1
- package/dist/engine/index.d.ts +1 -0
- package/dist/engine/index.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-NBNUNCY7.js.map +0 -1
package/dist/cli.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
defaultRegistry,
|
|
10
10
|
generateCatalog,
|
|
11
11
|
validateCanvasDeep
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-2E4H2GIJ.js";
|
|
13
13
|
|
|
14
14
|
// src/cli.ts
|
|
15
15
|
import { defineCommand as defineCommand141, runMain } from "citty";
|
|
@@ -8274,10 +8274,68 @@ var scaffoldStaticAdCommand = defineCommand75({
|
|
|
8274
8274
|
});
|
|
8275
8275
|
|
|
8276
8276
|
// src/commands/canvas/scaffold-video.ts
|
|
8277
|
-
import { cp, mkdir, readFile as
|
|
8277
|
+
import { cp, mkdir, readFile as readFile5, writeFile as writeFile2 } from "fs/promises";
|
|
8278
8278
|
import path5 from "path";
|
|
8279
8279
|
import { defineCommand as defineCommand76 } from "citty";
|
|
8280
8280
|
|
|
8281
|
+
// src/engine/nodes/local/lib/sceneDetect.ts
|
|
8282
|
+
import { execFile as execFile2 } from "child_process";
|
|
8283
|
+
import { mkdtemp, readdir as readdir2, readFile as readFile4, rm } from "fs/promises";
|
|
8284
|
+
import { tmpdir } from "os";
|
|
8285
|
+
import { join as join2 } from "path";
|
|
8286
|
+
import { promisify as promisify2 } from "util";
|
|
8287
|
+
var execFileAsync2 = promisify2(execFile2);
|
|
8288
|
+
var PYSCENEDETECT_THRESHOLD = 18;
|
|
8289
|
+
var PYSCENEDETECT_MIN_SCENE_LEN_S = 0.25;
|
|
8290
|
+
function timecodeToSeconds(tc) {
|
|
8291
|
+
const m = tc.trim().match(/^(\d+):(\d{1,2}):(\d{1,2}(?:\.\d+)?)$/);
|
|
8292
|
+
if (!m) return null;
|
|
8293
|
+
const h = Number.parseInt(m[1] ?? "", 10);
|
|
8294
|
+
const min = Number.parseInt(m[2] ?? "", 10);
|
|
8295
|
+
const s = Number.parseFloat(m[3] ?? "");
|
|
8296
|
+
if (!Number.isFinite(h) || !Number.isFinite(min) || !Number.isFinite(s)) return null;
|
|
8297
|
+
return h * 3600 + min * 60 + s;
|
|
8298
|
+
}
|
|
8299
|
+
function parsePySceneDetectCsvCuts(csv) {
|
|
8300
|
+
const firstLine = csv.split(/\r?\n/, 1)[0] ?? "";
|
|
8301
|
+
if (!/^\s*Timecode List:/i.test(firstLine)) return [];
|
|
8302
|
+
const cuts = [];
|
|
8303
|
+
for (const cell of firstLine.split(",").slice(1)) {
|
|
8304
|
+
const t = timecodeToSeconds(cell);
|
|
8305
|
+
if (t !== null && t > 0) cuts.push(Math.round(t * 1e3) / 1e3);
|
|
8306
|
+
}
|
|
8307
|
+
return [...new Set(cuts)].sort((a, b) => a - b);
|
|
8308
|
+
}
|
|
8309
|
+
async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
|
|
8310
|
+
const threshold = opts.threshold ?? PYSCENEDETECT_THRESHOLD;
|
|
8311
|
+
const minSceneLenS = opts.minSceneLenS ?? PYSCENEDETECT_MIN_SCENE_LEN_S;
|
|
8312
|
+
const outDir = await mkdtemp(join2(tmpdir(), "baker-scenedetect-"));
|
|
8313
|
+
try {
|
|
8314
|
+
await execFileAsync2(
|
|
8315
|
+
"scenedetect",
|
|
8316
|
+
[
|
|
8317
|
+
"--input",
|
|
8318
|
+
filePath,
|
|
8319
|
+
"--output",
|
|
8320
|
+
outDir,
|
|
8321
|
+
"detect-content",
|
|
8322
|
+
"--threshold",
|
|
8323
|
+
String(threshold),
|
|
8324
|
+
"--min-scene-len",
|
|
8325
|
+
String(minSceneLenS),
|
|
8326
|
+
"list-scenes",
|
|
8327
|
+
"--quiet"
|
|
8328
|
+
],
|
|
8329
|
+
{ encoding: "utf-8", maxBuffer: 32 * 1024 * 1024, timeout: opts.timeout_ms ?? 12e4 }
|
|
8330
|
+
);
|
|
8331
|
+
const csvName = (await readdir2(outDir)).find((f) => f.toLowerCase().endsWith(".csv"));
|
|
8332
|
+
if (!csvName) return [];
|
|
8333
|
+
return parsePySceneDetectCsvCuts(await readFile4(join2(outDir, csvName), "utf-8"));
|
|
8334
|
+
} finally {
|
|
8335
|
+
await rm(outDir, { recursive: true, force: true });
|
|
8336
|
+
}
|
|
8337
|
+
}
|
|
8338
|
+
|
|
8281
8339
|
// src/engine/scaffold/video.ts
|
|
8282
8340
|
import { z as z3 } from "zod";
|
|
8283
8341
|
|
|
@@ -8432,10 +8490,78 @@ function sceneDurationS(scene) {
|
|
|
8432
8490
|
const max = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
|
|
8433
8491
|
return Math.min(Math.max(raw, 0.5), max);
|
|
8434
8492
|
}
|
|
8435
|
-
function
|
|
8493
|
+
function canvasDims(ar) {
|
|
8494
|
+
switch (ar) {
|
|
8495
|
+
case "1:1":
|
|
8496
|
+
return { w: 1080, h: 1080 };
|
|
8497
|
+
case "16:9":
|
|
8498
|
+
return { w: 1920, h: 1080 };
|
|
8499
|
+
case "4:3":
|
|
8500
|
+
return { w: 1440, h: 1080 };
|
|
8501
|
+
case "3:4":
|
|
8502
|
+
return { w: 1080, h: 1440 };
|
|
8503
|
+
case "21:9":
|
|
8504
|
+
return { w: 1920, h: 822 };
|
|
8505
|
+
default:
|
|
8506
|
+
return { w: 1080, h: 1920 };
|
|
8507
|
+
}
|
|
8508
|
+
}
|
|
8509
|
+
function fillPanel(label, w, h, out) {
|
|
8510
|
+
return `[${label}]scale=${w}:${h}:force_original_aspect_ratio=increase,crop=${w}:${h},setsar=1,fps=30[${out}]`;
|
|
8511
|
+
}
|
|
8512
|
+
function splitStackArgs(count, axis, dims) {
|
|
8513
|
+
const pw = axis === "horizontal" ? Math.round(dims.w / count) : dims.w;
|
|
8514
|
+
const ph = axis === "vertical" ? Math.round(dims.h / count) : dims.h;
|
|
8515
|
+
const inputs = [];
|
|
8516
|
+
const filt = [];
|
|
8517
|
+
let labels = "";
|
|
8518
|
+
for (let i = 0; i < count; i++) {
|
|
8519
|
+
inputs.push("-i", `{{in.c${i}}}`);
|
|
8520
|
+
filt.push(fillPanel(`${i}:v`, pw, ph, `p${i}`));
|
|
8521
|
+
labels += `[p${i}]`;
|
|
8522
|
+
}
|
|
8523
|
+
const stack = axis === "vertical" ? "vstack" : "hstack";
|
|
8524
|
+
filt.push(`${labels}${stack}=inputs=${count}[v]`);
|
|
8525
|
+
return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
|
|
8526
|
+
}
|
|
8527
|
+
function overlayXY(position, marginPx) {
|
|
8528
|
+
const p = (position ?? "bottom_right").toLowerCase();
|
|
8529
|
+
const x = p.includes("left") ? `${marginPx}` : p.includes("right") ? `W-w-${marginPx}` : "(W-w)/2";
|
|
8530
|
+
const y = p.includes("top") ? `${marginPx}` : p.includes("bottom") ? `H-h-${marginPx}` : "(H-h)/2";
|
|
8531
|
+
return { x, y };
|
|
8532
|
+
}
|
|
8533
|
+
function pipOverlayArgs(dims, position, insetWpct) {
|
|
8534
|
+
const iw = Math.round(dims.w * insetWpct);
|
|
8535
|
+
const margin = Math.round(dims.w * 0.04);
|
|
8536
|
+
const { x, y } = overlayXY(position, margin);
|
|
8537
|
+
const filt = `${fillPanel("0:v", dims.w, dims.h, "bg")};[1:v]scale=${iw}:-2,setsar=1,fps=30[fg];[bg][fg]overlay=x=${x}:y=${y}:format=auto[v]`;
|
|
8538
|
+
return ["-i", "{{in.c0}}", "-i", "{{in.c1}}", "-filter_complex", filt, "-map", "[v]", "{{out.video}}"];
|
|
8539
|
+
}
|
|
8540
|
+
var FLASH_HOLD_MAX_S = 2;
|
|
8541
|
+
function stillHoldArgs(durationS, dims) {
|
|
8542
|
+
return [
|
|
8543
|
+
"-loop",
|
|
8544
|
+
"1",
|
|
8545
|
+
"-i",
|
|
8546
|
+
"{{in.frame}}",
|
|
8547
|
+
"-t",
|
|
8548
|
+
durationS.toFixed(3),
|
|
8549
|
+
"-r",
|
|
8550
|
+
"30",
|
|
8551
|
+
"-vf",
|
|
8552
|
+
`scale=${dims.w}:${dims.h}:force_original_aspect_ratio=increase,crop=${dims.w}:${dims.h},setsar=1,format=yuv420p`,
|
|
8553
|
+
"-c:v",
|
|
8554
|
+
"libx264",
|
|
8555
|
+
"-pix_fmt",
|
|
8556
|
+
"yuv420p",
|
|
8557
|
+
"{{out.video}}"
|
|
8558
|
+
];
|
|
8559
|
+
}
|
|
8560
|
+
function trimArgs(durationS, offsetS = 0) {
|
|
8436
8561
|
return [
|
|
8437
8562
|
"-i",
|
|
8438
8563
|
"{{in.clip}}",
|
|
8564
|
+
...offsetS > 0 ? ["-ss", offsetS.toFixed(3)] : [],
|
|
8439
8565
|
"-t",
|
|
8440
8566
|
durationS.toFixed(3),
|
|
8441
8567
|
"-an",
|
|
@@ -8462,6 +8588,25 @@ var Sfx = z3.object({
|
|
|
8462
8588
|
sound_effect_prompt: z3.string().optional(),
|
|
8463
8589
|
description: z3.string().optional()
|
|
8464
8590
|
}).loose();
|
|
8591
|
+
var CompositionRegion = z3.object({
|
|
8592
|
+
// full | top | bottom | left | right | inset
|
|
8593
|
+
panel: z3.string().optional(),
|
|
8594
|
+
// 9-grid anchor for an `inset` presenter box.
|
|
8595
|
+
position: z3.string().optional(),
|
|
8596
|
+
is_presenter: z3.boolean().optional(),
|
|
8597
|
+
// The cast id shown/speaking in this region (routes lip-sync + element refs).
|
|
8598
|
+
cast_ref: z3.string().optional(),
|
|
8599
|
+
summary: z3.string().optional(),
|
|
8600
|
+
frame_prompt: z3.string().optional(),
|
|
8601
|
+
motion_prompt: z3.string().optional()
|
|
8602
|
+
}).loose();
|
|
8603
|
+
var SceneComposition = z3.object({
|
|
8604
|
+
// full_frame (default) | split_screen | pip | keyed_overlay
|
|
8605
|
+
layout: z3.string().optional(),
|
|
8606
|
+
// split_screen only: vertical (top/bottom) | horizontal (left/right).
|
|
8607
|
+
split_axis: z3.string().optional(),
|
|
8608
|
+
regions: z3.array(CompositionRegion).optional()
|
|
8609
|
+
}).loose();
|
|
8465
8610
|
var CameraMotion = z3.object({ movement: z3.string().optional(), detail: z3.string().optional() }).loose();
|
|
8466
8611
|
var TranscriptWord = z3.object({ text: z3.string().optional() }).loose();
|
|
8467
8612
|
var Scene = z3.object({
|
|
@@ -8470,6 +8615,10 @@ var Scene = z3.object({
|
|
|
8470
8615
|
duration_s: z3.number().optional(),
|
|
8471
8616
|
summary: z3.string().optional(),
|
|
8472
8617
|
action_detail: z3.string().optional(),
|
|
8618
|
+
// The scene's spatial layout. Absent/full_frame ⇒ one uncut shot (default path).
|
|
8619
|
+
// A layered layout (split_screen/pip/keyed_overlay) with regions ⇒ the scaffold
|
|
8620
|
+
// builds one clip per region and stacks/overlays them into the scene picture.
|
|
8621
|
+
composition: SceneComposition.optional(),
|
|
8473
8622
|
// The capture "look" for this scene — selected from the ad-native shoot-mode
|
|
8474
8623
|
// grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
|
|
8475
8624
|
// UGC/product mode; a human can override per scene by setting this.
|
|
@@ -8495,7 +8644,12 @@ var Scene = z3.object({
|
|
|
8495
8644
|
floating_elements: z3.array(z3.unknown()).optional(),
|
|
8496
8645
|
transcript_slice: z3.array(TranscriptWord).optional(),
|
|
8497
8646
|
start_frame_asset: FrameAsset,
|
|
8498
|
-
end_frame_asset: FrameAsset
|
|
8647
|
+
end_frame_asset: FrameAsset,
|
|
8648
|
+
// DECON-supplied: true when this scene is a length-split CONTINUATION of the
|
|
8649
|
+
// previous one (the SAME physical shot, broken up only because it exceeded the
|
|
8650
|
+
// clip ceiling). The scaffold then shares the splice keyframe — this scene's
|
|
8651
|
+
// start frame IS the previous scene's end frame — so the join is seamless.
|
|
8652
|
+
continues_previous: z3.boolean().optional()
|
|
8499
8653
|
}).loose();
|
|
8500
8654
|
var VideoBlueprint = z3.object({
|
|
8501
8655
|
source: z3.object({ aspect_ratio: z3.string().optional(), duration_s: z3.number().optional() }).loose().optional(),
|
|
@@ -8600,6 +8754,40 @@ function annotateBlueprintWithElements(blueprintInput, elementsInput) {
|
|
|
8600
8754
|
clone.reference_elements = summary;
|
|
8601
8755
|
return clone;
|
|
8602
8756
|
}
|
|
8757
|
+
var SELECT_SCENE_FIELDS = [
|
|
8758
|
+
"index",
|
|
8759
|
+
"start_s",
|
|
8760
|
+
"end_s",
|
|
8761
|
+
"duration_s",
|
|
8762
|
+
"summary",
|
|
8763
|
+
"narrative_role",
|
|
8764
|
+
"action_detail",
|
|
8765
|
+
"start_frame_prompt",
|
|
8766
|
+
"end_frame_prompt"
|
|
8767
|
+
];
|
|
8768
|
+
var SELECT_GLOBAL_FIELDS = ["cast", "branding", "voiceover"];
|
|
8769
|
+
function slimBlueprintForSelection(blueprintInput) {
|
|
8770
|
+
if (!blueprintInput || typeof blueprintInput !== "object" || Array.isArray(blueprintInput)) return blueprintInput;
|
|
8771
|
+
const bp = blueprintInput;
|
|
8772
|
+
const out = {};
|
|
8773
|
+
for (const k of ["version", "source"]) if (k in bp) out[k] = bp[k];
|
|
8774
|
+
if (bp.global && typeof bp.global === "object" && !Array.isArray(bp.global)) {
|
|
8775
|
+
const g = bp.global;
|
|
8776
|
+
const slimG = {};
|
|
8777
|
+
for (const k of SELECT_GLOBAL_FIELDS) if (k in g) slimG[k] = g[k];
|
|
8778
|
+
out.global = slimG;
|
|
8779
|
+
}
|
|
8780
|
+
if (Array.isArray(bp.scenes)) {
|
|
8781
|
+
out.scenes = bp.scenes.map((s) => {
|
|
8782
|
+
if (!s || typeof s !== "object" || Array.isArray(s)) return s;
|
|
8783
|
+
const sr = s;
|
|
8784
|
+
const slim = {};
|
|
8785
|
+
for (const k of SELECT_SCENE_FIELDS) if (k in sr) slim[k] = sr[k];
|
|
8786
|
+
return slim;
|
|
8787
|
+
});
|
|
8788
|
+
}
|
|
8789
|
+
return out;
|
|
8790
|
+
}
|
|
8603
8791
|
function roleForType2(type) {
|
|
8604
8792
|
switch (type.toLowerCase()) {
|
|
8605
8793
|
case "logo":
|
|
@@ -8646,6 +8834,7 @@ function buildElementSlots(elements) {
|
|
|
8646
8834
|
type: el.type,
|
|
8647
8835
|
description: el.description,
|
|
8648
8836
|
sameAs: el.same_as ?? void 0,
|
|
8837
|
+
castId: el.cast_id ?? void 0,
|
|
8649
8838
|
presence: presenceOf(el)
|
|
8650
8839
|
});
|
|
8651
8840
|
});
|
|
@@ -8684,7 +8873,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
|
|
|
8684
8873
|
const legend = [
|
|
8685
8874
|
...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
|
|
8686
8875
|
...hasAnchor ? [
|
|
8687
|
-
"- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions
|
|
8876
|
+
"- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions. IGNORE its text, its logo, its brand name, and its colors entirely \u2014 it is a DIFFERENT brand's footage, here only to anchor layout/pose, never identity or palette."
|
|
8688
8877
|
] : []
|
|
8689
8878
|
].join("\n");
|
|
8690
8879
|
const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
|
|
@@ -8730,41 +8919,66 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
|
|
|
8730
8919
|
"REFERENCE IMAGES (in the order provided):",
|
|
8731
8920
|
legend,
|
|
8732
8921
|
"",
|
|
8733
|
-
|
|
8734
|
-
|
|
8735
|
-
|
|
8736
|
-
|
|
8922
|
+
// RECAST is the whole point of a transform: the dropped el_* images define who/
|
|
8923
|
+
// what is on screen, NOT the source footage and NOT the prose. Without this, the
|
|
8924
|
+
// model reproduces the original ad's people (a proven failure mode).
|
|
8925
|
+
...present.length > 0 ? [
|
|
8926
|
+
"IDENTITY & AESTHETIC \u2014 RECAST (this is a transform, not a copy):",
|
|
8927
|
+
"Identity comes from the reference image, never from the source footage or this prose. Render every",
|
|
8928
|
+
"person, animal, product, and set to MATCH its labeled reference image above \u2014 that image is the ONLY",
|
|
8929
|
+
"source of their identity, wardrobe, styling, and look. This is a complete recast: do NOT reproduce,",
|
|
8930
|
+
"trace, or resemble any individual, animal, product, or set from the source ad. Where the FRAME",
|
|
8931
|
+
"DESCRIPTION below names an appearance detail (hair, outfit, color, age, breed, brand of an object),",
|
|
8932
|
+
"IGNORE that wording \u2014 the reference image is the truth; use the description ONLY for pose, expression,",
|
|
8933
|
+
"action, framing, lighting, and palette.",
|
|
8934
|
+
""
|
|
8935
|
+
] : [
|
|
8936
|
+
"Identity comes from the reference image, never from prose \u2014 render the subject to MATCH it and",
|
|
8937
|
+
"describe only pose, expression, action, framing, and lighting in the FRAME DESCRIPTION below.",
|
|
8938
|
+
""
|
|
8939
|
+
],
|
|
8737
8940
|
"FRAME DESCRIPTION (this frame's editable prompt):",
|
|
8738
8941
|
description,
|
|
8739
8942
|
"",
|
|
8740
|
-
"
|
|
8943
|
+
"Render exactly what the FRAME DESCRIPTION and the SHARED AD SPEC specify \u2014 this is the authoritative ad: its cast identity (via the reference images), palette, brand, and intent are law. Keep every recurring element identical to its reference image across all frames. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
|
|
8741
8944
|
"",
|
|
8742
|
-
"
|
|
8945
|
+
"SHARED AD SPEC (authoritative \u2014 the ad blueprint this frame belongs to; align cast/palette/brand/type with it):",
|
|
8743
8946
|
"{{target_blueprint}}"
|
|
8744
8947
|
].join("\n");
|
|
8745
8948
|
}
|
|
8949
|
+
function ingestFrameRef(url, edge, ctx, nodes) {
|
|
8950
|
+
const cached2 = ctx.ingestCache?.get(url);
|
|
8951
|
+
if (cached2) return cached2;
|
|
8952
|
+
const tag = ctx.tag ?? "";
|
|
8953
|
+
const refId = `s${ctx.sceneIndex}${tag}_${edge}_ref`;
|
|
8954
|
+
nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
|
|
8955
|
+
const ref = `$ref:${refId}.asset`;
|
|
8956
|
+
ctx.ingestCache?.set(url, ref);
|
|
8957
|
+
return ref;
|
|
8958
|
+
}
|
|
8746
8959
|
function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
|
|
8747
|
-
const
|
|
8748
|
-
if (
|
|
8749
|
-
|
|
8750
|
-
const
|
|
8960
|
+
const tag = ctx.tag ?? "";
|
|
8961
|
+
if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
|
|
8962
|
+
const hasOriginal = Boolean(url);
|
|
8963
|
+
const originalRef = hasOriginal && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
|
|
8964
|
+
const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
|
|
8751
8965
|
const genParams = {
|
|
8752
8966
|
model: ctx.imageModel,
|
|
8753
8967
|
image_size: "2K",
|
|
8754
|
-
prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present,
|
|
8968
|
+
prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, hasOriginal, ctx.shootMode)
|
|
8755
8969
|
};
|
|
8756
8970
|
if (ctx.ar) genParams.aspect_ratio = ctx.ar;
|
|
8757
|
-
const
|
|
8758
|
-
|
|
8971
|
+
const genId = `s${ctx.sceneIndex}${tag}_${edge}`;
|
|
8972
|
+
nodes.push({
|
|
8973
|
+
id: genId,
|
|
8759
8974
|
type: "image_generate",
|
|
8760
8975
|
// `params.prompt` is this frame's authoritative, edit-per-frame description.
|
|
8761
|
-
// `target_blueprint` is
|
|
8762
|
-
//
|
|
8976
|
+
// `target_blueprint` is the shared ad spec (cast identity, palette, brand, type)
|
|
8977
|
+
// the frame must stay consistent with — editing one frame never touches another.
|
|
8763
8978
|
inputs: { target_blueprint: "$ref:prompt.asset", ...reference.length > 0 ? { reference } : {} },
|
|
8764
8979
|
params: genParams
|
|
8765
|
-
};
|
|
8766
|
-
|
|
8767
|
-
return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
|
|
8980
|
+
});
|
|
8981
|
+
return `$ref:${genId}.images#0`;
|
|
8768
8982
|
}
|
|
8769
8983
|
function seedanceAudioLine(scene, mode, audio, nativeLine) {
|
|
8770
8984
|
const ambient = scene.ambient?.trim() || diegeticFor(mode);
|
|
@@ -8810,10 +9024,11 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
|
|
|
8810
9024
|
);
|
|
8811
9025
|
return parts.join("\n");
|
|
8812
9026
|
}
|
|
8813
|
-
function audioExtractArgs(durationS) {
|
|
9027
|
+
function audioExtractArgs(durationS, offsetS = 0) {
|
|
8814
9028
|
return [
|
|
8815
9029
|
"-i",
|
|
8816
9030
|
"{{in.clip}}",
|
|
9031
|
+
...offsetS > 0.05 ? ["-ss", offsetS.toFixed(3)] : [],
|
|
8817
9032
|
"-t",
|
|
8818
9033
|
durationS.toFixed(3),
|
|
8819
9034
|
"-vn",
|
|
@@ -8841,27 +9056,21 @@ function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
|
|
|
8841
9056
|
hasProduct: present.some((s) => s.type.toLowerCase() === "product")
|
|
8842
9057
|
});
|
|
8843
9058
|
}
|
|
8844
|
-
function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
|
|
9059
|
+
function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks, nativeSegments, clipRef = `$ref:s${i}_clip.video`) {
|
|
8845
9060
|
if (nativeTurn) {
|
|
8846
|
-
const
|
|
9061
|
+
const speechWindow = Math.max(0.5, nativeTurn.end_s - nativeTurn.start_s);
|
|
9062
|
+
const extractLen = Math.min(speechWindow, lengths.genDur);
|
|
8847
9063
|
nodes.push({
|
|
8848
9064
|
id: `s${i}_voextract`,
|
|
8849
9065
|
type: "ffmpeg",
|
|
8850
|
-
inputs: { clip:
|
|
9066
|
+
inputs: { clip: clipRef },
|
|
8851
9067
|
params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
|
|
8852
9068
|
});
|
|
8853
|
-
|
|
8854
|
-
|
|
8855
|
-
|
|
8856
|
-
inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
|
|
8857
|
-
params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
|
|
8858
|
-
});
|
|
8859
|
-
voTracks.push({
|
|
8860
|
-
slot: `s${i}_voconv`,
|
|
8861
|
-
ref: `$ref:s${i}_voconv.audio`,
|
|
9069
|
+
nativeSegments.push({
|
|
9070
|
+
voiceNode: nativeTurn.voiceNode,
|
|
9071
|
+
ref: `$ref:s${i}_voextract.audio`,
|
|
8862
9072
|
start_s: nativeTurn.start_s,
|
|
8863
|
-
end_s: nativeTurn.start_s + extractLen
|
|
8864
|
-
kind: "vo"
|
|
9073
|
+
end_s: nativeTurn.start_s + extractLen
|
|
8865
9074
|
});
|
|
8866
9075
|
} else if (ambientBroll) {
|
|
8867
9076
|
const ambientStart = scene.start_s ?? 0;
|
|
@@ -8881,85 +9090,260 @@ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes
|
|
|
8881
9090
|
});
|
|
8882
9091
|
}
|
|
8883
9092
|
}
|
|
8884
|
-
function
|
|
8885
|
-
const
|
|
8886
|
-
const
|
|
8887
|
-
|
|
8888
|
-
|
|
8889
|
-
|
|
8890
|
-
|
|
8891
|
-
const
|
|
8892
|
-
|
|
8893
|
-
const
|
|
8894
|
-
const
|
|
8895
|
-
const
|
|
8896
|
-
|
|
8897
|
-
|
|
8898
|
-
|
|
8899
|
-
"start",
|
|
8900
|
-
scene.start_frame_asset?.url,
|
|
8901
|
-
scene.start_frame_prompt,
|
|
8902
|
-
slotsForFrame(slots, i, "start"),
|
|
8903
|
-
ctx,
|
|
8904
|
-
nodes
|
|
8905
|
-
);
|
|
8906
|
-
const lastFrame = buildFrameRef(
|
|
8907
|
-
"end",
|
|
8908
|
-
scene.end_frame_asset?.url,
|
|
8909
|
-
scene.end_frame_prompt,
|
|
8910
|
-
slotsForFrame(slots, i, "end"),
|
|
8911
|
-
ctx,
|
|
8912
|
-
nodes
|
|
8913
|
-
);
|
|
8914
|
-
const dur = sceneDurationS(scene);
|
|
8915
|
-
let out = sceneOutTransition(scene, i === lastIndex);
|
|
8916
|
-
let trimTarget = dur + (out?.dur ?? 0);
|
|
8917
|
-
if (out && ceilToSeedance(trimTarget) < trimTarget) {
|
|
8918
|
-
out = null;
|
|
8919
|
-
trimTarget = dur;
|
|
8920
|
-
}
|
|
8921
|
-
const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
|
|
8922
|
-
const genDur = ceilToSeedance(Math.max(trimTarget, speech));
|
|
8923
|
-
const clipParams = {
|
|
8924
|
-
model: opts.videoModel,
|
|
8925
|
-
prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
|
|
8926
|
-
duration: genDur,
|
|
8927
|
-
// Native talking scene → Seedance generates the spoken audio + lip-sync;
|
|
8928
|
-
// an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
|
|
8929
|
-
// clip is silent and audio comes from the tts/music timeline.
|
|
8930
|
-
generate_audio: Boolean(nativeTurn) || ambientBroll
|
|
8931
|
-
};
|
|
8932
|
-
if (ar) clipParams.aspect_ratio = ar;
|
|
9093
|
+
function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
|
|
9094
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9095
|
+
for (const seg of segments) {
|
|
9096
|
+
const arr = bySpeaker.get(seg.voiceNode) ?? [];
|
|
9097
|
+
arr.push(seg);
|
|
9098
|
+
bySpeaker.set(seg.voiceNode, arr);
|
|
9099
|
+
}
|
|
9100
|
+
const tracks = [];
|
|
9101
|
+
for (const [voiceNode, segs] of bySpeaker) {
|
|
9102
|
+
const trackId = `${voiceNode}_track`;
|
|
9103
|
+
const convId = `${voiceNode}_conv`;
|
|
9104
|
+
const mixInputs = {};
|
|
9105
|
+
segs.forEach((s, k) => {
|
|
9106
|
+
mixInputs[`seg${k}`] = s.ref;
|
|
9107
|
+
});
|
|
8933
9108
|
nodes.push({
|
|
8934
|
-
id:
|
|
8935
|
-
type: "
|
|
8936
|
-
inputs:
|
|
8937
|
-
params:
|
|
9109
|
+
id: trackId,
|
|
9110
|
+
type: "audio_timeline",
|
|
9111
|
+
inputs: mixInputs,
|
|
9112
|
+
params: {
|
|
9113
|
+
tracks: segs.map((s, k) => ({ slot: `seg${k}`, start_s: s.start_s })),
|
|
9114
|
+
total_ms: totalMs
|
|
9115
|
+
}
|
|
8938
9116
|
});
|
|
8939
|
-
|
|
8940
|
-
|
|
8941
|
-
|
|
8942
|
-
|
|
8943
|
-
|
|
8944
|
-
|
|
8945
|
-
|
|
8946
|
-
|
|
8947
|
-
|
|
8948
|
-
|
|
8949
|
-
|
|
8950
|
-
|
|
9117
|
+
nodes.push({
|
|
9118
|
+
id: convId,
|
|
9119
|
+
type: "audio_voice_convert",
|
|
9120
|
+
inputs: { audio: `$ref:${trackId}.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9121
|
+
params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
|
|
9122
|
+
});
|
|
9123
|
+
tracks.push({ slot: convId, ref: `$ref:${convId}.audio`, start_s: 0, kind: "vo" });
|
|
9124
|
+
}
|
|
9125
|
+
return tracks;
|
|
9126
|
+
}
|
|
9127
|
+
function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
|
|
9128
|
+
const clipParams = {
|
|
9129
|
+
model: opts.videoModel,
|
|
9130
|
+
prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
|
|
9131
|
+
duration: lengths.genDur,
|
|
9132
|
+
// Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
|
|
9133
|
+
// ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
|
|
9134
|
+
generate_audio: Boolean(nativeTurn) || ambientBroll
|
|
9135
|
+
};
|
|
9136
|
+
if (opts.ar) clipParams.aspect_ratio = opts.ar;
|
|
9137
|
+
nodes.push({
|
|
9138
|
+
id: `s${i}${tag}_clip`,
|
|
9139
|
+
type: "video_generate",
|
|
9140
|
+
inputs: { first_frame: frames.first, ...frames.last ? { last_frame: frames.last } : {} },
|
|
9141
|
+
params: clipParams
|
|
9142
|
+
});
|
|
9143
|
+
const base = `$ref:s${i}${tag}_clip.video`;
|
|
9144
|
+
if (lengths.genDur === lengths.trimTarget) return { ref: base, scene_s: lengths.dur, out };
|
|
9145
|
+
nodes.push({
|
|
9146
|
+
id: `s${i}${tag}_clip_trim`,
|
|
9147
|
+
type: "ffmpeg",
|
|
9148
|
+
inputs: { clip: base },
|
|
9149
|
+
params: { args: trimArgs(lengths.trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
9150
|
+
});
|
|
9151
|
+
return { ref: `$ref:s${i}${tag}_clip_trim.video`, scene_s: lengths.dur, out };
|
|
9152
|
+
}
|
|
9153
|
+
var COMPOSITE_LAYOUTS = /* @__PURE__ */ new Set(["split_screen", "pip", "keyed_overlay"]);
|
|
9154
|
+
function layeredComposition(scene) {
|
|
9155
|
+
const comp = scene.composition;
|
|
9156
|
+
const layout = (comp?.layout ?? "").toLowerCase();
|
|
9157
|
+
if (!COMPOSITE_LAYOUTS.has(layout)) return null;
|
|
9158
|
+
const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
|
|
9159
|
+
if (regions.length < 2) return null;
|
|
9160
|
+
return { layout, regions, comp: comp ?? {} };
|
|
9161
|
+
}
|
|
9162
|
+
function splitAxisOf(comp, regions) {
|
|
9163
|
+
const panels = regions.map((r) => (r.panel ?? "").toLowerCase());
|
|
9164
|
+
if (panels.some((p) => p === "top" || p === "bottom")) return "vertical";
|
|
9165
|
+
if (panels.some((p) => p === "left" || p === "right")) return "horizontal";
|
|
9166
|
+
return (comp.split_axis ?? "").toLowerCase() === "horizontal" ? "horizontal" : "vertical";
|
|
9167
|
+
}
|
|
9168
|
+
function orderSplitRefs(regions, regionRefs, axis) {
|
|
9169
|
+
const rank = (panel) => {
|
|
9170
|
+
const p = (panel ?? "").toLowerCase();
|
|
9171
|
+
if (axis === "vertical") return p === "top" ? 0 : p === "bottom" ? 2 : 1;
|
|
9172
|
+
return p === "left" ? 0 : p === "right" ? 2 : 1;
|
|
9173
|
+
};
|
|
9174
|
+
return regionRefs.map((ref, k) => ({ ref, k, rank: rank(regions[k]?.panel) })).sort((a, b) => a.rank - b.rank || a.k - b.k).map((x) => x.ref);
|
|
9175
|
+
}
|
|
9176
|
+
function presenterIndexOf(regions, hasNative) {
|
|
9177
|
+
const flagged = regions.findIndex((r) => r.is_presenter);
|
|
9178
|
+
if (flagged >= 0) return flagged;
|
|
9179
|
+
return hasNative ? 0 : -1;
|
|
9180
|
+
}
|
|
9181
|
+
function slotsForRegion(present, isPresenter) {
|
|
9182
|
+
return present.filter((s) => {
|
|
9183
|
+
const t = s.type.toLowerCase();
|
|
9184
|
+
const person = t === "person" || t === "animal";
|
|
9185
|
+
return isPresenter ? person : !person;
|
|
9186
|
+
});
|
|
9187
|
+
}
|
|
9188
|
+
function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes) {
|
|
9189
|
+
const dims = canvasDims(opts.ar);
|
|
9190
|
+
const presIdx = presenterIndexOf(regions, Boolean(nativeTurn));
|
|
9191
|
+
const regionRefs = [];
|
|
9192
|
+
let presenterPosition;
|
|
9193
|
+
regions.forEach((region, r) => {
|
|
9194
|
+
const isPresenter = r === presIdx;
|
|
9195
|
+
const tag = `_r${r}`;
|
|
9196
|
+
const regionSlots = slotsForRegion(present, isPresenter);
|
|
9197
|
+
const ctx = {
|
|
9198
|
+
sceneIndex: i,
|
|
9199
|
+
ar: opts.ar,
|
|
9200
|
+
reuse: opts.reuse,
|
|
9201
|
+
imageModel: opts.imageModel,
|
|
9202
|
+
shootMode: mode,
|
|
9203
|
+
tag
|
|
9204
|
+
};
|
|
9205
|
+
const startPrompt = region.frame_prompt ?? scene.start_frame_prompt;
|
|
9206
|
+
const endPrompt = region.frame_prompt ?? scene.end_frame_prompt;
|
|
9207
|
+
const first = buildFrameRef("start", void 0, startPrompt, regionSlots, ctx, nodes);
|
|
9208
|
+
const last = buildFrameRef("end", void 0, endPrompt, regionSlots, ctx, nodes);
|
|
9209
|
+
const regionNative = isPresenter ? nativeTurn : void 0;
|
|
9210
|
+
const regionScene = {
|
|
9211
|
+
...scene,
|
|
9212
|
+
summary: region.summary ?? scene.summary,
|
|
9213
|
+
motion_prompt: region.motion_prompt ?? scene.motion_prompt,
|
|
9214
|
+
dialogue: isPresenter ? scene.dialogue : []
|
|
9215
|
+
};
|
|
9216
|
+
const clip = emitSceneClip(
|
|
9217
|
+
i,
|
|
9218
|
+
regionScene,
|
|
9219
|
+
regionSlots,
|
|
9220
|
+
mode,
|
|
9221
|
+
regionNative,
|
|
9222
|
+
false,
|
|
9223
|
+
{ first, last },
|
|
9224
|
+
lengths,
|
|
9225
|
+
null,
|
|
9226
|
+
{ ar: opts.ar, videoModel: opts.videoModel },
|
|
9227
|
+
nodes,
|
|
9228
|
+
tag
|
|
9229
|
+
);
|
|
9230
|
+
regionRefs.push(clip.ref);
|
|
9231
|
+
if (isPresenter) presenterPosition = region.position;
|
|
9232
|
+
});
|
|
9233
|
+
const compInputs = {};
|
|
9234
|
+
let args;
|
|
9235
|
+
if (layout === "split_screen") {
|
|
9236
|
+
const axis = splitAxisOf(comp, regions);
|
|
9237
|
+
orderSplitRefs(regions, regionRefs, axis).forEach((ref, k) => {
|
|
9238
|
+
compInputs[`c${k}`] = ref;
|
|
9239
|
+
});
|
|
9240
|
+
args = splitStackArgs(regionRefs.length, axis, dims);
|
|
9241
|
+
} else {
|
|
9242
|
+
const bgIdx = regions.findIndex((_, k) => k !== presIdx);
|
|
9243
|
+
const bgRef = regionRefs[bgIdx >= 0 ? bgIdx : 0];
|
|
9244
|
+
let presRef = regionRefs[presIdx >= 0 ? presIdx : 1];
|
|
9245
|
+
if (layout === "keyed_overlay" && presIdx >= 0) {
|
|
9246
|
+
const keyId = `s${i}_key`;
|
|
9247
|
+
nodes.push({ id: keyId, type: "video_background_remove", inputs: { video: presRef }, params: {} });
|
|
9248
|
+
presRef = `$ref:${keyId}.video`;
|
|
9249
|
+
}
|
|
9250
|
+
compInputs.c0 = bgRef;
|
|
9251
|
+
compInputs.c1 = presRef;
|
|
9252
|
+
args = pipOverlayArgs(dims, presenterPosition, layout === "keyed_overlay" ? 0.5 : 0.34);
|
|
9253
|
+
}
|
|
9254
|
+
nodes.push({
|
|
9255
|
+
id: `s${i}_composite`,
|
|
9256
|
+
type: "ffmpeg",
|
|
9257
|
+
inputs: compInputs,
|
|
9258
|
+
params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
9259
|
+
});
|
|
9260
|
+
const presenterClipRef = presIdx >= 0 ? `$ref:s${i}_r${presIdx}_clip.video` : void 0;
|
|
9261
|
+
return { clip: { ref: `$ref:s${i}_composite.video`, scene_s: lengths.dur, out }, presenterClipRef };
|
|
9262
|
+
}
|
|
9263
|
+
function sceneTiming(scene, isLast, nativeTurn) {
|
|
9264
|
+
const dur = sceneDurationS(scene);
|
|
9265
|
+
let out = sceneOutTransition(scene, isLast);
|
|
9266
|
+
let trimTarget = dur + (out?.dur ?? 0);
|
|
9267
|
+
if (out && ceilToSeedance(trimTarget) < trimTarget) {
|
|
9268
|
+
out = null;
|
|
9269
|
+
trimTarget = dur;
|
|
9270
|
+
}
|
|
9271
|
+
const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
|
|
9272
|
+
const genDur = ceilToSeedance(Math.max(trimTarget, speech));
|
|
9273
|
+
return { dur, out, trimTarget, genDur, speech };
|
|
9274
|
+
}
|
|
9275
|
+
function emitCompositeScene(composite, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes, voTracks, nativeSegments, clips) {
|
|
9276
|
+
const built = buildCompositeScene(
|
|
9277
|
+
composite.layout,
|
|
9278
|
+
composite.regions,
|
|
9279
|
+
composite.comp,
|
|
9280
|
+
scene,
|
|
9281
|
+
i,
|
|
9282
|
+
present,
|
|
9283
|
+
mode,
|
|
9284
|
+
nativeTurn,
|
|
9285
|
+
{ dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
|
|
9286
|
+
out,
|
|
9287
|
+
opts,
|
|
9288
|
+
nodes
|
|
9289
|
+
);
|
|
9290
|
+
emitSceneNativeAudio(
|
|
9291
|
+
i,
|
|
9292
|
+
scene,
|
|
9293
|
+
nativeTurn,
|
|
9294
|
+
false,
|
|
9295
|
+
{ dur: lengths.dur, speech: lengths.speech, genDur: lengths.genDur },
|
|
9296
|
+
nodes,
|
|
9297
|
+
voTracks,
|
|
9298
|
+
nativeSegments,
|
|
9299
|
+
built.presenterClipRef
|
|
9300
|
+
);
|
|
9301
|
+
clips.push(built.clip);
|
|
9302
|
+
}
|
|
9303
|
+
function emitFlashHold(i, scene, slots, ctx, lengths, out, ar, nodes, clips) {
|
|
9304
|
+
const frame = buildFrameRef(
|
|
9305
|
+
"start",
|
|
9306
|
+
scene.start_frame_asset?.url,
|
|
9307
|
+
scene.start_frame_prompt,
|
|
9308
|
+
slotsForFrame(slots, i, "start"),
|
|
9309
|
+
ctx,
|
|
9310
|
+
nodes
|
|
9311
|
+
);
|
|
9312
|
+
nodes.push({
|
|
9313
|
+
id: `s${i}_clip`,
|
|
9314
|
+
type: "ffmpeg",
|
|
9315
|
+
inputs: { frame },
|
|
9316
|
+
params: {
|
|
9317
|
+
args: stillHoldArgs(lengths.trimTarget, canvasDims(ar)),
|
|
9318
|
+
outputs: { video: { kind: "video", ext: "mp4" } }
|
|
8951
9319
|
}
|
|
8952
9320
|
});
|
|
8953
|
-
|
|
9321
|
+
clips.push({ ref: `$ref:s${i}_clip.video`, scene_s: lengths.dur, out });
|
|
9322
|
+
}
|
|
9323
|
+
function musicScriptDigest(blueprint) {
|
|
9324
|
+
const lines = blueprint.scenes.flatMap((s) => (s.dialogue ?? []).map((d) => d.line?.trim())).filter((l) => Boolean(l));
|
|
9325
|
+
const script = lines.join(" ").slice(0, 500);
|
|
9326
|
+
const roles = blueprint.scenes.map((s) => s.narrative_role).filter((r) => Boolean(r));
|
|
9327
|
+
const arc = roles.length > 0 ? roles.join(" \u2192 ") : "";
|
|
9328
|
+
const parts = [];
|
|
9329
|
+
if (script) {
|
|
9330
|
+
parts.push(
|
|
9331
|
+
`Ad script (the bed must SUPPORT these words \u2014 leave room for the voice, swell on the payoff): "${script}"`
|
|
9332
|
+
);
|
|
9333
|
+
}
|
|
9334
|
+
if (arc) parts.push(`Emotional arc across scenes: ${arc}. Shape the bed's energy to this arc.`);
|
|
9335
|
+
return parts.length > 0 ? `
|
|
9336
|
+
|
|
9337
|
+
${parts.join("\n")}` : "";
|
|
8954
9338
|
}
|
|
8955
9339
|
function musicBedPrompt(blueprint, musicPrompt) {
|
|
9340
|
+
const digest = musicScriptDigest(blueprint);
|
|
8956
9341
|
const track2 = blueprint.global?.music?.identified_track;
|
|
8957
9342
|
const title = track2?.title?.trim();
|
|
8958
|
-
|
|
8959
|
-
const by = track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : "";
|
|
8960
|
-
return `${musicPrompt}
|
|
9343
|
+
const vibe = title ? `
|
|
8961
9344
|
|
|
8962
|
-
Reference vibe: the original used "${title}"${by} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track
|
|
9345
|
+
Reference vibe: the original used "${title}"${track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : ""} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.` : "";
|
|
9346
|
+
return `${musicPrompt}${digest}${vibe}`;
|
|
8963
9347
|
}
|
|
8964
9348
|
function onCameraDialogue(blueprint) {
|
|
8965
9349
|
const mode = blueprint.global?.voiceover?.mode;
|
|
@@ -8998,92 +9382,483 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
|
|
|
8998
9382
|
if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
|
|
8999
9383
|
return casts.has(speaker);
|
|
9000
9384
|
}
|
|
9001
|
-
function
|
|
9002
|
-
const
|
|
9003
|
-
const
|
|
9385
|
+
function makePresenterPresent(slots, canonical) {
|
|
9386
|
+
const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
|
|
9387
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9388
|
+
for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
|
|
9389
|
+
const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
|
|
9390
|
+
return (speaker, sceneIndex) => {
|
|
9391
|
+
const presence = bySpeaker.get(speaker) ?? solePerson;
|
|
9392
|
+
if (!presence) return true;
|
|
9393
|
+
return presence.has(sceneIndex);
|
|
9394
|
+
};
|
|
9395
|
+
}
|
|
9396
|
+
var PAUSE_GAP_S = 0.6;
|
|
9397
|
+
var PHRASE_MAX_S = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
|
|
9398
|
+
function collapseVoiceover(blueprint) {
|
|
9004
9399
|
const casts = castIdSet(blueprint);
|
|
9005
9400
|
const cameraOn = onCameraDialogue(blueprint);
|
|
9006
|
-
const
|
|
9007
|
-
const
|
|
9008
|
-
for (const
|
|
9009
|
-
|
|
9010
|
-
|
|
9011
|
-
}
|
|
9401
|
+
const presenters = /* @__PURE__ */ new Set();
|
|
9402
|
+
for (const scene of blueprint.scenes)
|
|
9403
|
+
for (const l of scene.dialogue ?? []) {
|
|
9404
|
+
const sp = l.speaker ?? "voiceover";
|
|
9405
|
+
if (isOnCameraSpeaker(sp, casts, cameraOn)) presenters.add(sp);
|
|
9012
9406
|
}
|
|
9407
|
+
if (presenters.size !== 1) return (s) => s;
|
|
9408
|
+
const presenter = [...presenters][0];
|
|
9409
|
+
return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
|
|
9410
|
+
}
|
|
9411
|
+
function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
|
|
9412
|
+
const casts = castIdSet(blueprint);
|
|
9413
|
+
const cameraOn = onCameraDialogue(blueprint);
|
|
9414
|
+
const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
|
|
9415
|
+
const multiSpeaker = /* @__PURE__ */ new Set();
|
|
9416
|
+
blueprint.scenes.forEach((scene, i) => {
|
|
9417
|
+
const onCam = new Set(
|
|
9418
|
+
(scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
|
|
9419
|
+
);
|
|
9420
|
+
if (onCam.size >= 2) multiSpeaker.add(i);
|
|
9421
|
+
});
|
|
9422
|
+
const lines = blueprint.scenes.flatMap(
|
|
9423
|
+
(scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
|
|
9424
|
+
const raw = l.speaker ?? "voiceover";
|
|
9425
|
+
const sp = canonical(raw);
|
|
9426
|
+
const text = l.line.trim();
|
|
9427
|
+
const start = l.start_s ?? scene.start_s ?? 0;
|
|
9428
|
+
return {
|
|
9429
|
+
sceneIndex,
|
|
9430
|
+
speaker: sp,
|
|
9431
|
+
// Shown = a cast member speaking AND their element is actually on screen
|
|
9432
|
+
// here (not a cutaway). A b-roll cutaway mid-phrase fails this and gets
|
|
9433
|
+
// its own clip while the phrase voice plays under it.
|
|
9434
|
+
shown: isOnCameraSpeaker(raw, casts, cameraOn) && !multiSpeaker.has(sceneIndex) && presenterPresent(sp, sceneIndex),
|
|
9435
|
+
start,
|
|
9436
|
+
// Real speech end. When the deconstruct gives no end_s, estimate it from
|
|
9437
|
+
// the words — NOT the scene end (which would fabricate continuity across
|
|
9438
|
+
// a long silent b-roll gap and wrongly merge two separate phrases).
|
|
9439
|
+
end: l.end_s ?? start + estSpeechS(text),
|
|
9440
|
+
text
|
|
9441
|
+
};
|
|
9442
|
+
})
|
|
9443
|
+
).sort((a, b) => a.start - b.start);
|
|
9444
|
+
const phrases = [];
|
|
9445
|
+
let cur = null;
|
|
9446
|
+
const flush = () => {
|
|
9447
|
+
if (!cur) return;
|
|
9448
|
+
const shownScenes = [...cur.shown].sort((a, b) => a - b);
|
|
9449
|
+
phrases.push({
|
|
9450
|
+
speaker: cur.speaker,
|
|
9451
|
+
start_s: cur.start,
|
|
9452
|
+
end_s: cur.end,
|
|
9453
|
+
text: cur.texts.join(" "),
|
|
9454
|
+
firstScene: cur.firstScene,
|
|
9455
|
+
shownScenes,
|
|
9456
|
+
presenterShown: shownScenes.length > 0
|
|
9457
|
+
});
|
|
9458
|
+
cur = null;
|
|
9459
|
+
};
|
|
9460
|
+
for (const ln of lines) {
|
|
9461
|
+
const lineCover = ln.shown ? Math.max(ln.end, sceneEndS(ln.sceneIndex)) : ln.end;
|
|
9462
|
+
const lineClipStart = ln.shown ? Math.min(ln.start, blueprint.scenes[ln.sceneIndex]?.start_s ?? ln.start) : ln.start;
|
|
9463
|
+
const breakRun = !cur || cur.speaker !== ln.speaker || ln.start - cur.end > PAUSE_GAP_S || // Cap by SCENE COVERAGE span, not line end — a presenter run whose sliced scenes span
|
|
9464
|
+
// more than one Seedance clip splits into the next take here (at this scene's
|
|
9465
|
+
// boundary, never mid-scene), so no segment ever reads past the generated clip.
|
|
9466
|
+
Math.max(cur.coverEnd, lineCover) - Math.min(cur.clipStart, lineClipStart) > PHRASE_MAX_S;
|
|
9467
|
+
if (breakRun || !cur) {
|
|
9468
|
+
flush();
|
|
9469
|
+
cur = {
|
|
9470
|
+
speaker: ln.speaker,
|
|
9471
|
+
firstScene: ln.sceneIndex,
|
|
9472
|
+
start: ln.start,
|
|
9473
|
+
end: ln.end,
|
|
9474
|
+
coverEnd: lineCover,
|
|
9475
|
+
clipStart: lineClipStart,
|
|
9476
|
+
texts: [ln.text],
|
|
9477
|
+
shown: /* @__PURE__ */ new Set()
|
|
9478
|
+
};
|
|
9479
|
+
} else {
|
|
9480
|
+
cur.texts.push(ln.text);
|
|
9481
|
+
cur.end = Math.max(cur.end, ln.end);
|
|
9482
|
+
cur.coverEnd = Math.max(cur.coverEnd, lineCover);
|
|
9483
|
+
cur.clipStart = Math.min(cur.clipStart, lineClipStart);
|
|
9484
|
+
}
|
|
9485
|
+
if (ln.shown) cur.shown.add(ln.sceneIndex);
|
|
9486
|
+
}
|
|
9487
|
+
flush();
|
|
9488
|
+
return phrases;
|
|
9489
|
+
}
|
|
9490
|
+
function makeVoiceFactory(blueprint, canonical, nodes) {
|
|
9491
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9492
|
+
const describe = (speaker) => {
|
|
9493
|
+
for (const scene of blueprint.scenes)
|
|
9494
|
+
for (const line of scene.dialogue ?? [])
|
|
9495
|
+
if (canonical(line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
|
|
9013
9496
|
const cast = blueprint.global?.cast?.find((c) => c.id === speaker);
|
|
9014
9497
|
return cast?.description ?? blueprint.global?.voiceover?.voice_description ?? `${speaker} voice`;
|
|
9015
9498
|
};
|
|
9016
|
-
|
|
9017
|
-
const existing =
|
|
9499
|
+
return (speaker) => {
|
|
9500
|
+
const existing = bySpeaker.get(speaker);
|
|
9018
9501
|
if (existing) return existing;
|
|
9019
|
-
const id = sanitizeId2(`voice_${speaker}`, `voice_${
|
|
9020
|
-
const description =
|
|
9021
|
-
|
|
9022
|
-
|
|
9023
|
-
voiceNodeBySpeaker.set(speaker, id);
|
|
9502
|
+
const id = sanitizeId2(`voice_${speaker}`, `voice_${bySpeaker.size}`);
|
|
9503
|
+
const description = describe(speaker);
|
|
9504
|
+
nodes.push({ id, type: "voice_select", params: { description, ...parseVoiceTraits(description) } });
|
|
9505
|
+
bySpeaker.set(speaker, id);
|
|
9024
9506
|
return id;
|
|
9025
9507
|
};
|
|
9508
|
+
}
|
|
9509
|
+
function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
|
|
9510
|
+
const anchor = phrase.shownScenes[0];
|
|
9511
|
+
const anchorScene = env.blueprint.scenes[anchor];
|
|
9512
|
+
if (!anchorScene) return;
|
|
9513
|
+
const present = slotsForScene(env.slots, anchor);
|
|
9514
|
+
const nativeTurn = {
|
|
9515
|
+
sceneIndex: anchor,
|
|
9516
|
+
speaker: phrase.speaker,
|
|
9517
|
+
start_s: phrase.start_s,
|
|
9518
|
+
end_s: phrase.end_s,
|
|
9519
|
+
text: phrase.text,
|
|
9520
|
+
voiceNode,
|
|
9521
|
+
native: true
|
|
9522
|
+
};
|
|
9523
|
+
const mode = sceneShootMode(anchorScene, present, nativeTurn, env.cameraOn, env.casts);
|
|
9524
|
+
const ctx = {
|
|
9525
|
+
sceneIndex: anchor,
|
|
9526
|
+
ar: env.ar,
|
|
9527
|
+
reuse: env.reuse,
|
|
9528
|
+
imageModel: env.opts.imageModel,
|
|
9529
|
+
shootMode: mode,
|
|
9530
|
+
ingestCache: env.ingestCache
|
|
9531
|
+
};
|
|
9532
|
+
const first = buildFrameRef(
|
|
9533
|
+
"start",
|
|
9534
|
+
anchorScene.start_frame_asset?.url,
|
|
9535
|
+
anchorScene.start_frame_prompt,
|
|
9536
|
+
slotsForFrame(env.slots, anchor, "start"),
|
|
9537
|
+
ctx,
|
|
9538
|
+
nodes
|
|
9539
|
+
);
|
|
9540
|
+
const lastShown = phrase.shownScenes[phrase.shownScenes.length - 1] ?? anchor;
|
|
9541
|
+
const lastScene = env.blueprint.scenes[lastShown] ?? anchorScene;
|
|
9542
|
+
const last = buildFrameRef(
|
|
9543
|
+
"end",
|
|
9544
|
+
lastScene.end_frame_asset?.url,
|
|
9545
|
+
lastScene.end_frame_prompt,
|
|
9546
|
+
slotsForFrame(env.slots, lastShown, "end"),
|
|
9547
|
+
ctx,
|
|
9548
|
+
nodes
|
|
9549
|
+
);
|
|
9550
|
+
const clipStart = phrase.shownScenes.reduce(
|
|
9551
|
+
(m, s) => Math.min(m, env.blueprint.scenes[s]?.start_s ?? phrase.start_s),
|
|
9552
|
+
phrase.start_s
|
|
9553
|
+
);
|
|
9554
|
+
const coverEnd = phrase.shownScenes.reduce((m, s) => Math.max(m, env.blueprint.scenes[s]?.end_s ?? 0), phrase.end_s);
|
|
9555
|
+
const phraseLen = Math.max(0.5, coverEnd - clipStart);
|
|
9556
|
+
const genDur = ceilToSeedance(phraseLen);
|
|
9557
|
+
const clipParams = {
|
|
9558
|
+
model: env.opts.videoModel,
|
|
9559
|
+
prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
|
|
9560
|
+
duration: genDur,
|
|
9561
|
+
generate_audio: true
|
|
9562
|
+
};
|
|
9563
|
+
if (env.ar) clipParams.aspect_ratio = env.ar;
|
|
9564
|
+
nodes.push({
|
|
9565
|
+
id: `s${anchor}_clip`,
|
|
9566
|
+
type: "video_generate",
|
|
9567
|
+
inputs: { first_frame: first, last_frame: last },
|
|
9568
|
+
params: clipParams
|
|
9569
|
+
});
|
|
9570
|
+
const clipRef = `$ref:s${anchor}_clip.video`;
|
|
9571
|
+
const speechOffset = Math.max(0, phrase.start_s - clipStart);
|
|
9572
|
+
const extractLen = Math.min(Math.max(0.5, phrase.end_s - phrase.start_s), Math.max(0.5, genDur - speechOffset));
|
|
9573
|
+
nodes.push({
|
|
9574
|
+
id: `s${anchor}_voextract`,
|
|
9575
|
+
type: "ffmpeg",
|
|
9576
|
+
inputs: { clip: clipRef },
|
|
9577
|
+
params: { args: audioExtractArgs(extractLen, speechOffset), outputs: { audio: { kind: "audio", ext: "mp3" } } }
|
|
9578
|
+
});
|
|
9579
|
+
const convId = `s${anchor}_conv`;
|
|
9580
|
+
nodes.push({
|
|
9581
|
+
id: convId,
|
|
9582
|
+
type: "audio_voice_convert",
|
|
9583
|
+
inputs: { audio: `$ref:s${anchor}_voextract.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9584
|
+
params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
|
|
9585
|
+
});
|
|
9586
|
+
out.voTracks.push({
|
|
9587
|
+
slot: convId,
|
|
9588
|
+
ref: `$ref:${convId}.audio`,
|
|
9589
|
+
start_s: phrase.start_s,
|
|
9590
|
+
end_s: phrase.end_s,
|
|
9591
|
+
kind: "vo"
|
|
9592
|
+
});
|
|
9593
|
+
out.voSegments.push({
|
|
9594
|
+
slot: convId,
|
|
9595
|
+
start_s: phrase.start_s,
|
|
9596
|
+
end_s: phrase.end_s,
|
|
9597
|
+
scene: anchor,
|
|
9598
|
+
speaker: phrase.speaker
|
|
9599
|
+
});
|
|
9600
|
+
out.talkingScenes.push({
|
|
9601
|
+
scene: anchor,
|
|
9602
|
+
voice_convert_node: convId,
|
|
9603
|
+
scene_s: Math.round(phraseLen * 100) / 100,
|
|
9604
|
+
est_speech_s: Math.round(estSpeechS(phrase.text) * 100) / 100
|
|
9605
|
+
});
|
|
9606
|
+
for (const s of phrase.shownScenes) {
|
|
9607
|
+
const sc = env.blueprint.scenes[s];
|
|
9608
|
+
if (!sc) continue;
|
|
9609
|
+
const rawOffset = (sc.start_s ?? clipStart) - clipStart;
|
|
9610
|
+
out.sceneSlice.set(s, {
|
|
9611
|
+
clipRef,
|
|
9612
|
+
// Snap a sub-frame offset (line-start vs scene-start drift) to 0 so a single-scene
|
|
9613
|
+
// phrase hits the whole-clip fast path instead of a needless re-encode + tiny shift.
|
|
9614
|
+
offset: rawOffset < 0.05 ? 0 : rawOffset,
|
|
9615
|
+
len: sceneDurationS(sc),
|
|
9616
|
+
clipDur: genDur
|
|
9617
|
+
});
|
|
9618
|
+
}
|
|
9619
|
+
}
|
|
9620
|
+
function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
|
|
9621
|
+
let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
|
|
9622
|
+
while (used.has(id)) id = `${id}_x`;
|
|
9623
|
+
used.add(id);
|
|
9624
|
+
nodes.push({
|
|
9625
|
+
id,
|
|
9626
|
+
type: "tts",
|
|
9627
|
+
inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9628
|
+
params: { model: FIXED_TTS_MODEL, text: phrase.text, voice: "{{voice_ref}}" }
|
|
9629
|
+
});
|
|
9630
|
+
out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
|
|
9631
|
+
out.voSegments.push({
|
|
9632
|
+
slot: id,
|
|
9633
|
+
start_s: phrase.start_s,
|
|
9634
|
+
end_s: phrase.end_s,
|
|
9635
|
+
scene: phrase.firstScene,
|
|
9636
|
+
speaker: phrase.speaker
|
|
9637
|
+
});
|
|
9638
|
+
}
|
|
9639
|
+
function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
|
|
9640
|
+
const present = slotsForScene(env.slots, i);
|
|
9641
|
+
const onCam = (scene.dialogue ?? []).filter(
|
|
9642
|
+
(l) => Boolean(l.line?.trim()) && isOnCameraSpeaker(l.speaker ?? "voiceover", env.casts, env.cameraOn)
|
|
9643
|
+
);
|
|
9644
|
+
const distinctSpeakers = new Set(onCam.map((l) => canonical(l.speaker ?? "voiceover")));
|
|
9645
|
+
let nativeTurn;
|
|
9646
|
+
if (onCam.length > 0 && distinctSpeakers.size === 1) {
|
|
9647
|
+
const speaker = canonical(onCam[0]?.speaker ?? "voiceover");
|
|
9648
|
+
const voiceNode = ensureVoiceNode(speaker);
|
|
9649
|
+
const start = onCam[0]?.start_s ?? scene.start_s ?? 0;
|
|
9650
|
+
const end = onCam[onCam.length - 1]?.end_s ?? scene.end_s ?? start;
|
|
9651
|
+
const text = onCam.map((l) => l.line.trim()).join(" ");
|
|
9652
|
+
nativeTurn = { sceneIndex: i, speaker, start_s: start, end_s: end, text, voiceNode, native: true };
|
|
9653
|
+
out.talkingScenes.push({
|
|
9654
|
+
scene: i,
|
|
9655
|
+
voice_convert_node: `${voiceNode}_conv`,
|
|
9656
|
+
scene_s: Math.round(sceneDurationS(scene) * 100) / 100,
|
|
9657
|
+
est_speech_s: Math.round(estSpeechS(text) * 100) / 100
|
|
9658
|
+
});
|
|
9659
|
+
}
|
|
9660
|
+
const mode = sceneShootMode(scene, present, nativeTurn, env.cameraOn, env.casts);
|
|
9661
|
+
const lengths = sceneTiming(scene, isLast, nativeTurn);
|
|
9662
|
+
emitCompositeScene(
|
|
9663
|
+
composite,
|
|
9664
|
+
scene,
|
|
9665
|
+
i,
|
|
9666
|
+
present,
|
|
9667
|
+
mode,
|
|
9668
|
+
nativeTurn,
|
|
9669
|
+
lengths,
|
|
9670
|
+
lengths.out,
|
|
9671
|
+
{ ar: env.ar, reuse: env.reuse, imageModel: env.opts.imageModel, videoModel: env.opts.videoModel },
|
|
9672
|
+
nodes,
|
|
9673
|
+
out.voTracks,
|
|
9674
|
+
out.nativeSegments,
|
|
9675
|
+
out.clips
|
|
9676
|
+
);
|
|
9677
|
+
if (!nativeTurn && distinctSpeakers.size >= 2) {
|
|
9678
|
+
emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out);
|
|
9679
|
+
}
|
|
9680
|
+
}
|
|
9681
|
+
function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
|
|
9682
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9683
|
+
for (const l of onCam) {
|
|
9684
|
+
const speaker = canonical(l.speaker ?? "voiceover");
|
|
9685
|
+
const text = l.line.trim();
|
|
9686
|
+
const start = l.start_s ?? scene.start_s ?? 0;
|
|
9687
|
+
const end = l.end_s ?? start + estSpeechS(text);
|
|
9688
|
+
const cur = bySpeaker.get(speaker);
|
|
9689
|
+
if (cur) {
|
|
9690
|
+
cur.lines.push(text);
|
|
9691
|
+
cur.start = Math.min(cur.start, start);
|
|
9692
|
+
cur.end = Math.max(cur.end, end);
|
|
9693
|
+
} else {
|
|
9694
|
+
bySpeaker.set(speaker, { lines: [text], start, end });
|
|
9695
|
+
}
|
|
9696
|
+
}
|
|
9697
|
+
for (const [speaker, agg] of bySpeaker) {
|
|
9698
|
+
const voiceNode = ensureVoiceNode(speaker);
|
|
9699
|
+
emitPhraseTts(
|
|
9700
|
+
{
|
|
9701
|
+
speaker,
|
|
9702
|
+
start_s: agg.start,
|
|
9703
|
+
end_s: agg.end,
|
|
9704
|
+
text: agg.lines.join(" "),
|
|
9705
|
+
firstScene: i,
|
|
9706
|
+
shownScenes: [],
|
|
9707
|
+
presenterShown: false
|
|
9708
|
+
},
|
|
9709
|
+
voiceNode,
|
|
9710
|
+
i,
|
|
9711
|
+
usedVoIds,
|
|
9712
|
+
nodes,
|
|
9713
|
+
out
|
|
9714
|
+
);
|
|
9715
|
+
}
|
|
9716
|
+
}
|
|
9717
|
+
function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
|
|
9718
|
+
const present = slotsForScene(env.slots, i);
|
|
9719
|
+
const mode = sceneShootMode(scene, present, void 0, env.cameraOn, env.casts);
|
|
9720
|
+
const ambientBroll = Boolean(env.opts.ambient) && mode !== "ugc_selfie";
|
|
9721
|
+
const lengths = sceneTiming(scene, isLast, void 0);
|
|
9722
|
+
const ctx = {
|
|
9723
|
+
sceneIndex: i,
|
|
9724
|
+
ar: env.ar,
|
|
9725
|
+
reuse: env.reuse,
|
|
9726
|
+
imageModel: env.opts.imageModel,
|
|
9727
|
+
shootMode: mode,
|
|
9728
|
+
ingestCache: env.ingestCache
|
|
9729
|
+
};
|
|
9730
|
+
if (!ambientBroll && lengths.dur <= FLASH_HOLD_MAX_S) {
|
|
9731
|
+
emitFlashHold(i, scene, env.slots, ctx, lengths, lengths.out, env.ar, nodes, out.clips);
|
|
9732
|
+
return void 0;
|
|
9733
|
+
}
|
|
9734
|
+
const first = scene.continues_previous && prevEndFrame ? prevEndFrame : buildFrameRef(
|
|
9735
|
+
"start",
|
|
9736
|
+
scene.start_frame_asset?.url,
|
|
9737
|
+
scene.start_frame_prompt,
|
|
9738
|
+
slotsForFrame(env.slots, i, "start"),
|
|
9739
|
+
ctx,
|
|
9740
|
+
nodes
|
|
9741
|
+
);
|
|
9742
|
+
const last = buildFrameRef(
|
|
9743
|
+
"end",
|
|
9744
|
+
scene.end_frame_asset?.url,
|
|
9745
|
+
scene.end_frame_prompt,
|
|
9746
|
+
slotsForFrame(env.slots, i, "end"),
|
|
9747
|
+
ctx,
|
|
9748
|
+
nodes
|
|
9749
|
+
);
|
|
9750
|
+
const clip = emitSceneClip(
|
|
9751
|
+
i,
|
|
9752
|
+
scene,
|
|
9753
|
+
present,
|
|
9754
|
+
mode,
|
|
9755
|
+
void 0,
|
|
9756
|
+
ambientBroll,
|
|
9757
|
+
{ first, last },
|
|
9758
|
+
{ dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
|
|
9759
|
+
lengths.out,
|
|
9760
|
+
{ ar: env.ar, videoModel: env.opts.videoModel },
|
|
9761
|
+
nodes
|
|
9762
|
+
);
|
|
9763
|
+
if (ambientBroll) {
|
|
9764
|
+
emitSceneNativeAudio(
|
|
9765
|
+
i,
|
|
9766
|
+
scene,
|
|
9767
|
+
void 0,
|
|
9768
|
+
true,
|
|
9769
|
+
{ dur: lengths.dur, speech: 0, genDur: lengths.genDur },
|
|
9770
|
+
nodes,
|
|
9771
|
+
out.voTracks,
|
|
9772
|
+
out.nativeSegments
|
|
9773
|
+
);
|
|
9774
|
+
}
|
|
9775
|
+
out.clips.push(clip);
|
|
9776
|
+
return last;
|
|
9777
|
+
}
|
|
9778
|
+
function buildTimeline(blueprint, slots, opts, nodes) {
|
|
9779
|
+
const reuse = opts.frames === "reuse";
|
|
9780
|
+
const compositeScenes = /* @__PURE__ */ new Set();
|
|
9781
|
+
if (!reuse) {
|
|
9782
|
+
blueprint.scenes.forEach((s, i) => {
|
|
9783
|
+
if (layeredComposition(s)) compositeScenes.add(i);
|
|
9784
|
+
});
|
|
9785
|
+
}
|
|
9786
|
+
const canonical = collapseVoiceover(blueprint);
|
|
9787
|
+
const ensureVoiceNode = makeVoiceFactory(blueprint, canonical, nodes);
|
|
9788
|
+
const env = {
|
|
9789
|
+
blueprint,
|
|
9790
|
+
slots,
|
|
9791
|
+
opts,
|
|
9792
|
+
ar: aspectRatioParam(blueprint),
|
|
9793
|
+
reuse,
|
|
9794
|
+
cameraOn: onCameraDialogue(blueprint),
|
|
9795
|
+
casts: castIdSet(blueprint),
|
|
9796
|
+
ingestCache: /* @__PURE__ */ new Map()
|
|
9797
|
+
};
|
|
9798
|
+
const out = {
|
|
9799
|
+
clips: [],
|
|
9800
|
+
voTracks: [],
|
|
9801
|
+
voSegments: [],
|
|
9802
|
+
talkingScenes: [],
|
|
9803
|
+
nativeSegments: [],
|
|
9804
|
+
sceneSlice: /* @__PURE__ */ new Map()
|
|
9805
|
+
};
|
|
9806
|
+
const presenterPresent = makePresenterPresent(slots, canonical);
|
|
9807
|
+
const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent);
|
|
9026
9808
|
const usedVoIds = /* @__PURE__ */ new Set();
|
|
9027
|
-
|
|
9028
|
-
|
|
9029
|
-
|
|
9030
|
-
const
|
|
9031
|
-
|
|
9032
|
-
const
|
|
9033
|
-
|
|
9034
|
-
|
|
9035
|
-
|
|
9036
|
-
}
|
|
9037
|
-
|
|
9038
|
-
|
|
9039
|
-
|
|
9040
|
-
|
|
9041
|
-
|
|
9042
|
-
|
|
9043
|
-
|
|
9044
|
-
|
|
9045
|
-
|
|
9046
|
-
|
|
9047
|
-
|
|
9048
|
-
|
|
9049
|
-
|
|
9050
|
-
|
|
9051
|
-
|
|
9052
|
-
|
|
9053
|
-
|
|
9054
|
-
|
|
9055
|
-
|
|
9056
|
-
|
|
9057
|
-
|
|
9058
|
-
|
|
9059
|
-
|
|
9060
|
-
|
|
9061
|
-
|
|
9062
|
-
|
|
9063
|
-
}
|
|
9064
|
-
if (!native) {
|
|
9065
|
-
let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
|
|
9066
|
-
if (usedVoIds.has(id)) {
|
|
9067
|
-
let n = 2;
|
|
9068
|
-
while (usedVoIds.has(`${id}_${n}`)) n++;
|
|
9069
|
-
id = `${id}_${n}`;
|
|
9070
|
-
}
|
|
9071
|
-
usedVoIds.add(id);
|
|
9809
|
+
const claimed = /* @__PURE__ */ new Set();
|
|
9810
|
+
phrases.forEach((phrase, k) => {
|
|
9811
|
+
const voiceNode = ensureVoiceNode(phrase.speaker);
|
|
9812
|
+
const available = phrase.shownScenes.filter((s) => !claimed.has(s));
|
|
9813
|
+
if (phrase.presenterShown && available.length > 0) {
|
|
9814
|
+
for (const s of available) claimed.add(s);
|
|
9815
|
+
emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
|
|
9816
|
+
} else {
|
|
9817
|
+
emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
|
|
9818
|
+
}
|
|
9819
|
+
});
|
|
9820
|
+
const lastIndex = blueprint.scenes.length - 1;
|
|
9821
|
+
let prevEndFrame;
|
|
9822
|
+
blueprint.scenes.forEach((scene, i) => {
|
|
9823
|
+
const composite = compositeScenes.has(i) ? layeredComposition(scene) : null;
|
|
9824
|
+
if (composite) {
|
|
9825
|
+
emitCompositeInTimeline(
|
|
9826
|
+
composite,
|
|
9827
|
+
scene,
|
|
9828
|
+
i,
|
|
9829
|
+
i === lastIndex,
|
|
9830
|
+
env,
|
|
9831
|
+
canonical,
|
|
9832
|
+
ensureVoiceNode,
|
|
9833
|
+
usedVoIds,
|
|
9834
|
+
nodes,
|
|
9835
|
+
out
|
|
9836
|
+
);
|
|
9837
|
+
prevEndFrame = void 0;
|
|
9838
|
+
return;
|
|
9839
|
+
}
|
|
9840
|
+
const slice = out.sceneSlice.get(i);
|
|
9841
|
+
if (slice) {
|
|
9842
|
+
const whole = slice.offset === 0 && Math.abs(slice.len - slice.clipDur) <= 0.05;
|
|
9843
|
+
if (whole) {
|
|
9844
|
+
out.clips.push({ ref: slice.clipRef, scene_s: slice.len, out: null });
|
|
9845
|
+
} else {
|
|
9072
9846
|
nodes.push({
|
|
9073
|
-
id
|
|
9074
|
-
type: "
|
|
9075
|
-
inputs: {
|
|
9076
|
-
params: {
|
|
9847
|
+
id: `s${i}_seg`,
|
|
9848
|
+
type: "ffmpeg",
|
|
9849
|
+
inputs: { clip: slice.clipRef },
|
|
9850
|
+
params: { args: trimArgs(slice.len, slice.offset), outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
9077
9851
|
});
|
|
9078
|
-
|
|
9079
|
-
const audioRef = `$ref:${id}.audio`;
|
|
9080
|
-
tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
|
|
9852
|
+
out.clips.push({ ref: `$ref:s${i}_seg.video`, scene_s: slice.len, out: null });
|
|
9081
9853
|
}
|
|
9082
|
-
|
|
9083
|
-
|
|
9084
|
-
|
|
9854
|
+
prevEndFrame = void 0;
|
|
9855
|
+
return;
|
|
9856
|
+
}
|
|
9857
|
+
prevEndFrame = emitBrollScene(scene, i, i === lastIndex, env, nodes, out, prevEndFrame);
|
|
9085
9858
|
});
|
|
9086
|
-
|
|
9859
|
+
const totalMs = Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3);
|
|
9860
|
+
out.voTracks.push(...buildPerSpeakerVoiceConversion(out.nativeSegments, totalMs, nodes));
|
|
9861
|
+
return { clips: out.clips, voTracks: out.voTracks, vo_segments: out.voSegments, talking_scenes: out.talkingScenes };
|
|
9087
9862
|
}
|
|
9088
9863
|
function buildSfxMusic(blueprint, nodes) {
|
|
9089
9864
|
const tracks = [];
|
|
@@ -9160,18 +9935,48 @@ function positionClass(position) {
|
|
|
9160
9935
|
const p = (position ?? "bottom_center").toLowerCase().replace(/[^a-z]+/g, "-");
|
|
9161
9936
|
return `pos-${p}`;
|
|
9162
9937
|
}
|
|
9163
|
-
function
|
|
9938
|
+
function collectCaptions(blueprint) {
|
|
9939
|
+
return blueprint.scenes.flatMap((scene) => {
|
|
9940
|
+
const sceneStart = scene.start_s ?? 0;
|
|
9941
|
+
const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
|
|
9942
|
+
return overlays.success ? overlays.data.filter((ov) => Boolean(ov.text?.trim())).map((ov) => {
|
|
9943
|
+
const at = ov.appears_at_s ?? sceneStart;
|
|
9944
|
+
return { text: ov.text.trim(), at, end: at + (ov.duration_s ?? 2.5), ov };
|
|
9945
|
+
}) : [];
|
|
9946
|
+
}).sort((a, b) => a.at - b.at);
|
|
9947
|
+
}
|
|
9948
|
+
function mergeCaptions(blueprint) {
|
|
9949
|
+
const byText = /* @__PURE__ */ new Map();
|
|
9950
|
+
for (const e of collectCaptions(blueprint)) {
|
|
9951
|
+
const arr = byText.get(e.text);
|
|
9952
|
+
if (arr) arr.push(e);
|
|
9953
|
+
else byText.set(e.text, [e]);
|
|
9954
|
+
}
|
|
9955
|
+
const merged = [];
|
|
9956
|
+
for (const arr of byText.values()) {
|
|
9957
|
+
let cur = null;
|
|
9958
|
+
for (const e of arr) {
|
|
9959
|
+
if (cur && e.at <= cur.end + 0.35) cur.end = Math.max(cur.end, e.end);
|
|
9960
|
+
else {
|
|
9961
|
+
cur = { ...e };
|
|
9962
|
+
merged.push(cur);
|
|
9963
|
+
}
|
|
9964
|
+
}
|
|
9965
|
+
}
|
|
9966
|
+
return merged.sort((a, b) => a.at - b.at);
|
|
9967
|
+
}
|
|
9968
|
+
function overlayElement(ov, at, dur) {
|
|
9164
9969
|
if (!ov.text?.trim()) return "";
|
|
9165
|
-
const at = ov.appears_at_s ?? sceneStart;
|
|
9166
|
-
const dur = ov.duration_s ?? 2.5;
|
|
9167
9970
|
const role = ov.role ? ` data-role="${escapeHtml(ov.role)}"` : "";
|
|
9168
9971
|
const normAnim = normalizeAnim(ov.animation);
|
|
9169
9972
|
const anim = normAnim ? ` data-anim="${normAnim}"` : "";
|
|
9170
9973
|
const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
|
|
9171
9974
|
return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
|
|
9172
9975
|
}
|
|
9976
|
+
var RICH_OVERLAY_RE = /notif|tweet|\bx post\b|post\b|comment|message|chat|bubble|card|review|rating|stat|counter|toast|popup/;
|
|
9173
9977
|
function sourceHint(fe) {
|
|
9174
9978
|
const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
|
|
9979
|
+
const haystack = `${fe.kind ?? ""} ${fe.description ?? ""} ${fe.what_it_represents ?? ""}`.toLowerCase();
|
|
9175
9980
|
switch ((fe.kind ?? "").toLowerCase()) {
|
|
9176
9981
|
case "logo":
|
|
9177
9982
|
return "baker images logo <domain> (or baker images library)";
|
|
@@ -9181,6 +9986,9 @@ function sourceHint(fe) {
|
|
|
9181
9986
|
case "product_cutout":
|
|
9182
9987
|
return `baker images library "${desc}" (the client's own product)`;
|
|
9183
9988
|
default:
|
|
9989
|
+
if (RICH_OVERLAY_RE.test(haystack)) {
|
|
9990
|
+
return `npx hyperframes add <social-card/notification block> for "${desc}" (animated overlay, not a static icon \u2014 see references/hyperframes/catalog.md)`;
|
|
9991
|
+
}
|
|
9184
9992
|
return `baker images icon "${desc}"`;
|
|
9185
9993
|
}
|
|
9186
9994
|
}
|
|
@@ -9215,14 +10023,12 @@ function buildOverlayHtml(input) {
|
|
|
9215
10023
|
" Positions: edit the .pos-* classes or add your own. -->"
|
|
9216
10024
|
].join("\n")
|
|
9217
10025
|
];
|
|
10026
|
+
const ovParts = mergeCaptions(blueprint).map((e) => overlayElement(e.ov, e.at, Math.round((e.end - e.at) * 1e3) / 1e3)).filter(Boolean);
|
|
10027
|
+
if (ovParts.length > 0) blocks.push(ovParts.join("\n"));
|
|
9218
10028
|
for (const scene of blueprint.scenes) {
|
|
9219
10029
|
const sceneStart = scene.start_s ?? 0;
|
|
9220
|
-
const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
|
|
9221
10030
|
const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
|
|
9222
|
-
const parts = [
|
|
9223
|
-
...overlays.success ? overlays.data.map((ov) => overlayElement(ov, sceneStart)) : [],
|
|
9224
|
-
...floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []
|
|
9225
|
-
].filter(Boolean);
|
|
10031
|
+
const parts = (floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []).filter(Boolean);
|
|
9226
10032
|
if (parts.length > 0) blocks.push(parts.join("\n"));
|
|
9227
10033
|
}
|
|
9228
10034
|
return blocks.join("\n\n");
|
|
@@ -9255,15 +10061,15 @@ function xfadeSpineArgs(clips) {
|
|
|
9255
10061
|
let cur = "c0";
|
|
9256
10062
|
let accLen = clipInputLen(clips[0]);
|
|
9257
10063
|
for (let k = 0; k < n - 1; k++) {
|
|
9258
|
-
const
|
|
10064
|
+
const join4 = clips[k].out;
|
|
9259
10065
|
const next = `c${k + 1}`;
|
|
9260
10066
|
const out = k === n - 2 ? "v" : `j${k + 1}`;
|
|
9261
|
-
if (
|
|
9262
|
-
const offset = Math.max(0, accLen -
|
|
10067
|
+
if (join4) {
|
|
10068
|
+
const offset = Math.max(0, accLen - join4.dur);
|
|
9263
10069
|
filt.push(
|
|
9264
|
-
`[${cur}][${next}]xfade=transition=${
|
|
10070
|
+
`[${cur}][${next}]xfade=transition=${join4.xfade}:duration=${join4.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
|
|
9265
10071
|
);
|
|
9266
|
-
accLen = accLen -
|
|
10072
|
+
accLen = accLen - join4.dur + clipInputLen(clips[k + 1]);
|
|
9267
10073
|
} else {
|
|
9268
10074
|
filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
|
|
9269
10075
|
accLen += clipInputLen(clips[k + 1]);
|
|
@@ -9305,9 +10111,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
9305
10111
|
});
|
|
9306
10112
|
});
|
|
9307
10113
|
if (opts.actorSheets) applyActorSheets(slots, nodes);
|
|
9308
|
-
const {
|
|
9309
|
-
const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
|
|
9310
|
-
const voTracks = [...ttsTracks, ...nativeVoTracks];
|
|
10114
|
+
const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
|
|
9311
10115
|
let videoRef = buildSpine(clips, nodes);
|
|
9312
10116
|
let videoNode = "spine";
|
|
9313
10117
|
const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
|
|
@@ -9384,45 +10188,31 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
9384
10188
|
// The timing plan `baker canvas validate` checks before any billed render:
|
|
9385
10189
|
// sequenced voiceover turns (no overlap), audio ≈ video length, and which
|
|
9386
10190
|
// scenes must be lip-synced.
|
|
9387
|
-
video: buildVideoMeta(blueprint,
|
|
10191
|
+
video: buildVideoMeta(blueprint, { vo_segments, talking_scenes })
|
|
9388
10192
|
},
|
|
9389
10193
|
nodes,
|
|
9390
10194
|
output: { node: videoNode, output: "video" }
|
|
9391
10195
|
};
|
|
9392
10196
|
}
|
|
9393
|
-
function buildVideoMeta(blueprint,
|
|
9394
|
-
const vo_segments = [];
|
|
9395
|
-
const talking_scenes = [];
|
|
9396
|
-
for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
|
|
9397
|
-
for (const t of turns) {
|
|
9398
|
-
if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
|
|
9399
|
-
}
|
|
9400
|
-
const nativeTurn = turns.find((t) => t.native);
|
|
9401
|
-
if (nativeTurn) {
|
|
9402
|
-
const sceneObj = blueprint.scenes[scene];
|
|
9403
|
-
talking_scenes.push({
|
|
9404
|
-
scene,
|
|
9405
|
-
voice_convert_node: `s${scene}_voconv`,
|
|
9406
|
-
scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
|
|
9407
|
-
est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
|
|
9408
|
-
});
|
|
9409
|
-
}
|
|
9410
|
-
}
|
|
10197
|
+
function buildVideoMeta(blueprint, meta) {
|
|
9411
10198
|
return {
|
|
9412
10199
|
duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
|
|
9413
|
-
vo_segments,
|
|
9414
|
-
talking_scenes,
|
|
9415
|
-
motion_board: buildMotionBoard(blueprint
|
|
10200
|
+
vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
|
|
10201
|
+
talking_scenes: meta.talking_scenes,
|
|
10202
|
+
motion_board: buildMotionBoard(blueprint)
|
|
9416
10203
|
};
|
|
9417
10204
|
}
|
|
9418
|
-
function
|
|
10205
|
+
function sceneSpokenText(scene) {
|
|
10206
|
+
return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
|
|
10207
|
+
}
|
|
10208
|
+
function buildMotionBoard(blueprint) {
|
|
9419
10209
|
const round = (n) => Math.round(n * 100) / 100;
|
|
9420
10210
|
let cursor = 0;
|
|
9421
10211
|
return blueprint.scenes.map((scene, i) => {
|
|
9422
10212
|
const start_s = scene.start_s ?? cursor;
|
|
9423
10213
|
const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
|
|
9424
10214
|
cursor = end_s;
|
|
9425
|
-
const spoken = (
|
|
10215
|
+
const spoken = sceneSpokenText(scene);
|
|
9426
10216
|
const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
|
|
9427
10217
|
const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
|
|
9428
10218
|
const graphics = [
|
|
@@ -9445,19 +10235,21 @@ function buildMotionBoard(blueprint, sceneTurns) {
|
|
|
9445
10235
|
scene: i,
|
|
9446
10236
|
role: resolveSceneRole(scene, i, blueprint.scenes.length),
|
|
9447
10237
|
window_s: [round(start_s), round(end_s)],
|
|
9448
|
-
|
|
10238
|
+
// A continuation b-roll scene shares the previous scene's end frame as its start
|
|
10239
|
+
// (no own `s<i>_start` node), so point the storyboard at that shared keyframe.
|
|
10240
|
+
storyboard_frames: [scene.continues_previous && i > 0 ? `s${i - 1}_end` : `s${i}_start`],
|
|
9449
10241
|
spoken,
|
|
9450
10242
|
graphics
|
|
9451
10243
|
};
|
|
9452
10244
|
});
|
|
9453
10245
|
}
|
|
9454
10246
|
var VIDEO_GUIDE = [
|
|
9455
|
-
"Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video.
|
|
10247
|
+
"Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video, built like an editing timeline. The VOICE is cut at PAUSES, not at visual cuts: each continuous-speech PHRASE is ONE Seedance clip (native lip-sync + audio) re-voiced to one brand voice, so a sentence never breaks mid-word across a cut. Each scene's PICTURE is independent: a scene that SHOWS the speaker slices its window out of the phrase clip; a b-roll cutaway gets its own silent clip (or a still hold for a sub-2s flash) laid over the continuing voice; a pure-voiceover stretch is one ElevenLabs tts read. Every clip gets a CLEAN-PLATE start AND end keyframe (no baked text), RECAST to your dropped reference assets \u2014 Seedance interpolates real in-shot motion between them. Each frame grounds ONLY on its own extracted frame + el_* slots (never another generated frame), so all frames render in PARALLEL (no cross-frame cascade). A SPLIT-SCREEN / PICTURE-IN-PICTURE / KEYED-PRESENTER scene is reproduced as one clip PER REGION, stacked or overlaid (see `metadata.todo.composition`). On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voice + SFX + a ducked music bed, normalized stereo. It is a STARTING POINT, not a locked render: add, delete, reorder, split, merge, or re-time scenes freely (a b-roll cutaway INSIDE a phrase lands at an approximate beat \u2014 nudge it) \u2014 see `metadata.todo.full_flexibility`.",
|
|
9456
10248
|
"",
|
|
9457
10249
|
"WHAT TO DO NEXT:",
|
|
9458
10250
|
"0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
|
|
9459
|
-
"1. Edit each frame's prompt IN PLACE. Every `s<i>_start`
|
|
9460
|
-
"1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill.
|
|
10251
|
+
"1. Edit each frame's prompt IN PLACE. Every `s<i>_start` keyframe node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want. The frame is RECAST to the el_* reference images you drop (the source ad's people are never reused), so describe pose/action/framing here and let the references carry identity.",
|
|
10252
|
+
"1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. Each scene's keyframe IS your storyboard; `metadata.video.motion_board` lays out each scene's frame, time window, spoken line, and the graphics scheduled in it. Lock the keyframes + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
|
|
9461
10253
|
"2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
|
|
9462
10254
|
"3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
|
|
9463
10255
|
"4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
|
|
@@ -9468,11 +10260,11 @@ var VIDEO_GUIDE = [
|
|
|
9468
10260
|
"- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
|
|
9469
10261
|
"- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
|
|
9470
10262
|
"- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
|
|
9471
|
-
"- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it).
|
|
10263
|
+
"- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it).",
|
|
9472
10264
|
"- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
|
|
9473
10265
|
"- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
|
|
9474
10266
|
"",
|
|
9475
|
-
"Tip: `prompt.json` is the deconstruction provenance + the
|
|
10267
|
+
"Tip: `prompt.json` is the deconstruction provenance + the authoritative SHARED AD SPEC each frame reads for cast identity, palette, brand, and type cohesion. The per-frame editing surface is the frame node's own FRAME DESCRIPTION.",
|
|
9476
10268
|
"Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
|
|
9477
10269
|
].join("\n");
|
|
9478
10270
|
function inferNarrativeRole(index, total) {
|
|
@@ -9514,14 +10306,16 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
|
|
|
9514
10306
|
const hookSceneIndex = findHookSceneIndex(blueprint);
|
|
9515
10307
|
const h = hookSceneIndex;
|
|
9516
10308
|
return {
|
|
10309
|
+
full_flexibility: "THIS CANVAS IS A STARTING POINT, NOT A LOCKED RENDER. It mirrors the reference's structure so you have a faithful scaffold \u2014 but you have FULL EDITING FREEDOM and should use it. You can: ADD a scene (new s<i>_start/_end + s<i>_clip + wire it into `spine`), DELETE a scene (drop its nodes + its `spine` input), REORDER scenes, SPLIT one beat into two or MERGE two into one, change any frame prompt or motion brief, swap an element reference, re-time or rewrite any overlay/voice, or change a scene's LAYOUT (make a full-frame beat a split-screen/PIP, or flatten a composite to one shot \u2014 see `composition`). Re-craft for OUR brand and OUR best ad; the reference is inspiration, not a spec to trace. The content-addressed cache re-bills only what you actually change, so iterate freely. `baker canvas validate` re-checks timing/lip-sync after any edit.",
|
|
10310
|
+
composition: "Some scenes are COMPOSITED, not single shots \u2014 `prompt.json`'s scene.composition.layout tells you which: `split_screen` (panels each showing different footage \u2014 e.g. b-roll on top, presenter on the bottom), `pip` (a presenter boxed in a corner over full-frame background), or `keyed_overlay` (a green-screen/cut-out presenter over background). Each is reproduced as ONE generated clip PER REGION (`s<i>_r0_*`, `s<i>_r1_*`, \u2026) stacked (vstack/hstack) or overlaid by an `s<i>_composite` ffmpeg node; a keyed presenter runs through `s<i>_key` (video_background_remove) for a transparent cut-out first. Edit each region's own keyframe prompt + motion brief independently. The presenter region (is_presenter) carries the lip-synced voice. To CHANGE a layout, edit composition in prompt.json and re-scaffold, or hand-edit the s<i>_composite ffmpeg args (splitStackArgs/pipOverlayArgs patterns). A clean full-frame talking head is simpler than a composite \u2014 flatten when the brand's version doesn't need the split.",
|
|
9517
10311
|
recraft_the_script_first: `VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer.${h >= 0 ? " The HOOK is the #1 decision (see the `hook` todo);" : ""} ${h >= 0 ? "then work" : "Work"} the per-scene \`script_recraft\` checklist. References: references/hook-craft.md (the hook), references/script-craft.md (body/CTA) + the meta-ads-playbook skill.`,
|
|
9518
10312
|
...h >= 0 ? {
|
|
9519
10313
|
hook: `THE HOOK IS THE HIGHEST-LEVERAGE BEAT \u2014 the first frame + first 3\u20134s decide whether the ad is watched at all, and the hook is TARGETING. But highest-leverage does NOT mean always rewrite: this hook already won, so MOST OF THE TIME you KEEP it and build on top (swap only the specifics). REBUILD is the exception \u2014 only when it doesn't transfer (a claim we lack or a different funnel/awareness stage), and then by reaching for its deeper INNER MECHANIC and delivering that truthfully, not inventing a new opener from nothing. For scene ${h}: DIAGNOSE it (device + mechanic + what stage it targets), DECIDE keep/adapt/rebuild, then hold the opener to the criteria \u2014 ${HOOK_OPENER_CRITERIA}. The hook lives across s${h}_start (the scroll-stopping first frame), the scene-${h} overlay text, the s${h}_clip line, an optional ~0.5s micro-hook, and the ramp into the body. Full diagnose\u2192decide\u2192(keep/adapt/rebuild) discipline + the proven hook-type menu: references/hook-craft.md (+ meta-ads-playbook \xA710/\xA717/\xA739).`
|
|
9520
10314
|
} : {},
|
|
9521
10315
|
script_recraft: buildScriptRecraft(blueprint),
|
|
9522
|
-
edit_frames_in_place: "Each s<i>_start
|
|
10316
|
+
edit_frames_in_place: "Each s<i>_start keyframe node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is the authoritative shared ad spec (cast identity, palette, brand). Frames are RECAST to the el_* reference images (the source ad's cast is never reused) and are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
|
|
9523
10317
|
frames_mode: opts.frames ?? "generate",
|
|
9524
|
-
review_storyboard_before_clips: "STORYBOARD FIRST.
|
|
10318
|
+
review_storyboard_before_clips: "STORYBOARD FIRST. Each scene's keyframe (s<i>_start) IS your storyboard \u2014 align the LOOK on it before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's keyframe, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
|
|
9525
10319
|
motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
|
|
9526
10320
|
assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
|
|
9527
10321
|
sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
|
|
@@ -9534,18 +10328,17 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
|
|
|
9534
10328
|
voice_description: d.voice_description,
|
|
9535
10329
|
line: d.line
|
|
9536
10330
|
})),
|
|
9537
|
-
talking_head_note: "NATIVE: a
|
|
9538
|
-
voice_note: "
|
|
9539
|
-
native_timing: "
|
|
10331
|
+
talking_head_note: "PHRASE-NATIVE: a continuous-speech phrase where the speaker is shown is ONE Seedance clip (the full phrase quoted in s<anchor>_clip's prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Scenes that show the speaker slice their window out of that clip (s<i>_seg); edit the phrase line in the s<anchor>_clip prompt to re-author it. A pure-voiceover phrase (speaker never shown) is one ElevenLabs tts read instead.",
|
|
10332
|
+
voice_note: "ONE voice per person: a single voice_select is reused across all that person's phrases (on-camera AND off \u2014 the deconstruct's `voiceover` label folds into the sole presenter). Each presenter phrase's native audio is re-voiced to that brand voice via audio_voice_convert (eleven_multilingual_sts_v2, one convert per phrase, timing preserved so lips stay matched). Set voice_select.voice_id's gender/language to match the creator.",
|
|
10333
|
+
native_timing: "The voice is cut at PAUSES, not at visual cuts, so a sentence spanning a cut stays one continuous read (no mid-word break). The clip is generated long enough for the estimated speech; if a line runs longer than its phrase window the voice continues a beat into the following pause (natural VO continuity). `metadata.video.talking_scenes` carries each phrase's scene_s vs est_speech_s. CAVEAT: a b-roll cutaway INSIDE a phrase lands at an approximate (proportional) time \u2014 Seedance exposes no word timing \u2014 so if a cutaway is off its beat, nudge the scene boundary (it's a starting point).",
|
|
9540
10334
|
craft: {
|
|
9541
10335
|
note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
|
|
9542
10336
|
principles: [
|
|
9543
10337
|
"Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
|
|
9544
10338
|
"Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
|
|
9545
|
-
"LOCK THE CAMERA \u2014
|
|
10339
|
+
"LOCK THE CAMERA \u2014 Seedance animates forward from the single keyframe; only move when the motion brief specifies a move. Unmotivated camera drift is the top realism tell.",
|
|
9546
10340
|
"One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
|
|
9547
10341
|
"Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
|
|
9548
|
-
"Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
|
|
9549
10342
|
"Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
|
|
9550
10343
|
"Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
|
|
9551
10344
|
"Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
|
|
@@ -9659,10 +10452,10 @@ DROP one-off background extras and incidental props \u2014 but the shared set/lo
|
|
|
9659
10452
|
|
|
9660
10453
|
ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
|
|
9661
10454
|
|
|
9662
|
-
For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of
|
|
10455
|
+
For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of ONLY the scenes where the element is ACTUALLY VISIBLE ON SCREEN \u2014 judged from that scene's start_frame_prompt / end_frame_prompt subjects and its action_detail, NOT from who is merely speaking. A narrator heard over b-roll is NOT present in that b-roll scene; a dog-running cutaway does NOT contain the couch creator just because she talks across it. Do NOT pad the list \u2014 an element wrongly listed in a scene makes the reproduction render the wrong subject there (e.g. the creator appearing in a pure-dog b-roll). When in doubt, leave a scene OUT. Output ONLY the JSON object.`;
|
|
9663
10456
|
async function loadAssetText2(ref, label) {
|
|
9664
10457
|
const r = ref;
|
|
9665
|
-
if (typeof r?.path === "string") return
|
|
10458
|
+
if (typeof r?.path === "string") return readFile5(r.path, "utf8");
|
|
9666
10459
|
if (typeof r?.url === "string") {
|
|
9667
10460
|
const res = await fetch(r.url);
|
|
9668
10461
|
if (!res.ok) throw new Error(`failed to fetch ${label} (${res.status})`);
|
|
@@ -9678,6 +10471,31 @@ function parseElements2(raw) {
|
|
|
9678
10471
|
}
|
|
9679
10472
|
return [];
|
|
9680
10473
|
}
|
|
10474
|
+
async function detectShotCutsBestEffort(videoPath, threshold) {
|
|
10475
|
+
try {
|
|
10476
|
+
const cuts = await detectSceneCutsPySceneDetect(videoPath, threshold ? { threshold } : {});
|
|
10477
|
+
if (cuts.length > 0) {
|
|
10478
|
+
process.stderr.write(`Detected ${cuts.length} shot cut(s) via PySceneDetect: ${cuts.join(", ")}s
|
|
10479
|
+
`);
|
|
10480
|
+
} else {
|
|
10481
|
+
process.stderr.write("PySceneDetect ran but found no hard cuts; using LLM scene boundaries.\n");
|
|
10482
|
+
}
|
|
10483
|
+
return cuts;
|
|
10484
|
+
} catch (e) {
|
|
10485
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
10486
|
+
const code = e?.code;
|
|
10487
|
+
const missing = code === "ENOENT" || /ENOENT|not found|command not found/i.test(msg);
|
|
10488
|
+
if (missing) {
|
|
10489
|
+
process.stderr.write(
|
|
10490
|
+
"WARNING: `scenedetect` (PySceneDetect) is NOT installed \u2014 falling back to LLM-only scene boundaries, which under-segments (coarse 9-15s scenes instead of the real 1-4s cuts). Install it (`pipx install scenedetect[opencv]` or `pip install scenedetect[opencv]`) for accurate shot-cut detection.\n"
|
|
10491
|
+
);
|
|
10492
|
+
} else {
|
|
10493
|
+
process.stderr.write(`Shot-cut detection skipped (${msg}); using LLM boundaries.
|
|
10494
|
+
`);
|
|
10495
|
+
}
|
|
10496
|
+
return [];
|
|
10497
|
+
}
|
|
10498
|
+
}
|
|
9681
10499
|
function fail2(code, message) {
|
|
9682
10500
|
process.stderr.write(`${JSON.stringify({ ok: false, error: { code, message } }, null, 2)}
|
|
9683
10501
|
`);
|
|
@@ -9686,7 +10504,11 @@ function fail2(code, message) {
|
|
|
9686
10504
|
function resolveModels2(args) {
|
|
9687
10505
|
const pick = (flag, kind, fallback) => args[flag] ? String(args[flag]) : resolveModel2(kind, fallback);
|
|
9688
10506
|
return {
|
|
9689
|
-
|
|
10507
|
+
// Flash by default: ~2-3× faster per call than Pro, which keeps each
|
|
10508
|
+
// per-scene deconstruct step well inside the action time budget and turns a
|
|
10509
|
+
// ~9 min run into ~3-4 min. Override with --deconstruct-model
|
|
10510
|
+
// ~google/gemini-pro-latest for the densest extraction when speed matters less.
|
|
10511
|
+
deconstructModel: pick("deconstruct-model", "video_deconstruct", "~google/gemini-flash-latest"),
|
|
9690
10512
|
selectModel: pick("select-model", "text_generate", "~google/gemini-flash-latest"),
|
|
9691
10513
|
// Default to the strongest image model (matches the static-ad scaffold); the
|
|
9692
10514
|
// frame generators need the most faithful text/identity reproduction. Override
|
|
@@ -9695,53 +10517,76 @@ function resolveModels2(args) {
|
|
|
9695
10517
|
videoModel: pick("video-model", "video_generate", "bytedance/seedance-2.0")
|
|
9696
10518
|
};
|
|
9697
10519
|
}
|
|
9698
|
-
function
|
|
10520
|
+
function buildDeconstructCanvas(videoPath, deconstructModel, opts) {
|
|
9699
10521
|
const deconstructParams = { model: deconstructModel, mode: "full" };
|
|
9700
10522
|
if (typeof opts.maxScenes === "number") deconstructParams.max_scenes = opts.maxScenes;
|
|
9701
10523
|
if (opts.language) deconstructParams.language = opts.language;
|
|
9702
10524
|
if (opts.focus) deconstructParams.focus = opts.focus;
|
|
10525
|
+
if (opts.shotCuts && opts.shotCuts.length > 0) deconstructParams.shot_cuts = opts.shotCuts;
|
|
10526
|
+
deconstructParams.max_clip_s = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
|
|
9703
10527
|
return {
|
|
9704
10528
|
schema: "baker-canvas/1",
|
|
9705
10529
|
metadata: { name: "video deconstruct pass" },
|
|
9706
10530
|
nodes: [
|
|
9707
10531
|
{ id: "src", type: "ingest", params: { source: "path", path: videoPath, expect: "video" } },
|
|
9708
|
-
{ id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams }
|
|
10532
|
+
{ id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams }
|
|
10533
|
+
],
|
|
10534
|
+
output: { node: "deconstruct", output: "analysis" }
|
|
10535
|
+
};
|
|
10536
|
+
}
|
|
10537
|
+
function buildSelectCanvas(selectModel, slimmedBlueprintJson) {
|
|
10538
|
+
return {
|
|
10539
|
+
schema: "baker-canvas/1",
|
|
10540
|
+
metadata: { name: "element selection pass" },
|
|
10541
|
+
nodes: [
|
|
9709
10542
|
{
|
|
9710
10543
|
id: "select",
|
|
9711
10544
|
type: "text_generate",
|
|
9712
|
-
inputs: { blueprint: "$ref:deconstruct.analysis" },
|
|
9713
10545
|
params: {
|
|
9714
10546
|
model: selectModel,
|
|
9715
10547
|
max_tokens: 6e3,
|
|
9716
10548
|
temperature: 0,
|
|
9717
10549
|
response_format: "json_object",
|
|
9718
10550
|
system: SELECT_SYSTEM2,
|
|
9719
|
-
prompt: SELECT_PROMPT2
|
|
10551
|
+
prompt: SELECT_PROMPT2.replace("{{blueprint}}", () => slimmedBlueprintJson)
|
|
9720
10552
|
}
|
|
9721
10553
|
}
|
|
9722
10554
|
],
|
|
9723
10555
|
output: { node: "select", output: "text" }
|
|
9724
10556
|
};
|
|
9725
10557
|
}
|
|
9726
|
-
async function runAnalysisPasses(
|
|
10558
|
+
async function runAnalysisPasses(deconstructCanvas, selectModel) {
|
|
9727
10559
|
const engine = createEngineFromEnv({ log: (line) => process.stderr.write(`${line}
|
|
9728
10560
|
`) });
|
|
9729
|
-
let
|
|
9730
|
-
let
|
|
10561
|
+
let credits = 0;
|
|
10562
|
+
let sawCredits = false;
|
|
10563
|
+
const addCredits = (stats) => {
|
|
10564
|
+
const c = stats?.total_credits;
|
|
10565
|
+
if (typeof c === "number") {
|
|
10566
|
+
credits += c;
|
|
10567
|
+
sawCredits = true;
|
|
10568
|
+
}
|
|
10569
|
+
};
|
|
10570
|
+
let blueprint;
|
|
9731
10571
|
try {
|
|
9732
|
-
const
|
|
9733
|
-
|
|
9734
|
-
|
|
10572
|
+
const r1 = await engine.run(deconstructCanvas, {});
|
|
10573
|
+
addCredits(r1.stats);
|
|
10574
|
+
blueprint = JSON.parse(await loadAssetText2(r1.outputs_by_node.deconstruct?.analysis, "deconstruct output"));
|
|
9735
10575
|
} catch (e) {
|
|
9736
10576
|
if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
|
|
10577
|
+
if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
|
|
9737
10578
|
return fail2("deconstruct", e instanceof Error ? e.message : String(e));
|
|
9738
10579
|
}
|
|
10580
|
+
const slimJson = JSON.stringify(slimBlueprintForSelection(blueprint));
|
|
9739
10581
|
try {
|
|
9740
|
-
const
|
|
9741
|
-
|
|
9742
|
-
|
|
10582
|
+
const r2 = await engine.run(buildSelectCanvas(selectModel, slimJson), {});
|
|
10583
|
+
addCredits(r2.stats);
|
|
10584
|
+
const elements = parseElements2(await loadAssetText2(r2.outputs_by_node.select?.text, "selection output"));
|
|
10585
|
+
return { blueprint, elements, creditsSpent: sawCredits ? credits : void 0 };
|
|
9743
10586
|
} catch (e) {
|
|
9744
|
-
return fail2("
|
|
10587
|
+
if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
|
|
10588
|
+
if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
|
|
10589
|
+
return fail2("deconstruct", e instanceof Error ? e.message : String(e));
|
|
9745
10590
|
}
|
|
9746
10591
|
}
|
|
9747
10592
|
var scaffoldVideoCommand = defineCommand76({
|
|
@@ -9762,6 +10607,10 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9762
10607
|
description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
|
|
9763
10608
|
},
|
|
9764
10609
|
"max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
|
|
10610
|
+
"shot-threshold": {
|
|
10611
|
+
type: "string",
|
|
10612
|
+
description: "PySceneDetect content threshold (default 18; lower = more/softer cuts, higher = fewer)"
|
|
10613
|
+
},
|
|
9765
10614
|
language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
|
|
9766
10615
|
focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
|
|
9767
10616
|
"deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
|
|
@@ -9784,12 +10633,15 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9784
10633
|
);
|
|
9785
10634
|
}
|
|
9786
10635
|
const { deconstructModel, selectModel, imageModel, videoModel } = resolveModels2(args);
|
|
9787
|
-
const
|
|
10636
|
+
const shotThreshold = args["shot-threshold"] ? Number(args["shot-threshold"]) : void 0;
|
|
10637
|
+
const shotCuts = await detectShotCutsBestEffort(videoPath, shotThreshold);
|
|
10638
|
+
const deconstructCanvas = buildDeconstructCanvas(videoPath, deconstructModel, {
|
|
9788
10639
|
maxScenes: Number.isFinite(maxScenes) ? maxScenes : void 0,
|
|
9789
10640
|
language: args.language ? String(args.language) : void 0,
|
|
9790
|
-
focus: args.focus ? String(args.focus) : void 0
|
|
10641
|
+
focus: args.focus ? String(args.focus) : void 0,
|
|
10642
|
+
shotCuts
|
|
9791
10643
|
});
|
|
9792
|
-
const { blueprint, elements, creditsSpent } = await runAnalysisPasses(
|
|
10644
|
+
const { blueprint, elements, creditsSpent } = await runAnalysisPasses(deconstructCanvas, selectModel);
|
|
9793
10645
|
await mkdir(outDir, { recursive: true });
|
|
9794
10646
|
const annotated = annotateBlueprintWithElements(blueprint, elements);
|
|
9795
10647
|
await writeFile2(blueprintPath, `${JSON.stringify(annotated, null, 2)}
|
|
@@ -9798,7 +10650,7 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9798
10650
|
await cp(SHIPPED_COMPOSITION_DIR, compositionDest, { recursive: true });
|
|
9799
10651
|
const indexPath = path5.join(compositionDest, "index.html");
|
|
9800
10652
|
const overlayHtml = buildOverlayHtml(blueprint);
|
|
9801
|
-
const indexHtml = await
|
|
10653
|
+
const indexHtml = await readFile5(indexPath, "utf8");
|
|
9802
10654
|
const injected = indexHtml.replace("<!--OVERLAYS-->", () => overlayHtml);
|
|
9803
10655
|
if (injected === indexHtml && overlayHtml.trim()) {
|
|
9804
10656
|
fail2(
|
|
@@ -9847,7 +10699,7 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9847
10699
|
stats: {
|
|
9848
10700
|
scene_count: report.scene_count,
|
|
9849
10701
|
total_nodes: canvas.nodes.length,
|
|
9850
|
-
|
|
10702
|
+
analysis_credits_spent: creditsSpent,
|
|
9851
10703
|
run_estimated_credits: validation.estimatedCredits
|
|
9852
10704
|
},
|
|
9853
10705
|
checklist: {
|
|
@@ -9875,7 +10727,7 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9875
10727
|
});
|
|
9876
10728
|
|
|
9877
10729
|
// src/commands/canvas/validate.ts
|
|
9878
|
-
import { readFile as
|
|
10730
|
+
import { readFile as readFile6 } from "fs/promises";
|
|
9879
10731
|
import path6 from "path";
|
|
9880
10732
|
import { defineCommand as defineCommand77 } from "citty";
|
|
9881
10733
|
var validateCommand = defineCommand77({
|
|
@@ -9886,7 +10738,7 @@ var validateCommand = defineCommand77({
|
|
|
9886
10738
|
args: { file: { type: "positional", required: true, description: "Path to canvas JSON" } },
|
|
9887
10739
|
async run({ args }) {
|
|
9888
10740
|
const filePath = path6.resolve(String(args.file));
|
|
9889
|
-
const raw = await
|
|
10741
|
+
const raw = await readFile6(filePath, "utf8");
|
|
9890
10742
|
let parsed;
|
|
9891
10743
|
try {
|
|
9892
10744
|
parsed = JSON.parse(raw);
|
|
@@ -10775,8 +11627,8 @@ function cropSprite(input, region) {
|
|
|
10775
11627
|
|
|
10776
11628
|
// src/lib/image/io.ts
|
|
10777
11629
|
import { randomBytes } from "crypto";
|
|
10778
|
-
import { glob as fsGlob, readFile as
|
|
10779
|
-
import { dirname, extname, join as
|
|
11630
|
+
import { glob as fsGlob, readFile as readFile7, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
|
|
11631
|
+
import { dirname, extname, join as join3, resolve as resolve4 } from "path";
|
|
10780
11632
|
var REMOTE_RE = /^https?:\/\//i;
|
|
10781
11633
|
var GLOB_RE = /[*?[\]{}]/;
|
|
10782
11634
|
function isRemoteUrl(value) {
|
|
@@ -10811,7 +11663,7 @@ async function readImageBuffer(pathOrUrl) {
|
|
|
10811
11663
|
}
|
|
10812
11664
|
return Buffer.from(await response.arrayBuffer());
|
|
10813
11665
|
}
|
|
10814
|
-
return
|
|
11666
|
+
return readFile7(pathOrUrl);
|
|
10815
11667
|
}
|
|
10816
11668
|
async function isDirectory(path7) {
|
|
10817
11669
|
try {
|
|
@@ -10826,14 +11678,14 @@ async function resolveOutputPath(inputPath, outputArg, options) {
|
|
|
10826
11678
|
if (!outputArg) return base;
|
|
10827
11679
|
if (options.multipleInputs || await isDirectory(outputArg)) {
|
|
10828
11680
|
const filename = base.split("/").pop() ?? "out.png";
|
|
10829
|
-
return
|
|
11681
|
+
return join3(outputArg, filename);
|
|
10830
11682
|
}
|
|
10831
11683
|
return outputArg;
|
|
10832
11684
|
}
|
|
10833
11685
|
async function atomicWrite(targetPath, data) {
|
|
10834
11686
|
const absolute = resolve4(targetPath);
|
|
10835
11687
|
const dir = dirname(absolute);
|
|
10836
|
-
const tmp =
|
|
11688
|
+
const tmp = join3(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
|
|
10837
11689
|
await writeFile3(tmp, data);
|
|
10838
11690
|
await rename(tmp, absolute);
|
|
10839
11691
|
}
|
|
@@ -11175,7 +12027,7 @@ var findCommand = defineCommand91({
|
|
|
11175
12027
|
});
|
|
11176
12028
|
|
|
11177
12029
|
// src/commands/images/generate.ts
|
|
11178
|
-
import { readFile as
|
|
12030
|
+
import { readFile as readFile8 } from "fs/promises";
|
|
11179
12031
|
import { defineCommand as defineCommand92 } from "citty";
|
|
11180
12032
|
import sharp2 from "sharp";
|
|
11181
12033
|
var GENERATE_TIMEOUT_MS = 18e4;
|
|
@@ -11258,7 +12110,7 @@ async function resolveReferences(spec) {
|
|
|
11258
12110
|
}
|
|
11259
12111
|
let raw;
|
|
11260
12112
|
try {
|
|
11261
|
-
raw = await
|
|
12113
|
+
raw = await readFile8(entry);
|
|
11262
12114
|
} catch {
|
|
11263
12115
|
throw new ApiError("VALIDATION_ERROR", `Reference file not found: ${entry}`);
|
|
11264
12116
|
}
|
|
@@ -12979,7 +13831,7 @@ var stockCommand = defineCommand105({
|
|
|
12979
13831
|
});
|
|
12980
13832
|
|
|
12981
13833
|
// src/commands/images/upload.ts
|
|
12982
|
-
import { readFile as
|
|
13834
|
+
import { readFile as readFile9 } from "fs/promises";
|
|
12983
13835
|
import { extname as extname2 } from "path";
|
|
12984
13836
|
import { defineCommand as defineCommand106 } from "citty";
|
|
12985
13837
|
var MIME_MAP = {
|
|
@@ -13119,7 +13971,7 @@ async function uploadLocal(target, args) {
|
|
|
13119
13971
|
});
|
|
13120
13972
|
return;
|
|
13121
13973
|
}
|
|
13122
|
-
const fileBuffer = await
|
|
13974
|
+
const fileBuffer = await readFile9(target);
|
|
13123
13975
|
const base64 = fileBuffer.toString("base64");
|
|
13124
13976
|
const body = { base64, contentType };
|
|
13125
13977
|
if (args.source) body.source = args.source;
|
|
@@ -15084,7 +15936,7 @@ var searchCommand3 = defineCommand135({
|
|
|
15084
15936
|
});
|
|
15085
15937
|
|
|
15086
15938
|
// src/commands/videos/upload.ts
|
|
15087
|
-
import { readFile as
|
|
15939
|
+
import { readFile as readFile10, stat as stat3 } from "fs/promises";
|
|
15088
15940
|
import { extname as extname3 } from "path";
|
|
15089
15941
|
import { defineCommand as defineCommand136 } from "citty";
|
|
15090
15942
|
var MIME_MAP2 = {
|
|
@@ -15149,7 +16001,7 @@ var uploadCommand2 = defineCommand136({
|
|
|
15149
16001
|
return;
|
|
15150
16002
|
}
|
|
15151
16003
|
const { uploadUrl, videoId } = await apiPost("/api/videos/upload", {});
|
|
15152
|
-
const fileBuffer = await
|
|
16004
|
+
const fileBuffer = await readFile10(filePath);
|
|
15153
16005
|
const uploadResponse = await fetch(uploadUrl, {
|
|
15154
16006
|
method: "PUT",
|
|
15155
16007
|
headers: { "Content-Type": contentType },
|