@koda-sl/baker-cli 0.82.0 → 0.91.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -9
- package/canvas/video-overlay-composition/index.html +31 -5
- package/dist/{chunk-KIL2ZJST.js → chunk-LMVDA3EZ.js} +151 -17
- package/dist/chunk-LMVDA3EZ.js.map +1 -0
- package/dist/cli.js +1258 -281
- package/dist/cli.js.map +1 -1
- package/dist/engine/index.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-KIL2ZJST.js.map +0 -1
package/dist/cli.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
defaultRegistry,
|
|
10
10
|
generateCatalog,
|
|
11
11
|
validateCanvasDeep
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-LMVDA3EZ.js";
|
|
13
13
|
|
|
14
14
|
// src/cli.ts
|
|
15
15
|
import { defineCommand as defineCommand141, runMain } from "citty";
|
|
@@ -8274,10 +8274,100 @@ var scaffoldStaticAdCommand = defineCommand75({
|
|
|
8274
8274
|
});
|
|
8275
8275
|
|
|
8276
8276
|
// src/commands/canvas/scaffold-video.ts
|
|
8277
|
-
import { cp, mkdir, readFile as
|
|
8277
|
+
import { cp, mkdir, readFile as readFile5, writeFile as writeFile2 } from "fs/promises";
|
|
8278
8278
|
import path5 from "path";
|
|
8279
8279
|
import { defineCommand as defineCommand76 } from "citty";
|
|
8280
8280
|
|
|
8281
|
+
// src/engine/nodes/local/lib/sceneDetect.ts
|
|
8282
|
+
import { execFile as execFile2 } from "child_process";
|
|
8283
|
+
import { mkdtemp, readdir as readdir2, readFile as readFile4, rm } from "fs/promises";
|
|
8284
|
+
import { tmpdir } from "os";
|
|
8285
|
+
import { join as join2 } from "path";
|
|
8286
|
+
import { promisify as promisify2 } from "util";
|
|
8287
|
+
var execFileAsync2 = promisify2(execFile2);
|
|
8288
|
+
var PYSCENEDETECT_THRESHOLD = 18;
|
|
8289
|
+
var PYSCENEDETECT_MIN_SCENE_LEN_S = 0.25;
|
|
8290
|
+
var PYSCENEDETECT_RECHECK_THRESHOLD = 27;
|
|
8291
|
+
var PYSCENEDETECT_RECHECK_MIN_SCENE_LEN_S = 0.6;
|
|
8292
|
+
function isLikelyOverSegmented(cuts, opts = {}) {
|
|
8293
|
+
const minCuts = opts.minCuts ?? 6;
|
|
8294
|
+
const maxMedianGap = opts.medianGapS ?? 2;
|
|
8295
|
+
const sorted = [...cuts].filter((c) => Number.isFinite(c) && c > 0).sort((a, b) => a - b);
|
|
8296
|
+
if (sorted.length < minCuts) return false;
|
|
8297
|
+
const gaps = [];
|
|
8298
|
+
let prev = 0;
|
|
8299
|
+
for (const c of sorted) {
|
|
8300
|
+
gaps.push(c - prev);
|
|
8301
|
+
prev = c;
|
|
8302
|
+
}
|
|
8303
|
+
gaps.sort((a, b) => a - b);
|
|
8304
|
+
const mid = Math.floor(gaps.length / 2);
|
|
8305
|
+
const median = gaps.length % 2 ? gaps[mid] : (gaps[mid - 1] + gaps[mid]) / 2;
|
|
8306
|
+
return median < maxMedianGap;
|
|
8307
|
+
}
|
|
8308
|
+
function timecodeToSeconds(tc) {
|
|
8309
|
+
const m = tc.trim().match(/^(\d+):(\d{1,2}):(\d{1,2}(?:\.\d+)?)$/);
|
|
8310
|
+
if (!m) return null;
|
|
8311
|
+
const h = Number.parseInt(m[1] ?? "", 10);
|
|
8312
|
+
const min = Number.parseInt(m[2] ?? "", 10);
|
|
8313
|
+
const s = Number.parseFloat(m[3] ?? "");
|
|
8314
|
+
if (!Number.isFinite(h) || !Number.isFinite(min) || !Number.isFinite(s)) return null;
|
|
8315
|
+
return h * 3600 + min * 60 + s;
|
|
8316
|
+
}
|
|
8317
|
+
function parsePySceneDetectCsvCuts(csv) {
|
|
8318
|
+
const firstLine = csv.split(/\r?\n/, 1)[0] ?? "";
|
|
8319
|
+
if (!/^\s*Timecode List:/i.test(firstLine)) return [];
|
|
8320
|
+
const cuts = [];
|
|
8321
|
+
for (const cell of firstLine.split(",").slice(1)) {
|
|
8322
|
+
const t = timecodeToSeconds(cell);
|
|
8323
|
+
if (t !== null && t > 0) cuts.push(Math.round(t * 1e3) / 1e3);
|
|
8324
|
+
}
|
|
8325
|
+
return [...new Set(cuts)].sort((a, b) => a - b);
|
|
8326
|
+
}
|
|
8327
|
+
async function runSceneDetectOnce(filePath, threshold, minSceneLenS, timeoutMs) {
|
|
8328
|
+
const outDir = await mkdtemp(join2(tmpdir(), "baker-scenedetect-"));
|
|
8329
|
+
try {
|
|
8330
|
+
await execFileAsync2(
|
|
8331
|
+
"scenedetect",
|
|
8332
|
+
[
|
|
8333
|
+
"--input",
|
|
8334
|
+
filePath,
|
|
8335
|
+
"--output",
|
|
8336
|
+
outDir,
|
|
8337
|
+
"detect-content",
|
|
8338
|
+
"--threshold",
|
|
8339
|
+
String(threshold),
|
|
8340
|
+
"--min-scene-len",
|
|
8341
|
+
String(minSceneLenS),
|
|
8342
|
+
"list-scenes",
|
|
8343
|
+
"--quiet"
|
|
8344
|
+
],
|
|
8345
|
+
{ encoding: "utf-8", maxBuffer: 32 * 1024 * 1024, timeout: timeoutMs }
|
|
8346
|
+
);
|
|
8347
|
+
const csvName = (await readdir2(outDir)).find((f) => f.toLowerCase().endsWith(".csv"));
|
|
8348
|
+
if (!csvName) return [];
|
|
8349
|
+
return parsePySceneDetectCsvCuts(await readFile4(join2(outDir, csvName), "utf-8"));
|
|
8350
|
+
} finally {
|
|
8351
|
+
await rm(outDir, { recursive: true, force: true });
|
|
8352
|
+
}
|
|
8353
|
+
}
|
|
8354
|
+
async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
|
|
8355
|
+
const pinned = opts.threshold !== void 0;
|
|
8356
|
+
const threshold = opts.threshold ?? PYSCENEDETECT_THRESHOLD;
|
|
8357
|
+
const minSceneLenS = opts.minSceneLenS ?? PYSCENEDETECT_MIN_SCENE_LEN_S;
|
|
8358
|
+
const timeoutMs = opts.timeout_ms ?? 12e4;
|
|
8359
|
+
const cuts = await runSceneDetectOnce(filePath, threshold, minSceneLenS, timeoutMs);
|
|
8360
|
+
if (!pinned && isLikelyOverSegmented(cuts)) {
|
|
8361
|
+
return await runSceneDetectOnce(
|
|
8362
|
+
filePath,
|
|
8363
|
+
PYSCENEDETECT_RECHECK_THRESHOLD,
|
|
8364
|
+
PYSCENEDETECT_RECHECK_MIN_SCENE_LEN_S,
|
|
8365
|
+
timeoutMs
|
|
8366
|
+
);
|
|
8367
|
+
}
|
|
8368
|
+
return cuts;
|
|
8369
|
+
}
|
|
8370
|
+
|
|
8281
8371
|
// src/engine/scaffold/video.ts
|
|
8282
8372
|
import { z as z3 } from "zod";
|
|
8283
8373
|
|
|
@@ -8380,7 +8470,7 @@ var FIXED_TTS_MODEL = "elevenlabs/eleven_v3";
|
|
|
8380
8470
|
var FIXED_SFX_MODEL = "elevenlabs/eleven_text_to_sound_v2";
|
|
8381
8471
|
var FIXED_MUSIC_MODEL = "elevenlabs/music-v1";
|
|
8382
8472
|
var FIXED_VOICE_CONVERT_MODEL = "elevenlabs/eleven_multilingual_sts_v2";
|
|
8383
|
-
var MUSIC_BED_GAIN_DB = -
|
|
8473
|
+
var MUSIC_BED_GAIN_DB = -20;
|
|
8384
8474
|
var AMBIENT_BED_GAIN_DB = -20;
|
|
8385
8475
|
var TRANSITION_DEFAULT_S = 0.4;
|
|
8386
8476
|
var XFADE_BY_TYPE = {
|
|
@@ -8432,10 +8522,78 @@ function sceneDurationS(scene) {
|
|
|
8432
8522
|
const max = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
|
|
8433
8523
|
return Math.min(Math.max(raw, 0.5), max);
|
|
8434
8524
|
}
|
|
8435
|
-
function
|
|
8525
|
+
function canvasDims(ar) {
|
|
8526
|
+
switch (ar) {
|
|
8527
|
+
case "1:1":
|
|
8528
|
+
return { w: 1080, h: 1080 };
|
|
8529
|
+
case "16:9":
|
|
8530
|
+
return { w: 1920, h: 1080 };
|
|
8531
|
+
case "4:3":
|
|
8532
|
+
return { w: 1440, h: 1080 };
|
|
8533
|
+
case "3:4":
|
|
8534
|
+
return { w: 1080, h: 1440 };
|
|
8535
|
+
case "21:9":
|
|
8536
|
+
return { w: 1920, h: 822 };
|
|
8537
|
+
default:
|
|
8538
|
+
return { w: 1080, h: 1920 };
|
|
8539
|
+
}
|
|
8540
|
+
}
|
|
8541
|
+
function fillPanel(label, w, h, out) {
|
|
8542
|
+
return `[${label}]scale=${w}:${h}:force_original_aspect_ratio=increase,crop=${w}:${h},setsar=1,fps=30[${out}]`;
|
|
8543
|
+
}
|
|
8544
|
+
function splitStackArgs(count, axis, dims) {
|
|
8545
|
+
const pw = axis === "horizontal" ? Math.round(dims.w / count) : dims.w;
|
|
8546
|
+
const ph = axis === "vertical" ? Math.round(dims.h / count) : dims.h;
|
|
8547
|
+
const inputs = [];
|
|
8548
|
+
const filt = [];
|
|
8549
|
+
let labels = "";
|
|
8550
|
+
for (let i = 0; i < count; i++) {
|
|
8551
|
+
inputs.push("-i", `{{in.c${i}}}`);
|
|
8552
|
+
filt.push(fillPanel(`${i}:v`, pw, ph, `p${i}`));
|
|
8553
|
+
labels += `[p${i}]`;
|
|
8554
|
+
}
|
|
8555
|
+
const stack = axis === "vertical" ? "vstack" : "hstack";
|
|
8556
|
+
filt.push(`${labels}${stack}=inputs=${count}[v]`);
|
|
8557
|
+
return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
|
|
8558
|
+
}
|
|
8559
|
+
function overlayXY(position, marginPx) {
|
|
8560
|
+
const p = (position ?? "bottom_right").toLowerCase();
|
|
8561
|
+
const x = p.includes("left") ? `${marginPx}` : p.includes("right") ? `W-w-${marginPx}` : "(W-w)/2";
|
|
8562
|
+
const y = p.includes("top") ? `${marginPx}` : p.includes("bottom") ? `H-h-${marginPx}` : "(H-h)/2";
|
|
8563
|
+
return { x, y };
|
|
8564
|
+
}
|
|
8565
|
+
function pipOverlayArgs(dims, position, insetWpct) {
|
|
8566
|
+
const iw = Math.round(dims.w * insetWpct);
|
|
8567
|
+
const margin = Math.round(dims.w * 0.04);
|
|
8568
|
+
const { x, y } = overlayXY(position, margin);
|
|
8569
|
+
const filt = `${fillPanel("0:v", dims.w, dims.h, "bg")};[1:v]scale=${iw}:-2,setsar=1,fps=30[fg];[bg][fg]overlay=x=${x}:y=${y}:format=auto[v]`;
|
|
8570
|
+
return ["-i", "{{in.c0}}", "-i", "{{in.c1}}", "-filter_complex", filt, "-map", "[v]", "{{out.video}}"];
|
|
8571
|
+
}
|
|
8572
|
+
var FLASH_HOLD_MAX_S = 2;
|
|
8573
|
+
function stillHoldArgs(durationS, dims) {
|
|
8574
|
+
return [
|
|
8575
|
+
"-loop",
|
|
8576
|
+
"1",
|
|
8577
|
+
"-i",
|
|
8578
|
+
"{{in.frame}}",
|
|
8579
|
+
"-t",
|
|
8580
|
+
durationS.toFixed(3),
|
|
8581
|
+
"-r",
|
|
8582
|
+
"30",
|
|
8583
|
+
"-vf",
|
|
8584
|
+
`scale=${dims.w}:${dims.h}:force_original_aspect_ratio=increase,crop=${dims.w}:${dims.h},setsar=1,format=yuv420p`,
|
|
8585
|
+
"-c:v",
|
|
8586
|
+
"libx264",
|
|
8587
|
+
"-pix_fmt",
|
|
8588
|
+
"yuv420p",
|
|
8589
|
+
"{{out.video}}"
|
|
8590
|
+
];
|
|
8591
|
+
}
|
|
8592
|
+
function trimArgs(durationS, offsetS = 0) {
|
|
8436
8593
|
return [
|
|
8437
8594
|
"-i",
|
|
8438
8595
|
"{{in.clip}}",
|
|
8596
|
+
...offsetS > 0 ? ["-ss", offsetS.toFixed(3)] : [],
|
|
8439
8597
|
"-t",
|
|
8440
8598
|
durationS.toFixed(3),
|
|
8441
8599
|
"-an",
|
|
@@ -8462,6 +8620,25 @@ var Sfx = z3.object({
|
|
|
8462
8620
|
sound_effect_prompt: z3.string().optional(),
|
|
8463
8621
|
description: z3.string().optional()
|
|
8464
8622
|
}).loose();
|
|
8623
|
+
var CompositionRegion = z3.object({
|
|
8624
|
+
// full | top | bottom | left | right | inset
|
|
8625
|
+
panel: z3.string().optional(),
|
|
8626
|
+
// 9-grid anchor for an `inset` presenter box.
|
|
8627
|
+
position: z3.string().optional(),
|
|
8628
|
+
is_presenter: z3.boolean().optional(),
|
|
8629
|
+
// The cast id shown/speaking in this region (routes lip-sync + element refs).
|
|
8630
|
+
cast_ref: z3.string().optional(),
|
|
8631
|
+
summary: z3.string().optional(),
|
|
8632
|
+
frame_prompt: z3.string().optional(),
|
|
8633
|
+
motion_prompt: z3.string().optional()
|
|
8634
|
+
}).loose();
|
|
8635
|
+
var SceneComposition = z3.object({
|
|
8636
|
+
// full_frame (default) | split_screen | pip | keyed_overlay
|
|
8637
|
+
layout: z3.string().optional(),
|
|
8638
|
+
// split_screen only: vertical (top/bottom) | horizontal (left/right).
|
|
8639
|
+
split_axis: z3.string().optional(),
|
|
8640
|
+
regions: z3.array(CompositionRegion).optional()
|
|
8641
|
+
}).loose();
|
|
8465
8642
|
var CameraMotion = z3.object({ movement: z3.string().optional(), detail: z3.string().optional() }).loose();
|
|
8466
8643
|
var TranscriptWord = z3.object({ text: z3.string().optional() }).loose();
|
|
8467
8644
|
var Scene = z3.object({
|
|
@@ -8470,6 +8647,10 @@ var Scene = z3.object({
|
|
|
8470
8647
|
duration_s: z3.number().optional(),
|
|
8471
8648
|
summary: z3.string().optional(),
|
|
8472
8649
|
action_detail: z3.string().optional(),
|
|
8650
|
+
// The scene's spatial layout. Absent/full_frame ⇒ one uncut shot (default path).
|
|
8651
|
+
// A layered layout (split_screen/pip/keyed_overlay) with regions ⇒ the scaffold
|
|
8652
|
+
// builds one clip per region and stacks/overlays them into the scene picture.
|
|
8653
|
+
composition: SceneComposition.optional(),
|
|
8473
8654
|
// The capture "look" for this scene — selected from the ad-native shoot-mode
|
|
8474
8655
|
// grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
|
|
8475
8656
|
// UGC/product mode; a human can override per scene by setting this.
|
|
@@ -8495,7 +8676,12 @@ var Scene = z3.object({
|
|
|
8495
8676
|
floating_elements: z3.array(z3.unknown()).optional(),
|
|
8496
8677
|
transcript_slice: z3.array(TranscriptWord).optional(),
|
|
8497
8678
|
start_frame_asset: FrameAsset,
|
|
8498
|
-
end_frame_asset: FrameAsset
|
|
8679
|
+
end_frame_asset: FrameAsset,
|
|
8680
|
+
// DECON-supplied: true when this scene is a length-split CONTINUATION of the
|
|
8681
|
+
// previous one (the SAME physical shot, broken up only because it exceeded the
|
|
8682
|
+
// clip ceiling). The scaffold then shares the splice keyframe — this scene's
|
|
8683
|
+
// start frame IS the previous scene's end frame — so the join is seamless.
|
|
8684
|
+
continues_previous: z3.boolean().optional()
|
|
8499
8685
|
}).loose();
|
|
8500
8686
|
var VideoBlueprint = z3.object({
|
|
8501
8687
|
source: z3.object({ aspect_ratio: z3.string().optional(), duration_s: z3.number().optional() }).loose().optional(),
|
|
@@ -8600,6 +8786,40 @@ function annotateBlueprintWithElements(blueprintInput, elementsInput) {
|
|
|
8600
8786
|
clone.reference_elements = summary;
|
|
8601
8787
|
return clone;
|
|
8602
8788
|
}
|
|
8789
|
+
var SELECT_SCENE_FIELDS = [
|
|
8790
|
+
"index",
|
|
8791
|
+
"start_s",
|
|
8792
|
+
"end_s",
|
|
8793
|
+
"duration_s",
|
|
8794
|
+
"summary",
|
|
8795
|
+
"narrative_role",
|
|
8796
|
+
"action_detail",
|
|
8797
|
+
"start_frame_prompt",
|
|
8798
|
+
"end_frame_prompt"
|
|
8799
|
+
];
|
|
8800
|
+
var SELECT_GLOBAL_FIELDS = ["cast", "branding", "voiceover"];
|
|
8801
|
+
function slimBlueprintForSelection(blueprintInput) {
|
|
8802
|
+
if (!blueprintInput || typeof blueprintInput !== "object" || Array.isArray(blueprintInput)) return blueprintInput;
|
|
8803
|
+
const bp = blueprintInput;
|
|
8804
|
+
const out = {};
|
|
8805
|
+
for (const k of ["version", "source"]) if (k in bp) out[k] = bp[k];
|
|
8806
|
+
if (bp.global && typeof bp.global === "object" && !Array.isArray(bp.global)) {
|
|
8807
|
+
const g = bp.global;
|
|
8808
|
+
const slimG = {};
|
|
8809
|
+
for (const k of SELECT_GLOBAL_FIELDS) if (k in g) slimG[k] = g[k];
|
|
8810
|
+
out.global = slimG;
|
|
8811
|
+
}
|
|
8812
|
+
if (Array.isArray(bp.scenes)) {
|
|
8813
|
+
out.scenes = bp.scenes.map((s) => {
|
|
8814
|
+
if (!s || typeof s !== "object" || Array.isArray(s)) return s;
|
|
8815
|
+
const sr = s;
|
|
8816
|
+
const slim = {};
|
|
8817
|
+
for (const k of SELECT_SCENE_FIELDS) if (k in sr) slim[k] = sr[k];
|
|
8818
|
+
return slim;
|
|
8819
|
+
});
|
|
8820
|
+
}
|
|
8821
|
+
return out;
|
|
8822
|
+
}
|
|
8603
8823
|
function roleForType2(type) {
|
|
8604
8824
|
switch (type.toLowerCase()) {
|
|
8605
8825
|
case "logo":
|
|
@@ -8633,7 +8853,16 @@ function todoPath2(el, label) {
|
|
|
8633
8853
|
return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in${fresh}${same}]`;
|
|
8634
8854
|
}
|
|
8635
8855
|
function buildElementSlots(elements) {
|
|
8636
|
-
const usedIds = /* @__PURE__ */ new Set([
|
|
8856
|
+
const usedIds = /* @__PURE__ */ new Set([
|
|
8857
|
+
"prompt",
|
|
8858
|
+
"spine",
|
|
8859
|
+
"overlaid",
|
|
8860
|
+
"captions",
|
|
8861
|
+
"captions_transcript",
|
|
8862
|
+
"audio_mix",
|
|
8863
|
+
"final",
|
|
8864
|
+
"music_bed"
|
|
8865
|
+
]);
|
|
8637
8866
|
const slots = [];
|
|
8638
8867
|
assignElementLabels2(elements).forEach(({ el, label }, i) => {
|
|
8639
8868
|
let id = sanitizeId2(`el_${label}`, `el_${i}`);
|
|
@@ -8646,6 +8875,7 @@ function buildElementSlots(elements) {
|
|
|
8646
8875
|
type: el.type,
|
|
8647
8876
|
description: el.description,
|
|
8648
8877
|
sameAs: el.same_as ?? void 0,
|
|
8878
|
+
castId: el.cast_id ?? void 0,
|
|
8649
8879
|
presence: presenceOf(el)
|
|
8650
8880
|
});
|
|
8651
8881
|
});
|
|
@@ -8684,7 +8914,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
|
|
|
8684
8914
|
const legend = [
|
|
8685
8915
|
...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
|
|
8686
8916
|
...hasAnchor ? [
|
|
8687
|
-
"- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions
|
|
8917
|
+
"- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions. IGNORE its text, its logo, its brand name, and its colors entirely \u2014 it is a DIFFERENT brand's footage, here only to anchor layout/pose, never identity or palette."
|
|
8688
8918
|
] : []
|
|
8689
8919
|
].join("\n");
|
|
8690
8920
|
const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
|
|
@@ -8703,6 +8933,9 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
|
|
|
8703
8933
|
"a studio) contains any text or graphics, DO NOT reproduce them \u2014 render the subject/scene",
|
|
8704
8934
|
"only, leaving the regions where overlays will sit clean. Imperfect/garbled letterforms or",
|
|
8705
8935
|
"stray icons are the worst outcome; leave those areas blank.",
|
|
8936
|
+
"A SCREEN/UI surface \u2014 an app, website, chat, dashboard, or phone display \u2014 is NEVER",
|
|
8937
|
+
"rendered here: leave any phone/screen OFF or blank-screened. The real interface is",
|
|
8938
|
+
"composited later as a screenshot or a brand HTML block, never AI-generated.",
|
|
8706
8939
|
"",
|
|
8707
8940
|
"FRAMING \u2014 ONE UNCUT FRAME:",
|
|
8708
8941
|
"Render ONE single uncut photographic frame: NO split screen, NO panels, NO dividing line,",
|
|
@@ -8730,41 +8963,71 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
|
|
|
8730
8963
|
"REFERENCE IMAGES (in the order provided):",
|
|
8731
8964
|
legend,
|
|
8732
8965
|
"",
|
|
8733
|
-
|
|
8734
|
-
|
|
8735
|
-
|
|
8736
|
-
|
|
8966
|
+
// RECAST is the whole point of a transform: the dropped el_* images define who/
|
|
8967
|
+
// what is on screen, NOT the source footage and NOT the prose. Without this, the
|
|
8968
|
+
// model reproduces the original ad's people (a proven failure mode).
|
|
8969
|
+
...present.length > 0 ? [
|
|
8970
|
+
"IDENTITY & AESTHETIC \u2014 RECAST (this is a transform, not a copy):",
|
|
8971
|
+
"Identity comes from the reference image, never from the source footage or this prose. Render every",
|
|
8972
|
+
"person, animal, product, and set to MATCH its labeled reference image above \u2014 that image is the ONLY",
|
|
8973
|
+
"source of their identity, wardrobe, styling, and look. This is a complete recast: do NOT reproduce,",
|
|
8974
|
+
"trace, or resemble any individual, animal, product, or set from the source ad. Where the FRAME",
|
|
8975
|
+
"DESCRIPTION below names an appearance detail (hair, outfit, color, age, breed, brand of an object),",
|
|
8976
|
+
"IGNORE that wording \u2014 the reference image is the truth; use the description ONLY for pose, expression,",
|
|
8977
|
+
"action, framing, lighting, and palette.",
|
|
8978
|
+
""
|
|
8979
|
+
] : [
|
|
8980
|
+
"Identity comes from the reference image, never from prose \u2014 render the subject to MATCH it and",
|
|
8981
|
+
"describe only pose, expression, action, framing, and lighting in the FRAME DESCRIPTION below.",
|
|
8982
|
+
""
|
|
8983
|
+
],
|
|
8737
8984
|
"FRAME DESCRIPTION (this frame's editable prompt):",
|
|
8738
8985
|
description,
|
|
8739
8986
|
"",
|
|
8740
|
-
"
|
|
8987
|
+
"Render exactly what the FRAME DESCRIPTION and the SHARED AD SPEC specify \u2014 this is the authoritative ad: its cast identity (via the reference images), palette, brand, and intent are law. Keep every recurring element identical to its reference image across all frames. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
|
|
8741
8988
|
"",
|
|
8742
|
-
"
|
|
8989
|
+
"SHARED AD SPEC (authoritative \u2014 the ad blueprint this frame belongs to; align cast/palette/brand/type with it):",
|
|
8743
8990
|
"{{target_blueprint}}"
|
|
8744
8991
|
].join("\n");
|
|
8745
8992
|
}
|
|
8993
|
+
function ingestFrameRef(url, edge, ctx, nodes) {
|
|
8994
|
+
const cached2 = ctx.ingestCache?.get(url);
|
|
8995
|
+
if (cached2) return cached2;
|
|
8996
|
+
const tag = ctx.tag ?? "";
|
|
8997
|
+
const refId = `s${ctx.sceneIndex}${tag}_${edge}_ref`;
|
|
8998
|
+
nodes.push({ id: refId, type: "ingest", params: { source: "url", url, expect: "image" } });
|
|
8999
|
+
const ref = `$ref:${refId}.asset`;
|
|
9000
|
+
ctx.ingestCache?.set(url, ref);
|
|
9001
|
+
return ref;
|
|
9002
|
+
}
|
|
8746
9003
|
function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
|
|
8747
|
-
const
|
|
8748
|
-
if (
|
|
8749
|
-
|
|
8750
|
-
|
|
9004
|
+
const tag = ctx.tag ?? "";
|
|
9005
|
+
if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
|
|
9006
|
+
const hasPersonOrAnimal = present.some((s) => {
|
|
9007
|
+
const t = s.type.toLowerCase();
|
|
9008
|
+
return t === "person" || t === "animal";
|
|
9009
|
+
});
|
|
9010
|
+
const useOriginalAnchor = Boolean(url) && !hasPersonOrAnimal;
|
|
9011
|
+
const hasOriginal = useOriginalAnchor;
|
|
9012
|
+
const originalRef = useOriginalAnchor && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
|
|
9013
|
+
const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
|
|
8751
9014
|
const genParams = {
|
|
8752
9015
|
model: ctx.imageModel,
|
|
8753
9016
|
image_size: "2K",
|
|
8754
|
-
prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present,
|
|
9017
|
+
prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, hasOriginal, ctx.shootMode)
|
|
8755
9018
|
};
|
|
8756
9019
|
if (ctx.ar) genParams.aspect_ratio = ctx.ar;
|
|
8757
|
-
const
|
|
8758
|
-
|
|
9020
|
+
const genId = `s${ctx.sceneIndex}${tag}_${edge}`;
|
|
9021
|
+
nodes.push({
|
|
9022
|
+
id: genId,
|
|
8759
9023
|
type: "image_generate",
|
|
8760
9024
|
// `params.prompt` is this frame's authoritative, edit-per-frame description.
|
|
8761
|
-
// `target_blueprint` is
|
|
8762
|
-
//
|
|
9025
|
+
// `target_blueprint` is the shared ad spec (cast identity, palette, brand, type)
|
|
9026
|
+
// the frame must stay consistent with — editing one frame never touches another.
|
|
8763
9027
|
inputs: { target_blueprint: "$ref:prompt.asset", ...reference.length > 0 ? { reference } : {} },
|
|
8764
9028
|
params: genParams
|
|
8765
|
-
};
|
|
8766
|
-
|
|
8767
|
-
return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
|
|
9029
|
+
});
|
|
9030
|
+
return `$ref:${genId}.images#0`;
|
|
8768
9031
|
}
|
|
8769
9032
|
function seedanceAudioLine(scene, mode, audio, nativeLine) {
|
|
8770
9033
|
const ambient = scene.ambient?.trim() || diegeticFor(mode);
|
|
@@ -8810,10 +9073,11 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
|
|
|
8810
9073
|
);
|
|
8811
9074
|
return parts.join("\n");
|
|
8812
9075
|
}
|
|
8813
|
-
function audioExtractArgs(durationS) {
|
|
9076
|
+
function audioExtractArgs(durationS, offsetS = 0) {
|
|
8814
9077
|
return [
|
|
8815
9078
|
"-i",
|
|
8816
9079
|
"{{in.clip}}",
|
|
9080
|
+
...offsetS > 0.05 ? ["-ss", offsetS.toFixed(3)] : [],
|
|
8817
9081
|
"-t",
|
|
8818
9082
|
durationS.toFixed(3),
|
|
8819
9083
|
"-vn",
|
|
@@ -8841,27 +9105,21 @@ function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
|
|
|
8841
9105
|
hasProduct: present.some((s) => s.type.toLowerCase() === "product")
|
|
8842
9106
|
});
|
|
8843
9107
|
}
|
|
8844
|
-
function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
|
|
9108
|
+
function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks, nativeSegments, clipRef = `$ref:s${i}_clip.video`) {
|
|
8845
9109
|
if (nativeTurn) {
|
|
8846
|
-
const
|
|
9110
|
+
const speechWindow = Math.max(0.5, nativeTurn.end_s - nativeTurn.start_s);
|
|
9111
|
+
const extractLen = Math.min(speechWindow, lengths.genDur);
|
|
8847
9112
|
nodes.push({
|
|
8848
9113
|
id: `s${i}_voextract`,
|
|
8849
9114
|
type: "ffmpeg",
|
|
8850
|
-
inputs: { clip:
|
|
9115
|
+
inputs: { clip: clipRef },
|
|
8851
9116
|
params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
|
|
8852
9117
|
});
|
|
8853
|
-
|
|
8854
|
-
|
|
8855
|
-
|
|
8856
|
-
inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
|
|
8857
|
-
params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
|
|
8858
|
-
});
|
|
8859
|
-
voTracks.push({
|
|
8860
|
-
slot: `s${i}_voconv`,
|
|
8861
|
-
ref: `$ref:s${i}_voconv.audio`,
|
|
9118
|
+
nativeSegments.push({
|
|
9119
|
+
voiceNode: nativeTurn.voiceNode,
|
|
9120
|
+
ref: `$ref:s${i}_voextract.audio`,
|
|
8862
9121
|
start_s: nativeTurn.start_s,
|
|
8863
|
-
end_s: nativeTurn.start_s + extractLen
|
|
8864
|
-
kind: "vo"
|
|
9122
|
+
end_s: nativeTurn.start_s + extractLen
|
|
8865
9123
|
});
|
|
8866
9124
|
} else if (ambientBroll) {
|
|
8867
9125
|
const ambientStart = scene.start_s ?? 0;
|
|
@@ -8881,85 +9139,260 @@ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes
|
|
|
8881
9139
|
});
|
|
8882
9140
|
}
|
|
8883
9141
|
}
|
|
8884
|
-
function
|
|
8885
|
-
const
|
|
8886
|
-
const
|
|
8887
|
-
|
|
8888
|
-
|
|
8889
|
-
|
|
8890
|
-
|
|
8891
|
-
const
|
|
8892
|
-
|
|
8893
|
-
const
|
|
8894
|
-
const
|
|
8895
|
-
const
|
|
8896
|
-
|
|
8897
|
-
|
|
8898
|
-
|
|
8899
|
-
"start",
|
|
8900
|
-
scene.start_frame_asset?.url,
|
|
8901
|
-
scene.start_frame_prompt,
|
|
8902
|
-
slotsForFrame(slots, i, "start"),
|
|
8903
|
-
ctx,
|
|
8904
|
-
nodes
|
|
8905
|
-
);
|
|
8906
|
-
const lastFrame = buildFrameRef(
|
|
8907
|
-
"end",
|
|
8908
|
-
scene.end_frame_asset?.url,
|
|
8909
|
-
scene.end_frame_prompt,
|
|
8910
|
-
slotsForFrame(slots, i, "end"),
|
|
8911
|
-
ctx,
|
|
8912
|
-
nodes
|
|
8913
|
-
);
|
|
8914
|
-
const dur = sceneDurationS(scene);
|
|
8915
|
-
let out = sceneOutTransition(scene, i === lastIndex);
|
|
8916
|
-
let trimTarget = dur + (out?.dur ?? 0);
|
|
8917
|
-
if (out && ceilToSeedance(trimTarget) < trimTarget) {
|
|
8918
|
-
out = null;
|
|
8919
|
-
trimTarget = dur;
|
|
8920
|
-
}
|
|
8921
|
-
const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
|
|
8922
|
-
const genDur = ceilToSeedance(Math.max(trimTarget, speech));
|
|
8923
|
-
const clipParams = {
|
|
8924
|
-
model: opts.videoModel,
|
|
8925
|
-
prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
|
|
8926
|
-
duration: genDur,
|
|
8927
|
-
// Native talking scene → Seedance generates the spoken audio + lip-sync;
|
|
8928
|
-
// an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
|
|
8929
|
-
// clip is silent and audio comes from the tts/music timeline.
|
|
8930
|
-
generate_audio: Boolean(nativeTurn) || ambientBroll
|
|
8931
|
-
};
|
|
8932
|
-
if (ar) clipParams.aspect_ratio = ar;
|
|
9142
|
+
function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
|
|
9143
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9144
|
+
for (const seg of segments) {
|
|
9145
|
+
const arr = bySpeaker.get(seg.voiceNode) ?? [];
|
|
9146
|
+
arr.push(seg);
|
|
9147
|
+
bySpeaker.set(seg.voiceNode, arr);
|
|
9148
|
+
}
|
|
9149
|
+
const tracks = [];
|
|
9150
|
+
for (const [voiceNode, segs] of bySpeaker) {
|
|
9151
|
+
const trackId = `${voiceNode}_track`;
|
|
9152
|
+
const convId = `${voiceNode}_conv`;
|
|
9153
|
+
const mixInputs = {};
|
|
9154
|
+
segs.forEach((s, k) => {
|
|
9155
|
+
mixInputs[`seg${k}`] = s.ref;
|
|
9156
|
+
});
|
|
8933
9157
|
nodes.push({
|
|
8934
|
-
id:
|
|
8935
|
-
type: "
|
|
8936
|
-
inputs:
|
|
8937
|
-
params:
|
|
9158
|
+
id: trackId,
|
|
9159
|
+
type: "audio_timeline",
|
|
9160
|
+
inputs: mixInputs,
|
|
9161
|
+
params: {
|
|
9162
|
+
tracks: segs.map((s, k) => ({ slot: `seg${k}`, start_s: s.start_s })),
|
|
9163
|
+
total_ms: totalMs
|
|
9164
|
+
}
|
|
8938
9165
|
});
|
|
8939
|
-
|
|
8940
|
-
|
|
8941
|
-
|
|
8942
|
-
|
|
8943
|
-
|
|
8944
|
-
|
|
8945
|
-
|
|
8946
|
-
|
|
8947
|
-
|
|
8948
|
-
|
|
8949
|
-
|
|
8950
|
-
|
|
9166
|
+
nodes.push({
|
|
9167
|
+
id: convId,
|
|
9168
|
+
type: "audio_voice_convert",
|
|
9169
|
+
inputs: { audio: `$ref:${trackId}.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9170
|
+
params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
|
|
9171
|
+
});
|
|
9172
|
+
tracks.push({ slot: convId, ref: `$ref:${convId}.audio`, start_s: 0, kind: "vo" });
|
|
9173
|
+
}
|
|
9174
|
+
return tracks;
|
|
9175
|
+
}
|
|
9176
|
+
function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
|
|
9177
|
+
const clipParams = {
|
|
9178
|
+
model: opts.videoModel,
|
|
9179
|
+
prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
|
|
9180
|
+
duration: lengths.genDur,
|
|
9181
|
+
// Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
|
|
9182
|
+
// ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
|
|
9183
|
+
generate_audio: Boolean(nativeTurn) || ambientBroll
|
|
9184
|
+
};
|
|
9185
|
+
if (opts.ar) clipParams.aspect_ratio = opts.ar;
|
|
9186
|
+
nodes.push({
|
|
9187
|
+
id: `s${i}${tag}_clip`,
|
|
9188
|
+
type: "video_generate",
|
|
9189
|
+
inputs: { first_frame: frames.first, ...frames.last ? { last_frame: frames.last } : {} },
|
|
9190
|
+
params: clipParams
|
|
9191
|
+
});
|
|
9192
|
+
const base = `$ref:s${i}${tag}_clip.video`;
|
|
9193
|
+
if (lengths.genDur === lengths.trimTarget) return { ref: base, scene_s: lengths.dur, out };
|
|
9194
|
+
nodes.push({
|
|
9195
|
+
id: `s${i}${tag}_clip_trim`,
|
|
9196
|
+
type: "ffmpeg",
|
|
9197
|
+
inputs: { clip: base },
|
|
9198
|
+
params: { args: trimArgs(lengths.trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
9199
|
+
});
|
|
9200
|
+
return { ref: `$ref:s${i}${tag}_clip_trim.video`, scene_s: lengths.dur, out };
|
|
9201
|
+
}
|
|
9202
|
+
var COMPOSITE_LAYOUTS = /* @__PURE__ */ new Set(["split_screen", "pip", "keyed_overlay"]);
|
|
9203
|
+
var UI_SURFACE_RE = /\b(?:app|ui|web ?site|web ?page|website|browser|chat|interface|mock-?up|in[- ]?app|dashboard|app screen|phone screen|screen[- ]?(?:recording|capture|grab|share))\b/i;
|
|
9204
|
+
function regionIsUiSurface(r) {
|
|
9205
|
+
return UI_SURFACE_RE.test(`${r.panel ?? ""} ${r.summary ?? ""} ${r.frame_prompt ?? ""}`);
|
|
9206
|
+
}
|
|
9207
|
+
function isUiOnlyComposite(regions) {
|
|
9208
|
+
const ui = regions.filter(regionIsUiSurface).length;
|
|
9209
|
+
return ui >= 1 && regions.length - ui <= 1;
|
|
9210
|
+
}
|
|
9211
|
+
function layeredComposition(scene) {
|
|
9212
|
+
const comp = scene.composition;
|
|
9213
|
+
const layout = (comp?.layout ?? "").toLowerCase();
|
|
9214
|
+
if (!COMPOSITE_LAYOUTS.has(layout)) return null;
|
|
9215
|
+
const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
|
|
9216
|
+
if (regions.length < 2) return null;
|
|
9217
|
+
if (isUiOnlyComposite(regions)) return null;
|
|
9218
|
+
return { layout, regions, comp: comp ?? {} };
|
|
9219
|
+
}
|
|
9220
|
+
function splitAxisOf(comp, regions) {
|
|
9221
|
+
const panels = regions.map((r) => (r.panel ?? "").toLowerCase());
|
|
9222
|
+
if (panels.some((p) => p === "top" || p === "bottom")) return "vertical";
|
|
9223
|
+
if (panels.some((p) => p === "left" || p === "right")) return "horizontal";
|
|
9224
|
+
return (comp.split_axis ?? "").toLowerCase() === "horizontal" ? "horizontal" : "vertical";
|
|
9225
|
+
}
|
|
9226
|
+
function orderSplitRefs(regions, regionRefs, axis) {
|
|
9227
|
+
const rank = (panel) => {
|
|
9228
|
+
const p = (panel ?? "").toLowerCase();
|
|
9229
|
+
if (axis === "vertical") return p === "top" ? 0 : p === "bottom" ? 2 : 1;
|
|
9230
|
+
return p === "left" ? 0 : p === "right" ? 2 : 1;
|
|
9231
|
+
};
|
|
9232
|
+
return regionRefs.map((ref, k) => ({ ref, k, rank: rank(regions[k]?.panel) })).sort((a, b) => a.rank - b.rank || a.k - b.k).map((x) => x.ref);
|
|
9233
|
+
}
|
|
9234
|
+
function presenterIndexOf(regions, hasNative) {
|
|
9235
|
+
const flagged = regions.findIndex((r) => r.is_presenter);
|
|
9236
|
+
if (flagged >= 0) return flagged;
|
|
9237
|
+
return hasNative ? 0 : -1;
|
|
9238
|
+
}
|
|
9239
|
+
function slotsForRegion(present, isPresenter) {
|
|
9240
|
+
return present.filter((s) => {
|
|
9241
|
+
const t = s.type.toLowerCase();
|
|
9242
|
+
const person = t === "person" || t === "animal";
|
|
9243
|
+
return isPresenter ? person : !person;
|
|
9244
|
+
});
|
|
9245
|
+
}
|
|
9246
|
+
function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes) {
|
|
9247
|
+
const dims = canvasDims(opts.ar);
|
|
9248
|
+
const presIdx = presenterIndexOf(regions, Boolean(nativeTurn));
|
|
9249
|
+
const regionRefs = [];
|
|
9250
|
+
let presenterPosition;
|
|
9251
|
+
regions.forEach((region, r) => {
|
|
9252
|
+
const isPresenter = r === presIdx;
|
|
9253
|
+
const tag = `_r${r}`;
|
|
9254
|
+
const regionSlots = slotsForRegion(present, isPresenter);
|
|
9255
|
+
const ctx = {
|
|
9256
|
+
sceneIndex: i,
|
|
9257
|
+
ar: opts.ar,
|
|
9258
|
+
reuse: opts.reuse,
|
|
9259
|
+
imageModel: opts.imageModel,
|
|
9260
|
+
shootMode: mode,
|
|
9261
|
+
tag
|
|
9262
|
+
};
|
|
9263
|
+
const startPrompt = region.frame_prompt ?? scene.start_frame_prompt;
|
|
9264
|
+
const endPrompt = region.frame_prompt ?? scene.end_frame_prompt;
|
|
9265
|
+
const first = buildFrameRef("start", void 0, startPrompt, regionSlots, ctx, nodes);
|
|
9266
|
+
const last = buildFrameRef("end", void 0, endPrompt, regionSlots, ctx, nodes);
|
|
9267
|
+
const regionNative = isPresenter ? nativeTurn : void 0;
|
|
9268
|
+
const regionScene = {
|
|
9269
|
+
...scene,
|
|
9270
|
+
summary: region.summary ?? scene.summary,
|
|
9271
|
+
motion_prompt: region.motion_prompt ?? scene.motion_prompt,
|
|
9272
|
+
dialogue: isPresenter ? scene.dialogue : []
|
|
9273
|
+
};
|
|
9274
|
+
const clip = emitSceneClip(
|
|
9275
|
+
i,
|
|
9276
|
+
regionScene,
|
|
9277
|
+
regionSlots,
|
|
9278
|
+
mode,
|
|
9279
|
+
regionNative,
|
|
9280
|
+
false,
|
|
9281
|
+
{ first, last },
|
|
9282
|
+
lengths,
|
|
9283
|
+
null,
|
|
9284
|
+
{ ar: opts.ar, videoModel: opts.videoModel },
|
|
9285
|
+
nodes,
|
|
9286
|
+
tag
|
|
9287
|
+
);
|
|
9288
|
+
regionRefs.push(clip.ref);
|
|
9289
|
+
if (isPresenter) presenterPosition = region.position;
|
|
9290
|
+
});
|
|
9291
|
+
const compInputs = {};
|
|
9292
|
+
let args;
|
|
9293
|
+
if (layout === "split_screen") {
|
|
9294
|
+
const axis = splitAxisOf(comp, regions);
|
|
9295
|
+
orderSplitRefs(regions, regionRefs, axis).forEach((ref, k) => {
|
|
9296
|
+
compInputs[`c${k}`] = ref;
|
|
9297
|
+
});
|
|
9298
|
+
args = splitStackArgs(regionRefs.length, axis, dims);
|
|
9299
|
+
} else {
|
|
9300
|
+
const bgIdx = regions.findIndex((_, k) => k !== presIdx);
|
|
9301
|
+
const bgRef = regionRefs[bgIdx >= 0 ? bgIdx : 0];
|
|
9302
|
+
let presRef = regionRefs[presIdx >= 0 ? presIdx : 1];
|
|
9303
|
+
if (layout === "keyed_overlay" && presIdx >= 0) {
|
|
9304
|
+
const keyId = `s${i}_key`;
|
|
9305
|
+
nodes.push({ id: keyId, type: "video_background_remove", inputs: { video: presRef }, params: {} });
|
|
9306
|
+
presRef = `$ref:${keyId}.video`;
|
|
9307
|
+
}
|
|
9308
|
+
compInputs.c0 = bgRef;
|
|
9309
|
+
compInputs.c1 = presRef;
|
|
9310
|
+
args = pipOverlayArgs(dims, presenterPosition, layout === "keyed_overlay" ? 0.5 : 0.34);
|
|
9311
|
+
}
|
|
9312
|
+
nodes.push({
|
|
9313
|
+
id: `s${i}_composite`,
|
|
9314
|
+
type: "ffmpeg",
|
|
9315
|
+
inputs: compInputs,
|
|
9316
|
+
params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
9317
|
+
});
|
|
9318
|
+
const presenterClipRef = presIdx >= 0 ? `$ref:s${i}_r${presIdx}_clip.video` : void 0;
|
|
9319
|
+
return { clip: { ref: `$ref:s${i}_composite.video`, scene_s: lengths.dur, out }, presenterClipRef };
|
|
9320
|
+
}
|
|
9321
|
+
function sceneTiming(scene, isLast, nativeTurn) {
|
|
9322
|
+
const dur = sceneDurationS(scene);
|
|
9323
|
+
let out = sceneOutTransition(scene, isLast);
|
|
9324
|
+
let trimTarget = dur + (out?.dur ?? 0);
|
|
9325
|
+
if (out && ceilToSeedance(trimTarget) < trimTarget) {
|
|
9326
|
+
out = null;
|
|
9327
|
+
trimTarget = dur;
|
|
9328
|
+
}
|
|
9329
|
+
const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
|
|
9330
|
+
const genDur = ceilToSeedance(Math.max(trimTarget, speech));
|
|
9331
|
+
return { dur, out, trimTarget, genDur, speech };
|
|
9332
|
+
}
|
|
9333
|
+
function emitCompositeScene(composite, scene, i, present, mode, nativeTurn, lengths, out, opts, nodes, voTracks, nativeSegments, clips) {
|
|
9334
|
+
const built = buildCompositeScene(
|
|
9335
|
+
composite.layout,
|
|
9336
|
+
composite.regions,
|
|
9337
|
+
composite.comp,
|
|
9338
|
+
scene,
|
|
9339
|
+
i,
|
|
9340
|
+
present,
|
|
9341
|
+
mode,
|
|
9342
|
+
nativeTurn,
|
|
9343
|
+
{ dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
|
|
9344
|
+
out,
|
|
9345
|
+
opts,
|
|
9346
|
+
nodes
|
|
9347
|
+
);
|
|
9348
|
+
emitSceneNativeAudio(
|
|
9349
|
+
i,
|
|
9350
|
+
scene,
|
|
9351
|
+
nativeTurn,
|
|
9352
|
+
false,
|
|
9353
|
+
{ dur: lengths.dur, speech: lengths.speech, genDur: lengths.genDur },
|
|
9354
|
+
nodes,
|
|
9355
|
+
voTracks,
|
|
9356
|
+
nativeSegments,
|
|
9357
|
+
built.presenterClipRef
|
|
9358
|
+
);
|
|
9359
|
+
clips.push(built.clip);
|
|
9360
|
+
}
|
|
9361
|
+
function emitFlashHold(i, scene, slots, ctx, lengths, out, ar, nodes, clips) {
|
|
9362
|
+
const frame = buildFrameRef(
|
|
9363
|
+
"start",
|
|
9364
|
+
scene.start_frame_asset?.url,
|
|
9365
|
+
scene.start_frame_prompt,
|
|
9366
|
+
slotsForFrame(slots, i, "start"),
|
|
9367
|
+
ctx,
|
|
9368
|
+
nodes
|
|
9369
|
+
);
|
|
9370
|
+
nodes.push({
|
|
9371
|
+
id: `s${i}_clip`,
|
|
9372
|
+
type: "ffmpeg",
|
|
9373
|
+
inputs: { frame },
|
|
9374
|
+
params: {
|
|
9375
|
+
args: stillHoldArgs(lengths.trimTarget, canvasDims(ar)),
|
|
9376
|
+
outputs: { video: { kind: "video", ext: "mp4" } }
|
|
8951
9377
|
}
|
|
8952
9378
|
});
|
|
8953
|
-
|
|
9379
|
+
clips.push({ ref: `$ref:s${i}_clip.video`, scene_s: lengths.dur, out });
|
|
9380
|
+
}
|
|
9381
|
+
function musicArcDigest(blueprint) {
|
|
9382
|
+
const roles = blueprint.scenes.map((s) => s.narrative_role).filter((r) => Boolean(r));
|
|
9383
|
+
const arc = roles.length > 0 ? roles.join(" \u2192 ") : "";
|
|
9384
|
+
return arc ? `
|
|
9385
|
+
|
|
9386
|
+
Emotional arc across scenes: ${arc}. Shape the bed's energy to this arc, swelling on the payoff. Purely instrumental \u2014 no vocals, no singing, no spoken words.` : "";
|
|
8954
9387
|
}
|
|
8955
9388
|
function musicBedPrompt(blueprint, musicPrompt) {
|
|
9389
|
+
const digest = musicArcDigest(blueprint);
|
|
8956
9390
|
const track2 = blueprint.global?.music?.identified_track;
|
|
8957
9391
|
const title = track2?.title?.trim();
|
|
8958
|
-
|
|
8959
|
-
const by = track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : "";
|
|
8960
|
-
return `${musicPrompt}
|
|
9392
|
+
const vibe = title ? `
|
|
8961
9393
|
|
|
8962
|
-
Reference vibe: the original used "${title}"${by} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track
|
|
9394
|
+
Reference vibe: the original used "${title}"${track2?.artist?.trim() ? ` by ${track2.artist.trim()}` : ""} (identified via AudD). Match its mood, tempo, and energy with ORIGINAL music \u2014 do not reproduce the track.` : "";
|
|
9395
|
+
return `${musicPrompt}${digest}${vibe}`;
|
|
8963
9396
|
}
|
|
8964
9397
|
function onCameraDialogue(blueprint) {
|
|
8965
9398
|
const mode = blueprint.global?.voiceover?.mode;
|
|
@@ -8998,92 +9431,483 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
|
|
|
8998
9431
|
if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
|
|
8999
9432
|
return casts.has(speaker);
|
|
9000
9433
|
}
|
|
9001
|
-
function
|
|
9002
|
-
const
|
|
9003
|
-
const
|
|
9434
|
+
function makePresenterPresent(slots, canonical) {
|
|
9435
|
+
const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
|
|
9436
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9437
|
+
for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
|
|
9438
|
+
const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
|
|
9439
|
+
return (speaker, sceneIndex) => {
|
|
9440
|
+
const presence = bySpeaker.get(speaker) ?? solePerson;
|
|
9441
|
+
if (!presence) return true;
|
|
9442
|
+
return presence.has(sceneIndex);
|
|
9443
|
+
};
|
|
9444
|
+
}
|
|
9445
|
+
var PAUSE_GAP_S = 0.6;
|
|
9446
|
+
var PHRASE_MAX_S = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
|
|
9447
|
+
function collapseVoiceover(blueprint) {
|
|
9004
9448
|
const casts = castIdSet(blueprint);
|
|
9005
9449
|
const cameraOn = onCameraDialogue(blueprint);
|
|
9006
|
-
const
|
|
9007
|
-
const
|
|
9008
|
-
for (const
|
|
9009
|
-
|
|
9010
|
-
|
|
9011
|
-
}
|
|
9450
|
+
const presenters = /* @__PURE__ */ new Set();
|
|
9451
|
+
for (const scene of blueprint.scenes)
|
|
9452
|
+
for (const l of scene.dialogue ?? []) {
|
|
9453
|
+
const sp = l.speaker ?? "voiceover";
|
|
9454
|
+
if (isOnCameraSpeaker(sp, casts, cameraOn)) presenters.add(sp);
|
|
9012
9455
|
}
|
|
9456
|
+
if (presenters.size !== 1) return (s) => s;
|
|
9457
|
+
const presenter = [...presenters][0];
|
|
9458
|
+
return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
|
|
9459
|
+
}
|
|
9460
|
+
function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
|
|
9461
|
+
const casts = castIdSet(blueprint);
|
|
9462
|
+
const cameraOn = onCameraDialogue(blueprint);
|
|
9463
|
+
const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
|
|
9464
|
+
const multiSpeaker = /* @__PURE__ */ new Set();
|
|
9465
|
+
blueprint.scenes.forEach((scene, i) => {
|
|
9466
|
+
const onCam = new Set(
|
|
9467
|
+
(scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
|
|
9468
|
+
);
|
|
9469
|
+
if (onCam.size >= 2) multiSpeaker.add(i);
|
|
9470
|
+
});
|
|
9471
|
+
const lines = blueprint.scenes.flatMap(
|
|
9472
|
+
(scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
|
|
9473
|
+
const raw = l.speaker ?? "voiceover";
|
|
9474
|
+
const sp = canonical(raw);
|
|
9475
|
+
const text = l.line.trim();
|
|
9476
|
+
const start = l.start_s ?? scene.start_s ?? 0;
|
|
9477
|
+
return {
|
|
9478
|
+
sceneIndex,
|
|
9479
|
+
speaker: sp,
|
|
9480
|
+
// Shown = a cast member speaking AND their element is actually on screen
|
|
9481
|
+
// here (not a cutaway). A b-roll cutaway mid-phrase fails this and gets
|
|
9482
|
+
// its own clip while the phrase voice plays under it.
|
|
9483
|
+
shown: isOnCameraSpeaker(raw, casts, cameraOn) && !multiSpeaker.has(sceneIndex) && presenterPresent(sp, sceneIndex),
|
|
9484
|
+
start,
|
|
9485
|
+
// Real speech end. When the deconstruct gives no end_s, estimate it from
|
|
9486
|
+
// the words — NOT the scene end (which would fabricate continuity across
|
|
9487
|
+
// a long silent b-roll gap and wrongly merge two separate phrases).
|
|
9488
|
+
end: l.end_s ?? start + estSpeechS(text),
|
|
9489
|
+
text
|
|
9490
|
+
};
|
|
9491
|
+
})
|
|
9492
|
+
).sort((a, b) => a.start - b.start);
|
|
9493
|
+
const phrases = [];
|
|
9494
|
+
let cur = null;
|
|
9495
|
+
const flush = () => {
|
|
9496
|
+
if (!cur) return;
|
|
9497
|
+
const shownScenes = [...cur.shown].sort((a, b) => a - b);
|
|
9498
|
+
phrases.push({
|
|
9499
|
+
speaker: cur.speaker,
|
|
9500
|
+
start_s: cur.start,
|
|
9501
|
+
end_s: cur.end,
|
|
9502
|
+
text: cur.texts.join(" "),
|
|
9503
|
+
firstScene: cur.firstScene,
|
|
9504
|
+
shownScenes,
|
|
9505
|
+
presenterShown: shownScenes.length > 0
|
|
9506
|
+
});
|
|
9507
|
+
cur = null;
|
|
9508
|
+
};
|
|
9509
|
+
for (const ln of lines) {
|
|
9510
|
+
const lineCover = ln.shown ? Math.max(ln.end, sceneEndS(ln.sceneIndex)) : ln.end;
|
|
9511
|
+
const lineClipStart = ln.shown ? Math.min(ln.start, blueprint.scenes[ln.sceneIndex]?.start_s ?? ln.start) : ln.start;
|
|
9512
|
+
const breakRun = !cur || cur.speaker !== ln.speaker || ln.start - cur.end > PAUSE_GAP_S || // Cap by SCENE COVERAGE span, not line end — a presenter run whose sliced scenes span
|
|
9513
|
+
// more than one Seedance clip splits into the next take here (at this scene's
|
|
9514
|
+
// boundary, never mid-scene), so no segment ever reads past the generated clip.
|
|
9515
|
+
Math.max(cur.coverEnd, lineCover) - Math.min(cur.clipStart, lineClipStart) > PHRASE_MAX_S;
|
|
9516
|
+
if (breakRun || !cur) {
|
|
9517
|
+
flush();
|
|
9518
|
+
cur = {
|
|
9519
|
+
speaker: ln.speaker,
|
|
9520
|
+
firstScene: ln.sceneIndex,
|
|
9521
|
+
start: ln.start,
|
|
9522
|
+
end: ln.end,
|
|
9523
|
+
coverEnd: lineCover,
|
|
9524
|
+
clipStart: lineClipStart,
|
|
9525
|
+
texts: [ln.text],
|
|
9526
|
+
shown: /* @__PURE__ */ new Set()
|
|
9527
|
+
};
|
|
9528
|
+
} else {
|
|
9529
|
+
cur.texts.push(ln.text);
|
|
9530
|
+
cur.end = Math.max(cur.end, ln.end);
|
|
9531
|
+
cur.coverEnd = Math.max(cur.coverEnd, lineCover);
|
|
9532
|
+
cur.clipStart = Math.min(cur.clipStart, lineClipStart);
|
|
9533
|
+
}
|
|
9534
|
+
if (ln.shown) cur.shown.add(ln.sceneIndex);
|
|
9535
|
+
}
|
|
9536
|
+
flush();
|
|
9537
|
+
return phrases;
|
|
9538
|
+
}
|
|
9539
|
+
function makeVoiceFactory(blueprint, canonical, nodes) {
|
|
9540
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9541
|
+
const describe = (speaker) => {
|
|
9542
|
+
for (const scene of blueprint.scenes)
|
|
9543
|
+
for (const line of scene.dialogue ?? [])
|
|
9544
|
+
if (canonical(line.speaker ?? "voiceover") === speaker && line.voice_description) return line.voice_description;
|
|
9013
9545
|
const cast = blueprint.global?.cast?.find((c) => c.id === speaker);
|
|
9014
9546
|
return cast?.description ?? blueprint.global?.voiceover?.voice_description ?? `${speaker} voice`;
|
|
9015
9547
|
};
|
|
9016
|
-
|
|
9017
|
-
const existing =
|
|
9548
|
+
return (speaker) => {
|
|
9549
|
+
const existing = bySpeaker.get(speaker);
|
|
9018
9550
|
if (existing) return existing;
|
|
9019
|
-
const id = sanitizeId2(`voice_${speaker}`, `voice_${
|
|
9020
|
-
const description =
|
|
9021
|
-
|
|
9022
|
-
|
|
9023
|
-
voiceNodeBySpeaker.set(speaker, id);
|
|
9551
|
+
const id = sanitizeId2(`voice_${speaker}`, `voice_${bySpeaker.size}`);
|
|
9552
|
+
const description = describe(speaker);
|
|
9553
|
+
nodes.push({ id, type: "voice_select", params: { description, ...parseVoiceTraits(description) } });
|
|
9554
|
+
bySpeaker.set(speaker, id);
|
|
9024
9555
|
return id;
|
|
9025
9556
|
};
|
|
9557
|
+
}
|
|
9558
|
+
function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
|
|
9559
|
+
const anchor = phrase.shownScenes[0];
|
|
9560
|
+
const anchorScene = env.blueprint.scenes[anchor];
|
|
9561
|
+
if (!anchorScene) return;
|
|
9562
|
+
const present = slotsForScene(env.slots, anchor);
|
|
9563
|
+
const nativeTurn = {
|
|
9564
|
+
sceneIndex: anchor,
|
|
9565
|
+
speaker: phrase.speaker,
|
|
9566
|
+
start_s: phrase.start_s,
|
|
9567
|
+
end_s: phrase.end_s,
|
|
9568
|
+
text: phrase.text,
|
|
9569
|
+
voiceNode,
|
|
9570
|
+
native: true
|
|
9571
|
+
};
|
|
9572
|
+
const mode = sceneShootMode(anchorScene, present, nativeTurn, env.cameraOn, env.casts);
|
|
9573
|
+
const ctx = {
|
|
9574
|
+
sceneIndex: anchor,
|
|
9575
|
+
ar: env.ar,
|
|
9576
|
+
reuse: env.reuse,
|
|
9577
|
+
imageModel: env.opts.imageModel,
|
|
9578
|
+
shootMode: mode,
|
|
9579
|
+
ingestCache: env.ingestCache
|
|
9580
|
+
};
|
|
9581
|
+
const first = buildFrameRef(
|
|
9582
|
+
"start",
|
|
9583
|
+
anchorScene.start_frame_asset?.url,
|
|
9584
|
+
anchorScene.start_frame_prompt,
|
|
9585
|
+
slotsForFrame(env.slots, anchor, "start"),
|
|
9586
|
+
ctx,
|
|
9587
|
+
nodes
|
|
9588
|
+
);
|
|
9589
|
+
const lastShown = phrase.shownScenes[phrase.shownScenes.length - 1] ?? anchor;
|
|
9590
|
+
const lastScene = env.blueprint.scenes[lastShown] ?? anchorScene;
|
|
9591
|
+
const last = buildFrameRef(
|
|
9592
|
+
"end",
|
|
9593
|
+
lastScene.end_frame_asset?.url,
|
|
9594
|
+
lastScene.end_frame_prompt,
|
|
9595
|
+
slotsForFrame(env.slots, lastShown, "end"),
|
|
9596
|
+
ctx,
|
|
9597
|
+
nodes
|
|
9598
|
+
);
|
|
9599
|
+
const clipStart = phrase.shownScenes.reduce(
|
|
9600
|
+
(m, s) => Math.min(m, env.blueprint.scenes[s]?.start_s ?? phrase.start_s),
|
|
9601
|
+
phrase.start_s
|
|
9602
|
+
);
|
|
9603
|
+
const coverEnd = phrase.shownScenes.reduce((m, s) => Math.max(m, env.blueprint.scenes[s]?.end_s ?? 0), phrase.end_s);
|
|
9604
|
+
const phraseLen = Math.max(0.5, coverEnd - clipStart);
|
|
9605
|
+
const genDur = ceilToSeedance(phraseLen);
|
|
9606
|
+
const clipParams = {
|
|
9607
|
+
model: env.opts.videoModel,
|
|
9608
|
+
prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
|
|
9609
|
+
duration: genDur,
|
|
9610
|
+
generate_audio: true
|
|
9611
|
+
};
|
|
9612
|
+
if (env.ar) clipParams.aspect_ratio = env.ar;
|
|
9613
|
+
nodes.push({
|
|
9614
|
+
id: `s${anchor}_clip`,
|
|
9615
|
+
type: "video_generate",
|
|
9616
|
+
inputs: { first_frame: first, last_frame: last },
|
|
9617
|
+
params: clipParams
|
|
9618
|
+
});
|
|
9619
|
+
const clipRef = `$ref:s${anchor}_clip.video`;
|
|
9620
|
+
const speechOffset = Math.max(0, phrase.start_s - clipStart);
|
|
9621
|
+
const extractLen = Math.min(Math.max(0.5, phrase.end_s - phrase.start_s), Math.max(0.5, genDur - speechOffset));
|
|
9622
|
+
nodes.push({
|
|
9623
|
+
id: `s${anchor}_voextract`,
|
|
9624
|
+
type: "ffmpeg",
|
|
9625
|
+
inputs: { clip: clipRef },
|
|
9626
|
+
params: { args: audioExtractArgs(extractLen, speechOffset), outputs: { audio: { kind: "audio", ext: "mp3" } } }
|
|
9627
|
+
});
|
|
9628
|
+
const convId = `s${anchor}_conv`;
|
|
9629
|
+
nodes.push({
|
|
9630
|
+
id: convId,
|
|
9631
|
+
type: "audio_voice_convert",
|
|
9632
|
+
inputs: { audio: `$ref:s${anchor}_voextract.audio`, voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9633
|
+
params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
|
|
9634
|
+
});
|
|
9635
|
+
out.voTracks.push({
|
|
9636
|
+
slot: convId,
|
|
9637
|
+
ref: `$ref:${convId}.audio`,
|
|
9638
|
+
start_s: phrase.start_s,
|
|
9639
|
+
end_s: phrase.end_s,
|
|
9640
|
+
kind: "vo"
|
|
9641
|
+
});
|
|
9642
|
+
out.voSegments.push({
|
|
9643
|
+
slot: convId,
|
|
9644
|
+
start_s: phrase.start_s,
|
|
9645
|
+
end_s: phrase.end_s,
|
|
9646
|
+
scene: anchor,
|
|
9647
|
+
speaker: phrase.speaker
|
|
9648
|
+
});
|
|
9649
|
+
out.talkingScenes.push({
|
|
9650
|
+
scene: anchor,
|
|
9651
|
+
voice_convert_node: convId,
|
|
9652
|
+
scene_s: Math.round(phraseLen * 100) / 100,
|
|
9653
|
+
est_speech_s: Math.round(estSpeechS(phrase.text) * 100) / 100
|
|
9654
|
+
});
|
|
9655
|
+
for (const s of phrase.shownScenes) {
|
|
9656
|
+
const sc = env.blueprint.scenes[s];
|
|
9657
|
+
if (!sc) continue;
|
|
9658
|
+
const rawOffset = (sc.start_s ?? clipStart) - clipStart;
|
|
9659
|
+
out.sceneSlice.set(s, {
|
|
9660
|
+
clipRef,
|
|
9661
|
+
// Snap a sub-frame offset (line-start vs scene-start drift) to 0 so a single-scene
|
|
9662
|
+
// phrase hits the whole-clip fast path instead of a needless re-encode + tiny shift.
|
|
9663
|
+
offset: rawOffset < 0.05 ? 0 : rawOffset,
|
|
9664
|
+
len: sceneDurationS(sc),
|
|
9665
|
+
clipDur: genDur
|
|
9666
|
+
});
|
|
9667
|
+
}
|
|
9668
|
+
}
|
|
9669
|
+
function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
|
|
9670
|
+
let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
|
|
9671
|
+
while (used.has(id)) id = `${id}_x`;
|
|
9672
|
+
used.add(id);
|
|
9673
|
+
nodes.push({
|
|
9674
|
+
id,
|
|
9675
|
+
type: "tts",
|
|
9676
|
+
inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9677
|
+
params: { model: FIXED_TTS_MODEL, text: phrase.text, voice: "{{voice_ref}}" }
|
|
9678
|
+
});
|
|
9679
|
+
out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
|
|
9680
|
+
out.voSegments.push({
|
|
9681
|
+
slot: id,
|
|
9682
|
+
start_s: phrase.start_s,
|
|
9683
|
+
end_s: phrase.end_s,
|
|
9684
|
+
scene: phrase.firstScene,
|
|
9685
|
+
speaker: phrase.speaker
|
|
9686
|
+
});
|
|
9687
|
+
}
|
|
9688
|
+
function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
|
|
9689
|
+
const present = slotsForScene(env.slots, i);
|
|
9690
|
+
const onCam = (scene.dialogue ?? []).filter(
|
|
9691
|
+
(l) => Boolean(l.line?.trim()) && isOnCameraSpeaker(l.speaker ?? "voiceover", env.casts, env.cameraOn)
|
|
9692
|
+
);
|
|
9693
|
+
const distinctSpeakers = new Set(onCam.map((l) => canonical(l.speaker ?? "voiceover")));
|
|
9694
|
+
let nativeTurn;
|
|
9695
|
+
if (onCam.length > 0 && distinctSpeakers.size === 1) {
|
|
9696
|
+
const speaker = canonical(onCam[0]?.speaker ?? "voiceover");
|
|
9697
|
+
const voiceNode = ensureVoiceNode(speaker);
|
|
9698
|
+
const start = onCam[0]?.start_s ?? scene.start_s ?? 0;
|
|
9699
|
+
const end = onCam[onCam.length - 1]?.end_s ?? scene.end_s ?? start;
|
|
9700
|
+
const text = onCam.map((l) => l.line.trim()).join(" ");
|
|
9701
|
+
nativeTurn = { sceneIndex: i, speaker, start_s: start, end_s: end, text, voiceNode, native: true };
|
|
9702
|
+
out.talkingScenes.push({
|
|
9703
|
+
scene: i,
|
|
9704
|
+
voice_convert_node: `${voiceNode}_conv`,
|
|
9705
|
+
scene_s: Math.round(sceneDurationS(scene) * 100) / 100,
|
|
9706
|
+
est_speech_s: Math.round(estSpeechS(text) * 100) / 100
|
|
9707
|
+
});
|
|
9708
|
+
}
|
|
9709
|
+
const mode = sceneShootMode(scene, present, nativeTurn, env.cameraOn, env.casts);
|
|
9710
|
+
const lengths = sceneTiming(scene, isLast, nativeTurn);
|
|
9711
|
+
emitCompositeScene(
|
|
9712
|
+
composite,
|
|
9713
|
+
scene,
|
|
9714
|
+
i,
|
|
9715
|
+
present,
|
|
9716
|
+
mode,
|
|
9717
|
+
nativeTurn,
|
|
9718
|
+
lengths,
|
|
9719
|
+
lengths.out,
|
|
9720
|
+
{ ar: env.ar, reuse: env.reuse, imageModel: env.opts.imageModel, videoModel: env.opts.videoModel },
|
|
9721
|
+
nodes,
|
|
9722
|
+
out.voTracks,
|
|
9723
|
+
out.nativeSegments,
|
|
9724
|
+
out.clips
|
|
9725
|
+
);
|
|
9726
|
+
if (!nativeTurn && distinctSpeakers.size >= 2) {
|
|
9727
|
+
emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out);
|
|
9728
|
+
}
|
|
9729
|
+
}
|
|
9730
|
+
function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
|
|
9731
|
+
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9732
|
+
for (const l of onCam) {
|
|
9733
|
+
const speaker = canonical(l.speaker ?? "voiceover");
|
|
9734
|
+
const text = l.line.trim();
|
|
9735
|
+
const start = l.start_s ?? scene.start_s ?? 0;
|
|
9736
|
+
const end = l.end_s ?? start + estSpeechS(text);
|
|
9737
|
+
const cur = bySpeaker.get(speaker);
|
|
9738
|
+
if (cur) {
|
|
9739
|
+
cur.lines.push(text);
|
|
9740
|
+
cur.start = Math.min(cur.start, start);
|
|
9741
|
+
cur.end = Math.max(cur.end, end);
|
|
9742
|
+
} else {
|
|
9743
|
+
bySpeaker.set(speaker, { lines: [text], start, end });
|
|
9744
|
+
}
|
|
9745
|
+
}
|
|
9746
|
+
for (const [speaker, agg] of bySpeaker) {
|
|
9747
|
+
const voiceNode = ensureVoiceNode(speaker);
|
|
9748
|
+
emitPhraseTts(
|
|
9749
|
+
{
|
|
9750
|
+
speaker,
|
|
9751
|
+
start_s: agg.start,
|
|
9752
|
+
end_s: agg.end,
|
|
9753
|
+
text: agg.lines.join(" "),
|
|
9754
|
+
firstScene: i,
|
|
9755
|
+
shownScenes: [],
|
|
9756
|
+
presenterShown: false
|
|
9757
|
+
},
|
|
9758
|
+
voiceNode,
|
|
9759
|
+
i,
|
|
9760
|
+
usedVoIds,
|
|
9761
|
+
nodes,
|
|
9762
|
+
out
|
|
9763
|
+
);
|
|
9764
|
+
}
|
|
9765
|
+
}
|
|
9766
|
+
function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
|
|
9767
|
+
const present = slotsForScene(env.slots, i);
|
|
9768
|
+
const mode = sceneShootMode(scene, present, void 0, env.cameraOn, env.casts);
|
|
9769
|
+
const ambientBroll = Boolean(env.opts.ambient) && mode !== "ugc_selfie";
|
|
9770
|
+
const lengths = sceneTiming(scene, isLast, void 0);
|
|
9771
|
+
const ctx = {
|
|
9772
|
+
sceneIndex: i,
|
|
9773
|
+
ar: env.ar,
|
|
9774
|
+
reuse: env.reuse,
|
|
9775
|
+
imageModel: env.opts.imageModel,
|
|
9776
|
+
shootMode: mode,
|
|
9777
|
+
ingestCache: env.ingestCache
|
|
9778
|
+
};
|
|
9779
|
+
if (!ambientBroll && lengths.dur <= FLASH_HOLD_MAX_S) {
|
|
9780
|
+
emitFlashHold(i, scene, env.slots, ctx, lengths, lengths.out, env.ar, nodes, out.clips);
|
|
9781
|
+
return void 0;
|
|
9782
|
+
}
|
|
9783
|
+
const first = scene.continues_previous && prevEndFrame ? prevEndFrame : buildFrameRef(
|
|
9784
|
+
"start",
|
|
9785
|
+
scene.start_frame_asset?.url,
|
|
9786
|
+
scene.start_frame_prompt,
|
|
9787
|
+
slotsForFrame(env.slots, i, "start"),
|
|
9788
|
+
ctx,
|
|
9789
|
+
nodes
|
|
9790
|
+
);
|
|
9791
|
+
const last = buildFrameRef(
|
|
9792
|
+
"end",
|
|
9793
|
+
scene.end_frame_asset?.url,
|
|
9794
|
+
scene.end_frame_prompt,
|
|
9795
|
+
slotsForFrame(env.slots, i, "end"),
|
|
9796
|
+
ctx,
|
|
9797
|
+
nodes
|
|
9798
|
+
);
|
|
9799
|
+
const clip = emitSceneClip(
|
|
9800
|
+
i,
|
|
9801
|
+
scene,
|
|
9802
|
+
present,
|
|
9803
|
+
mode,
|
|
9804
|
+
void 0,
|
|
9805
|
+
ambientBroll,
|
|
9806
|
+
{ first, last },
|
|
9807
|
+
{ dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
|
|
9808
|
+
lengths.out,
|
|
9809
|
+
{ ar: env.ar, videoModel: env.opts.videoModel },
|
|
9810
|
+
nodes
|
|
9811
|
+
);
|
|
9812
|
+
if (ambientBroll) {
|
|
9813
|
+
emitSceneNativeAudio(
|
|
9814
|
+
i,
|
|
9815
|
+
scene,
|
|
9816
|
+
void 0,
|
|
9817
|
+
true,
|
|
9818
|
+
{ dur: lengths.dur, speech: 0, genDur: lengths.genDur },
|
|
9819
|
+
nodes,
|
|
9820
|
+
out.voTracks,
|
|
9821
|
+
out.nativeSegments
|
|
9822
|
+
);
|
|
9823
|
+
}
|
|
9824
|
+
out.clips.push(clip);
|
|
9825
|
+
return last;
|
|
9826
|
+
}
|
|
9827
|
+
function buildTimeline(blueprint, slots, opts, nodes) {
|
|
9828
|
+
const reuse = opts.frames === "reuse";
|
|
9829
|
+
const compositeScenes = /* @__PURE__ */ new Set();
|
|
9830
|
+
if (!reuse) {
|
|
9831
|
+
blueprint.scenes.forEach((s, i) => {
|
|
9832
|
+
if (layeredComposition(s)) compositeScenes.add(i);
|
|
9833
|
+
});
|
|
9834
|
+
}
|
|
9835
|
+
const canonical = collapseVoiceover(blueprint);
|
|
9836
|
+
const ensureVoiceNode = makeVoiceFactory(blueprint, canonical, nodes);
|
|
9837
|
+
const env = {
|
|
9838
|
+
blueprint,
|
|
9839
|
+
slots,
|
|
9840
|
+
opts,
|
|
9841
|
+
ar: aspectRatioParam(blueprint),
|
|
9842
|
+
reuse,
|
|
9843
|
+
cameraOn: onCameraDialogue(blueprint),
|
|
9844
|
+
casts: castIdSet(blueprint),
|
|
9845
|
+
ingestCache: /* @__PURE__ */ new Map()
|
|
9846
|
+
};
|
|
9847
|
+
const out = {
|
|
9848
|
+
clips: [],
|
|
9849
|
+
voTracks: [],
|
|
9850
|
+
voSegments: [],
|
|
9851
|
+
talkingScenes: [],
|
|
9852
|
+
nativeSegments: [],
|
|
9853
|
+
sceneSlice: /* @__PURE__ */ new Map()
|
|
9854
|
+
};
|
|
9855
|
+
const presenterPresent = makePresenterPresent(slots, canonical);
|
|
9856
|
+
const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent);
|
|
9026
9857
|
const usedVoIds = /* @__PURE__ */ new Set();
|
|
9027
|
-
|
|
9028
|
-
|
|
9029
|
-
|
|
9030
|
-
const
|
|
9031
|
-
|
|
9032
|
-
const
|
|
9033
|
-
|
|
9034
|
-
|
|
9035
|
-
|
|
9036
|
-
}
|
|
9037
|
-
|
|
9038
|
-
|
|
9039
|
-
|
|
9040
|
-
|
|
9041
|
-
|
|
9042
|
-
|
|
9043
|
-
|
|
9044
|
-
|
|
9045
|
-
|
|
9046
|
-
|
|
9047
|
-
|
|
9048
|
-
|
|
9049
|
-
|
|
9050
|
-
|
|
9051
|
-
|
|
9052
|
-
|
|
9053
|
-
|
|
9054
|
-
|
|
9055
|
-
|
|
9056
|
-
|
|
9057
|
-
|
|
9058
|
-
|
|
9059
|
-
|
|
9060
|
-
|
|
9061
|
-
|
|
9062
|
-
|
|
9063
|
-
}
|
|
9064
|
-
if (!native) {
|
|
9065
|
-
let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
|
|
9066
|
-
if (usedVoIds.has(id)) {
|
|
9067
|
-
let n = 2;
|
|
9068
|
-
while (usedVoIds.has(`${id}_${n}`)) n++;
|
|
9069
|
-
id = `${id}_${n}`;
|
|
9070
|
-
}
|
|
9071
|
-
usedVoIds.add(id);
|
|
9858
|
+
const claimed = /* @__PURE__ */ new Set();
|
|
9859
|
+
phrases.forEach((phrase, k) => {
|
|
9860
|
+
const voiceNode = ensureVoiceNode(phrase.speaker);
|
|
9861
|
+
const available = phrase.shownScenes.filter((s) => !claimed.has(s));
|
|
9862
|
+
if (phrase.presenterShown && available.length > 0) {
|
|
9863
|
+
for (const s of available) claimed.add(s);
|
|
9864
|
+
emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
|
|
9865
|
+
} else {
|
|
9866
|
+
emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
|
|
9867
|
+
}
|
|
9868
|
+
});
|
|
9869
|
+
const lastIndex = blueprint.scenes.length - 1;
|
|
9870
|
+
let prevEndFrame;
|
|
9871
|
+
blueprint.scenes.forEach((scene, i) => {
|
|
9872
|
+
const composite = compositeScenes.has(i) ? layeredComposition(scene) : null;
|
|
9873
|
+
if (composite) {
|
|
9874
|
+
emitCompositeInTimeline(
|
|
9875
|
+
composite,
|
|
9876
|
+
scene,
|
|
9877
|
+
i,
|
|
9878
|
+
i === lastIndex,
|
|
9879
|
+
env,
|
|
9880
|
+
canonical,
|
|
9881
|
+
ensureVoiceNode,
|
|
9882
|
+
usedVoIds,
|
|
9883
|
+
nodes,
|
|
9884
|
+
out
|
|
9885
|
+
);
|
|
9886
|
+
prevEndFrame = void 0;
|
|
9887
|
+
return;
|
|
9888
|
+
}
|
|
9889
|
+
const slice = out.sceneSlice.get(i);
|
|
9890
|
+
if (slice) {
|
|
9891
|
+
const whole = slice.offset === 0 && Math.abs(slice.len - slice.clipDur) <= 0.05;
|
|
9892
|
+
if (whole) {
|
|
9893
|
+
out.clips.push({ ref: slice.clipRef, scene_s: slice.len, out: null });
|
|
9894
|
+
} else {
|
|
9072
9895
|
nodes.push({
|
|
9073
|
-
id
|
|
9074
|
-
type: "
|
|
9075
|
-
inputs: {
|
|
9076
|
-
params: {
|
|
9896
|
+
id: `s${i}_seg`,
|
|
9897
|
+
type: "ffmpeg",
|
|
9898
|
+
inputs: { clip: slice.clipRef },
|
|
9899
|
+
params: { args: trimArgs(slice.len, slice.offset), outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
9077
9900
|
});
|
|
9078
|
-
|
|
9079
|
-
const audioRef = `$ref:${id}.audio`;
|
|
9080
|
-
tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
|
|
9901
|
+
out.clips.push({ ref: `$ref:s${i}_seg.video`, scene_s: slice.len, out: null });
|
|
9081
9902
|
}
|
|
9082
|
-
|
|
9083
|
-
|
|
9084
|
-
|
|
9903
|
+
prevEndFrame = void 0;
|
|
9904
|
+
return;
|
|
9905
|
+
}
|
|
9906
|
+
prevEndFrame = emitBrollScene(scene, i, i === lastIndex, env, nodes, out, prevEndFrame);
|
|
9085
9907
|
});
|
|
9086
|
-
|
|
9908
|
+
const totalMs = Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3);
|
|
9909
|
+
out.voTracks.push(...buildPerSpeakerVoiceConversion(out.nativeSegments, totalMs, nodes));
|
|
9910
|
+
return { clips: out.clips, voTracks: out.voTracks, vo_segments: out.voSegments, talking_scenes: out.talkingScenes };
|
|
9087
9911
|
}
|
|
9088
9912
|
function buildSfxMusic(blueprint, nodes) {
|
|
9089
9913
|
const tracks = [];
|
|
@@ -9106,13 +9930,21 @@ function buildSfxMusic(blueprint, nodes) {
|
|
|
9106
9930
|
const musicPrompt = blueprint.global?.music?.music_prompt;
|
|
9107
9931
|
if (musicPrompt) {
|
|
9108
9932
|
const total = blueprint.source?.duration_s ?? lastSceneEnd(blueprint);
|
|
9109
|
-
const
|
|
9933
|
+
const hookEnd = blueprint.scenes[0]?.end_s ?? 0;
|
|
9934
|
+
const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, hookEnd), Math.max(total - 0.5, 0));
|
|
9110
9935
|
const totalMs = Math.round((total - startAt) * 1e3);
|
|
9111
9936
|
const musicMs = Math.min(Math.max(totalMs, 3e3), ELEVENLABS_MAX_MUSIC_LENGTH_MS);
|
|
9112
9937
|
nodes.push({
|
|
9113
9938
|
id: "music_bed",
|
|
9114
9939
|
type: "music",
|
|
9115
|
-
|
|
9940
|
+
// force_instrumental: the model is vocal-capable; without this it can SING the
|
|
9941
|
+
// mood (and feeding it the script made it sing the ad). The voice owns the words.
|
|
9942
|
+
params: {
|
|
9943
|
+
model: FIXED_MUSIC_MODEL,
|
|
9944
|
+
prompt: musicBedPrompt(blueprint, musicPrompt),
|
|
9945
|
+
music_length_ms: musicMs,
|
|
9946
|
+
force_instrumental: true
|
|
9947
|
+
}
|
|
9116
9948
|
});
|
|
9117
9949
|
tracks.push({
|
|
9118
9950
|
slot: "music",
|
|
@@ -9156,22 +9988,63 @@ function normalizeAnim(animation) {
|
|
|
9156
9988
|
const mapped = animation === "slide" ? "slide_up" : animation;
|
|
9157
9989
|
return SUPPORTED_ANIMS.has(mapped) ? mapped : void 0;
|
|
9158
9990
|
}
|
|
9991
|
+
var FACE_ZONE_POSITIONS = /* @__PURE__ */ new Set([
|
|
9992
|
+
"center",
|
|
9993
|
+
"centre",
|
|
9994
|
+
"mid-center",
|
|
9995
|
+
"mid-centre",
|
|
9996
|
+
"middle-center",
|
|
9997
|
+
"center-center",
|
|
9998
|
+
"mid",
|
|
9999
|
+
"middle"
|
|
10000
|
+
]);
|
|
9159
10001
|
function positionClass(position) {
|
|
9160
10002
|
const p = (position ?? "bottom_center").toLowerCase().replace(/[^a-z]+/g, "-");
|
|
9161
|
-
|
|
10003
|
+
const safe = FACE_ZONE_POSITIONS.has(p) ? "bottom-center" : p;
|
|
10004
|
+
return `pos-${safe}`;
|
|
9162
10005
|
}
|
|
9163
|
-
function
|
|
10006
|
+
function collectCaptions(blueprint) {
|
|
10007
|
+
return blueprint.scenes.flatMap((scene) => {
|
|
10008
|
+
const sceneStart = scene.start_s ?? 0;
|
|
10009
|
+
const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
|
|
10010
|
+
return overlays.success ? overlays.data.filter((ov) => Boolean(ov.text?.trim())).map((ov) => {
|
|
10011
|
+
const at = ov.appears_at_s ?? sceneStart;
|
|
10012
|
+
return { text: ov.text.trim(), at, end: at + (ov.duration_s ?? 2.5), ov };
|
|
10013
|
+
}) : [];
|
|
10014
|
+
}).sort((a, b) => a.at - b.at);
|
|
10015
|
+
}
|
|
10016
|
+
function mergeCaptions(blueprint) {
|
|
10017
|
+
const byText = /* @__PURE__ */ new Map();
|
|
10018
|
+
for (const e of collectCaptions(blueprint)) {
|
|
10019
|
+
const arr = byText.get(e.text);
|
|
10020
|
+
if (arr) arr.push(e);
|
|
10021
|
+
else byText.set(e.text, [e]);
|
|
10022
|
+
}
|
|
10023
|
+
const merged = [];
|
|
10024
|
+
for (const arr of byText.values()) {
|
|
10025
|
+
let cur = null;
|
|
10026
|
+
for (const e of arr) {
|
|
10027
|
+
if (cur && e.at <= cur.end + 0.35) cur.end = Math.max(cur.end, e.end);
|
|
10028
|
+
else {
|
|
10029
|
+
cur = { ...e };
|
|
10030
|
+
merged.push(cur);
|
|
10031
|
+
}
|
|
10032
|
+
}
|
|
10033
|
+
}
|
|
10034
|
+
return merged.sort((a, b) => a.at - b.at);
|
|
10035
|
+
}
|
|
10036
|
+
function overlayElement(ov, at, dur) {
|
|
9164
10037
|
if (!ov.text?.trim()) return "";
|
|
9165
|
-
const at = ov.appears_at_s ?? sceneStart;
|
|
9166
|
-
const dur = ov.duration_s ?? 2.5;
|
|
9167
10038
|
const role = ov.role ? ` data-role="${escapeHtml(ov.role)}"` : "";
|
|
9168
10039
|
const normAnim = normalizeAnim(ov.animation);
|
|
9169
10040
|
const anim = normAnim ? ` data-anim="${normAnim}"` : "";
|
|
9170
10041
|
const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
|
|
9171
10042
|
return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
|
|
9172
10043
|
}
|
|
10044
|
+
var RICH_OVERLAY_RE = /notif|tweet|\bx post\b|post\b|comment|message|chat|bubble|card|review|rating|stat|counter|toast|popup/;
|
|
9173
10045
|
function sourceHint(fe) {
|
|
9174
10046
|
const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
|
|
10047
|
+
const haystack = `${fe.kind ?? ""} ${fe.description ?? ""} ${fe.what_it_represents ?? ""}`.toLowerCase();
|
|
9175
10048
|
switch ((fe.kind ?? "").toLowerCase()) {
|
|
9176
10049
|
case "logo":
|
|
9177
10050
|
return "baker images logo <domain> (or baker images library)";
|
|
@@ -9181,6 +10054,9 @@ function sourceHint(fe) {
|
|
|
9181
10054
|
case "product_cutout":
|
|
9182
10055
|
return `baker images library "${desc}" (the client's own product)`;
|
|
9183
10056
|
default:
|
|
10057
|
+
if (RICH_OVERLAY_RE.test(haystack)) {
|
|
10058
|
+
return `npx hyperframes add <social-card/notification block> for "${desc}" (animated overlay, not a static icon \u2014 see references/hyperframes/catalog.md)`;
|
|
10059
|
+
}
|
|
9184
10060
|
return `baker images icon "${desc}"`;
|
|
9185
10061
|
}
|
|
9186
10062
|
}
|
|
@@ -9196,6 +10072,26 @@ function floatingStub(fe, sceneStart) {
|
|
|
9196
10072
|
`<img class="ov ${positionClass(fe.position)}" src="your-${slug}.png" data-start="${at}" data-dur="${dur}" alt="" /> -->`
|
|
9197
10073
|
].join("\n");
|
|
9198
10074
|
}
|
|
10075
|
+
function uiPipStub(scene) {
|
|
10076
|
+
const comp = scene.composition;
|
|
10077
|
+
const layout = (comp?.layout ?? "").toLowerCase();
|
|
10078
|
+
if (!COMPOSITE_LAYOUTS.has(layout)) return "";
|
|
10079
|
+
const regions = (comp?.regions ?? []).filter((r) => Boolean(r) && typeof r === "object");
|
|
10080
|
+
if (regions.length < 2 || !isUiOnlyComposite(regions)) return "";
|
|
10081
|
+
const ui = regions.find(regionIsUiSurface);
|
|
10082
|
+
const at = scene.start_s ?? 0;
|
|
10083
|
+
const dur = Math.max(0.5, Math.round(((scene.end_s ?? at + 2.5) - at) * 100) / 100);
|
|
10084
|
+
const label = commentSafe(ui?.summary || ui?.frame_prompt || ui?.panel || "the app screen");
|
|
10085
|
+
return [
|
|
10086
|
+
`<!-- PHONE UI @ ${at}s for ${dur}s \u2014 the app/site screen this scene shows: ${label}.`,
|
|
10087
|
+
" Build it as a REAL surface, NEVER AI: capture the live page \u2014",
|
|
10088
|
+
" baker images screenshot https://<brand-domain>/<path> (image-library skill)",
|
|
10089
|
+
" \u2014 OR hand-build a brand-accurate HTML screen; then frame it in a phone mockup:",
|
|
10090
|
+
" npx hyperframes add phone-scroll (writes compositions/phone-scroll.html)",
|
|
10091
|
+
" drop the screenshot as screenshot.png in this dir and nest it as a PIP clip:",
|
|
10092
|
+
` <div data-composition-src="compositions/phone-scroll.html" data-start="${at}" data-duration="${dur}" data-track-index="2" data-width="1080" data-height="1920"></div> -->`
|
|
10093
|
+
].join("\n");
|
|
10094
|
+
}
|
|
9199
10095
|
function buildOverlayHtml(input) {
|
|
9200
10096
|
const blueprint = VideoBlueprint.parse(input);
|
|
9201
10097
|
const blocks = [
|
|
@@ -9215,14 +10111,14 @@ function buildOverlayHtml(input) {
|
|
|
9215
10111
|
" Positions: edit the .pos-* classes or add your own. -->"
|
|
9216
10112
|
].join("\n")
|
|
9217
10113
|
];
|
|
10114
|
+
const ovParts = mergeCaptions(blueprint).map((e) => overlayElement(e.ov, e.at, Math.round((e.end - e.at) * 1e3) / 1e3)).filter(Boolean);
|
|
10115
|
+
if (ovParts.length > 0) blocks.push(ovParts.join("\n"));
|
|
9218
10116
|
for (const scene of blueprint.scenes) {
|
|
9219
10117
|
const sceneStart = scene.start_s ?? 0;
|
|
9220
|
-
const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
|
|
9221
10118
|
const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
|
|
9222
|
-
const parts = [
|
|
9223
|
-
|
|
9224
|
-
|
|
9225
|
-
].filter(Boolean);
|
|
10119
|
+
const parts = (floats.success ? floats.data.map((fe) => floatingStub(fe, sceneStart)) : []).filter(Boolean);
|
|
10120
|
+
const pip = uiPipStub(scene);
|
|
10121
|
+
if (pip) parts.push(pip);
|
|
9226
10122
|
if (parts.length > 0) blocks.push(parts.join("\n"));
|
|
9227
10123
|
}
|
|
9228
10124
|
return blocks.join("\n\n");
|
|
@@ -9255,15 +10151,15 @@ function xfadeSpineArgs(clips) {
|
|
|
9255
10151
|
let cur = "c0";
|
|
9256
10152
|
let accLen = clipInputLen(clips[0]);
|
|
9257
10153
|
for (let k = 0; k < n - 1; k++) {
|
|
9258
|
-
const
|
|
10154
|
+
const join4 = clips[k].out;
|
|
9259
10155
|
const next = `c${k + 1}`;
|
|
9260
10156
|
const out = k === n - 2 ? "v" : `j${k + 1}`;
|
|
9261
|
-
if (
|
|
9262
|
-
const offset = Math.max(0, accLen -
|
|
10157
|
+
if (join4) {
|
|
10158
|
+
const offset = Math.max(0, accLen - join4.dur);
|
|
9263
10159
|
filt.push(
|
|
9264
|
-
`[${cur}][${next}]xfade=transition=${
|
|
10160
|
+
`[${cur}][${next}]xfade=transition=${join4.xfade}:duration=${join4.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
|
|
9265
10161
|
);
|
|
9266
|
-
accLen = accLen -
|
|
10162
|
+
accLen = accLen - join4.dur + clipInputLen(clips[k + 1]);
|
|
9267
10163
|
} else {
|
|
9268
10164
|
filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
|
|
9269
10165
|
accLen += clipInputLen(clips[k + 1]);
|
|
@@ -9304,15 +10200,14 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
9304
10200
|
params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
|
|
9305
10201
|
});
|
|
9306
10202
|
});
|
|
9307
|
-
|
|
9308
|
-
const {
|
|
9309
|
-
const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
|
|
9310
|
-
const voTracks = [...ttsTracks, ...nativeVoTracks];
|
|
10203
|
+
applyActorSheets(slots, nodes);
|
|
10204
|
+
const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
|
|
9311
10205
|
let videoRef = buildSpine(clips, nodes);
|
|
9312
10206
|
let videoNode = "spine";
|
|
9313
10207
|
const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
|
|
9314
10208
|
const floating = blueprint.scenes.flatMap((s) => s.floating_elements ?? []);
|
|
9315
|
-
|
|
10209
|
+
const hasUiPip = blueprint.scenes.some((s) => uiPipStub(s) !== "");
|
|
10210
|
+
if (overlays.length > 0 || floating.length > 0 || hasUiPip) {
|
|
9316
10211
|
nodes.push({
|
|
9317
10212
|
id: "overlaid",
|
|
9318
10213
|
type: "hyperframe_render",
|
|
@@ -9322,10 +10217,28 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
9322
10217
|
videoRef = "$ref:overlaid.video";
|
|
9323
10218
|
videoNode = "overlaid";
|
|
9324
10219
|
}
|
|
10220
|
+
if (opts.captionsCompositionPath && opts.transcriptPath) {
|
|
10221
|
+
nodes.push({
|
|
10222
|
+
id: "captions_transcript",
|
|
10223
|
+
type: "ingest",
|
|
10224
|
+
params: { source: "path", path: opts.transcriptPath, expect: "json" }
|
|
10225
|
+
});
|
|
10226
|
+
nodes.push({
|
|
10227
|
+
id: "captions",
|
|
10228
|
+
type: "hyperframe_render",
|
|
10229
|
+
inputs: { background: videoRef, transcript: "$ref:captions_transcript.asset" },
|
|
10230
|
+
params: { composition: opts.captionsCompositionPath }
|
|
10231
|
+
});
|
|
10232
|
+
videoRef = "$ref:captions.video";
|
|
10233
|
+
videoNode = "captions";
|
|
10234
|
+
}
|
|
9325
10235
|
const tracks = [...voTracks, ...buildSfxMusic(blueprint, nodes)];
|
|
9326
10236
|
if (tracks.length > 0) {
|
|
9327
10237
|
const mixInputs = {};
|
|
9328
10238
|
for (const t of tracks) mixInputs[t.slot] = t.ref;
|
|
10239
|
+
const musicTrack = tracks.find((t) => t.kind === "music");
|
|
10240
|
+
const voiceSlots = tracks.filter((t) => t.kind === "vo").map((t) => t.slot);
|
|
10241
|
+
const duck = musicTrack && voiceSlots.length > 0 ? { duck: { track: musicTrack.slot, against: voiceSlots } } : {};
|
|
9329
10242
|
nodes.push({
|
|
9330
10243
|
id: "audio_mix",
|
|
9331
10244
|
type: "audio_timeline",
|
|
@@ -9336,7 +10249,8 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
9336
10249
|
start_s: t.start_s,
|
|
9337
10250
|
...t.gain_db !== void 0 ? { gain_db: t.gain_db } : {}
|
|
9338
10251
|
})),
|
|
9339
|
-
total_ms: Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3)
|
|
10252
|
+
total_ms: Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3),
|
|
10253
|
+
...duck
|
|
9340
10254
|
}
|
|
9341
10255
|
});
|
|
9342
10256
|
nodes.push({
|
|
@@ -9384,45 +10298,31 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
9384
10298
|
// The timing plan `baker canvas validate` checks before any billed render:
|
|
9385
10299
|
// sequenced voiceover turns (no overlap), audio ≈ video length, and which
|
|
9386
10300
|
// scenes must be lip-synced.
|
|
9387
|
-
video: buildVideoMeta(blueprint,
|
|
10301
|
+
video: buildVideoMeta(blueprint, { vo_segments, talking_scenes })
|
|
9388
10302
|
},
|
|
9389
10303
|
nodes,
|
|
9390
10304
|
output: { node: videoNode, output: "video" }
|
|
9391
10305
|
};
|
|
9392
10306
|
}
|
|
9393
|
-
function buildVideoMeta(blueprint,
|
|
9394
|
-
const vo_segments = [];
|
|
9395
|
-
const talking_scenes = [];
|
|
9396
|
-
for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
|
|
9397
|
-
for (const t of turns) {
|
|
9398
|
-
if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
|
|
9399
|
-
}
|
|
9400
|
-
const nativeTurn = turns.find((t) => t.native);
|
|
9401
|
-
if (nativeTurn) {
|
|
9402
|
-
const sceneObj = blueprint.scenes[scene];
|
|
9403
|
-
talking_scenes.push({
|
|
9404
|
-
scene,
|
|
9405
|
-
voice_convert_node: `s${scene}_voconv`,
|
|
9406
|
-
scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
|
|
9407
|
-
est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
|
|
9408
|
-
});
|
|
9409
|
-
}
|
|
9410
|
-
}
|
|
10307
|
+
function buildVideoMeta(blueprint, meta) {
|
|
9411
10308
|
return {
|
|
9412
10309
|
duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
|
|
9413
|
-
vo_segments,
|
|
9414
|
-
talking_scenes,
|
|
9415
|
-
motion_board: buildMotionBoard(blueprint
|
|
10310
|
+
vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
|
|
10311
|
+
talking_scenes: meta.talking_scenes,
|
|
10312
|
+
motion_board: buildMotionBoard(blueprint)
|
|
9416
10313
|
};
|
|
9417
10314
|
}
|
|
9418
|
-
function
|
|
10315
|
+
function sceneSpokenText(scene) {
|
|
10316
|
+
return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
|
|
10317
|
+
}
|
|
10318
|
+
function buildMotionBoard(blueprint) {
|
|
9419
10319
|
const round = (n) => Math.round(n * 100) / 100;
|
|
9420
10320
|
let cursor = 0;
|
|
9421
10321
|
return blueprint.scenes.map((scene, i) => {
|
|
9422
10322
|
const start_s = scene.start_s ?? cursor;
|
|
9423
10323
|
const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
|
|
9424
10324
|
cursor = end_s;
|
|
9425
|
-
const spoken = (
|
|
10325
|
+
const spoken = sceneSpokenText(scene);
|
|
9426
10326
|
const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
|
|
9427
10327
|
const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
|
|
9428
10328
|
const graphics = [
|
|
@@ -9445,19 +10345,21 @@ function buildMotionBoard(blueprint, sceneTurns) {
|
|
|
9445
10345
|
scene: i,
|
|
9446
10346
|
role: resolveSceneRole(scene, i, blueprint.scenes.length),
|
|
9447
10347
|
window_s: [round(start_s), round(end_s)],
|
|
9448
|
-
|
|
10348
|
+
// A continuation b-roll scene shares the previous scene's end frame as its start
|
|
10349
|
+
// (no own `s<i>_start` node), so point the storyboard at that shared keyframe.
|
|
10350
|
+
storyboard_frames: [scene.continues_previous && i > 0 ? `s${i - 1}_end` : `s${i}_start`],
|
|
9449
10351
|
spoken,
|
|
9450
10352
|
graphics
|
|
9451
10353
|
};
|
|
9452
10354
|
});
|
|
9453
10355
|
}
|
|
9454
10356
|
var VIDEO_GUIDE = [
|
|
9455
|
-
"Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video.
|
|
10357
|
+
"Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video, built like an editing timeline. The VOICE is cut at PAUSES, not at visual cuts: each continuous-speech PHRASE is ONE Seedance clip (native lip-sync + audio) re-voiced to one brand voice, so a sentence never breaks mid-word across a cut. Each scene's PICTURE is independent: a scene that SHOWS the speaker slices its window out of the phrase clip; a b-roll cutaway gets its own silent clip (or a still hold for a sub-2s flash) laid over the continuing voice; a pure-voiceover stretch is one ElevenLabs tts read. Every clip gets a CLEAN-PLATE start AND end keyframe (no baked text), RECAST to your dropped reference assets \u2014 Seedance interpolates real in-shot motion between them. Each frame grounds ONLY on its own extracted frame + el_* slots (never another generated frame), so all frames render in PARALLEL (no cross-frame cascade). A SPLIT-SCREEN / PICTURE-IN-PICTURE / KEYED-PRESENTER scene is reproduced as one clip PER REGION, stacked or overlaid (see `metadata.todo.composition`). On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voice + SFX + a ducked music bed, normalized stereo. It is a STARTING POINT, not a locked render: add, delete, reorder, split, merge, or re-time scenes freely (a b-roll cutaway INSIDE a phrase lands at an approximate beat \u2014 nudge it) \u2014 see `metadata.todo.full_flexibility`.",
|
|
9456
10358
|
"",
|
|
9457
10359
|
"WHAT TO DO NEXT:",
|
|
9458
10360
|
"0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
|
|
9459
|
-
"1. Edit each frame's prompt IN PLACE. Every `s<i>_start`
|
|
9460
|
-
"1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill.
|
|
10361
|
+
"1. Edit each frame's prompt IN PLACE. Every `s<i>_start` keyframe node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want. The frame is RECAST to the el_* reference images you drop (the source ad's people are never reused), so describe pose/action/framing here and let the references carry identity.",
|
|
10362
|
+
"1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. Each scene's keyframe IS your storyboard; `metadata.video.motion_board` lays out each scene's frame, time window, spoken line, and the graphics scheduled in it. Lock the keyframes + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
|
|
9461
10363
|
"2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
|
|
9462
10364
|
"3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
|
|
9463
10365
|
"4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
|
|
@@ -9468,11 +10370,11 @@ var VIDEO_GUIDE = [
|
|
|
9468
10370
|
"- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
|
|
9469
10371
|
"- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
|
|
9470
10372
|
"- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
|
|
9471
|
-
"- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it).
|
|
10373
|
+
"- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it).",
|
|
9472
10374
|
"- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
|
|
9473
10375
|
"- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
|
|
9474
10376
|
"",
|
|
9475
|
-
"Tip: `prompt.json` is the deconstruction provenance + the
|
|
10377
|
+
"Tip: `prompt.json` is the deconstruction provenance + the authoritative SHARED AD SPEC each frame reads for cast identity, palette, brand, and type cohesion. The per-frame editing surface is the frame node's own FRAME DESCRIPTION.",
|
|
9476
10378
|
"Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
|
|
9477
10379
|
].join("\n");
|
|
9478
10380
|
function inferNarrativeRole(index, total) {
|
|
@@ -9514,14 +10416,16 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
|
|
|
9514
10416
|
const hookSceneIndex = findHookSceneIndex(blueprint);
|
|
9515
10417
|
const h = hookSceneIndex;
|
|
9516
10418
|
return {
|
|
10419
|
+
full_flexibility: "THIS CANVAS IS A STARTING POINT, NOT A LOCKED RENDER. It mirrors the reference's structure so you have a faithful scaffold \u2014 but you have FULL EDITING FREEDOM and should use it. You can: ADD a scene (new s<i>_start/_end + s<i>_clip + wire it into `spine`), DELETE a scene (drop its nodes + its `spine` input), REORDER scenes, SPLIT one beat into two or MERGE two into one, change any frame prompt or motion brief, swap an element reference, re-time or rewrite any overlay/voice, or change a scene's LAYOUT (make a full-frame beat a split-screen/PIP, or flatten a composite to one shot \u2014 see `composition`). Re-craft for OUR brand and OUR best ad; the reference is inspiration, not a spec to trace. The content-addressed cache re-bills only what you actually change, so iterate freely. `baker canvas validate` re-checks timing/lip-sync after any edit.",
|
|
10420
|
+
composition: "Some scenes are COMPOSITED, not single shots \u2014 `prompt.json`'s scene.composition.layout tells you which: `split_screen` (panels each showing different footage \u2014 e.g. b-roll on top, presenter on the bottom), `pip` (a presenter boxed in a corner over full-frame background), or `keyed_overlay` (a green-screen/cut-out presenter over background). Each is reproduced as ONE generated clip PER REGION (`s<i>_r0_*`, `s<i>_r1_*`, \u2026) stacked (vstack/hstack) or overlaid by an `s<i>_composite` ffmpeg node; a keyed presenter runs through `s<i>_key` (video_background_remove) for a transparent cut-out first. Edit each region's own keyframe prompt + motion brief independently. The presenter region (is_presenter) carries the lip-synced voice. To CHANGE a layout, edit composition in prompt.json and re-scaffold, or hand-edit the s<i>_composite ffmpeg args (splitStackArgs/pipOverlayArgs patterns). A clean full-frame talking head is simpler than a composite \u2014 flatten when the brand's version doesn't need the split.",
|
|
9517
10421
|
recraft_the_script_first: `VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer.${h >= 0 ? " The HOOK is the #1 decision (see the `hook` todo);" : ""} ${h >= 0 ? "then work" : "Work"} the per-scene \`script_recraft\` checklist. References: references/hook-craft.md (the hook), references/script-craft.md (body/CTA) + the meta-ads-playbook skill.`,
|
|
9518
10422
|
...h >= 0 ? {
|
|
9519
10423
|
hook: `THE HOOK IS THE HIGHEST-LEVERAGE BEAT \u2014 the first frame + first 3\u20134s decide whether the ad is watched at all, and the hook is TARGETING. But highest-leverage does NOT mean always rewrite: this hook already won, so MOST OF THE TIME you KEEP it and build on top (swap only the specifics). REBUILD is the exception \u2014 only when it doesn't transfer (a claim we lack or a different funnel/awareness stage), and then by reaching for its deeper INNER MECHANIC and delivering that truthfully, not inventing a new opener from nothing. For scene ${h}: DIAGNOSE it (device + mechanic + what stage it targets), DECIDE keep/adapt/rebuild, then hold the opener to the criteria \u2014 ${HOOK_OPENER_CRITERIA}. The hook lives across s${h}_start (the scroll-stopping first frame), the scene-${h} overlay text, the s${h}_clip line, an optional ~0.5s micro-hook, and the ramp into the body. Full diagnose\u2192decide\u2192(keep/adapt/rebuild) discipline + the proven hook-type menu: references/hook-craft.md (+ meta-ads-playbook \xA710/\xA717/\xA739).`
|
|
9520
10424
|
} : {},
|
|
9521
10425
|
script_recraft: buildScriptRecraft(blueprint),
|
|
9522
|
-
edit_frames_in_place: "Each s<i>_start
|
|
10426
|
+
edit_frames_in_place: "Each s<i>_start keyframe node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is the authoritative shared ad spec (cast identity, palette, brand). Frames are RECAST to the el_* reference images (the source ad's cast is never reused) and are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
|
|
9523
10427
|
frames_mode: opts.frames ?? "generate",
|
|
9524
|
-
review_storyboard_before_clips: "STORYBOARD FIRST.
|
|
10428
|
+
review_storyboard_before_clips: "STORYBOARD FIRST. Each scene's keyframe (s<i>_start) IS your storyboard \u2014 align the LOOK on it before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's keyframe, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
|
|
9525
10429
|
motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
|
|
9526
10430
|
assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
|
|
9527
10431
|
sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
|
|
@@ -9534,18 +10438,17 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
|
|
|
9534
10438
|
voice_description: d.voice_description,
|
|
9535
10439
|
line: d.line
|
|
9536
10440
|
})),
|
|
9537
|
-
talking_head_note: "NATIVE: a
|
|
9538
|
-
voice_note: "
|
|
9539
|
-
native_timing: "
|
|
10441
|
+
talking_head_note: "PHRASE-NATIVE: a continuous-speech phrase where the speaker is shown is ONE Seedance clip (the full phrase quoted in s<anchor>_clip's prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Scenes that show the speaker slice their window out of that clip (s<i>_seg); edit the phrase line in the s<anchor>_clip prompt to re-author it. A pure-voiceover phrase (speaker never shown) is one ElevenLabs tts read instead.",
|
|
10442
|
+
voice_note: "ONE voice per person: a single voice_select is reused across all that person's phrases (on-camera AND off \u2014 the deconstruct's `voiceover` label folds into the sole presenter). Each presenter phrase's native audio is re-voiced to that brand voice via audio_voice_convert (eleven_multilingual_sts_v2, one convert per phrase, timing preserved so lips stay matched). Set voice_select.voice_id's gender/language to match the creator.",
|
|
10443
|
+
native_timing: "The voice is cut at PAUSES, not at visual cuts, so a sentence spanning a cut stays one continuous read (no mid-word break). The clip is generated long enough for the estimated speech; if a line runs longer than its phrase window the voice continues a beat into the following pause (natural VO continuity). `metadata.video.talking_scenes` carries each phrase's scene_s vs est_speech_s. CAVEAT: a b-roll cutaway INSIDE a phrase lands at an approximate (proportional) time \u2014 Seedance exposes no word timing \u2014 so if a cutaway is off its beat, nudge the scene boundary (it's a starting point).",
|
|
9540
10444
|
craft: {
|
|
9541
10445
|
note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
|
|
9542
10446
|
principles: [
|
|
9543
10447
|
"Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
|
|
9544
10448
|
"Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
|
|
9545
|
-
"LOCK THE CAMERA \u2014
|
|
10449
|
+
"LOCK THE CAMERA \u2014 Seedance animates forward from the single keyframe; only move when the motion brief specifies a move. Unmotivated camera drift is the top realism tell.",
|
|
9546
10450
|
"One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
|
|
9547
10451
|
"Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
|
|
9548
|
-
"Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
|
|
9549
10452
|
"Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
|
|
9550
10453
|
"Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
|
|
9551
10454
|
"Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
|
|
@@ -9639,6 +10542,7 @@ function resolveShippedCanvasDir(name, startDir, exists = existsSync3, maxDepth
|
|
|
9639
10542
|
|
|
9640
10543
|
// src/commands/canvas/scaffold-video.ts
|
|
9641
10544
|
var SHIPPED_COMPOSITION_DIR = resolveShippedCanvasDir("video-overlay-composition", import.meta.dirname);
|
|
10545
|
+
var SHIPPED_CAPTIONS_DIR = resolveShippedCanvasDir("tiktok-captions-composition", import.meta.dirname);
|
|
9642
10546
|
function resolveModel2(kind, preferred) {
|
|
9643
10547
|
const ids = Object.keys(MODEL_REGISTRY[kind]);
|
|
9644
10548
|
return ids.includes(preferred) ? preferred : ids[0] ?? preferred;
|
|
@@ -9659,10 +10563,10 @@ DROP one-off background extras and incidental props \u2014 but the shared set/lo
|
|
|
9659
10563
|
|
|
9660
10564
|
ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
|
|
9661
10565
|
|
|
9662
|
-
For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of
|
|
10566
|
+
For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of ONLY the scenes where the element is ACTUALLY VISIBLE ON SCREEN \u2014 judged from that scene's start_frame_prompt / end_frame_prompt subjects and its action_detail, NOT from who is merely speaking. A narrator heard over b-roll is NOT present in that b-roll scene; a dog-running cutaway does NOT contain the couch creator just because she talks across it. Do NOT pad the list \u2014 an element wrongly listed in a scene makes the reproduction render the wrong subject there (e.g. the creator appearing in a pure-dog b-roll). When in doubt, leave a scene OUT. Output ONLY the JSON object.`;
|
|
9663
10567
|
async function loadAssetText2(ref, label) {
|
|
9664
10568
|
const r = ref;
|
|
9665
|
-
if (typeof r?.path === "string") return
|
|
10569
|
+
if (typeof r?.path === "string") return readFile5(r.path, "utf8");
|
|
9666
10570
|
if (typeof r?.url === "string") {
|
|
9667
10571
|
const res = await fetch(r.url);
|
|
9668
10572
|
if (!res.ok) throw new Error(`failed to fetch ${label} (${res.status})`);
|
|
@@ -9670,6 +10574,24 @@ async function loadAssetText2(ref, label) {
|
|
|
9670
10574
|
}
|
|
9671
10575
|
throw new Error(`${label}: output had no readable path or url`);
|
|
9672
10576
|
}
|
|
10577
|
+
async function loadTranscriptBestEffort(ref) {
|
|
10578
|
+
if (!ref) return void 0;
|
|
10579
|
+
try {
|
|
10580
|
+
return await loadAssetText2(ref, "deconstruct transcript");
|
|
10581
|
+
} catch {
|
|
10582
|
+
return void 0;
|
|
10583
|
+
}
|
|
10584
|
+
}
|
|
10585
|
+
async function stageCaptions(outDir, transcript) {
|
|
10586
|
+
const text = transcript?.trim();
|
|
10587
|
+
if (!text || text === "[]") return {};
|
|
10588
|
+
const transcriptPath = path5.join(outDir, "transcript.json");
|
|
10589
|
+
await writeFile2(transcriptPath, `${text}
|
|
10590
|
+
`, "utf8");
|
|
10591
|
+
const compositionPath = path5.join(outDir, "tiktok-captions-composition");
|
|
10592
|
+
await cp(SHIPPED_CAPTIONS_DIR, compositionPath, { recursive: true });
|
|
10593
|
+
return { compositionPath, transcriptPath };
|
|
10594
|
+
}
|
|
9673
10595
|
function parseElements2(raw) {
|
|
9674
10596
|
const parsed = JSON.parse(raw);
|
|
9675
10597
|
if (Array.isArray(parsed)) return parsed;
|
|
@@ -9678,6 +10600,31 @@ function parseElements2(raw) {
|
|
|
9678
10600
|
}
|
|
9679
10601
|
return [];
|
|
9680
10602
|
}
|
|
10603
|
+
async function detectShotCutsBestEffort(videoPath, threshold) {
|
|
10604
|
+
try {
|
|
10605
|
+
const cuts = await detectSceneCutsPySceneDetect(videoPath, threshold ? { threshold } : {});
|
|
10606
|
+
if (cuts.length > 0) {
|
|
10607
|
+
process.stderr.write(`Detected ${cuts.length} shot cut(s) via PySceneDetect: ${cuts.join(", ")}s
|
|
10608
|
+
`);
|
|
10609
|
+
} else {
|
|
10610
|
+
process.stderr.write("PySceneDetect ran but found no hard cuts; using LLM scene boundaries.\n");
|
|
10611
|
+
}
|
|
10612
|
+
return cuts;
|
|
10613
|
+
} catch (e) {
|
|
10614
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
10615
|
+
const code = e?.code;
|
|
10616
|
+
const missing = code === "ENOENT" || /ENOENT|not found|command not found/i.test(msg);
|
|
10617
|
+
if (missing) {
|
|
10618
|
+
process.stderr.write(
|
|
10619
|
+
"WARNING: `scenedetect` (PySceneDetect) is NOT installed \u2014 falling back to LLM-only scene boundaries, which under-segments (coarse 9-15s scenes instead of the real 1-4s cuts). Install it (`pipx install scenedetect[opencv]` or `pip install scenedetect[opencv]`) for accurate shot-cut detection.\n"
|
|
10620
|
+
);
|
|
10621
|
+
} else {
|
|
10622
|
+
process.stderr.write(`Shot-cut detection skipped (${msg}); using LLM boundaries.
|
|
10623
|
+
`);
|
|
10624
|
+
}
|
|
10625
|
+
return [];
|
|
10626
|
+
}
|
|
10627
|
+
}
|
|
9681
10628
|
function fail2(code, message) {
|
|
9682
10629
|
process.stderr.write(`${JSON.stringify({ ok: false, error: { code, message } }, null, 2)}
|
|
9683
10630
|
`);
|
|
@@ -9699,53 +10646,78 @@ function resolveModels2(args) {
|
|
|
9699
10646
|
videoModel: pick("video-model", "video_generate", "bytedance/seedance-2.0")
|
|
9700
10647
|
};
|
|
9701
10648
|
}
|
|
9702
|
-
function
|
|
10649
|
+
function buildDeconstructCanvas(videoPath, deconstructModel, opts) {
|
|
9703
10650
|
const deconstructParams = { model: deconstructModel, mode: "full" };
|
|
9704
10651
|
if (typeof opts.maxScenes === "number") deconstructParams.max_scenes = opts.maxScenes;
|
|
9705
10652
|
if (opts.language) deconstructParams.language = opts.language;
|
|
9706
10653
|
if (opts.focus) deconstructParams.focus = opts.focus;
|
|
10654
|
+
if (opts.shotCuts && opts.shotCuts.length > 0) deconstructParams.shot_cuts = opts.shotCuts;
|
|
10655
|
+
deconstructParams.max_clip_s = SEEDANCE_DURATIONS[SEEDANCE_DURATIONS.length - 1];
|
|
9707
10656
|
return {
|
|
9708
10657
|
schema: "baker-canvas/1",
|
|
9709
10658
|
metadata: { name: "video deconstruct pass" },
|
|
9710
10659
|
nodes: [
|
|
9711
10660
|
{ id: "src", type: "ingest", params: { source: "path", path: videoPath, expect: "video" } },
|
|
9712
|
-
{ id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams }
|
|
10661
|
+
{ id: "deconstruct", type: "video_deconstruct", inputs: { video: "$ref:src.asset" }, params: deconstructParams }
|
|
10662
|
+
],
|
|
10663
|
+
output: { node: "deconstruct", output: "analysis" }
|
|
10664
|
+
};
|
|
10665
|
+
}
|
|
10666
|
+
function buildSelectCanvas(selectModel, slimmedBlueprintJson) {
|
|
10667
|
+
return {
|
|
10668
|
+
schema: "baker-canvas/1",
|
|
10669
|
+
metadata: { name: "element selection pass" },
|
|
10670
|
+
nodes: [
|
|
9713
10671
|
{
|
|
9714
10672
|
id: "select",
|
|
9715
10673
|
type: "text_generate",
|
|
9716
|
-
inputs: { blueprint: "$ref:deconstruct.analysis" },
|
|
9717
10674
|
params: {
|
|
9718
10675
|
model: selectModel,
|
|
9719
10676
|
max_tokens: 6e3,
|
|
9720
10677
|
temperature: 0,
|
|
9721
10678
|
response_format: "json_object",
|
|
9722
10679
|
system: SELECT_SYSTEM2,
|
|
9723
|
-
prompt: SELECT_PROMPT2
|
|
10680
|
+
prompt: SELECT_PROMPT2.replace("{{blueprint}}", () => slimmedBlueprintJson)
|
|
9724
10681
|
}
|
|
9725
10682
|
}
|
|
9726
10683
|
],
|
|
9727
10684
|
output: { node: "select", output: "text" }
|
|
9728
10685
|
};
|
|
9729
10686
|
}
|
|
9730
|
-
async function runAnalysisPasses(
|
|
10687
|
+
async function runAnalysisPasses(deconstructCanvas, selectModel) {
|
|
9731
10688
|
const engine = createEngineFromEnv({ log: (line) => process.stderr.write(`${line}
|
|
9732
10689
|
`) });
|
|
9733
|
-
let
|
|
9734
|
-
let
|
|
10690
|
+
let credits = 0;
|
|
10691
|
+
let sawCredits = false;
|
|
10692
|
+
const addCredits = (stats) => {
|
|
10693
|
+
const c = stats?.total_credits;
|
|
10694
|
+
if (typeof c === "number") {
|
|
10695
|
+
credits += c;
|
|
10696
|
+
sawCredits = true;
|
|
10697
|
+
}
|
|
10698
|
+
};
|
|
10699
|
+
let blueprint;
|
|
10700
|
+
let transcript;
|
|
9735
10701
|
try {
|
|
9736
|
-
const
|
|
9737
|
-
|
|
9738
|
-
|
|
10702
|
+
const r1 = await engine.run(deconstructCanvas, {});
|
|
10703
|
+
addCredits(r1.stats);
|
|
10704
|
+
blueprint = JSON.parse(await loadAssetText2(r1.outputs_by_node.deconstruct?.analysis, "deconstruct output"));
|
|
10705
|
+
transcript = await loadTranscriptBestEffort(r1.outputs_by_node.deconstruct?.transcript);
|
|
9739
10706
|
} catch (e) {
|
|
9740
10707
|
if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
|
|
10708
|
+
if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
|
|
9741
10709
|
return fail2("deconstruct", e instanceof Error ? e.message : String(e));
|
|
9742
10710
|
}
|
|
10711
|
+
const slimJson = JSON.stringify(slimBlueprintForSelection(blueprint));
|
|
9743
10712
|
try {
|
|
9744
|
-
const
|
|
9745
|
-
|
|
9746
|
-
|
|
10713
|
+
const r2 = await engine.run(buildSelectCanvas(selectModel, slimJson), {});
|
|
10714
|
+
addCredits(r2.stats);
|
|
10715
|
+
const elements = parseElements2(await loadAssetText2(r2.outputs_by_node.select?.text, "selection output"));
|
|
10716
|
+
return { blueprint, elements, transcript, creditsSpent: sawCredits ? credits : void 0 };
|
|
9747
10717
|
} catch (e) {
|
|
9748
|
-
return fail2("
|
|
10718
|
+
if (e instanceof ValidationError) return fail2("validation", JSON.stringify(e.issues));
|
|
10719
|
+
if (e instanceof SyntaxError) return fail2("read_outputs", e.message);
|
|
10720
|
+
return fail2("deconstruct", e instanceof Error ? e.message : String(e));
|
|
9749
10721
|
}
|
|
9750
10722
|
}
|
|
9751
10723
|
var scaffoldVideoCommand = defineCommand76({
|
|
@@ -9761,11 +10733,11 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9761
10733
|
type: "boolean",
|
|
9762
10734
|
description: "Give silent b-roll scenes native diegetic ambient mixed deep under the music bed (off by default)"
|
|
9763
10735
|
},
|
|
9764
|
-
"actor-sheets": {
|
|
9765
|
-
type: "boolean",
|
|
9766
|
-
description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
|
|
9767
|
-
},
|
|
9768
10736
|
"max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
|
|
10737
|
+
"shot-threshold": {
|
|
10738
|
+
type: "string",
|
|
10739
|
+
description: "PySceneDetect content threshold. Default is adaptive (18, auto re-checked at 27 when a continuous shot looks over-segmented); pinning a value disables the re-check. Lower = more/softer cuts, higher = fewer."
|
|
10740
|
+
},
|
|
9769
10741
|
language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
|
|
9770
10742
|
focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
|
|
9771
10743
|
"deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
|
|
@@ -9788,12 +10760,15 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9788
10760
|
);
|
|
9789
10761
|
}
|
|
9790
10762
|
const { deconstructModel, selectModel, imageModel, videoModel } = resolveModels2(args);
|
|
9791
|
-
const
|
|
10763
|
+
const shotThreshold = args["shot-threshold"] ? Number(args["shot-threshold"]) : void 0;
|
|
10764
|
+
const shotCuts = await detectShotCutsBestEffort(videoPath, shotThreshold);
|
|
10765
|
+
const deconstructCanvas = buildDeconstructCanvas(videoPath, deconstructModel, {
|
|
9792
10766
|
maxScenes: Number.isFinite(maxScenes) ? maxScenes : void 0,
|
|
9793
10767
|
language: args.language ? String(args.language) : void 0,
|
|
9794
|
-
focus: args.focus ? String(args.focus) : void 0
|
|
10768
|
+
focus: args.focus ? String(args.focus) : void 0,
|
|
10769
|
+
shotCuts
|
|
9795
10770
|
});
|
|
9796
|
-
const { blueprint, elements, creditsSpent } = await runAnalysisPasses(
|
|
10771
|
+
const { blueprint, elements, transcript, creditsSpent } = await runAnalysisPasses(deconstructCanvas, selectModel);
|
|
9797
10772
|
await mkdir(outDir, { recursive: true });
|
|
9798
10773
|
const annotated = annotateBlueprintWithElements(blueprint, elements);
|
|
9799
10774
|
await writeFile2(blueprintPath, `${JSON.stringify(annotated, null, 2)}
|
|
@@ -9802,7 +10777,7 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9802
10777
|
await cp(SHIPPED_COMPOSITION_DIR, compositionDest, { recursive: true });
|
|
9803
10778
|
const indexPath = path5.join(compositionDest, "index.html");
|
|
9804
10779
|
const overlayHtml = buildOverlayHtml(blueprint);
|
|
9805
|
-
const indexHtml = await
|
|
10780
|
+
const indexHtml = await readFile5(indexPath, "utf8");
|
|
9806
10781
|
const injected = indexHtml.replace("<!--OVERLAYS-->", () => overlayHtml);
|
|
9807
10782
|
if (injected === indexHtml && overlayHtml.trim()) {
|
|
9808
10783
|
fail2(
|
|
@@ -9811,14 +10786,16 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9811
10786
|
);
|
|
9812
10787
|
}
|
|
9813
10788
|
await writeFile2(indexPath, injected, "utf8");
|
|
10789
|
+
const captions = await stageCaptions(outDir, transcript);
|
|
9814
10790
|
const opts = {
|
|
9815
10791
|
imageModel,
|
|
9816
10792
|
videoModel,
|
|
9817
10793
|
overlayCompositionPath: compositionDest,
|
|
10794
|
+
captionsCompositionPath: captions.compositionPath,
|
|
10795
|
+
transcriptPath: captions.transcriptPath,
|
|
9818
10796
|
blueprintPath,
|
|
9819
10797
|
frames,
|
|
9820
|
-
ambient: Boolean(args.ambient)
|
|
9821
|
-
actorSheets: Boolean(args["actor-sheets"])
|
|
10798
|
+
ambient: Boolean(args.ambient)
|
|
9822
10799
|
};
|
|
9823
10800
|
let canvas;
|
|
9824
10801
|
let report;
|
|
@@ -9851,7 +10828,7 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9851
10828
|
stats: {
|
|
9852
10829
|
scene_count: report.scene_count,
|
|
9853
10830
|
total_nodes: canvas.nodes.length,
|
|
9854
|
-
|
|
10831
|
+
analysis_credits_spent: creditsSpent,
|
|
9855
10832
|
run_estimated_credits: validation.estimatedCredits
|
|
9856
10833
|
},
|
|
9857
10834
|
checklist: {
|
|
@@ -9879,7 +10856,7 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9879
10856
|
});
|
|
9880
10857
|
|
|
9881
10858
|
// src/commands/canvas/validate.ts
|
|
9882
|
-
import { readFile as
|
|
10859
|
+
import { readFile as readFile6 } from "fs/promises";
|
|
9883
10860
|
import path6 from "path";
|
|
9884
10861
|
import { defineCommand as defineCommand77 } from "citty";
|
|
9885
10862
|
var validateCommand = defineCommand77({
|
|
@@ -9890,7 +10867,7 @@ var validateCommand = defineCommand77({
|
|
|
9890
10867
|
args: { file: { type: "positional", required: true, description: "Path to canvas JSON" } },
|
|
9891
10868
|
async run({ args }) {
|
|
9892
10869
|
const filePath = path6.resolve(String(args.file));
|
|
9893
|
-
const raw = await
|
|
10870
|
+
const raw = await readFile6(filePath, "utf8");
|
|
9894
10871
|
let parsed;
|
|
9895
10872
|
try {
|
|
9896
10873
|
parsed = JSON.parse(raw);
|
|
@@ -10779,8 +11756,8 @@ function cropSprite(input, region) {
|
|
|
10779
11756
|
|
|
10780
11757
|
// src/lib/image/io.ts
|
|
10781
11758
|
import { randomBytes } from "crypto";
|
|
10782
|
-
import { glob as fsGlob, readFile as
|
|
10783
|
-
import { dirname, extname, join as
|
|
11759
|
+
import { glob as fsGlob, readFile as readFile7, rename, stat as stat2, writeFile as writeFile3 } from "fs/promises";
|
|
11760
|
+
import { dirname, extname, join as join3, resolve as resolve4 } from "path";
|
|
10784
11761
|
var REMOTE_RE = /^https?:\/\//i;
|
|
10785
11762
|
var GLOB_RE = /[*?[\]{}]/;
|
|
10786
11763
|
function isRemoteUrl(value) {
|
|
@@ -10815,7 +11792,7 @@ async function readImageBuffer(pathOrUrl) {
|
|
|
10815
11792
|
}
|
|
10816
11793
|
return Buffer.from(await response.arrayBuffer());
|
|
10817
11794
|
}
|
|
10818
|
-
return
|
|
11795
|
+
return readFile7(pathOrUrl);
|
|
10819
11796
|
}
|
|
10820
11797
|
async function isDirectory(path7) {
|
|
10821
11798
|
try {
|
|
@@ -10830,14 +11807,14 @@ async function resolveOutputPath(inputPath, outputArg, options) {
|
|
|
10830
11807
|
if (!outputArg) return base;
|
|
10831
11808
|
if (options.multipleInputs || await isDirectory(outputArg)) {
|
|
10832
11809
|
const filename = base.split("/").pop() ?? "out.png";
|
|
10833
|
-
return
|
|
11810
|
+
return join3(outputArg, filename);
|
|
10834
11811
|
}
|
|
10835
11812
|
return outputArg;
|
|
10836
11813
|
}
|
|
10837
11814
|
async function atomicWrite(targetPath, data) {
|
|
10838
11815
|
const absolute = resolve4(targetPath);
|
|
10839
11816
|
const dir = dirname(absolute);
|
|
10840
|
-
const tmp =
|
|
11817
|
+
const tmp = join3(dir, `.baker-image-${randomBytes(8).toString("hex")}.tmp`);
|
|
10841
11818
|
await writeFile3(tmp, data);
|
|
10842
11819
|
await rename(tmp, absolute);
|
|
10843
11820
|
}
|
|
@@ -11179,7 +12156,7 @@ var findCommand = defineCommand91({
|
|
|
11179
12156
|
});
|
|
11180
12157
|
|
|
11181
12158
|
// src/commands/images/generate.ts
|
|
11182
|
-
import { readFile as
|
|
12159
|
+
import { readFile as readFile8 } from "fs/promises";
|
|
11183
12160
|
import { defineCommand as defineCommand92 } from "citty";
|
|
11184
12161
|
import sharp2 from "sharp";
|
|
11185
12162
|
var GENERATE_TIMEOUT_MS = 18e4;
|
|
@@ -11262,7 +12239,7 @@ async function resolveReferences(spec) {
|
|
|
11262
12239
|
}
|
|
11263
12240
|
let raw;
|
|
11264
12241
|
try {
|
|
11265
|
-
raw = await
|
|
12242
|
+
raw = await readFile8(entry);
|
|
11266
12243
|
} catch {
|
|
11267
12244
|
throw new ApiError("VALIDATION_ERROR", `Reference file not found: ${entry}`);
|
|
11268
12245
|
}
|
|
@@ -12983,7 +13960,7 @@ var stockCommand = defineCommand105({
|
|
|
12983
13960
|
});
|
|
12984
13961
|
|
|
12985
13962
|
// src/commands/images/upload.ts
|
|
12986
|
-
import { readFile as
|
|
13963
|
+
import { readFile as readFile9 } from "fs/promises";
|
|
12987
13964
|
import { extname as extname2 } from "path";
|
|
12988
13965
|
import { defineCommand as defineCommand106 } from "citty";
|
|
12989
13966
|
var MIME_MAP = {
|
|
@@ -13123,7 +14100,7 @@ async function uploadLocal(target, args) {
|
|
|
13123
14100
|
});
|
|
13124
14101
|
return;
|
|
13125
14102
|
}
|
|
13126
|
-
const fileBuffer = await
|
|
14103
|
+
const fileBuffer = await readFile9(target);
|
|
13127
14104
|
const base64 = fileBuffer.toString("base64");
|
|
13128
14105
|
const body = { base64, contentType };
|
|
13129
14106
|
if (args.source) body.source = args.source;
|
|
@@ -15088,7 +16065,7 @@ var searchCommand3 = defineCommand135({
|
|
|
15088
16065
|
});
|
|
15089
16066
|
|
|
15090
16067
|
// src/commands/videos/upload.ts
|
|
15091
|
-
import { readFile as
|
|
16068
|
+
import { readFile as readFile10, stat as stat3 } from "fs/promises";
|
|
15092
16069
|
import { extname as extname3 } from "path";
|
|
15093
16070
|
import { defineCommand as defineCommand136 } from "citty";
|
|
15094
16071
|
var MIME_MAP2 = {
|
|
@@ -15153,7 +16130,7 @@ var uploadCommand2 = defineCommand136({
|
|
|
15153
16130
|
return;
|
|
15154
16131
|
}
|
|
15155
16132
|
const { uploadUrl, videoId } = await apiPost("/api/videos/upload", {});
|
|
15156
|
-
const fileBuffer = await
|
|
16133
|
+
const fileBuffer = await readFile10(filePath);
|
|
15157
16134
|
const uploadResponse = await fetch(uploadUrl, {
|
|
15158
16135
|
method: "PUT",
|
|
15159
16136
|
headers: { "Content-Type": contentType },
|