@koda-sl/baker-cli 0.74.0 → 0.79.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -8
- package/canvas/end-card-composition/index.html +66 -0
- package/canvas/end-card-composition/meta.json +19 -0
- package/canvas/feature-reveal-composition/index.html +83 -0
- package/canvas/feature-reveal-composition/meta.json +18 -0
- package/canvas/lower-third-composition/index.html +75 -0
- package/canvas/lower-third-composition/meta.json +18 -0
- package/canvas/stat-counter-composition/index.html +73 -0
- package/canvas/stat-counter-composition/meta.json +20 -0
- package/canvas/title-card-composition/index.html +90 -0
- package/canvas/title-card-composition/meta.json +20 -0
- package/dist/{chunk-JIDZ37KG.js → chunk-CCO34ACK.js} +507 -307
- package/dist/chunk-CCO34ACK.js.map +1 -0
- package/dist/cli.js +624 -109
- package/dist/cli.js.map +1 -1
- package/dist/engine/index.d.ts +6 -0
- package/dist/engine/index.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-JIDZ37KG.js.map +0 -1
package/dist/cli.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
defaultRegistry,
|
|
10
10
|
generateCatalog,
|
|
11
11
|
validateCanvasDeep
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-CCO34ACK.js";
|
|
13
13
|
|
|
14
14
|
// src/cli.ts
|
|
15
15
|
import { defineCommand as defineCommand141, runMain } from "citty";
|
|
@@ -8280,11 +8280,121 @@ import { defineCommand as defineCommand76 } from "citty";
|
|
|
8280
8280
|
|
|
8281
8281
|
// src/engine/scaffold/video.ts
|
|
8282
8282
|
import { z as z3 } from "zod";
|
|
8283
|
+
|
|
8284
|
+
// src/engine/scaffold/lib/shoot-modes.ts
|
|
8285
|
+
var SHOOT_MODES = [
|
|
8286
|
+
"ugc_selfie",
|
|
8287
|
+
"ugc_broll",
|
|
8288
|
+
"studio_product",
|
|
8289
|
+
"lifestyle_cinematic",
|
|
8290
|
+
"screen_ui"
|
|
8291
|
+
];
|
|
8292
|
+
var SHOOT_MODE_SPECS = {
|
|
8293
|
+
ugc_selfie: {
|
|
8294
|
+
label: "UGC selfie / talking-head",
|
|
8295
|
+
allowsDoF: false,
|
|
8296
|
+
capture: [
|
|
8297
|
+
"CAPTURE \u2014 AUTHENTIC PHONE UGC (front camera):",
|
|
8298
|
+
"Shot on a modern phone front camera \u2014 natural lens, real skin texture and pores,",
|
|
8299
|
+
"catchlights, mixed indoor white balance, faint sensor grain, slight handheld imperfection.",
|
|
8300
|
+
"NO shallow depth of field or background blur \u2014 native phone footage is flat front-to-back;",
|
|
8301
|
+
"blur reads as 'produced', not filmed-on-a-phone. Keep the whole frame in focus."
|
|
8302
|
+
].join("\n"),
|
|
8303
|
+
motion: "Lock the camera at arm's-length selfie distance; only natural handheld micro-movement. Move the camera only if a move is named above.",
|
|
8304
|
+
diegetic: "a quiet room tone with soft fabric and breath under the speaker's own voice"
|
|
8305
|
+
},
|
|
8306
|
+
ugc_broll: {
|
|
8307
|
+
label: "UGC b-roll / handheld",
|
|
8308
|
+
allowsDoF: false,
|
|
8309
|
+
capture: [
|
|
8310
|
+
"CAPTURE \u2014 AUTHENTIC PHONE UGC (rear camera, candid):",
|
|
8311
|
+
"Shot on a modern phone rear camera, handheld and candid \u2014 natural lens, real materials and",
|
|
8312
|
+
"textures, real hands in frame where natural, mixed natural white balance, faint sensor grain.",
|
|
8313
|
+
"NO shallow depth of field or background blur \u2014 native phone footage is flat front-to-back;",
|
|
8314
|
+
"keep the whole frame in focus."
|
|
8315
|
+
].join("\n"),
|
|
8316
|
+
motion: "Handheld, candid framing; keep any move small and motivated. Move the camera only if a move is named above.",
|
|
8317
|
+
diegetic: "the real ambient of the setting \u2014 handling sounds, footsteps, and room or outdoor tone"
|
|
8318
|
+
},
|
|
8319
|
+
studio_product: {
|
|
8320
|
+
label: "Studio / product (pack shot)",
|
|
8321
|
+
allowsDoF: true,
|
|
8322
|
+
capture: [
|
|
8323
|
+
"CAPTURE \u2014 CONTROLLED PRODUCT / STUDIO:",
|
|
8324
|
+
"Photographed on a controlled set \u2014 a clean seamless or one styled surface, a soft key with",
|
|
8325
|
+
"gentle fill and a subtle rim, true-to-life color. Shallow depth of field IS allowed to isolate",
|
|
8326
|
+
"the hero, with crisp specular highlights on the product's real materials.",
|
|
8327
|
+
"Still a real photograph, not CGI \u2014 no plastic or waxy surfaces, no over-render; real material",
|
|
8328
|
+
"texture and weight."
|
|
8329
|
+
].join("\n"),
|
|
8330
|
+
motion: "Lock off, or a slow motivated push-in / settle onto the product; otherwise hold.",
|
|
8331
|
+
diegetic: "minimal \u2014 soft product-handling sounds over a quiet room tone"
|
|
8332
|
+
},
|
|
8333
|
+
lifestyle_cinematic: {
|
|
8334
|
+
label: "Lifestyle / cinematic",
|
|
8335
|
+
allowsDoF: true,
|
|
8336
|
+
capture: [
|
|
8337
|
+
"CAPTURE \u2014 LIFESTYLE / CINEMATIC:",
|
|
8338
|
+
"A real camera in a real location \u2014 natural motivated light, true color, a gentle filmic grade,",
|
|
8339
|
+
"fine grain. A shallow depth of field is allowed when motivated by the moment.",
|
|
8340
|
+
"Photographic, not rendered \u2014 real skin and material texture, no airbrushing, no glossy 3D look."
|
|
8341
|
+
].join("\n"),
|
|
8342
|
+
motion: "A slow, motivated camera move (gentle push-in, drift, or settle) is allowed; otherwise hold.",
|
|
8343
|
+
diegetic: "the location's natural ambience \u2014 wind, traffic, water, or room tone as the setting implies"
|
|
8344
|
+
},
|
|
8345
|
+
screen_ui: {
|
|
8346
|
+
label: "Screen / UI / demo",
|
|
8347
|
+
allowsDoF: true,
|
|
8348
|
+
capture: [
|
|
8349
|
+
"CAPTURE \u2014 SCREEN / UI CAPTURE:",
|
|
8350
|
+
"A clean screen or app capture \u2014 crisp pixels, true on-screen color, optionally framed inside a",
|
|
8351
|
+
"real device held in a real hand. No human-skin realism is needed; the screen content is the subject.",
|
|
8352
|
+
"Do not bake invented UI copy into the plate beyond what the reference shows \u2014 editable text lives",
|
|
8353
|
+
"on the overlay layer."
|
|
8354
|
+
].join("\n"),
|
|
8355
|
+
motion: "Hold on the screen; allow a slow push-in or a UI scroll only if a move is named above.",
|
|
8356
|
+
diegetic: "soft UI taps and device handling over a quiet room tone"
|
|
8357
|
+
}
|
|
8358
|
+
};
|
|
8359
|
+
function isShootMode(value) {
|
|
8360
|
+
return typeof value === "string" && SHOOT_MODES.includes(value);
|
|
8361
|
+
}
|
|
8362
|
+
function captureBlockFor(mode) {
|
|
8363
|
+
return SHOOT_MODE_SPECS[mode].capture;
|
|
8364
|
+
}
|
|
8365
|
+
function seedanceMotionFor(mode) {
|
|
8366
|
+
return SHOOT_MODE_SPECS[mode].motion;
|
|
8367
|
+
}
|
|
8368
|
+
function diegeticFor(mode) {
|
|
8369
|
+
return SHOOT_MODE_SPECS[mode].diegetic;
|
|
8370
|
+
}
|
|
8371
|
+
function deriveShootMode(opts) {
|
|
8372
|
+
if (isShootMode(opts.explicit)) return opts.explicit;
|
|
8373
|
+
if (opts.talking) return "ugc_selfie";
|
|
8374
|
+
if (opts.hasProduct && !opts.hasPerson) return "studio_product";
|
|
8375
|
+
return "ugc_broll";
|
|
8376
|
+
}
|
|
8377
|
+
|
|
8378
|
+
// src/engine/scaffold/video.ts
|
|
8283
8379
|
var FIXED_TTS_MODEL = "elevenlabs/eleven_v3";
|
|
8284
8380
|
var FIXED_SFX_MODEL = "elevenlabs/eleven_text_to_sound_v2";
|
|
8285
8381
|
var FIXED_MUSIC_MODEL = "elevenlabs/music-v1";
|
|
8286
|
-
var
|
|
8382
|
+
var FIXED_VOICE_CONVERT_MODEL = "elevenlabs/eleven_multilingual_sts_v2";
|
|
8287
8383
|
var MUSIC_BED_GAIN_DB = -12;
|
|
8384
|
+
var AMBIENT_BED_GAIN_DB = -20;
|
|
8385
|
+
var TRANSITION_DEFAULT_S = 0.4;
|
|
8386
|
+
var XFADE_BY_TYPE = {
|
|
8387
|
+
fade: "fade",
|
|
8388
|
+
dissolve: "dissolve",
|
|
8389
|
+
whip: "smoothleft",
|
|
8390
|
+
swipe: "wipeleft",
|
|
8391
|
+
zoom: "zoomin"
|
|
8392
|
+
};
|
|
8393
|
+
var WORDS_PER_SECOND = 2.5;
|
|
8394
|
+
function estSpeechS(text) {
|
|
8395
|
+
const words = text.trim().split(/\s+/).filter(Boolean).length;
|
|
8396
|
+
return words / WORDS_PER_SECOND;
|
|
8397
|
+
}
|
|
8288
8398
|
var NARRATOR_SPEAKERS = /* @__PURE__ */ new Set([
|
|
8289
8399
|
"voiceover",
|
|
8290
8400
|
"voice_over",
|
|
@@ -8360,10 +8470,25 @@ var Scene = z3.object({
|
|
|
8360
8470
|
duration_s: z3.number().optional(),
|
|
8361
8471
|
summary: z3.string().optional(),
|
|
8362
8472
|
action_detail: z3.string().optional(),
|
|
8473
|
+
// The capture "look" for this scene — selected from the ad-native shoot-mode
|
|
8474
|
+
// grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
|
|
8475
|
+
// UGC/product mode; a human can override per scene by setting this.
|
|
8476
|
+
shoot_mode: z3.string().optional(),
|
|
8477
|
+
// Diegetic ambient the clip's native audio should carry (no music). When
|
|
8478
|
+
// absent the scene falls back to its shoot mode's default ambience.
|
|
8479
|
+
ambient: z3.string().optional(),
|
|
8363
8480
|
camera_motion: CameraMotion.optional(),
|
|
8364
8481
|
start_frame_prompt: z3.string().optional(),
|
|
8365
8482
|
end_frame_prompt: z3.string().optional(),
|
|
8366
8483
|
motion_prompt: z3.string().optional(),
|
|
8484
|
+
// The scene's role in the ad's persuasion arc (DECON-supplied); drives the
|
|
8485
|
+
// script re-craft checklist. Inferred from position when absent.
|
|
8486
|
+
narrative_role: z3.string().optional(),
|
|
8487
|
+
// How this scene cuts to the next (DECON-supplied). A recognized non-cut type
|
|
8488
|
+
// (fade/whip/zoom/dissolve/swipe) is reproduced as an ffmpeg xfade at the
|
|
8489
|
+
// boundary; cut/match_cut/none/other stay hard cuts. The last scene's value is
|
|
8490
|
+
// ignored (nothing follows it).
|
|
8491
|
+
transition_out: z3.object({ type: z3.string().optional(), description: z3.string().optional() }).loose().optional(),
|
|
8367
8492
|
dialogue: z3.array(DialogueLine).optional(),
|
|
8368
8493
|
sfx: z3.array(Sfx).optional(),
|
|
8369
8494
|
overlays: z3.array(z3.unknown()).optional(),
|
|
@@ -8378,6 +8503,10 @@ var VideoBlueprint = z3.object({
|
|
|
8378
8503
|
music: z3.object({
|
|
8379
8504
|
present: z3.boolean().optional(),
|
|
8380
8505
|
music_prompt: z3.string().optional(),
|
|
8506
|
+
// Absolute second the music enters in the reference (the bed often
|
|
8507
|
+
// kicks in mid-ad, after the hook). We start the regenerated track here
|
|
8508
|
+
// instead of at 0 so the timing matches.
|
|
8509
|
+
starts_at_s: z3.number().optional(),
|
|
8381
8510
|
// Populated by the deconstruct when AudD (Shazam-style) recognizes the
|
|
8382
8511
|
// reference track. We never reuse it — only style the regenerated bed.
|
|
8383
8512
|
identified_track: z3.object({ title: z3.string().optional(), artist: z3.string().optional() }).loose().nullish()
|
|
@@ -8401,6 +8530,11 @@ var RecurringElement = z3.object({
|
|
|
8401
8530
|
expression: z3.string().nullable().optional(),
|
|
8402
8531
|
// When the element maps to a global cast entry, its stable id (for annotation).
|
|
8403
8532
|
cast_id: z3.string().nullable().optional(),
|
|
8533
|
+
// The label of another element that is the SAME individual as this one, shown
|
|
8534
|
+
// in a DIFFERENT wardrobe/persona/state (e.g. one creator playing skeptic in a
|
|
8535
|
+
// pink shirt and believer in a white shirt). Each look gets its own reference
|
|
8536
|
+
// slot, but the face/identity must stay identical across them.
|
|
8537
|
+
same_as: z3.string().nullable().optional(),
|
|
8404
8538
|
// Scenes the element appears in. Either a bare list of scene indices (both
|
|
8405
8539
|
// edges) or per-{scene,edge} entries. Both forms are accepted and merged.
|
|
8406
8540
|
scenes: z3.array(z3.number()).optional(),
|
|
@@ -8476,15 +8610,27 @@ function roleForType2(type) {
|
|
|
8476
8610
|
return "the showcased product; keep this exact product identity consistent across every frame. Ignore any caption text printed on this reference.";
|
|
8477
8611
|
case "person":
|
|
8478
8612
|
case "animal":
|
|
8479
|
-
return "a recurring
|
|
8613
|
+
return "a recurring cast member; render the SAME individual as this reference image and keep them consistent across EVERY frame \u2014 their appearance comes from this reference, never from prose. Ignore any caption text printed on this reference.";
|
|
8614
|
+
case "location":
|
|
8615
|
+
return "the fixed set/location; keep the room, background, and layout identical to this reference across EVERY frame \u2014 do not re-invent the environment. Ignore any caption text printed on this reference.";
|
|
8480
8616
|
default:
|
|
8481
8617
|
return "a recurring identity element; reproduce it faithfully and keep it consistent across every frame. Ignore any caption text printed on it.";
|
|
8482
8618
|
}
|
|
8483
8619
|
}
|
|
8620
|
+
function roleForSlot(slot) {
|
|
8621
|
+
if (slot.sameAs) {
|
|
8622
|
+
const what = slot.description ? ` (${slot.description})` : "";
|
|
8623
|
+
return `the SAME individual as ${slot.sameAs}, shown in a DIFFERENT wardrobe/persona/state${what} \u2014 keep the FACE and identity IDENTICAL to the ${slot.sameAs} references; change ONLY wardrobe, styling, and expression. Ignore any caption text printed on this reference.`;
|
|
8624
|
+
}
|
|
8625
|
+
return roleForType2(slot.type);
|
|
8626
|
+
}
|
|
8484
8627
|
function todoPath2(el, label) {
|
|
8485
8628
|
const desc = el.description ? ` \u2014 ${el.description}` : "";
|
|
8486
8629
|
const expr = el.expression ? `, with a ${el.expression} expression` : "";
|
|
8487
|
-
|
|
8630
|
+
const t = el.type.toLowerCase();
|
|
8631
|
+
const fresh = t === "person" || t === "animal" || t === "location" ? " [SOURCE FRESH \u2014 a DIFFERENT person/animal/set than the original ad; do not reuse the source's individual]" : "";
|
|
8632
|
+
const same = el.same_as ? ` [SAME INDIVIDUAL as ${el.same_as} \u2014 a different wardrobe/look of the same person; reuse that cast person, change only the outfit]` : "";
|
|
8633
|
+
return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in${fresh}${same}]`;
|
|
8488
8634
|
}
|
|
8489
8635
|
function buildElementSlots(elements) {
|
|
8490
8636
|
const usedIds = /* @__PURE__ */ new Set(["prompt", "spine", "overlaid", "audio_mix", "final", "music_bed"]);
|
|
@@ -8499,6 +8645,7 @@ function buildElementSlots(elements) {
|
|
|
8499
8645
|
label,
|
|
8500
8646
|
type: el.type,
|
|
8501
8647
|
description: el.description,
|
|
8648
|
+
sameAs: el.same_as ?? void 0,
|
|
8502
8649
|
presence: presenceOf(el)
|
|
8503
8650
|
});
|
|
8504
8651
|
});
|
|
@@ -8507,38 +8654,90 @@ function buildElementSlots(elements) {
|
|
|
8507
8654
|
function slotsForFrame(slots, sceneIndex, edge) {
|
|
8508
8655
|
return slots.filter((s) => s.presence.get(sceneIndex)?.has(edge));
|
|
8509
8656
|
}
|
|
8657
|
+
var ACTOR_SHEET_MODEL = "google/gemini-3-pro-image-preview";
|
|
8658
|
+
function applyActorSheets(slots, nodes) {
|
|
8659
|
+
for (const slot of slots) {
|
|
8660
|
+
const t = slot.type.toLowerCase();
|
|
8661
|
+
if (t !== "person" && t !== "animal") continue;
|
|
8662
|
+
if (slot.presence.size < 2) continue;
|
|
8663
|
+
const sheetId = `${slot.id}_sheet`;
|
|
8664
|
+
nodes.push({
|
|
8665
|
+
id: sheetId,
|
|
8666
|
+
type: "image_reference_sheet",
|
|
8667
|
+
// The lone dropped ingest is the source; the sheet fans it into a turnaround.
|
|
8668
|
+
inputs: { references: [slot.ref] },
|
|
8669
|
+
params: {
|
|
8670
|
+
model: ACTOR_SHEET_MODEL,
|
|
8671
|
+
subject_description: slot.description ?? `the ${slot.type}`,
|
|
8672
|
+
subject_type: t === "person" ? "person" : "character",
|
|
8673
|
+
image_size: "2K"
|
|
8674
|
+
}
|
|
8675
|
+
});
|
|
8676
|
+
slot.ref = `$ref:${sheetId}.sheet`;
|
|
8677
|
+
}
|
|
8678
|
+
}
|
|
8510
8679
|
function slotsForScene(slots, sceneIndex) {
|
|
8511
8680
|
return slots.filter((s) => s.presence.has(sceneIndex));
|
|
8512
8681
|
}
|
|
8513
|
-
function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor) {
|
|
8682
|
+
function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mode) {
|
|
8514
8683
|
const EDGE = edge.toUpperCase();
|
|
8515
8684
|
const legend = [
|
|
8516
|
-
...present.map((s) => `- ${s.label} \u2014 ${
|
|
8685
|
+
...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
|
|
8517
8686
|
...hasAnchor ? [
|
|
8518
8687
|
"- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions of THIS frame. IGNORE its overlay text, captions, and any brand that is being swapped."
|
|
8519
8688
|
] : []
|
|
8520
8689
|
].join("\n");
|
|
8521
8690
|
const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
|
|
8691
|
+
const isHookFrame = sceneIndex === 0 && edge === "start";
|
|
8522
8692
|
return [
|
|
8523
8693
|
`Render the ${EDGE} frame of scene ${sceneIndex + 1} as a single still image. This prompt is self-contained and edit-per-frame: change the FRAME DESCRIPTION below to alter ONLY this frame.`,
|
|
8524
8694
|
"",
|
|
8525
|
-
"CRITICAL \u2014 RENDER A CLEAN PLATE WITH ZERO TEXT OR
|
|
8526
|
-
"This frame is a background plate.
|
|
8527
|
-
"
|
|
8528
|
-
"
|
|
8529
|
-
"
|
|
8530
|
-
"
|
|
8531
|
-
"
|
|
8532
|
-
"
|
|
8533
|
-
"
|
|
8695
|
+
"CRITICAL \u2014 RENDER A CLEAN PLATE WITH ZERO TEXT OR GRAPHIC OVERLAYS:",
|
|
8696
|
+
"This frame is a background plate. Every overlay element is composited afterwards as a",
|
|
8697
|
+
"separate hyperframe HTML overlay layer \u2014 NOT painted into this image. Render NONE of:",
|
|
8698
|
+
"words, captions, subtitles, headlines, lower-third bars, news tickers/crawls, chyrons,",
|
|
8699
|
+
"station bugs, watermarks, numbers, prices; and NONE of the graphic overlays layered on",
|
|
8700
|
+
"the picture either \u2014 icons, stickers, emojis, badges, rating/trust seals, progress bars,",
|
|
8701
|
+
"UI chrome/buttons, and arrows. No legible text anywhere, not even in the background, on a",
|
|
8702
|
+
"desk, on screens, or as part of a 'broadcast look'. If a reference image (a logo, a desk,",
|
|
8703
|
+
"a studio) contains any text or graphics, DO NOT reproduce them \u2014 render the subject/scene",
|
|
8704
|
+
"only, leaving the regions where overlays will sit clean. Imperfect/garbled letterforms or",
|
|
8705
|
+
"stray icons are the worst outcome; leave those areas blank.",
|
|
8706
|
+
"",
|
|
8707
|
+
"FRAMING \u2014 ONE UNCUT FRAME:",
|
|
8708
|
+
"Render ONE single uncut photographic frame: NO split screen, NO panels, NO dividing line,",
|
|
8709
|
+
"NO collage, NO before/after. Avoid the AI look \u2014 no waxy/plastic skin, no airbrushing, no",
|
|
8710
|
+
"over-smoothing, no over-saturation, no glossy 3D render. Every descriptor needs a technical",
|
|
8711
|
+
'anchor (a named lens / focal length / color grade) \u2014 no empty adjectives like "cinematic",',
|
|
8712
|
+
`"beautiful", "high quality"; they waste tokens and don't move the model.`,
|
|
8713
|
+
"",
|
|
8714
|
+
// The capture aesthetic + depth-of-field rule are SHOOT-MODE specific: a UGC
|
|
8715
|
+
// selfie is flat phone footage; a pack shot is a controlled studio frame. Only
|
|
8716
|
+
// this block varies by mode — the clean-plate and framing rules above are universal.
|
|
8717
|
+
captureBlockFor(mode),
|
|
8718
|
+
"",
|
|
8719
|
+
// Moderation-safe phrasing — Seedance routes around the real-person filter but
|
|
8720
|
+
// prompts still hit provider moderation; age-blind, role-based descriptions trip
|
|
8721
|
+
// it far less. (The client's own brand assets come from the references, not here.)
|
|
8722
|
+
"Describe any person by role, wardrobe, and build \u2014 never by name and never by age",
|
|
8723
|
+
"(no child/kid/teen/young/elderly); do not invent brand logos or marks.",
|
|
8724
|
+
...isHookFrame ? [
|
|
8725
|
+
"",
|
|
8726
|
+
"HOOK FRAME (scene 1 opens the ad): the feed plays muted, so this frame must read",
|
|
8727
|
+
"INSTANTLY SOUND-OFF \u2014 one clear subject, legible at a glance in ~1 second, no clutter."
|
|
8728
|
+
] : [],
|
|
8534
8729
|
"",
|
|
8535
8730
|
"REFERENCE IMAGES (in the order provided):",
|
|
8536
8731
|
legend,
|
|
8537
8732
|
"",
|
|
8733
|
+
"Identity comes from the reference images, not from this prose \u2014 render each person,",
|
|
8734
|
+
"product, and set to MATCH its reference image, and describe only pose, expression, action,",
|
|
8735
|
+
"and camera in the FRAME DESCRIPTION below.",
|
|
8736
|
+
"",
|
|
8538
8737
|
"FRAME DESCRIPTION (this frame's editable prompt):",
|
|
8539
8738
|
description,
|
|
8540
8739
|
"",
|
|
8541
|
-
"Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared
|
|
8740
|
+
"Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
|
|
8542
8741
|
"",
|
|
8543
8742
|
"GLOBAL STYLE REFERENCE (shared across frames; not this frame's content):",
|
|
8544
8743
|
"{{target_blueprint}}"
|
|
@@ -8552,7 +8751,7 @@ function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
|
|
|
8552
8751
|
const genParams = {
|
|
8553
8752
|
model: ctx.imageModel,
|
|
8554
8753
|
image_size: "2K",
|
|
8555
|
-
prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url))
|
|
8754
|
+
prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url), ctx.shootMode)
|
|
8556
8755
|
};
|
|
8557
8756
|
if (ctx.ar) genParams.aspect_ratio = ctx.ar;
|
|
8558
8757
|
const genNode = {
|
|
@@ -8567,7 +8766,17 @@ function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
|
|
|
8567
8766
|
nodes.push(genNode);
|
|
8568
8767
|
return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
|
|
8569
8768
|
}
|
|
8570
|
-
function
|
|
8769
|
+
function seedanceAudioLine(scene, mode, audio, nativeLine) {
|
|
8770
|
+
const ambient = scene.ambient?.trim() || diegeticFor(mode);
|
|
8771
|
+
if (nativeLine) {
|
|
8772
|
+
return `Audio: diegetic only \u2014 the speaker's own voice over ${ambient}; no music, no song, no soundtrack (the music bed is a separate track).`;
|
|
8773
|
+
}
|
|
8774
|
+
if (audio) {
|
|
8775
|
+
return `Audio: diegetic ambient only \u2014 ${ambient}; no spoken dialogue, no music, no song, no soundtrack (voice and music are separate tracks).`;
|
|
8776
|
+
}
|
|
8777
|
+
return null;
|
|
8778
|
+
}
|
|
8779
|
+
function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine) {
|
|
8571
8780
|
const parts = [];
|
|
8572
8781
|
const summary = scene.summary?.trim();
|
|
8573
8782
|
parts.push(summary ? `Scene ${sceneIndex + 1}: ${summary}` : `Scene ${sceneIndex + 1}`);
|
|
@@ -8583,18 +8792,109 @@ function buildSeedancePrompt(scene, sceneIndex, present) {
|
|
|
8583
8792
|
`Keep these consistent with their references: ${present.map((s) => `${s.label} (${s.description ?? s.type})`).join("; ")}`
|
|
8584
8793
|
);
|
|
8585
8794
|
}
|
|
8586
|
-
|
|
8587
|
-
|
|
8795
|
+
if (nativeLine) {
|
|
8796
|
+
parts.push(
|
|
8797
|
+
`The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${nativeLine}"`
|
|
8798
|
+
);
|
|
8799
|
+
} else {
|
|
8800
|
+
const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
|
|
8801
|
+
if (lines.length > 0)
|
|
8802
|
+
parts.push(`Spoken context (do not render as audio): ${lines.map((l) => `"${l}"`).join(" ")}`);
|
|
8803
|
+
}
|
|
8588
8804
|
const transcript = (scene.transcript_slice ?? []).map((w) => w.text?.trim()).filter(Boolean).join(" ").trim();
|
|
8589
8805
|
if (transcript) parts.push(`Transcript: ${transcript}`);
|
|
8806
|
+
const audioLine = seedanceAudioLine(scene, mode, audio, nativeLine);
|
|
8807
|
+
if (audioLine) parts.push(audioLine);
|
|
8808
|
+
parts.push(
|
|
8809
|
+
`Direction: describe MOTION ONLY \u2014 the frames carry the content; keep it short. ${seedanceMotionFor(mode)} Spell choreography move-by-move (not 'she dances' but the actual beats: head nod, shoulder roll, knee dip). One short continuous beat. Real physical weight on any impact (no weightless AI motion). Describe any person by role and wardrobe, never by name or age.`
|
|
8810
|
+
);
|
|
8590
8811
|
return parts.join("\n");
|
|
8591
8812
|
}
|
|
8813
|
+
function audioExtractArgs(durationS) {
|
|
8814
|
+
return [
|
|
8815
|
+
"-i",
|
|
8816
|
+
"{{in.clip}}",
|
|
8817
|
+
"-t",
|
|
8818
|
+
durationS.toFixed(3),
|
|
8819
|
+
"-vn",
|
|
8820
|
+
"-acodec",
|
|
8821
|
+
"libmp3lame",
|
|
8822
|
+
"-q:a",
|
|
8823
|
+
"2",
|
|
8824
|
+
"{{out.audio}}"
|
|
8825
|
+
];
|
|
8826
|
+
}
|
|
8827
|
+
function sceneOutTransition(scene, isLast) {
|
|
8828
|
+
if (isLast) return null;
|
|
8829
|
+
const type = scene.transition_out?.type?.toLowerCase();
|
|
8830
|
+
const xfade = type ? XFADE_BY_TYPE[type] : void 0;
|
|
8831
|
+
return xfade ? { xfade, dur: TRANSITION_DEFAULT_S } : null;
|
|
8832
|
+
}
|
|
8833
|
+
function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
|
|
8834
|
+
const talking = Boolean(nativeTurn) || cameraOn && (scene.dialogue ?? []).some(
|
|
8835
|
+
(d) => d.line?.trim() && isOnCameraSpeaker(d.speaker ?? "voiceover", casts, cameraOn)
|
|
8836
|
+
);
|
|
8837
|
+
return deriveShootMode({
|
|
8838
|
+
explicit: scene.shoot_mode,
|
|
8839
|
+
talking,
|
|
8840
|
+
hasPerson: present.some((s) => s.type.toLowerCase() === "person" || s.type.toLowerCase() === "animal"),
|
|
8841
|
+
hasProduct: present.some((s) => s.type.toLowerCase() === "product")
|
|
8842
|
+
});
|
|
8843
|
+
}
|
|
8844
|
+
function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
|
|
8845
|
+
if (nativeTurn) {
|
|
8846
|
+
const extractLen = Math.min(Math.max(lengths.dur, lengths.speech), lengths.genDur);
|
|
8847
|
+
nodes.push({
|
|
8848
|
+
id: `s${i}_voextract`,
|
|
8849
|
+
type: "ffmpeg",
|
|
8850
|
+
inputs: { clip: `$ref:s${i}_clip.video` },
|
|
8851
|
+
params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
|
|
8852
|
+
});
|
|
8853
|
+
nodes.push({
|
|
8854
|
+
id: `s${i}_voconv`,
|
|
8855
|
+
type: "audio_voice_convert",
|
|
8856
|
+
inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
|
|
8857
|
+
params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
|
|
8858
|
+
});
|
|
8859
|
+
voTracks.push({
|
|
8860
|
+
slot: `s${i}_voconv`,
|
|
8861
|
+
ref: `$ref:s${i}_voconv.audio`,
|
|
8862
|
+
start_s: nativeTurn.start_s,
|
|
8863
|
+
end_s: nativeTurn.start_s + extractLen,
|
|
8864
|
+
kind: "vo"
|
|
8865
|
+
});
|
|
8866
|
+
} else if (ambientBroll) {
|
|
8867
|
+
const ambientStart = scene.start_s ?? 0;
|
|
8868
|
+
nodes.push({
|
|
8869
|
+
id: `s${i}_ambient`,
|
|
8870
|
+
type: "ffmpeg",
|
|
8871
|
+
inputs: { clip: `$ref:s${i}_clip.video` },
|
|
8872
|
+
params: { args: audioExtractArgs(lengths.dur), outputs: { audio: { kind: "audio", ext: "mp3" } } }
|
|
8873
|
+
});
|
|
8874
|
+
voTracks.push({
|
|
8875
|
+
slot: `s${i}_ambient`,
|
|
8876
|
+
ref: `$ref:s${i}_ambient.audio`,
|
|
8877
|
+
start_s: ambientStart,
|
|
8878
|
+
end_s: ambientStart + lengths.dur,
|
|
8879
|
+
gain_db: AMBIENT_BED_GAIN_DB,
|
|
8880
|
+
kind: "ambient"
|
|
8881
|
+
});
|
|
8882
|
+
}
|
|
8883
|
+
}
|
|
8592
8884
|
function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
|
|
8593
8885
|
const ar = aspectRatioParam(blueprint);
|
|
8594
8886
|
const reuse = opts.frames === "reuse";
|
|
8595
|
-
const
|
|
8887
|
+
const clips = [];
|
|
8888
|
+
const voTracks = [];
|
|
8889
|
+
const lastIndex = blueprint.scenes.length - 1;
|
|
8890
|
+
const cameraOn = onCameraDialogue(blueprint);
|
|
8891
|
+
const casts = castIdSet(blueprint);
|
|
8596
8892
|
blueprint.scenes.forEach((scene, i) => {
|
|
8597
|
-
const
|
|
8893
|
+
const nativeTurn = (sceneTurns.get(i) ?? []).find((t) => t.native);
|
|
8894
|
+
const present = slotsForScene(slots, i);
|
|
8895
|
+
const mode = sceneShootMode(scene, present, nativeTurn, cameraOn, casts);
|
|
8896
|
+
const ambientBroll = Boolean(opts.ambient) && !nativeTurn && mode !== "ugc_selfie";
|
|
8897
|
+
const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel, shootMode: mode };
|
|
8598
8898
|
const firstFrame = buildFrameRef(
|
|
8599
8899
|
"start",
|
|
8600
8900
|
scene.start_frame_asset?.url,
|
|
@@ -8612,10 +8912,22 @@ function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
|
|
|
8612
8912
|
nodes
|
|
8613
8913
|
);
|
|
8614
8914
|
const dur = sceneDurationS(scene);
|
|
8915
|
+
let out = sceneOutTransition(scene, i === lastIndex);
|
|
8916
|
+
let trimTarget = dur + (out?.dur ?? 0);
|
|
8917
|
+
if (out && ceilToSeedance(trimTarget) < trimTarget) {
|
|
8918
|
+
out = null;
|
|
8919
|
+
trimTarget = dur;
|
|
8920
|
+
}
|
|
8921
|
+
const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
|
|
8922
|
+
const genDur = ceilToSeedance(Math.max(trimTarget, speech));
|
|
8615
8923
|
const clipParams = {
|
|
8616
8924
|
model: opts.videoModel,
|
|
8617
|
-
prompt: buildSeedancePrompt(scene, i,
|
|
8618
|
-
duration:
|
|
8925
|
+
prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
|
|
8926
|
+
duration: genDur,
|
|
8927
|
+
// Native talking scene → Seedance generates the spoken audio + lip-sync;
|
|
8928
|
+
// an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
|
|
8929
|
+
// clip is silent and audio comes from the tts/music timeline.
|
|
8930
|
+
generate_audio: Boolean(nativeTurn) || ambientBroll
|
|
8619
8931
|
};
|
|
8620
8932
|
if (ar) clipParams.aspect_ratio = ar;
|
|
8621
8933
|
nodes.push({
|
|
@@ -8624,31 +8936,21 @@ function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
|
|
|
8624
8936
|
inputs: { first_frame: firstFrame, last_frame: lastFrame },
|
|
8625
8937
|
params: clipParams
|
|
8626
8938
|
});
|
|
8627
|
-
|
|
8628
|
-
const
|
|
8629
|
-
|
|
8630
|
-
|
|
8631
|
-
nodes.push({
|
|
8632
|
-
id: `s${i}_lipsync`,
|
|
8633
|
-
type: "video_lipsync",
|
|
8634
|
-
inputs: { video: base, audio: solo.audioRef },
|
|
8635
|
-
params: { model: FIXED_LIPSYNC_MODEL }
|
|
8636
|
-
});
|
|
8637
|
-
base = `$ref:s${i}_lipsync.video`;
|
|
8638
|
-
}
|
|
8639
|
-
if (ceilToSeedance(dur) === dur) {
|
|
8640
|
-
clipRefs.push(base);
|
|
8939
|
+
emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, { dur, speech, genDur }, nodes, voTracks);
|
|
8940
|
+
const base = `$ref:s${i}_clip.video`;
|
|
8941
|
+
if (genDur === trimTarget) {
|
|
8942
|
+
clips.push({ ref: base, scene_s: dur, out });
|
|
8641
8943
|
} else {
|
|
8642
8944
|
nodes.push({
|
|
8643
8945
|
id: `s${i}_trim`,
|
|
8644
8946
|
type: "ffmpeg",
|
|
8645
8947
|
inputs: { clip: base },
|
|
8646
|
-
params: { args: trimArgs(
|
|
8948
|
+
params: { args: trimArgs(trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
8647
8949
|
});
|
|
8648
|
-
|
|
8950
|
+
clips.push({ ref: `$ref:s${i}_trim.video`, scene_s: dur, out });
|
|
8649
8951
|
}
|
|
8650
8952
|
});
|
|
8651
|
-
return
|
|
8953
|
+
return { clips, voTracks };
|
|
8652
8954
|
}
|
|
8653
8955
|
function musicBedPrompt(blueprint, musicPrompt) {
|
|
8654
8956
|
const track2 = blueprint.global?.music?.identified_track;
|
|
@@ -8664,6 +8966,33 @@ function onCameraDialogue(blueprint) {
|
|
|
8664
8966
|
return mode !== "voiceover" && mode !== "none";
|
|
8665
8967
|
}
|
|
8666
8968
|
var castIdSet = (blueprint) => new Set((blueprint.global?.cast ?? []).map((c) => c.id).filter((id) => Boolean(id)));
|
|
8969
|
+
var LANGUAGE_WORDS = [
|
|
8970
|
+
[/\b(french|fran[çc]ais|francaise)\b/, "french"],
|
|
8971
|
+
[/\b(spanish|espa[ñn]ol|castilian)\b/, "spanish"],
|
|
8972
|
+
[/\benglish\b/, "english"],
|
|
8973
|
+
[/\b(german|deutsch)\b/, "german"],
|
|
8974
|
+
[/\b(italian|italiano)\b/, "italian"],
|
|
8975
|
+
[/\b(portuguese|portugu[êe]s|brazilian)\b/, "portuguese"],
|
|
8976
|
+
[/\b(dutch|nederlands)\b/, "dutch"],
|
|
8977
|
+
[/\b(arabic)\b/, "arabic"],
|
|
8978
|
+
[/\b(japanese)\b/, "japanese"],
|
|
8979
|
+
[/\b(korean)\b/, "korean"],
|
|
8980
|
+
[/\b(hindi)\b/, "hindi"],
|
|
8981
|
+
[/\b(polish)\b/, "polish"]
|
|
8982
|
+
];
|
|
8983
|
+
function parseVoiceTraits(description) {
|
|
8984
|
+
const d = description.toLowerCase();
|
|
8985
|
+
const out = {};
|
|
8986
|
+
if (/\b(female|woman|women|girl|lady)\b/.test(d)) out.gender = "female";
|
|
8987
|
+
else if (/\b(male|man|men|guy|boy)\b/.test(d)) out.gender = "male";
|
|
8988
|
+
for (const [re, name] of LANGUAGE_WORDS) {
|
|
8989
|
+
if (re.test(d)) {
|
|
8990
|
+
out.language = name;
|
|
8991
|
+
break;
|
|
8992
|
+
}
|
|
8993
|
+
}
|
|
8994
|
+
return out;
|
|
8995
|
+
}
|
|
8667
8996
|
function isOnCameraSpeaker(speaker, casts, cameraOn) {
|
|
8668
8997
|
if (!cameraOn) return false;
|
|
8669
8998
|
if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
|
|
@@ -8688,7 +9017,9 @@ function buildDialogue(blueprint, nodes) {
|
|
|
8688
9017
|
const existing = voiceNodeBySpeaker.get(speaker);
|
|
8689
9018
|
if (existing) return existing;
|
|
8690
9019
|
const id = sanitizeId2(`voice_${speaker}`, `voice_${voiceNodeBySpeaker.size}`);
|
|
8691
|
-
|
|
9020
|
+
const description = speakerDescription(speaker);
|
|
9021
|
+
const traits = parseVoiceTraits(description);
|
|
9022
|
+
nodes.push({ id, type: "voice_select", params: { description, ...traits } });
|
|
8692
9023
|
voiceNodeBySpeaker.set(speaker, id);
|
|
8693
9024
|
return id;
|
|
8694
9025
|
};
|
|
@@ -8703,44 +9034,52 @@ function buildDialogue(blueprint, nodes) {
|
|
|
8703
9034
|
if (last && last.speaker === speaker) last.lines.push(line);
|
|
8704
9035
|
else groups.push({ speaker, lines: [line] });
|
|
8705
9036
|
}
|
|
8706
|
-
const
|
|
8707
|
-
groups.forEach((group, gi) => {
|
|
9037
|
+
const shells = groups.map((group) => {
|
|
8708
9038
|
const first = group.lines[0];
|
|
8709
9039
|
const last = group.lines[group.lines.length - 1];
|
|
8710
|
-
if (!first || !last) return;
|
|
8711
|
-
|
|
8712
|
-
|
|
9040
|
+
if (!first || !last) return void 0;
|
|
9041
|
+
return {
|
|
9042
|
+
group,
|
|
9043
|
+
start: first.start_s ?? scene.start_s ?? 0,
|
|
9044
|
+
end: last.end_s ?? last.start_s ?? scene.end_s ?? first.start_s ?? scene.start_s ?? 0,
|
|
9045
|
+
onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn)
|
|
9046
|
+
};
|
|
9047
|
+
}).filter((s) => Boolean(s));
|
|
9048
|
+
const onCamCount = shells.filter((s) => s.onCamera).length;
|
|
9049
|
+
const list = [];
|
|
9050
|
+
shells.forEach((shell, gi) => {
|
|
9051
|
+
const { group, start, end, onCamera } = shell;
|
|
8713
9052
|
const voiceNode = ensureVoiceNode(group.speaker);
|
|
8714
|
-
|
|
8715
|
-
|
|
8716
|
-
let n = 2;
|
|
8717
|
-
while (usedVoIds.has(`${id}_${n}`)) n++;
|
|
8718
|
-
id = `${id}_${n}`;
|
|
8719
|
-
}
|
|
8720
|
-
usedVoIds.add(id);
|
|
8721
|
-
nodes.push({
|
|
8722
|
-
id,
|
|
8723
|
-
type: "tts",
|
|
8724
|
-
inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
8725
|
-
// Lines join with a space; each keeps its terminal punctuation so eleven_v3
|
|
8726
|
-
// reads the sentence boundaries (and their pauses) within the one turn.
|
|
8727
|
-
params: {
|
|
8728
|
-
model: FIXED_TTS_MODEL,
|
|
8729
|
-
text: group.lines.map((l) => l.line.trim()).join(" "),
|
|
8730
|
-
voice: "{{voice_ref}}"
|
|
8731
|
-
}
|
|
8732
|
-
});
|
|
9053
|
+
const text = group.lines.map((l) => l.line.trim()).join(" ");
|
|
9054
|
+
const native = onCamera && onCamCount === 1;
|
|
8733
9055
|
const turn = {
|
|
8734
9056
|
sceneIndex,
|
|
8735
9057
|
speaker: group.speaker,
|
|
8736
|
-
onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn),
|
|
8737
9058
|
start_s: start,
|
|
8738
9059
|
end_s: end,
|
|
8739
|
-
|
|
8740
|
-
|
|
9060
|
+
text,
|
|
9061
|
+
voiceNode,
|
|
9062
|
+
native
|
|
8741
9063
|
};
|
|
9064
|
+
if (!native) {
|
|
9065
|
+
let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
|
|
9066
|
+
if (usedVoIds.has(id)) {
|
|
9067
|
+
let n = 2;
|
|
9068
|
+
while (usedVoIds.has(`${id}_${n}`)) n++;
|
|
9069
|
+
id = `${id}_${n}`;
|
|
9070
|
+
}
|
|
9071
|
+
usedVoIds.add(id);
|
|
9072
|
+
nodes.push({
|
|
9073
|
+
id,
|
|
9074
|
+
type: "tts",
|
|
9075
|
+
inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9076
|
+
params: { model: FIXED_TTS_MODEL, text, voice: "{{voice_ref}}" }
|
|
9077
|
+
});
|
|
9078
|
+
turn.ttsId = id;
|
|
9079
|
+
const audioRef = `$ref:${id}.audio`;
|
|
9080
|
+
tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
|
|
9081
|
+
}
|
|
8742
9082
|
list.push(turn);
|
|
8743
|
-
tracks.push({ slot: id, ref: turn.audioRef, start_s: start, end_s: end, kind: "vo" });
|
|
8744
9083
|
});
|
|
8745
9084
|
sceneTurns.set(sceneIndex, list);
|
|
8746
9085
|
});
|
|
@@ -8766,14 +9105,22 @@ function buildSfxMusic(blueprint, nodes) {
|
|
|
8766
9105
|
});
|
|
8767
9106
|
const musicPrompt = blueprint.global?.music?.music_prompt;
|
|
8768
9107
|
if (musicPrompt) {
|
|
8769
|
-
const
|
|
9108
|
+
const total = blueprint.source?.duration_s ?? lastSceneEnd(blueprint);
|
|
9109
|
+
const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, 0), Math.max(total - 0.5, 0));
|
|
9110
|
+
const totalMs = Math.round((total - startAt) * 1e3);
|
|
8770
9111
|
const musicMs = Math.min(Math.max(totalMs, 3e3), ELEVENLABS_MAX_MUSIC_LENGTH_MS);
|
|
8771
9112
|
nodes.push({
|
|
8772
9113
|
id: "music_bed",
|
|
8773
9114
|
type: "music",
|
|
8774
9115
|
params: { model: FIXED_MUSIC_MODEL, prompt: musicBedPrompt(blueprint, musicPrompt), music_length_ms: musicMs }
|
|
8775
9116
|
});
|
|
8776
|
-
tracks.push({
|
|
9117
|
+
tracks.push({
|
|
9118
|
+
slot: "music",
|
|
9119
|
+
ref: "$ref:music_bed.audio",
|
|
9120
|
+
start_s: startAt,
|
|
9121
|
+
gain_db: MUSIC_BED_GAIN_DB,
|
|
9122
|
+
kind: "music"
|
|
9123
|
+
});
|
|
8777
9124
|
}
|
|
8778
9125
|
return tracks;
|
|
8779
9126
|
}
|
|
@@ -8823,14 +9170,29 @@ function overlayElement(ov, sceneStart) {
|
|
|
8823
9170
|
const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
|
|
8824
9171
|
return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
|
|
8825
9172
|
}
|
|
9173
|
+
function sourceHint(fe) {
|
|
9174
|
+
const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
|
|
9175
|
+
switch ((fe.kind ?? "").toLowerCase()) {
|
|
9176
|
+
case "logo":
|
|
9177
|
+
return "baker images logo <domain> (or baker images library)";
|
|
9178
|
+
case "emoji":
|
|
9179
|
+
case "sticker":
|
|
9180
|
+
return `baker images sticker "${desc}" (or baker images gif)`;
|
|
9181
|
+
case "product_cutout":
|
|
9182
|
+
return `baker images library "${desc}" (the client's own product)`;
|
|
9183
|
+
default:
|
|
9184
|
+
return `baker images icon "${desc}"`;
|
|
9185
|
+
}
|
|
9186
|
+
}
|
|
8826
9187
|
function floatingStub(fe, sceneStart) {
|
|
8827
9188
|
const at = fe.appears_at_s ?? sceneStart;
|
|
8828
9189
|
const dur = fe.duration_s ?? 2.5;
|
|
8829
9190
|
const kind = commentSafe(fe.kind ?? "element");
|
|
8830
9191
|
const label = commentSafe(fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element");
|
|
9192
|
+
const hint = commentSafe(sourceHint(fe));
|
|
8831
9193
|
const slug = (fe.kind ?? "element").toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "") || "element";
|
|
8832
9194
|
return [
|
|
8833
|
-
`<!-- ${kind}: ${label} @ ${at}s for ${dur}s (${positionClass(fe.position)}).
|
|
9195
|
+
`<!-- ${kind}: ${label} @ ${at}s for ${dur}s (${positionClass(fe.position)}). Source a real asset: ${hint} \u2014 drop it in this dir and uncomment:`,
|
|
8834
9196
|
`<img class="ov ${positionClass(fe.position)}" src="your-${slug}.png" data-start="${at}" data-dur="${dur}" alt="" /> -->`
|
|
8835
9197
|
].join("\n");
|
|
8836
9198
|
}
|
|
@@ -8871,6 +9233,52 @@ function concatArgs(count) {
|
|
|
8871
9233
|
}
|
|
8872
9234
|
return [...inputs, "-filter_complex", `${labels}concat=n=${count}:v=1:a=0[v]`, "-map", "[v]", "{{out.video}}"];
|
|
8873
9235
|
}
|
|
9236
|
+
function clipInputLen(c) {
|
|
9237
|
+
return c.scene_s + (c.out?.dur ?? 0);
|
|
9238
|
+
}
|
|
9239
|
+
function xfadeSpineArgs(clips) {
|
|
9240
|
+
const n = clips.length;
|
|
9241
|
+
const inputs = [];
|
|
9242
|
+
const filt = [];
|
|
9243
|
+
for (let i = 0; i < n; i++) {
|
|
9244
|
+
inputs.push("-i", `{{in.c${i}}}`);
|
|
9245
|
+
filt.push(`[${i}:v]format=yuv420p,fps=30,setsar=1,settb=AVTB[c${i}]`);
|
|
9246
|
+
}
|
|
9247
|
+
let cur = "c0";
|
|
9248
|
+
let accLen = clipInputLen(clips[0]);
|
|
9249
|
+
for (let k = 0; k < n - 1; k++) {
|
|
9250
|
+
const join3 = clips[k].out;
|
|
9251
|
+
const next = `c${k + 1}`;
|
|
9252
|
+
const out = k === n - 2 ? "v" : `j${k + 1}`;
|
|
9253
|
+
if (join3) {
|
|
9254
|
+
const offset = Math.max(0, accLen - join3.dur);
|
|
9255
|
+
filt.push(
|
|
9256
|
+
`[${cur}][${next}]xfade=transition=${join3.xfade}:duration=${join3.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
|
|
9257
|
+
);
|
|
9258
|
+
accLen = accLen - join3.dur + clipInputLen(clips[k + 1]);
|
|
9259
|
+
} else {
|
|
9260
|
+
filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
|
|
9261
|
+
accLen += clipInputLen(clips[k + 1]);
|
|
9262
|
+
}
|
|
9263
|
+
cur = out;
|
|
9264
|
+
}
|
|
9265
|
+
return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
|
|
9266
|
+
}
|
|
9267
|
+
function buildSpine(clips, nodes) {
|
|
9268
|
+
const inputs = {};
|
|
9269
|
+
clips.forEach((c, i) => {
|
|
9270
|
+
inputs[`c${i}`] = c.ref;
|
|
9271
|
+
});
|
|
9272
|
+
const hasTransition = clips.length > 1 && clips.some((c) => c.out);
|
|
9273
|
+
const args = hasTransition ? xfadeSpineArgs(clips) : concatArgs(clips.length);
|
|
9274
|
+
nodes.push({
|
|
9275
|
+
id: "spine",
|
|
9276
|
+
type: "ffmpeg",
|
|
9277
|
+
inputs,
|
|
9278
|
+
params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
9279
|
+
});
|
|
9280
|
+
return "$ref:spine.video";
|
|
9281
|
+
}
|
|
8874
9282
|
function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
8875
9283
|
const blueprint = VideoBlueprint.parse(input);
|
|
8876
9284
|
const elements = RecurringElements.parse(elementsInput);
|
|
@@ -8888,19 +9296,11 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
8888
9296
|
params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
|
|
8889
9297
|
});
|
|
8890
9298
|
});
|
|
8891
|
-
|
|
8892
|
-
const
|
|
8893
|
-
const
|
|
8894
|
-
|
|
8895
|
-
|
|
8896
|
-
});
|
|
8897
|
-
nodes.push({
|
|
8898
|
-
id: "spine",
|
|
8899
|
-
type: "ffmpeg",
|
|
8900
|
-
inputs: concatInputs,
|
|
8901
|
-
params: { args: concatArgs(clipRefs.length), outputs: { video: { kind: "video", ext: "mp4" } } }
|
|
8902
|
-
});
|
|
8903
|
-
let videoRef = "$ref:spine.video";
|
|
9299
|
+
if (opts.actorSheets) applyActorSheets(slots, nodes);
|
|
9300
|
+
const { tracks: ttsTracks, sceneTurns } = buildDialogue(blueprint, nodes);
|
|
9301
|
+
const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
|
|
9302
|
+
const voTracks = [...ttsTracks, ...nativeVoTracks];
|
|
9303
|
+
let videoRef = buildSpine(clips, nodes);
|
|
8904
9304
|
let videoNode = "spine";
|
|
8905
9305
|
const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
|
|
8906
9306
|
const floating = blueprint.scenes.flatMap((s) => s.floating_elements ?? []);
|
|
@@ -8972,7 +9372,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
8972
9372
|
metadata: {
|
|
8973
9373
|
name: "video reproduction",
|
|
8974
9374
|
description: VIDEO_GUIDE,
|
|
8975
|
-
todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts),
|
|
9375
|
+
todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts, blueprint),
|
|
8976
9376
|
// The timing plan `baker canvas validate` checks before any billed render:
|
|
8977
9377
|
// sequenced voiceover turns (no overlap), audio ≈ video length, and which
|
|
8978
9378
|
// scenes must be lip-synced.
|
|
@@ -8987,60 +9387,162 @@ function buildVideoMeta(blueprint, sceneTurns) {
|
|
|
8987
9387
|
const talking_scenes = [];
|
|
8988
9388
|
for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
|
|
8989
9389
|
for (const t of turns) {
|
|
8990
|
-
vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
|
|
8991
|
-
}
|
|
8992
|
-
|
|
8993
|
-
|
|
9390
|
+
if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
|
|
9391
|
+
}
|
|
9392
|
+
const nativeTurn = turns.find((t) => t.native);
|
|
9393
|
+
if (nativeTurn) {
|
|
9394
|
+
const sceneObj = blueprint.scenes[scene];
|
|
9395
|
+
talking_scenes.push({
|
|
9396
|
+
scene,
|
|
9397
|
+
voice_convert_node: `s${scene}_voconv`,
|
|
9398
|
+
scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
|
|
9399
|
+
est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
|
|
9400
|
+
});
|
|
8994
9401
|
}
|
|
8995
9402
|
}
|
|
8996
9403
|
return {
|
|
8997
9404
|
duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
|
|
8998
9405
|
vo_segments,
|
|
8999
|
-
talking_scenes
|
|
9406
|
+
talking_scenes,
|
|
9407
|
+
motion_board: buildMotionBoard(blueprint, sceneTurns)
|
|
9000
9408
|
};
|
|
9001
9409
|
}
|
|
9410
|
+
function buildMotionBoard(blueprint, sceneTurns) {
|
|
9411
|
+
const round = (n) => Math.round(n * 100) / 100;
|
|
9412
|
+
let cursor = 0;
|
|
9413
|
+
return blueprint.scenes.map((scene, i) => {
|
|
9414
|
+
const start_s = scene.start_s ?? cursor;
|
|
9415
|
+
const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
|
|
9416
|
+
cursor = end_s;
|
|
9417
|
+
const spoken = (sceneTurns.get(i) ?? []).map((t) => t.text?.trim()).filter((l) => Boolean(l)).join(" ") || null;
|
|
9418
|
+
const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
|
|
9419
|
+
const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
|
|
9420
|
+
const graphics = [
|
|
9421
|
+
...(overlays.success ? overlays.data : []).filter((ov) => ov.text?.trim()).map((ov) => ({
|
|
9422
|
+
kind: "text",
|
|
9423
|
+
at_s: round(ov.appears_at_s ?? start_s),
|
|
9424
|
+
dur_s: round(ov.duration_s ?? 2.5),
|
|
9425
|
+
position: ov.position ?? "bottom_center",
|
|
9426
|
+
text: ov.text?.trim()
|
|
9427
|
+
})),
|
|
9428
|
+
...(floats.success ? floats.data : []).map((fe) => ({
|
|
9429
|
+
kind: "graphic",
|
|
9430
|
+
at_s: round(fe.appears_at_s ?? start_s),
|
|
9431
|
+
dur_s: round(fe.duration_s ?? 2.5),
|
|
9432
|
+
position: fe.position ?? "bottom_center",
|
|
9433
|
+
label: fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element"
|
|
9434
|
+
}))
|
|
9435
|
+
].sort((a, b) => a.at_s - b.at_s);
|
|
9436
|
+
return {
|
|
9437
|
+
scene: i,
|
|
9438
|
+
role: scene.narrative_role?.trim() || inferNarrativeRole(i, blueprint.scenes.length),
|
|
9439
|
+
window_s: [round(start_s), round(end_s)],
|
|
9440
|
+
storyboard_frames: [`s${i}_start`, `s${i}_end`],
|
|
9441
|
+
spoken,
|
|
9442
|
+
graphics
|
|
9443
|
+
};
|
|
9444
|
+
});
|
|
9445
|
+
}
|
|
9002
9446
|
var VIDEO_GUIDE = [
|
|
9003
|
-
"Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192
|
|
9447
|
+
"Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 concatenated. Talking heads are voiced NATIVELY by Seedance (lips+voice generated together) then re-voiced to one brand voice; off-camera narration is sequenced tts. On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
|
|
9004
9448
|
"",
|
|
9005
9449
|
"WHAT TO DO NEXT:",
|
|
9450
|
+
"0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
|
|
9006
9451
|
"1. Edit each frame's prompt IN PLACE. Every `s<i>_start` / `s<i>_end` node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want.",
|
|
9452
|
+
"1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. The boundary frames ARE your storyboard; `metadata.video.motion_board` lays out each scene's frames, time window, spoken line, and the graphics scheduled in it. Lock the frames + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
|
|
9007
9453
|
"2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
|
|
9008
|
-
"3.
|
|
9009
|
-
"4.
|
|
9010
|
-
"5. Overlays are REAL HTML you paint. Open `video-overlay-composition/index.html`: the reference's overlays are seeded inside `#overlay-root` as plain elements (text + a `.pos-*` class + `data-start`/`data-dur`). Restyle the CSS freely \u2014 build lower-thirds, a ticker, whatever the look needs \u2014 and replace a logo placeholder with a real `<img>` you drop in that dir. The runtime only shows/hides by timestamp; it makes no styling decisions. Drop `brand-bold.otf` / `brand-regular.otf` there for on-brand type.",
|
|
9011
|
-
"6. `baker canvas validate` (proves audio
|
|
9454
|
+
"3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
|
|
9455
|
+
"4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
|
|
9456
|
+
"5. Overlays are REAL HTML you paint. Open `video-overlay-composition/index.html`: the reference's overlays are seeded inside `#overlay-root` as plain elements (text + a `.pos-*` class + `data-start`/`data-dur`). Restyle the CSS freely \u2014 build lower-thirds, a ticker, whatever the look needs \u2014 and replace a logo placeholder with a real `<img>` you source (`baker images icon/sticker/gif/logo`) and drop in that dir. The runtime only shows/hides by timestamp; it makes no styling decisions. Drop `brand-bold.otf` / `brand-regular.otf` there for on-brand type.",
|
|
9457
|
+
"6. `baker canvas validate` (proves native-audio + timing for free) then `baker canvas run` (generates many billed image/video/audio assets \u2014 not free).",
|
|
9012
9458
|
"",
|
|
9013
|
-
"
|
|
9459
|
+
"CRAFT \u2014 raises every clip's realism (one-liners; full rationale in references/video-craft.md):",
|
|
9460
|
+
"- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
|
|
9461
|
+
"- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
|
|
9462
|
+
"- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
|
|
9463
|
+
"- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it). Match-cut continuous action by setting scene N+1's start frame = scene N's end frame (costs no extra gens).",
|
|
9464
|
+
"- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
|
|
9465
|
+
"- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
|
|
9466
|
+
"",
|
|
9467
|
+
"Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are.",
|
|
9468
|
+
"Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
|
|
9014
9469
|
].join("\n");
|
|
9015
|
-
function
|
|
9470
|
+
function inferNarrativeRole(index, total) {
|
|
9471
|
+
if (index === 0 && total > 1) return "hook";
|
|
9472
|
+
if (index === total - 1) return "cta";
|
|
9473
|
+
return "body";
|
|
9474
|
+
}
|
|
9475
|
+
function buildScriptRecraft(blueprint) {
|
|
9476
|
+
const total = blueprint.scenes.length;
|
|
9477
|
+
return blueprint.scenes.map((scene, i) => {
|
|
9478
|
+
const role = scene.narrative_role?.trim() || inferNarrativeRole(i, total);
|
|
9479
|
+
const original = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ");
|
|
9480
|
+
return {
|
|
9481
|
+
scene: i,
|
|
9482
|
+
role,
|
|
9483
|
+
original_line: original || null,
|
|
9484
|
+
recraft: `[RECRAFT: rewrite this ${role} for OUR brand \u2014 true claims only; do NOT render the reference's words. See references/script-craft.md + meta-ads-playbook.]`
|
|
9485
|
+
};
|
|
9486
|
+
});
|
|
9487
|
+
}
|
|
9488
|
+
function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
|
|
9016
9489
|
return {
|
|
9490
|
+
recraft_the_script_first: "VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer. Work the per-scene `script_recraft` checklist below; see references/script-craft.md + the meta-ads-playbook skill.",
|
|
9491
|
+
script_recraft: buildScriptRecraft(blueprint),
|
|
9017
9492
|
edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference. Frames are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
|
|
9018
9493
|
frames_mode: opts.frames ?? "generate",
|
|
9494
|
+
review_storyboard_before_clips: "STORYBOARD FIRST. The per-scene boundary frames (s<i>_start / s<i>_end) ARE your storyboard \u2014 align the LOOK on them before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's frames, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
|
|
9495
|
+
motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
|
|
9019
9496
|
assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
|
|
9497
|
+
sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
|
|
9020
9498
|
recurring_elements_to_supply: report.elements,
|
|
9021
9499
|
text_strategy: "Decide per ad: text is either baked by the generated creative OR painted via the overlay HTML \u2014 not both. Default here is clean text-free frames + the HTML overlay layer (video-overlay-composition/index.html) as the single text source, which you fully control.",
|
|
9022
|
-
timeline: "Automatic: each clip is generated at >= its scene length then trimmed back to the real scene duration, so the concatenated picture stays on the same timeline as the
|
|
9500
|
+
timeline: "Automatic: each clip is generated at >= its scene length then trimmed back to the real scene duration, so the concatenated picture stays on the same timeline as the audio. You don't manage it.",
|
|
9023
9501
|
voices_to_confirm: report.dialogue.map((d) => ({
|
|
9024
9502
|
scene: d.scene,
|
|
9025
9503
|
speaker: d.speaker,
|
|
9026
9504
|
voice_description: d.voice_description,
|
|
9027
9505
|
line: d.line
|
|
9028
9506
|
})),
|
|
9029
|
-
|
|
9030
|
-
|
|
9507
|
+
talking_head_note: "NATIVE: a single-on-camera-speaker scene is voiced by Seedance itself (line in s<i>_clip prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Edit the line in the clip's prompt to re-author it.",
|
|
9508
|
+
voice_note: "Every native talking clip's audio is re-voiced to ONE brand voice via audio_voice_convert (eleven_multilingual_sts_v2), timing preserved so lips stay matched. voice_select.voice_id is that brand voice \u2014 set its gender/language to match the creator. Off-camera narration uses a sequenced tts per turn.",
|
|
9509
|
+
native_timing: "Seedance paces the spoken line to fill the clip, so each native talking clip is generated long enough for the estimated speech and its audio is kept full-length (not hard-trimmed to the visual scene) \u2014 the line is never cut mid-word; the voice may continue a beat past the visual cut (natural VO continuity). `metadata.video.talking_scenes` carries each scene's scene_s vs est_speech_s. If a rendered line still sounds clipped, the line is simply longer than the scene: shorten the line or lengthen the scene in the deconstruct.",
|
|
9510
|
+
craft: {
|
|
9511
|
+
note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
|
|
9512
|
+
principles: [
|
|
9513
|
+
"Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
|
|
9514
|
+
"Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
|
|
9515
|
+
"LOCK THE CAMERA \u2014 a first/last-frame clip holds the framing the two frames define; only move when a move is specified. Unmotivated camera drift is the top realism tell.",
|
|
9516
|
+
"One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
|
|
9517
|
+
"Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
|
|
9518
|
+
"Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
|
|
9519
|
+
"Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
|
|
9520
|
+
"Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
|
|
9521
|
+
"Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
|
|
9522
|
+
"The hook is VISUAL-FIRST: the feed plays muted, so scene 1's opening frame + its overlay text must read sound-off in ~1 second \u2014 don't bury the hook in the spoken line alone (meta-ads-playbook \xA739 visual-hooks-beat-audio, \xA748 the 1-second/feed-native rule)."
|
|
9523
|
+
]
|
|
9524
|
+
},
|
|
9525
|
+
transitions: "Scene-to-scene cuts the deconstruct flagged as fade/whip/zoom/dissolve/swipe are reproduced as an ffmpeg xfade at the boundary (everything else stays a hard cut). The overlap is consumed from extra generated footage, so the picture stays exactly on the audio timeline. To change a transition, edit the scene's `transition_out.type` in prompt.json and re-scaffold, or hand-edit the `spine` node's ffmpeg args.",
|
|
9031
9526
|
text_overlays: {
|
|
9032
9527
|
count: overlayCount,
|
|
9033
9528
|
note: "Seeded as editable HTML inside `#overlay-root` in video-overlay-composition/index.html (text + a .pos-* class + data-start/data-dur). PAINT it: restyle the CSS, build lower-thirds/tickers, drop brand-*.otf for on-brand type. The runtime only shows/hides by timestamp."
|
|
9034
9529
|
},
|
|
9035
9530
|
floating_elements: {
|
|
9036
9531
|
count: floatingCount,
|
|
9037
|
-
note: floatingCount > 0 ? "Seeded as
|
|
9532
|
+
note: floatingCount > 0 ? "Seeded as commented <img> stubs in index.html (each names the `baker images icon/sticker/gif/logo` command to source it) \u2014 source the asset, drop it in video-overlay-composition/, uncomment the <img>." : "none detected by the deconstruct \u2014 see `completeness_check`, the reference may still have icons/stickers it missed."
|
|
9533
|
+
},
|
|
9534
|
+
sound_effects: {
|
|
9535
|
+
count: report.sfx_count,
|
|
9536
|
+
note: report.sfx_count > 0 ? "Seeded as `sound_effect` nodes on `audio_mix` at their timestamps \u2014 edit the prompt or retime." : "none detected by the deconstruct \u2014 see `completeness_check`, the reference may still have sound cues it missed."
|
|
9038
9537
|
},
|
|
9039
|
-
sound_effects: { count: report.sfx_count },
|
|
9040
9538
|
music: {
|
|
9041
9539
|
present: report.has_music,
|
|
9042
|
-
note: report.has_music ? "Original bed regenerated from the deconstruct prompt (styled after the AudD-identified track when available); ducked under the voices." : "no music bed scaffolded"
|
|
9540
|
+
note: report.has_music ? "Original bed regenerated from the deconstruct prompt (styled after the AudD-identified track when available); ducked under the voices." : "no music bed scaffolded \u2014 if the reference has music, see `completeness_check`."
|
|
9043
9541
|
},
|
|
9542
|
+
// ALWAYS-ON safety net: the scaffold can only seed what the deconstruct
|
|
9543
|
+
// cataloged, and it under-detects on-image graphics + sound cues. Never trust
|
|
9544
|
+
// "none detected" — re-watch the reference and fill the gaps with the right tool.
|
|
9545
|
+
completeness_check: 'The scaffold mirrors the deconstruct\'s catalog, which UNDER-DETECTS \u2014 never trust a 0 count. Re-watch the reference frame-by-frame and add anything missing: (1) ON-IMAGE GRAPHICS not in floating_elements (dollar/coin icons, emojis, checkmarks, rating stars, price tags, arrows, progress bars, app UI) \u2192 source each with `baker images icon "<desc>"` / `baker images sticker` / `baker images gif` / `baker images logo <domain>` and add it as an <img class="ov pos-* " data-start data-dur> in video-overlay-composition/index.html (NEVER bake graphics into the frame plates). (2) SOUND CUES not in sound_effects (cha-ching/coin, whoosh, ding, pop, notification, keyboard) \u2192 add a `sound_effect` node (eleven_text_to_sound) and wire it onto `audio_mix` at its timestamp. (3) RECURRING people/animals/products/logos/sets with no el_* slot \u2192 add an `ingest` [TODO] slot and reference it from the frames they appear in. (4) Burned-in captions/text not in text_overlays \u2192 add an <img>-free <div class="ov"> in index.html. (5) ONE person playing MULTIPLE personas/wardrobes (skeptic vs believer, before vs after, two outfits) collapsed into a single el_* slot \u2192 split into one el_* slot PER look, each linked as the SAME individual via `same_as` so every outfit has its own reference image but the face/identity stays identical.',
|
|
9044
9546
|
scenes_clamped_to_15s: report.clamped_scenes,
|
|
9045
9547
|
run_warning: "`baker canvas run` generates many billed image/video/audio assets \u2014 validate first, it is not free."
|
|
9046
9548
|
};
|
|
@@ -9118,10 +9620,13 @@ List ONLY the elements worth keeping consistent across frames \u2014 the ones a
|
|
|
9118
9620
|
- a showcased product, package, card, or device the ad sells or demonstrates -> type "product"
|
|
9119
9621
|
- the advertiser brand logo/wordmark (from global.branding) -> type "logo"
|
|
9120
9622
|
- a recurring trust/rating/certification badge -> type "badge"
|
|
9623
|
+
- the dominant recording set/location the scenes share (e.g. the same living room, car interior, kitchen) -> type "location"
|
|
9121
9624
|
|
|
9122
|
-
DROP one-off background extras
|
|
9625
|
+
DROP one-off background extras and incidental props \u2014 but the shared set/location is NOT generic scenery: pin it as ONE "location" element so the room stays identical across scenes. A person in global.cast is almost always recurring. Keep at most ~8.
|
|
9123
9626
|
|
|
9124
|
-
|
|
9627
|
+
ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
|
|
9628
|
+
|
|
9629
|
+
For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
|
|
9125
9630
|
async function loadAssetText2(ref, label) {
|
|
9126
9631
|
const r = ref;
|
|
9127
9632
|
if (typeof r?.path === "string") return readFile4(r.path, "utf8");
|
|
@@ -9215,6 +9720,14 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9215
9720
|
file: { type: "positional", required: true, description: "Path to the reference video" },
|
|
9216
9721
|
out: { type: "string", description: "Output canvas path (default <video-dir>/<name>.video.canvas.json)" },
|
|
9217
9722
|
frames: { type: "string", description: '"generate" (default, anchored regen) or "reuse" (wire real frames in)' },
|
|
9723
|
+
ambient: {
|
|
9724
|
+
type: "boolean",
|
|
9725
|
+
description: "Give silent b-roll scenes native diegetic ambient mixed deep under the music bed (off by default)"
|
|
9726
|
+
},
|
|
9727
|
+
"actor-sheets": {
|
|
9728
|
+
type: "boolean",
|
|
9729
|
+
description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
|
|
9730
|
+
},
|
|
9218
9731
|
"max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
|
|
9219
9732
|
language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
|
|
9220
9733
|
focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
|
|
@@ -9266,7 +9779,9 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
9266
9779
|
videoModel,
|
|
9267
9780
|
overlayCompositionPath: compositionDest,
|
|
9268
9781
|
blueprintPath,
|
|
9269
|
-
frames
|
|
9782
|
+
frames,
|
|
9783
|
+
ambient: Boolean(args.ambient),
|
|
9784
|
+
actorSheets: Boolean(args["actor-sheets"])
|
|
9270
9785
|
};
|
|
9271
9786
|
let canvas;
|
|
9272
9787
|
let report;
|
|
@@ -10557,7 +11072,7 @@ registerSchema({
|
|
|
10557
11072
|
query: { type: "string", description: "Search query", required: true },
|
|
10558
11073
|
sources: {
|
|
10559
11074
|
type: "string",
|
|
10560
|
-
description: "Comma-separated providers: library,magnific,google,iconify,giphy (brandfetch lives at `baker images logo`)",
|
|
11075
|
+
description: "Comma-separated providers: library,magnific,google,iconify,giphy,pinterest (brandfetch lives at `baker images logo`)",
|
|
10561
11076
|
required: false
|
|
10562
11077
|
},
|
|
10563
11078
|
limit: { type: "number", description: "Max results per group", required: false, default: 20 },
|