@koda-sl/baker-cli 0.94.0 → 0.95.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/dist/cli.js +78 -26
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -3653,7 +3653,7 @@ It then scaffolds the full pipeline like an **editing timeline**: each clip gets
|
|
|
3653
3653
|
|
|
3654
3654
|
**Montage flashes held as stills.** A rapid-cut beat shorter than ~2s with no spoken line is a **flash** — Seedance's shortest clip is 4s, so generating one (then trimming away most of it) burns credits for motion no viewer perceives. The scaffold instead **holds one keyframe as a still** for the scene length (a cheap ffmpeg loop, no billed `video_generate`), same look at a fraction of the cost. Talking/ambient beats keep a real clip (they need motion + native audio).
|
|
3655
3655
|
|
|
3656
|
-
**The phrase model (voice cut at pauses, not at visual cuts).** The voice is grouped into **phrases** — runs of continuous speech with no real pause, which may span several visual scenes. A phrase is voiced ONCE (so a sentence the deconstruct split at a visual cut never breaks mid-word): if the speaker is **shown** anywhere in the phrase it's a single Seedance clip (`s<anchor>_clip`, native lip-sync + audio) re-voiced to the brand voice; if the speaker is **never shown** it's one ElevenLabs `tts` read. The picture is then assembled **scene by scene**: a scene that shows the speaker **slices its window** out of the phrase clip (`s<i>_seg`, an ffmpeg `-ss`/`-t` cut — video and audio come from the *same* clip, so lip-sync holds), and a **b-roll cutaway** gets its own silent clip while the phrase's voice plays underneath. "Shown" is decided by the **presenter element's per-scene presence**, not just who's speaking — a scene where a cast member narrates over b-roll (their element absent) is treated as a cutaway, so the talking head never appears where the original cut away. A presenter run longer than the
|
|
3656
|
+
**The phrase model (voice cut at pauses, not at visual cuts).** The voice is grouped into **phrases** — runs of continuous speech with no real pause, which may span several visual scenes. A phrase is voiced ONCE (so a sentence the deconstruct split at a visual cut never breaks mid-word): if the speaker is **shown** anywhere in the phrase it's a single Seedance clip (`s<anchor>_clip`, native lip-sync + audio) re-voiced to the brand voice; if the speaker is **never shown** it's one ElevenLabs `tts` read. The picture is then assembled **scene by scene**: a scene that shows the speaker **slices its window** out of the phrase clip (`s<i>_seg`, an ffmpeg `-ss`/`-t` cut — video and audio come from the *same* clip, so lip-sync holds), and a **b-roll cutaway** gets its own silent clip while the phrase's voice plays underneath. "Shown" is decided by the **presenter element's per-scene presence**, not just who's speaking — a scene where a cast member narrates over b-roll (their element absent) is treated as a cutaway, so the talking head never appears where the original cut away. A presenter run longer than the **gateway-safe ~10s clip ceiling splits at a scene boundary** into contiguous takes (each its own clip + convert), so a sliced window never reads past its clip. (Seedance's *API* max is 15s, but the generation gateway frequently times out — **HTTP 524** — before it can deliver a clip longer than ~10s, so the scaffold never asks for one that long; 10s is a Seedance-allowed duration, so the split clip still snaps cleanly.) A b-roll cutaway *inside* a phrase lands at an **approximate** time (Seedance exposes no word timing) — nudge the scene boundary if it's off its beat.
|
|
3657
3657
|
|
|
3658
3658
|
**A starting point, not a locked render.** The canvas mirrors the reference's structure to give you a faithful scaffold, but `metadata.todo.full_flexibility` makes explicit that the agent has **full editing freedom**: add / delete / reorder / split / merge scenes, re-prompt any frame or motion brief, change a scene's layout (full-frame ↔ composite), or rewrite any line — the content-addressed cache re-bills only what changes, and `baker canvas validate` re-checks timing/lip-sync after any edit.
|
|
3659
3659
|
|
|
@@ -3663,7 +3663,7 @@ It then scaffolds the full pipeline like an **editing timeline**: each clip gets
|
|
|
3663
3663
|
|
|
3664
3664
|
**Same-shot lip-sync caution.** A single held shot can carry only ONE lip-synced clip (voiceover turns must not overlap, and Seedance generates one clip per shot), so when the on-camera speaker has further turns in that shot (a rapid "3000? … 4000?" with an off-camera "Plus" between), the first turn is native and the rest play as `tts` over the same clip — where the mouth no longer matches those words. This is inherent to reproducing sparse same-shot dialogue, not a wiring fault; the scaffold lists the affected scenes/lines in **`metadata.video.lip_sync_caution`** (advisory, never gated) so you can cut away to b-roll over those lines or rely on the burned-in captions that already show them.
|
|
3665
3665
|
|
|
3666
|
-
**Timing-faithful clip + extract (no overlap).** Each phrase clip is generated to its **coverage window** (the deconstruct's real scene/line timing, capped at
|
|
3666
|
+
**Timing-faithful clip + extract (no overlap).** Each phrase clip is generated to its **coverage window** (the deconstruct's real scene/line timing, capped at the gateway-safe ~10s ceiling) and its converted voice is extracted to the **spoken window** (pause to pause) — *not* padded to a word-count estimate. Padding past the window was what ran the voice the clip's whole length and overlapped the next phrase; trusting the deconstruct's timing keeps consecutive phrases back-to-back and lets Seedance pace the quoted text to fit. `metadata.video.talking_scenes` records each phrase's `scene_s` vs `est_speech_s`; on top of that the scaffold flags any scene whose estimated speech overruns its window by more than ~1.3× as **`metadata.todo.overstuffed_scenes`** (also in the stdout checklist) — a loud advisory to shorten the copy or lengthen the scene before rendering, since an over-stuffed line pushes the picture off the audio timeline. It similarly flags **`oversize_scenes`** — a single scene whose own footage exceeds the gateway-safe ~10s clip ceiling (a b-roll shot or one-shot monologue). The phrase splitter only breaks at scene boundaries, so it can't shrink a single over-long scene; its clip would 524 at the gateway, so the advisory tells you to split that scene into two before rendering.
|
|
3667
3667
|
|
|
3668
3668
|
**Timeline-accurate picture.** Seedance can't render under 4s, so each clip is generated at the smallest allowed duration ≥ the scene length and then **trimmed back to the exact scene duration** before concat. This keeps the concatenated picture on the same timeline as the absolute-timed audio — without it, short scenes balloon to 4s, the spine runs far longer than the soundtrack, and every line plays over the wrong (slowed) scene so the lips never match. Frames are also prompted as **clean text-free plates** (no baked captions/lower-thirds/tickers/logos-as-text) so the overlay layer is the single source of on-screen text.
|
|
3669
3669
|
|
|
@@ -3701,7 +3701,7 @@ baker canvas run ./reference-ad.video.canvas.json
|
|
|
3701
3701
|
|
|
3702
3702
|
Each scene is captured in a **shoot mode** — `ugc_selfie` (talking heads, the default look), `ugc_broll`, `studio_product` (pack shot), `lifestyle_cinematic`, or `screen_ui`. The scaffold derives one per scene (UGC by default; the cinematic and screen lanes are opt-in) and bakes its capture block into the frame and a camera default into the clip; override per scene with a `shoot_mode` field in `prompt.json`. Capture aesthetic + depth-of-field follow the mode (UGC stays flat; studio/lifestyle allow shallow DoF). Clips also carry **diegetic native audio** — the scene's own ambience described in the Seedance prompt, never music (the music bed is a separate, ducked track); set a scene's `ambient` field to steer it.
|
|
3703
3703
|
|
|
3704
|
-
**Automatic by default (no flags).** Every recast **base element — person, pet, product, AND location/set** — is fused into ONE rich multi-view sheet (`image_reference_sheet`, one subject per sheet, **4K**, up to 8 cells) that every frame it appears in grounds on, so the same face/pet/pack/room is rendered from a multi-angle canvas instead of a lone flat snapshot (a one-scene hero element is sheeted too). Each sheet pairs a **full turnaround** (angles, for proportions/wardrobe/layout) with tight **close-ups** so the generator is prepared for ANY framing a scene needs: a **person** gets body cells + face close-ups (front/¾/profile) and a mid-sentence speaking expression (identity pinned, natural skin — no airbrushing); an **animal** gets a body turnaround + head close-ups + an eyes/face macro; a **product** gets a turnaround + label and material detail macros; a **location/set** gets several camera angles of the same room + a key-surface detail. Generated clips are pinned to **1080p** (see `--resolution`) so the video keeps the keyframe's sharpness, and each cast frame keeps the source frame as a **composition anchor** (identity stays on the sheet) so the original framing/camera is reproduced, not re-guessed. An **app/website/chat screen** is never sent to the video model — the scaffold drops the scene to a clean talking-head and seeds a phone-mockup PIP stub to fill with a real `baker images screenshot` or brand HTML block (Seedance garbles UI and a split leaves a seam). The **music bed is instrumental** (the script is never fed to the music model — it would sing over the voice), enters only after the hook, and is **sidechain-ducked** under the voice. **Word-synced TikTok captions** are wired
|
|
3704
|
+
**Automatic by default (no flags).** Every recast **base element — person, pet, product, AND location/set** — is fused into ONE rich multi-view sheet (`image_reference_sheet`, one subject per sheet, **4K**, up to 8 cells) that every frame it appears in grounds on, so the same face/pet/pack/room is rendered from a multi-angle canvas instead of a lone flat snapshot (a one-scene hero element is sheeted too). Each sheet pairs a **full turnaround** (angles, for proportions/wardrobe/layout) with tight **close-ups** so the generator is prepared for ANY framing a scene needs: a **person** gets body cells + face close-ups (front/¾/profile) and a mid-sentence speaking expression (identity pinned, natural skin — no airbrushing); an **animal** gets a body turnaround + head close-ups + an eyes/face macro; a **product** gets a turnaround + label and material detail macros; a **location/set** gets several camera angles of the same room + a key-surface detail. Generated clips are pinned to **1080p** (see `--resolution`) so the video keeps the keyframe's sharpness, and each cast frame keeps the source frame as a **composition anchor** (identity stays on the sheet) so the original framing/camera is reproduced, not re-guessed. An **app/website/chat screen** is never sent to the video model — the scaffold drops the scene to a clean talking-head and seeds a phone-mockup PIP stub to fill with a real `baker images screenshot` or brand HTML block (Seedance garbles UI and a split leaves a seam). The **music bed is instrumental** (the script is never fed to the music model — it would sing over the voice), enters only after the hook, and is **sidechain-ducked** under the voice. **Word-synced TikTok captions** are wired whenever the ad has speech — and they are **transcribed from the rendered audio** (a `video_transcribe` of the actual voice mix), not the deconstruct's original transcript. This is a correctness boundary: wiring the source transcript would burn the **competitor's** words (their brand name, a claim we can't make) over the ad once the script is re-authored, whereas transcribing the generated audio can only ever show what is actually spoken, so the captions always track the re-written lines. Seeded overlays are pushed **off the subject's face** (dead-center → bottom band).
|
|
3705
3705
|
|
|
3706
3706
|
The two scaffold passes are billed (the full `video_deconstruct` is the heavy one); **running** the result then generates many image/video/audio assets and is not free. Defaults to vertical 1080×1920 overlays — copy + edit the composition for other aspect ratios. For on-brand overlay type, drop `brand-bold.otf`/`brand-regular.otf` into the copied `video-overlay-composition/` dir (wired via `@font-face`, with a system fallback). Richer transcription (punctuated words + paragraphs) is available via the deconstruct's `transcriber: "deepgram"` param when `DEEPGRAM_API_KEY` is set.
|
|
3707
3707
|
|
package/dist/cli.js
CHANGED
|
@@ -9096,7 +9096,8 @@ function buildElementSlots(elements) {
|
|
|
9096
9096
|
"spine",
|
|
9097
9097
|
"overlaid",
|
|
9098
9098
|
"captions",
|
|
9099
|
-
"
|
|
9099
|
+
"captions_premux",
|
|
9100
|
+
"captions_transcribe",
|
|
9100
9101
|
"audio_mix",
|
|
9101
9102
|
"final",
|
|
9102
9103
|
"music_bed"
|
|
@@ -9762,7 +9763,8 @@ function makePresenterPresent(slots, canonical, opts = {}) {
|
|
|
9762
9763
|
};
|
|
9763
9764
|
}
|
|
9764
9765
|
var PAUSE_GAP_S = 0.6;
|
|
9765
|
-
var
|
|
9766
|
+
var SEEDANCE_SAFE_MAX_S = SEEDANCE_DURATIONS.find((d) => d >= 10) ?? 10;
|
|
9767
|
+
var PHRASE_MAX_S = SEEDANCE_SAFE_MAX_S;
|
|
9766
9768
|
function collapseVoiceover(blueprint) {
|
|
9767
9769
|
const casts = castIdSet(blueprint);
|
|
9768
9770
|
const cameraOn = onCameraDialogue(blueprint);
|
|
@@ -10564,22 +10566,8 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
10564
10566
|
videoRef = "$ref:overlaid.video";
|
|
10565
10567
|
videoNode = "overlaid";
|
|
10566
10568
|
}
|
|
10567
|
-
if (opts.captionsCompositionPath && opts.transcriptPath) {
|
|
10568
|
-
nodes.push({
|
|
10569
|
-
id: "captions_transcript",
|
|
10570
|
-
type: "ingest",
|
|
10571
|
-
params: { source: "path", path: opts.transcriptPath, expect: "json" }
|
|
10572
|
-
});
|
|
10573
|
-
nodes.push({
|
|
10574
|
-
id: "captions",
|
|
10575
|
-
type: "hyperframe_render",
|
|
10576
|
-
inputs: { background: videoRef, transcript: "$ref:captions_transcript.asset" },
|
|
10577
|
-
params: { composition: opts.captionsCompositionPath }
|
|
10578
|
-
});
|
|
10579
|
-
videoRef = "$ref:captions.video";
|
|
10580
|
-
videoNode = "captions";
|
|
10581
|
-
}
|
|
10582
10569
|
const tracks = [...voTracks, ...buildSfxMusic(blueprint, nodes)];
|
|
10570
|
+
let audioMixRef;
|
|
10583
10571
|
if (tracks.length > 0) {
|
|
10584
10572
|
const mixInputs = {};
|
|
10585
10573
|
for (const t of tracks) mixInputs[t.slot] = t.ref;
|
|
@@ -10600,10 +10588,54 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
10600
10588
|
...duck
|
|
10601
10589
|
}
|
|
10602
10590
|
});
|
|
10591
|
+
audioMixRef = "$ref:audio_mix.audio";
|
|
10592
|
+
}
|
|
10593
|
+
if (opts.captionsCompositionPath && audioMixRef) {
|
|
10594
|
+
nodes.push({
|
|
10595
|
+
id: "captions_premux",
|
|
10596
|
+
type: "ffmpeg",
|
|
10597
|
+
inputs: { video: videoRef, audio: audioMixRef },
|
|
10598
|
+
params: {
|
|
10599
|
+
args: [
|
|
10600
|
+
"-i",
|
|
10601
|
+
"{{in.video}}",
|
|
10602
|
+
"-i",
|
|
10603
|
+
"{{in.audio}}",
|
|
10604
|
+
"-map",
|
|
10605
|
+
"0:v:0",
|
|
10606
|
+
"-map",
|
|
10607
|
+
"1:a:0",
|
|
10608
|
+
"-c:v",
|
|
10609
|
+
"copy",
|
|
10610
|
+
"-c:a",
|
|
10611
|
+
"aac",
|
|
10612
|
+
"-shortest",
|
|
10613
|
+
"{{out.video}}"
|
|
10614
|
+
],
|
|
10615
|
+
outputs: { video: { kind: "video", ext: "mp4" } }
|
|
10616
|
+
}
|
|
10617
|
+
});
|
|
10618
|
+
const captionLanguage = deriveTtsLanguageCode(blueprint);
|
|
10619
|
+
nodes.push({
|
|
10620
|
+
id: "captions_transcribe",
|
|
10621
|
+
type: "video_transcribe",
|
|
10622
|
+
inputs: { video: "$ref:captions_premux.video" },
|
|
10623
|
+
params: captionLanguage ? { language: captionLanguage } : {}
|
|
10624
|
+
});
|
|
10625
|
+
nodes.push({
|
|
10626
|
+
id: "captions",
|
|
10627
|
+
type: "hyperframe_render",
|
|
10628
|
+
inputs: { background: videoRef, transcript: "$ref:captions_transcribe.transcript" },
|
|
10629
|
+
params: { composition: opts.captionsCompositionPath }
|
|
10630
|
+
});
|
|
10631
|
+
videoRef = "$ref:captions.video";
|
|
10632
|
+
videoNode = "captions";
|
|
10633
|
+
}
|
|
10634
|
+
if (audioMixRef) {
|
|
10603
10635
|
nodes.push({
|
|
10604
10636
|
id: "final",
|
|
10605
10637
|
type: "ffmpeg",
|
|
10606
|
-
inputs: { video: videoRef, audio:
|
|
10638
|
+
inputs: { video: videoRef, audio: audioMixRef },
|
|
10607
10639
|
params: {
|
|
10608
10640
|
args: [
|
|
10609
10641
|
"-i",
|
|
@@ -10845,14 +10877,36 @@ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
|
|
|
10845
10877
|
// "none detected" — re-watch the reference and fill the gaps with the right tool.
|
|
10846
10878
|
completeness_check: 'The scaffold mirrors the deconstruct\'s catalog, which UNDER-DETECTS \u2014 never trust a 0 count. Re-watch the reference frame-by-frame and add anything missing: (1) ON-IMAGE GRAPHICS not in floating_elements (dollar/coin icons, emojis, checkmarks, rating stars, price tags, arrows, progress bars, app UI) \u2192 source each with `baker images icon "<desc>"` / `baker images sticker` / `baker images gif` / `baker images logo <domain>` and add it as an <img class="ov pos-* " data-start data-dur> in video-overlay-composition/index.html (NEVER bake graphics into the frame plates). (2) SOUND CUES not in sound_effects (cha-ching/coin, whoosh, ding, pop, notification, keyboard) \u2192 add a `sound_effect` node (eleven_text_to_sound) and wire it onto `audio_mix` at its timestamp. (3) RECURRING people/animals/products/logos/sets with no el_* slot \u2192 add an `ingest` [TODO] slot and reference it from the frames they appear in. (4) Burned-in captions/text not in text_overlays \u2192 add an <img>-free <div class="ov"> in index.html. (5) ONE person playing MULTIPLE personas/wardrobes (skeptic vs believer, before vs after, two outfits) collapsed into a single el_* slot \u2192 split into one el_* slot PER look, each linked as the SAME individual via `same_as` so every outfit has its own reference image but the face/identity stays identical.',
|
|
10847
10879
|
scenes_clamped_to_15s: report.clamped_scenes,
|
|
10880
|
+
oversize_scenes: report.oversize_scenes.length > 0 ? {
|
|
10881
|
+
scenes: report.oversize_scenes,
|
|
10882
|
+
fix: "Each listed scene's own footage is longer than the gateway-safe ~10s clip ceiling, so its clip will fail the gateway (HTTP 524) \u2014 and the scaffold can't auto-split a single scene. Split each into two scenes in prompt.json (scene N+1's start frame = scene N's end frame, so the action stays continuous) before rendering."
|
|
10883
|
+
} : "none \u2014 every scene fits within the gateway-safe clip ceiling.",
|
|
10884
|
+
overstuffed_scenes: report.overstuffed_scenes.length > 0 ? {
|
|
10885
|
+
scenes: report.overstuffed_scenes,
|
|
10886
|
+
fix: "Each listed scene's estimated speech runs well past its window \u2014 the line will overrun the scene and push the picture off the audio timeline (lips fall behind). Shorten the copy in prompt.json or lengthen the scene before rendering."
|
|
10887
|
+
} : "none \u2014 every scene's spoken length fits its window.",
|
|
10848
10888
|
run_warning: "`baker canvas run` generates many billed image/video/audio assets \u2014 validate first, it is not free."
|
|
10849
10889
|
};
|
|
10850
10890
|
}
|
|
10891
|
+
var OVERSTUFF_RATIO = 1.3;
|
|
10892
|
+
function collectClipAdvisories(scene, i, out) {
|
|
10893
|
+
const round22 = (n) => Math.round(n * 100) / 100;
|
|
10894
|
+
const original = scene.duration_s ?? 5;
|
|
10895
|
+
if (original > 15) out.clamped.push({ scene: i, original_s: original, clip_s: snapToSeedance(original) });
|
|
10896
|
+
const window = sceneDurationS(scene);
|
|
10897
|
+
if (window > SEEDANCE_SAFE_MAX_S)
|
|
10898
|
+
out.oversize.push({ scene: i, scene_s: round22(window), clip_s: ceilToSeedance(window) });
|
|
10899
|
+
const speech = (scene.dialogue ?? []).reduce((s, line) => s + (line.line ? estSpeechS(line.line) : 0), 0);
|
|
10900
|
+
if (speech > window * OVERSTUFF_RATIO)
|
|
10901
|
+
out.overstuffed.push({ scene: i, scene_s: round22(window), est_speech_s: round22(speech) });
|
|
10902
|
+
}
|
|
10851
10903
|
function videoReport(input, elementsInput) {
|
|
10852
10904
|
const blueprint = VideoBlueprint.parse(input);
|
|
10853
10905
|
const elements = RecurringElements.parse(elementsInput);
|
|
10854
10906
|
const dialogue = [];
|
|
10855
10907
|
const clamped = [];
|
|
10908
|
+
const oversize = [];
|
|
10909
|
+
const overstuffed = [];
|
|
10856
10910
|
let sfxCount = 0;
|
|
10857
10911
|
let overlayCount = 0;
|
|
10858
10912
|
blueprint.scenes.forEach((scene, i) => {
|
|
@@ -10868,9 +10922,7 @@ function videoReport(input, elementsInput) {
|
|
|
10868
10922
|
}
|
|
10869
10923
|
sfxCount += (scene.sfx ?? []).length;
|
|
10870
10924
|
overlayCount += (scene.overlays ?? []).length;
|
|
10871
|
-
|
|
10872
|
-
const clip = snapToSeedance(original);
|
|
10873
|
-
if (original > 15) clamped.push({ scene: i, original_s: original, clip_s: clip });
|
|
10925
|
+
collectClipAdvisories(scene, i, { clamped, oversize, overstuffed });
|
|
10874
10926
|
});
|
|
10875
10927
|
return {
|
|
10876
10928
|
scene_count: blueprint.scenes.length,
|
|
@@ -10885,6 +10937,8 @@ function videoReport(input, elementsInput) {
|
|
|
10885
10937
|
sfx_count: sfxCount,
|
|
10886
10938
|
overlay_count: overlayCount,
|
|
10887
10939
|
clamped_scenes: clamped,
|
|
10940
|
+
oversize_scenes: oversize,
|
|
10941
|
+
overstuffed_scenes: overstuffed,
|
|
10888
10942
|
has_music: Boolean(blueprint.global?.music?.music_prompt)
|
|
10889
10943
|
};
|
|
10890
10944
|
}
|
|
@@ -10950,12 +11004,9 @@ async function loadTranscriptBestEffort(ref) {
|
|
|
10950
11004
|
async function stageCaptions(outDir, transcript) {
|
|
10951
11005
|
const text = transcript?.trim();
|
|
10952
11006
|
if (!text || text === "[]") return {};
|
|
10953
|
-
const transcriptPath = path5.join(outDir, "transcript.json");
|
|
10954
|
-
await writeFile2(transcriptPath, `${text}
|
|
10955
|
-
`, "utf8");
|
|
10956
11007
|
const compositionPath = path5.join(outDir, "tiktok-captions-composition");
|
|
10957
11008
|
await cp(SHIPPED_CAPTIONS_DIR, compositionPath, { recursive: true });
|
|
10958
|
-
return { compositionPath
|
|
11009
|
+
return { compositionPath };
|
|
10959
11010
|
}
|
|
10960
11011
|
function parseElements2(raw) {
|
|
10961
11012
|
const parsed = JSON.parse(raw);
|
|
@@ -11161,7 +11212,6 @@ var scaffoldVideoCommand = defineCommand78({
|
|
|
11161
11212
|
videoModel,
|
|
11162
11213
|
overlayCompositionPath: compositionDest,
|
|
11163
11214
|
captionsCompositionPath: captions.compositionPath,
|
|
11164
|
-
transcriptPath: captions.transcriptPath,
|
|
11165
11215
|
blueprintPath,
|
|
11166
11216
|
frames,
|
|
11167
11217
|
ambient: Boolean(args.ambient),
|
|
@@ -11214,6 +11264,8 @@ var scaffoldVideoCommand = defineCommand78({
|
|
|
11214
11264
|
overlay_count: report.overlay_count,
|
|
11215
11265
|
has_music: report.has_music,
|
|
11216
11266
|
scenes_clamped_to_15s: report.clamped_scenes,
|
|
11267
|
+
oversize_scenes: report.oversize_scenes,
|
|
11268
|
+
overstuffed_scenes: report.overstuffed_scenes,
|
|
11217
11269
|
note: "Drop ONE real source image at each el_* [TODO] (reused across every frame that element appears in), confirm each voice_select casting, then `baker canvas validate` and `baker canvas run`. Running generates many billed image/video/audio assets \u2014 it is not free."
|
|
11218
11270
|
}
|
|
11219
11271
|
},
|