@koda-sl/baker-cli 0.93.0 → 0.95.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -49
- package/dist/{chunk-LMVDA3EZ.js → chunk-RCPMJKI7.js} +13 -6
- package/dist/chunk-RCPMJKI7.js.map +1 -0
- package/dist/cli.js +792 -676
- package/dist/cli.js.map +1 -1
- package/dist/engine/index.d.ts +5 -0
- package/dist/engine/index.js +1 -1
- package/package.json +2 -1
- package/dist/chunk-LMVDA3EZ.js.map +0 -1
package/README.md
CHANGED
|
@@ -882,9 +882,13 @@ baker ads linkedin analytics
|
|
|
882
882
|
baker ads linkedin analytics --level campaign --campaign-id 1234 \
|
|
883
883
|
--pivot job-title --intent baseline --last-days 30
|
|
884
884
|
|
|
885
|
-
# "Top companies seeing this campaign" (ABM feedback loop)
|
|
885
|
+
# "Top companies seeing this campaign" (ABM feedback loop) — org names auto-resolved
|
|
886
886
|
baker ads linkedin top-companies --campaign-id 1234 --last-days 30
|
|
887
887
|
|
|
888
|
+
# "Who are these org URNs?" — resolve org URNs to company names (probe leakage)
|
|
889
|
+
baker ads linkedin resolve --urns urn:li:organization:17719,urn:li:organization:19022
|
|
890
|
+
baker ads linkedin resolve --ids 17719,19022 --output csv
|
|
891
|
+
|
|
888
892
|
# "Who is in our audience by industry/seniority/function/title — all at once?"
|
|
889
893
|
baker ads linkedin demographics --campaign-id 1234 --last-days 30
|
|
890
894
|
|
|
@@ -2311,8 +2315,11 @@ Manage action items for the current chat. Most write operations stage on the cha
|
|
|
2311
2315
|
`BAKER_CHAT_ID` must be set for any command that stages or claims against a chat.
|
|
2312
2316
|
|
|
2313
2317
|
```bash
|
|
2314
|
-
baker actions list # default: bucketed (claimable, myClaims, blocked, claimedByOthers, completed, discarded)
|
|
2318
|
+
baker actions list # default: bucketed (claimable, myClaims, blocked, claimedByOthers, completed, discarded, draftCreates)
|
|
2315
2319
|
baker actions list --bucketed=false --status pending
|
|
2320
|
+
# With BAKER_CHAT_ID, the bucketed list folds in THIS chat's draft: staged creates appear in
|
|
2321
|
+
# `draftCreates`; published actions being completed/discarded/updated carry a `draftStatus` marker.
|
|
2322
|
+
# Only the caller's own chat draft is reflected — never another chat's staged work.
|
|
2316
2323
|
|
|
2317
2324
|
baker actions get <action-id>
|
|
2318
2325
|
baker actions status temp_hero jx123 # batch resolve real IDs and temp_* refs
|
|
@@ -2322,17 +2329,25 @@ baker actions release <action-id>
|
|
|
2322
2329
|
|
|
2323
2330
|
baker actions create --name "Build hero" --description "..."
|
|
2324
2331
|
# → returns { ok, data: { tempId }, hints: [...] } — check hints for dependencies/description reminders
|
|
2325
|
-
|
|
2326
|
-
#
|
|
2332
|
+
# If the name/description reads as recurring or future-dated (e.g. "weekly", "every Monday", "schedule"),
|
|
2333
|
+
# create emits a hint to use `baker scheduled-actions create` instead of a Work Action.
|
|
2327
2334
|
baker actions update <action-id> --name "Better name"
|
|
2328
2335
|
baker actions complete <action-id-or-tempId> --note "What was done" # stages — applies on chat publish
|
|
2329
2336
|
baker actions discard <action-id> --reason "obsolete"
|
|
2330
2337
|
|
|
2331
2338
|
baker actions link --blocker <id-or-tempId> --blocked <id-or-tempId>
|
|
2332
2339
|
baker actions unlink --blocker <id> --blocked <id>
|
|
2340
|
+
|
|
2341
|
+
# Review and edit the ops staged in THIS chat before publish
|
|
2342
|
+
baker actions draft # show every staged op + footgun warnings
|
|
2343
|
+
baker actions draft remove temp_hero # drop a staged create (cascades its complete/link ops)
|
|
2344
|
+
baker actions draft remove <id> --op complete # drop a staged op on a real action (--op update|complete|discard|tagAdd|tagRemove)
|
|
2345
|
+
baker actions draft clear # drop everything staged in this chat
|
|
2333
2346
|
```
|
|
2334
2347
|
|
|
2335
|
-
`baker actions
|
|
2348
|
+
`baker actions draft` is the detailed, op-by-op view of staged (pre-publish) ops plus footgun `warnings`, and the place to edit the draft (`draft remove`/`clear`). The bucketed `list` and a chat-scoped `status` also surface this chat's draft (staged creates, `draftStatus` markers, `draft` ref status), so `draft` is mainly for the full changelog and edits. Do not stage `complete` to cancel an unwanted action — that publishes it as an already-completed item; remove the staged create (or `discard` a published action) instead.
|
|
2349
|
+
|
|
2350
|
+
`baker actions status <ref...>` resolves refs in one request to `/api/actions/status`. It works without `BAKER_CHAT_ID`, but when the CLI has a chat id it includes it so a temp still staged in **that** chat resolves to `status: "draft"` instead of `not_found` (only the caller's own chat draft is consulted). It preserves the backend JSON envelope:
|
|
2336
2351
|
|
|
2337
2352
|
```json
|
|
2338
2353
|
{
|
|
@@ -2344,10 +2359,16 @@ baker actions unlink --blocker <id> --blocked <id>
|
|
|
2344
2359
|
"status": "completed",
|
|
2345
2360
|
"name": "Build hero section"
|
|
2346
2361
|
},
|
|
2362
|
+
{
|
|
2363
|
+
"ref": "temp_staged",
|
|
2364
|
+
"status": "draft",
|
|
2365
|
+
"name": "Brand new task",
|
|
2366
|
+
"hint": "Created in this chat's draft, not yet published. It applies when this chat publishes."
|
|
2367
|
+
},
|
|
2347
2368
|
{
|
|
2348
2369
|
"ref": "temp_old",
|
|
2349
2370
|
"status": "not_found",
|
|
2350
|
-
"hint": "Temp ref was not published, was discarded before publish, or belongs to another company."
|
|
2371
|
+
"hint": "Temp ref was not published, was discarded before publish, does not exist, or belongs to another company."
|
|
2351
2372
|
}
|
|
2352
2373
|
]
|
|
2353
2374
|
}
|
|
@@ -2363,44 +2384,6 @@ Permissions enforced server-side:
|
|
|
2363
2384
|
|
|
2364
2385
|
---
|
|
2365
2386
|
|
|
2366
|
-
### Missions (`baker missions`)
|
|
2367
|
-
|
|
2368
|
-
A **Mission** groups an ordered set of actions (its "steps") under one goal, with a markdown **overview** (the human-readable plan). Use a mission whenever a request decomposes into 2+ ordered actions — audits, campaigns, multi-step plans. A single one-off capture stays a loose action.
|
|
2369
|
-
|
|
2370
|
-
Mission write ops stage on the **same chat draft as actions** and apply atomically on the existing publish — there is no separate apply. `BAKER_CHAT_ID` must be set for every command except `list`/`get`.
|
|
2371
|
-
|
|
2372
|
-
```bash
|
|
2373
|
-
# 1. Open the mission FIRST with the detailed plan (the overview is the point).
|
|
2374
|
-
# Returns { ok, data: { missionTempId }, hints }.
|
|
2375
|
-
baker missions create --title "Audit Google Ads" --overview "## Phase 1 — Pull data\n..."
|
|
2376
|
-
|
|
2377
|
-
# 2. Create each step already attached to the mission, in order. This is the
|
|
2378
|
-
# preferred path: create+attach is one atomic op, so a step never appears as
|
|
2379
|
-
# a loose action between create and attach.
|
|
2380
|
-
baker actions create --name "Pull data" --description "..." --mission <missionTempId> --order 0
|
|
2381
|
-
baker actions create --name "Analyze" --description "..." --mission <missionTempId> --order 1
|
|
2382
|
-
|
|
2383
|
-
# Attach an already-existing action to a mission (when it wasn't created with --mission).
|
|
2384
|
-
# Both --mission and --action accept a real id or a temp_* ref from the same draft.
|
|
2385
|
-
baker missions add-action --mission <id-or-missionTempId> --action <id-or-tempId> --order 0
|
|
2386
|
-
|
|
2387
|
-
baker missions update <mission-id> --title "..." --overview "..." # real id only
|
|
2388
|
-
baker missions set-status <mission-id> --status accomplished # active | accomplished | aborted
|
|
2389
|
-
|
|
2390
|
-
baker missions list [--include-aborted] # per-mission progress (done/total) + ordered steps
|
|
2391
|
-
baker missions get <mission-id>
|
|
2392
|
-
```
|
|
2393
|
-
|
|
2394
|
-
Rules:
|
|
2395
|
-
|
|
2396
|
-
- **The overview is forward planning** — write it as numbered Phases (goal / what it produces / what unlocks next), with **no calendar framing** (dates, deadlines, ETAs) and **no effort framing** (quick-win, MVP, t-shirt sizes).
|
|
2397
|
-
- **Publishing the chat = the user approving the plan.** The mission and its ordered steps apply atomically; the dashboard then shows the mission grouped, ticking off as steps complete.
|
|
2398
|
-
- **Mark `accomplished` only when the goal is genuinely met**, `aborted` if abandoned — never auto-conclude from step status.
|
|
2399
|
-
- Hard dependencies between steps still go through `baker actions link`; missions (ordering) and dependencies (blocking) are orthogonal.
|
|
2400
|
-
- All staged mission ops revert automatically if the chat is discarded.
|
|
2401
|
-
|
|
2402
|
-
---
|
|
2403
|
-
|
|
2404
2387
|
### `baker schema [command]`
|
|
2405
2388
|
|
|
2406
2389
|
Inspect command argument schemas for AI agent introspection.
|
|
@@ -3664,21 +3647,23 @@ Before the deconstruct it runs a **local shot-cut pass** on the source file with
|
|
|
3664
3647
|
|
|
3665
3648
|
A shot longer than the video model's per-clip ceiling (Seedance's 15s, passed as `video_deconstruct`'s `max_clip_s`) is split into equal **continuation sub-scenes** that share their splice boundary exactly — so a long shot is reproduced in **full** (no truncation) and joins seamlessly. Each sub-scene carries `continues_previous`.
|
|
3666
3649
|
|
|
3667
|
-
It then scaffolds the full pipeline like an **editing timeline**: each clip gets a **static-ad-grade start AND end keyframe** (`image_generate`, each with its **own self-contained `params.prompt`** — edit a frame node to change only that frame; `prompt.json` wired as the **authoritative shared `target_blueprint`**, plus a per-element reference legend). Each keyframe is **fully recast** to the dropped `el_*` reference images.
|
|
3650
|
+
It then scaffolds the full pipeline like an **editing timeline**: each clip gets a **static-ad-grade start AND end keyframe** (`image_generate`, each with its **own self-contained `params.prompt`** — edit a frame node to change only that frame; `prompt.json` wired as the **authoritative shared `target_blueprint`**, plus a per-element reference legend). Each keyframe is **fully recast** to the dropped `el_*` reference images. The original extracted frame is kept LAST as a **pure composition anchor** (framing / camera angle / shot size / pose) whenever identity is safely locked — i.e. a frame with no person/animal, OR every cast member present is **sheet-backed** (a multi-view turnaround owns identity, so the anchor can reproduce the source's framing without dictating the face). Since every base element is now sheet-backed by default, cast frames keep their framing anchor too — this is what reproduces the source's composition (a side-profile stays a side-profile, the camera angle holds scene to scene) instead of drifting to a fresh guess. The anchor's legend forbids taking identity/text/palette from it. It is dropped only when a cast member rests on a weak lone-snapshot reference (e.g. a `same_as` second-look slot), where the original frame could re-leak the source actor. Both keyframes feed `video_generate` (`first_frame`+`last_frame`, so Seedance interpolates real in-shot motion; ultra-detailed motion brief; duration snapped to the nearest allowed clip length). Every keyframe grounds **only on its own extracted frame + `el_*` slots** — no reference to any other generated frame — so all images render **in parallel** (no cascade). Source-frame URLs are **deduped** (each ingested once). `--frames reuse` wires the real source frame straight in.
|
|
3668
3651
|
|
|
3669
3652
|
**Composited scenes (split-screen / picture-in-picture / keyed presenter).** Real ads aren't always one full-frame shot — a frame can be **persistently divided** (b-roll on top, a presenter talking on the bottom) or **layer a presenter** over background footage (boxed in a corner, or green-screen keyed). The deconstruct now reports this per scene as `scene.composition` (`layout: split_screen | pip | keyed_overlay`, with one `region` per stream — each its own clean-plate frame + motion brief, the talking-head region flagged `is_presenter`). The scaffold reproduces a composited scene by building **one clip per region** (`s<i>_r0_*`, `s<i>_r1_*`, …) and compositing them with ffmpeg: a split-screen `vstack`/`hstack` (stack direction read from the region **panels**, so a top/bottom split always stacks vertically), or a picture-in-picture `overlay` of the presenter inset at its corner. A **keyed** presenter is first cut to transparency by `video_background_remove` (`s<i>_key`), then overlaid. The presenter region carries the native lip-synced voice; b-roll/render panels stay silent. To change a layout, edit `composition` in `prompt.json` and re-scaffold, or hand-edit the `s<i>_composite` ffmpeg args. Plain full-frame scenes (the default) are unaffected.
|
|
3670
3653
|
|
|
3671
3654
|
**Montage flashes held as stills.** A rapid-cut beat shorter than ~2s with no spoken line is a **flash** — Seedance's shortest clip is 4s, so generating one (then trimming away most of it) burns credits for motion no viewer perceives. The scaffold instead **holds one keyframe as a still** for the scene length (a cheap ffmpeg loop, no billed `video_generate`), same look at a fraction of the cost. Talking/ambient beats keep a real clip (they need motion + native audio).
|
|
3672
3655
|
|
|
3673
|
-
**The phrase model (voice cut at pauses, not at visual cuts).** The voice is grouped into **phrases** — runs of continuous speech with no real pause, which may span several visual scenes. A phrase is voiced ONCE (so a sentence the deconstruct split at a visual cut never breaks mid-word): if the speaker is **shown** anywhere in the phrase it's a single Seedance clip (`s<anchor>_clip`, native lip-sync + audio) re-voiced to the brand voice; if the speaker is **never shown** it's one ElevenLabs `tts` read. The picture is then assembled **scene by scene**: a scene that shows the speaker **slices its window** out of the phrase clip (`s<i>_seg`, an ffmpeg `-ss`/`-t` cut — video and audio come from the *same* clip, so lip-sync holds), and a **b-roll cutaway** gets its own silent clip while the phrase's voice plays underneath. "Shown" is decided by the **presenter element's per-scene presence**, not just who's speaking — a scene where a cast member narrates over b-roll (their element absent) is treated as a cutaway, so the talking head never appears where the original cut away. A presenter run longer than the
|
|
3656
|
+
**The phrase model (voice cut at pauses, not at visual cuts).** The voice is grouped into **phrases** — runs of continuous speech with no real pause, which may span several visual scenes. A phrase is voiced ONCE (so a sentence the deconstruct split at a visual cut never breaks mid-word): if the speaker is **shown** anywhere in the phrase it's a single Seedance clip (`s<anchor>_clip`, native lip-sync + audio) re-voiced to the brand voice; if the speaker is **never shown** it's one ElevenLabs `tts` read. The picture is then assembled **scene by scene**: a scene that shows the speaker **slices its window** out of the phrase clip (`s<i>_seg`, an ffmpeg `-ss`/`-t` cut — video and audio come from the *same* clip, so lip-sync holds), and a **b-roll cutaway** gets its own silent clip while the phrase's voice plays underneath. "Shown" is decided by the **presenter element's per-scene presence**, not just who's speaking — a scene where a cast member narrates over b-roll (their element absent) is treated as a cutaway, so the talking head never appears where the original cut away. A presenter run longer than the **gateway-safe ~10s clip ceiling splits at a scene boundary** into contiguous takes (each its own clip + convert), so a sliced window never reads past its clip. (Seedance's *API* max is 15s, but the generation gateway frequently times out — **HTTP 524** — before it can deliver a clip longer than ~10s, so the scaffold never asks for one that long; 10s is a Seedance-allowed duration, so the split clip still snaps cleanly.) A b-roll cutaway *inside* a phrase lands at an **approximate** time (Seedance exposes no word timing) — nudge the scene boundary if it's off its beat.
|
|
3674
3657
|
|
|
3675
3658
|
**A starting point, not a locked render.** The canvas mirrors the reference's structure to give you a faithful scaffold, but `metadata.todo.full_flexibility` makes explicit that the agent has **full editing freedom**: add / delete / reorder / split / merge scenes, re-prompt any frame or motion brief, change a scene's layout (full-frame ↔ composite), or rewrite any line — the content-addressed cache re-bills only what changes, and `baker canvas validate` re-checks timing/lip-sync after any edit.
|
|
3676
3659
|
|
|
3677
3660
|
**Sequenced audio.** Dialogue is a back-and-forth on one absolute timeline, so each **contiguous same-speaker turn** becomes its own `tts` placed at its real `start_s` — turns alternate and never stack (the earlier design concatenated each speaker's whole monologue at their earliest timestamp, so two voices played in parallel for the entire video). Each speaker is locked to one shared `voice_select` voice; a `sound_effect` per SFX and a `music` bed (conditioned on the **ad's own script + emotional arc** so the bed supports the message, styled after the AudD-identified track when available, ducked under the voices, and started at the reference's `music.starts_at_s` rather than always at 0) round out the mix (`audio_timeline`). The final mux normalizes the soundtrack to **−14 LUFS (stereo)** so the output plays loud in every player — the raw mix is quiet mono, which reads as "no sound."
|
|
3678
3661
|
|
|
3679
|
-
**Native talking heads + one voice per person (no post-hoc lip-sync).** Seedance 2.0 generates lip-synced speech **natively** — a presenter phrase puts the full phrase in the clip's prompt with `generate_audio`, so lips and voice are generated together (no `video_lipsync`/veed). Each presenter phrase's audio is extracted and re-voiced through a **per-phrase** `audio_voice_convert` (ElevenLabs Voice Changer; one per phrase keeps each ≤15s clip under the converter's length cap) to the brand voice — timing preserved so the lips stay matched. There is **ONE voice per person**: a single `voice_select` is reused for all that person's phrases, and the deconstruct's `voiceover` label folds into the sole on-camera presenter (so on-camera and off-camera narration are the same voice, not two). A scene with **two on
|
|
3662
|
+
**Native talking heads + one voice per person (no post-hoc lip-sync).** Seedance 2.0 generates lip-synced speech **natively** — a presenter phrase puts the full phrase in the clip's prompt with `generate_audio`, so lips and voice are generated together (no `video_lipsync`/veed). Each presenter phrase's audio is extracted and re-voiced through a **per-phrase** `audio_voice_convert` (ElevenLabs Voice Changer; one per phrase keeps each ≤15s clip under the converter's length cap) to the brand voice — timing preserved so the lips stay matched. There is **ONE voice per person**: a single `voice_select` is reused for all that person's phrases, and the deconstruct's `voiceover` label folds into the sole on-camera presenter (so on-camera and off-camera narration are the same voice, not two). A scene with **two speakers both on screen** can't be one clip — both lines become `tts` over a plain scene clip. But a scene with **one on-camera speaker trading lines with an OFF-camera voice** (an interviewer, a heard-but-not-shown assistant) keeps the on-camera speaker **native** (lip-synced) and reads the off-camera line as `tts` — "on screen" is decided by the speaker's element presence, so a heard-but-unshown voice no longer drops the whole scene to a silent clip. Every `tts` node is stamped with the spoken track's **`language_code`** when the blueprint states a language (cast localization note / voiceover persona / voice description), so numbers and units are read in the target tongue instead of ElevenLabs' English default (the "6900 read in English" bug). For **NATIVE (Seedance) lines** — which carry no language tag — the scaffold additionally **spells numerals into target-language words** across every part of the clip prompt Seedance can vocalize (the spoken line, the scene summary/action/motion, the transcript), so a French "6930 ?" becomes "six mille neuf cent trente ?" and is never read as English digits. Spelling covers **every language the blueprint can resolve** (fr, es, en, de, it, pt, nl, pl, ar, ja, ko, hi — via `n2words`); a language outside that set leaves digits (the `tts` path still localizes them via `language_code`).
|
|
3663
|
+
|
|
3664
|
+
**Same-shot lip-sync caution.** A single held shot can carry only ONE lip-synced clip (voiceover turns must not overlap, and Seedance generates one clip per shot), so when the on-camera speaker has further turns in that shot (a rapid "3000? … 4000?" with an off-camera "Plus" between), the first turn is native and the rest play as `tts` over the same clip — where the mouth no longer matches those words. This is inherent to reproducing sparse same-shot dialogue, not a wiring fault; the scaffold lists the affected scenes/lines in **`metadata.video.lip_sync_caution`** (advisory, never gated) so you can cut away to b-roll over those lines or rely on the burned-in captions that already show them.
|
|
3680
3665
|
|
|
3681
|
-
**Timing-faithful clip + extract (no overlap).** Each phrase clip is generated to its **coverage window** (the deconstruct's real scene/line timing, capped at
|
|
3666
|
+
**Timing-faithful clip + extract (no overlap).** Each phrase clip is generated to its **coverage window** (the deconstruct's real scene/line timing, capped at the gateway-safe ~10s ceiling) and its converted voice is extracted to the **spoken window** (pause to pause) — *not* padded to a word-count estimate. Padding past the window was what ran the voice the clip's whole length and overlapped the next phrase; trusting the deconstruct's timing keeps consecutive phrases back-to-back and lets Seedance pace the quoted text to fit. `metadata.video.talking_scenes` records each phrase's `scene_s` vs `est_speech_s`; on top of that the scaffold flags any scene whose estimated speech overruns its window by more than ~1.3× as **`metadata.todo.overstuffed_scenes`** (also in the stdout checklist) — a loud advisory to shorten the copy or lengthen the scene before rendering, since an over-stuffed line pushes the picture off the audio timeline. It similarly flags **`oversize_scenes`** — a single scene whose own footage exceeds the gateway-safe ~10s clip ceiling (a b-roll shot or one-shot monologue). The phrase splitter only breaks at scene boundaries, so it can't shrink a single over-long scene; its clip would 524 at the gateway, so the advisory tells you to split that scene into two before rendering.
|
|
3682
3667
|
|
|
3683
3668
|
**Timeline-accurate picture.** Seedance can't render under 4s, so each clip is generated at the smallest allowed duration ≥ the scene length and then **trimmed back to the exact scene duration** before concat. This keeps the concatenated picture on the same timeline as the absolute-timed audio — without it, short scenes balloon to 4s, the spine runs far longer than the soundtrack, and every line plays over the wrong (slowed) scene so the lips never match. Frames are also prompted as **clean text-free plates** (no baked captions/lower-thirds/tickers/logos-as-text) so the overlay layer is the single source of on-screen text.
|
|
3684
3669
|
|
|
@@ -3712,10 +3697,11 @@ baker canvas run ./reference-ad.video.canvas.json
|
|
|
3712
3697
|
| `--select-model <id>` | `~google/gemini-flash-latest` | Override the element-selection `text_generate` model. |
|
|
3713
3698
|
| `--image-model <id>` | `openai/gpt-5.4-image-2` | Override the per-frame `image_generate` model (defaults to the strongest, matching `scaffold-static-ad`). |
|
|
3714
3699
|
| `--video-model <id>` | `bytedance/seedance-2.0` | Override the `video_generate` model. |
|
|
3700
|
+
| `--resolution <res>` | `1080p` | Output resolution for every generated clip (`480p`/`720p`/`1080p` for Seedance). The model defaults to a LOW tier when unset, which downscales the 2K keyframes — pinning the top tier keeps the clip as sharp as its frames. |
|
|
3715
3701
|
|
|
3716
3702
|
Each scene is captured in a **shoot mode** — `ugc_selfie` (talking heads, the default look), `ugc_broll`, `studio_product` (pack shot), `lifestyle_cinematic`, or `screen_ui`. The scaffold derives one per scene (UGC by default; the cinematic and screen lanes are opt-in) and bakes its capture block into the frame and a camera default into the clip; override per scene with a `shoot_mode` field in `prompt.json`. Capture aesthetic + depth-of-field follow the mode (UGC stays flat; studio/lifestyle allow shallow DoF). Clips also carry **diegetic native audio** — the scene's own ambience described in the Seedance prompt, never music (the music bed is a separate, ducked track); set a scene's `ambient` field to steer it.
|
|
3717
3703
|
|
|
3718
|
-
**Automatic by default (no flags).**
|
|
3704
|
+
**Automatic by default (no flags).** Every recast **base element — person, pet, product, AND location/set** — is fused into ONE rich multi-view sheet (`image_reference_sheet`, one subject per sheet, **4K**, up to 8 cells) that every frame it appears in grounds on, so the same face/pet/pack/room is rendered from a multi-angle canvas instead of a lone flat snapshot (a one-scene hero element is sheeted too). Each sheet pairs a **full turnaround** (angles, for proportions/wardrobe/layout) with tight **close-ups** so the generator is prepared for ANY framing a scene needs: a **person** gets body cells + face close-ups (front/¾/profile) and a mid-sentence speaking expression (identity pinned, natural skin — no airbrushing); an **animal** gets a body turnaround + head close-ups + an eyes/face macro; a **product** gets a turnaround + label and material detail macros; a **location/set** gets several camera angles of the same room + a key-surface detail. Generated clips are pinned to **1080p** (see `--resolution`) so the video keeps the keyframe's sharpness, and each cast frame keeps the source frame as a **composition anchor** (identity stays on the sheet) so the original framing/camera is reproduced, not re-guessed. An **app/website/chat screen** is never sent to the video model — the scaffold drops the scene to a clean talking-head and seeds a phone-mockup PIP stub to fill with a real `baker images screenshot` or brand HTML block (Seedance garbles UI and a split leaves a seam). The **music bed is instrumental** (the script is never fed to the music model — it would sing over the voice), enters only after the hook, and is **sidechain-ducked** under the voice. **Word-synced TikTok captions** are wired whenever the ad has speech — and they are **transcribed from the rendered audio** (a `video_transcribe` of the actual voice mix), not the deconstruct's original transcript. This is a correctness boundary: wiring the source transcript would burn the **competitor's** words (their brand name, a claim we can't make) over the ad once the script is re-authored, whereas transcribing the generated audio can only ever show what is actually spoken, so the captions always track the re-written lines. Seeded overlays are pushed **off the subject's face** (dead-center → bottom band).
|
|
3719
3705
|
|
|
3720
3706
|
The two scaffold passes are billed (the full `video_deconstruct` is the heavy one); **running** the result then generates many image/video/audio assets and is not free. Defaults to vertical 1080×1920 overlays — copy + edit the composition for other aspect ratios. For on-brand overlay type, drop `brand-bold.otf`/`brand-regular.otf` into the copied `video-overlay-composition/` dir (wired via `@font-face`, with a system fallback). Richer transcription (punctuated words + paragraphs) is available via the deconstruct's `transcriber: "deepgram"` param when `DEEPGRAM_API_KEY` is set.
|
|
3721
3707
|
|
|
@@ -1117,7 +1117,7 @@ var MODEL_REGISTRY = {
|
|
|
1117
1117
|
required: ["subject_description", "subject_type"],
|
|
1118
1118
|
params: {
|
|
1119
1119
|
subject_description: { kind: "string" },
|
|
1120
|
-
subject_type: { kind: "string", enum: ["character", "person", "product"] },
|
|
1120
|
+
subject_type: { kind: "string", enum: ["character", "person", "product", "location"] },
|
|
1121
1121
|
views: { kind: "json" },
|
|
1122
1122
|
style: { kind: "string" },
|
|
1123
1123
|
prompt_override: { kind: "string" },
|
|
@@ -1131,7 +1131,7 @@ var MODEL_REGISTRY = {
|
|
|
1131
1131
|
required: ["subject_description", "subject_type"],
|
|
1132
1132
|
params: {
|
|
1133
1133
|
subject_description: { kind: "string" },
|
|
1134
|
-
subject_type: { kind: "string", enum: ["character", "person", "product"] },
|
|
1134
|
+
subject_type: { kind: "string", enum: ["character", "person", "product", "location"] },
|
|
1135
1135
|
views: { kind: "json" },
|
|
1136
1136
|
style: { kind: "string" },
|
|
1137
1137
|
prompt_override: { kind: "string" },
|
|
@@ -1589,6 +1589,11 @@ var VideoMeta = z.object({
|
|
|
1589
1589
|
z.object({ scene: z.number(), lipsync_node: z.string() })
|
|
1590
1590
|
])
|
|
1591
1591
|
).default([]),
|
|
1592
|
+
// Advisory, NOT gated: scenes where a presenter's later same-shot turns play as
|
|
1593
|
+
// `tts` over their one native clip (a held shot can hold only ONE lip-synced clip,
|
|
1594
|
+
// and voiceover turns can't overlap), so the lips may not match those words. The
|
|
1595
|
+
// scaffold surfaces it so the agent can cut away or rely on the burned-in captions.
|
|
1596
|
+
lip_sync_caution: z.array(z.object({ scene: z.number(), speaker: z.string(), tts_over_native: z.array(z.string()) })).optional(),
|
|
1592
1597
|
// Advisory, NOT gated by the validator: the reviewable "which graphic fires
|
|
1593
1598
|
// on which spoken beat" map emitted by scaffold-video (per-scene window,
|
|
1594
1599
|
// spoken line, storyboard frames, scheduled graphics). Free-form rows so the
|
|
@@ -5398,8 +5403,10 @@ var REFERENCE_SHEET_MODELS = ["google/gemini-3-pro-image-preview", "google/gemin
|
|
|
5398
5403
|
var ImageReferenceSheetParams = z20.object({
|
|
5399
5404
|
model: z20.enum(REFERENCE_SHEET_MODELS),
|
|
5400
5405
|
subject_description: z20.string().min(1),
|
|
5401
|
-
|
|
5402
|
-
|
|
5406
|
+
// `location` = a set/room shown from several camera ANGLES (not a rotated subject),
|
|
5407
|
+
// so a multi-scene shoot keeps one consistent set.
|
|
5408
|
+
subject_type: z20.enum(["character", "person", "product", "location"]),
|
|
5409
|
+
views: z20.array(z20.string().min(1)).min(2).max(8).optional(),
|
|
5403
5410
|
style: z20.string().optional(),
|
|
5404
5411
|
prompt_override: z20.string().min(1).optional(),
|
|
5405
5412
|
aspect_ratio: z20.enum(["1:1", "16:9", "9:16", "4:3", "3:4", "3:2", "2:3", "4:5", "5:4", "21:9", "1:4", "4:1", "1:8", "8:1"]).optional(),
|
|
@@ -5409,7 +5416,7 @@ var imageReferenceSheetNode = delegated({
|
|
|
5409
5416
|
id: "image_reference_sheet",
|
|
5410
5417
|
version: "1.0.0",
|
|
5411
5418
|
category: "image",
|
|
5412
|
-
summary: "Fuse 1\u20136 images of a single subject (person, character, or
|
|
5419
|
+
summary: "Fuse 1\u20136 images of a single subject (person, character, product, or location/set) into ONE multi-view reference sheet \u2014 a labeled grid in consistent style and lighting: a turnaround (FRONT / SIDE / BACK\u2026) for a person/character/product, or several camera angles of the same room (WIDE / REVERSE / DETAIL\u2026) for a location. Curated models: Gemini 3 Pro Image (best fusion + labels), Gemini 3.1 Flash Image (cheap iteration).",
|
|
5413
5420
|
when_to_use: "Use before image_generate / video_generate when a subject must stay consistent across many creatives \u2014 wire the `sheet` output into their `reference` input instead of re-describing the subject per prompt. `subject_description` should be the exact wording you reuse downstream. Pick `google/gemini-3-pro-image-preview` for final 6-view sheets at 2K+, `google/gemini-3.1-flash-image-preview` while iterating.",
|
|
5414
5421
|
inputs: z20.object({ references: z20.array(ImageRef).min(1).max(6) }).loose(),
|
|
5415
5422
|
params: ImageReferenceSheetParams,
|
|
@@ -6139,4 +6146,4 @@ export {
|
|
|
6139
6146
|
defaultRegistry,
|
|
6140
6147
|
createEngineFromEnv
|
|
6141
6148
|
};
|
|
6142
|
-
//# sourceMappingURL=chunk-
|
|
6149
|
+
//# sourceMappingURL=chunk-RCPMJKI7.js.map
|