@koda-sl/baker-cli 0.68.0 → 0.71.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -3
- package/canvas/video-overlay-composition/index.html +65 -191
- package/canvas/video-overlay-composition/meta.json +4 -15
- package/dist/{chunk-K6LHXCKD.js → chunk-JIDZ37KG.js} +77 -5
- package/dist/chunk-JIDZ37KG.js.map +1 -0
- package/dist/cli.js +714 -195
- package/dist/cli.js.map +1 -1
- package/dist/engine/index.d.ts +14 -0
- package/dist/engine/index.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-K6LHXCKD.js.map +0 -1
package/README.md
CHANGED
|
@@ -1587,6 +1587,8 @@ Each external source is its own subcommand. Pick the verb that matches the sourc
|
|
|
1587
1587
|
| `baker images find <q>` | Fanout: library + opted-in providers | off |
|
|
1588
1588
|
| `baker images stock <q> [--type photo\|vector\|psd]` | Magnific (Freepik's dev API) — photos, vectors, illustrations, PSDs (~250M assets) | off |
|
|
1589
1589
|
| `baker images google <q>` | Google Images via the official Custom Search JSON API | off |
|
|
1590
|
+
| `baker images pinterest <q>` | Pinterest reference imagery via ScrapeCreators (photo-real mood boards) | off |
|
|
1591
|
+
| `baker images generate <prompt>` | AI image generation via OpenRouter image models (cost-tracked) | **always on** (the bytes are the artifact) |
|
|
1590
1592
|
| `baker images logo <domain>` | Brand logo via Brandfetch CDN | **on** (top 1) |
|
|
1591
1593
|
| `baker images icon <name>` | Iconify (200+ icon sets, no API key) | off (CDN URL is stable) |
|
|
1592
1594
|
| `baker images gif <q>` | Reaction GIFs / memes via Giphy (paid-social creative) | off |
|
|
@@ -1726,6 +1728,69 @@ Requires both `GOOGLE_CUSTOM_SEARCH_API_KEY` and `GOOGLE_CUSTOM_SEARCH_ENGINE_ID
|
|
|
1726
1728
|
| `--auto-ingest` | Ingest top N (0–20, default 0) and return Baker-owned URLs on ingested hits |
|
|
1727
1729
|
| `--context` | Free-text hint passed to Gemini describe to bias the generated description and tags (overrides provider-derived context on auto-ingest paths) |
|
|
1728
1730
|
|
|
1731
|
+
### `baker images pinterest <query>`
|
|
1732
|
+
|
|
1733
|
+
Pinterest image search via ScrapeCreators. The best source for **photo-real reference and mood imagery** — interiors, fashion, food, product styling, lifestyle. ⚠ Unverified, trademark-bearing web content — use for *reference*, not republishing. Strongest paired with `generate --reference` (find the look on Pinterest, then generate an owned, on-brand image from it). Auto-ingest is opt-in.
|
|
1734
|
+
|
|
1735
|
+
```bash
|
|
1736
|
+
baker images pinterest "scandinavian living room"
|
|
1737
|
+
baker images pinterest "minimalist skincare product photography" --limit 20
|
|
1738
|
+
baker images pinterest "cozy coffee shop interior" --auto-ingest 2 --context "Mood reference for hero photography"
|
|
1739
|
+
```
|
|
1740
|
+
|
|
1741
|
+
**Flags:**
|
|
1742
|
+
|
|
1743
|
+
| Flag | Description |
|
|
1744
|
+
|-----------------|----------------------------------------------------------------------------|
|
|
1745
|
+
| `--limit` | Max results (1–20, default 5) |
|
|
1746
|
+
| `--auto-ingest` | Ingest top N (0–20, default 0) and return Baker-owned URLs on ingested hits |
|
|
1747
|
+
| `--context` | Free-text hint passed to Gemini describe on auto-ingest (overrides the pin title) |
|
|
1748
|
+
|
|
1749
|
+
Endpoint `api.scrapecreators.com/v1/pinterest/search` (`SCRAPE_CREATORS_API_KEY`), $0.00188/request, 1-day cache.
|
|
1750
|
+
|
|
1751
|
+
### `baker images generate <prompt>`
|
|
1752
|
+
|
|
1753
|
+
Generate an image with AI (OpenRouter image models) and ingest it into the library. The generated bytes **are** the artifact, so the result is always auto-ingested (described + embedded) — the next `baker images library` query finds it. Cost is tracked per request via OpenRouter's reported usage. Models and defaults mirror the Baker canvas.
|
|
1754
|
+
|
|
1755
|
+
```bash
|
|
1756
|
+
baker images generate "a friendly golden retriever in a bright modern living room" --aspect-ratio 16:9
|
|
1757
|
+
baker images generate "hero shot of a matte black water bottle on wet marble, studio light" \
|
|
1758
|
+
--model google/gemini-3-pro-image-preview --image-size 2K --aspect-ratio 4:5
|
|
1759
|
+
baker images generate "home office hero, warm minimalist, natural light, 35mm photo" \
|
|
1760
|
+
--reference "https://i.pinimg.com/…/pin1.jpg,https://…/brand-product.png"
|
|
1761
|
+
baker images generate "put this product on a marble countertop in soft daylight" \
|
|
1762
|
+
--reference "./src/brand/logos/product.png,./refs/kitchen-mood.jpg" # local sandbox files
|
|
1763
|
+
baker images generate "flat geometric mascot, brand palette" \
|
|
1764
|
+
--model recraft/recraft-v4.1-pro-vector --rgb-colors "[[10,10,10],[255,80,0]]" --bg-rgb "[255,255,255]"
|
|
1765
|
+
```
|
|
1766
|
+
|
|
1767
|
+
**Models** (`--model`, default `openai/gpt-5.4-image-2`):
|
|
1768
|
+
|
|
1769
|
+
| Model | Best for | Aspect ratios | Sizes |
|
|
1770
|
+
|---|---|---|---|
|
|
1771
|
+
| `openai/gpt-5.4-image-2` **(default)** | Photoreal + cleanest in-image text — ad/landing reproduction | standard set | `1K` `2K` `4K` |
|
|
1772
|
+
| `google/gemini-3-pro-image-preview` | Highest fidelity (Nano Banana Pro) | standard set | `1K` `2K` `4K` |
|
|
1773
|
+
| `google/gemini-3.5-flash` | Fast; extreme aspect ratios | standard **+** `1:4` `4:1` `1:8` `8:1` | `0.5K`–`4K` |
|
|
1774
|
+
| `google/gemini-3.1-flash-image-preview` | Same as 3.5 flash (preview) | extreme set | `0.5K`–`4K` |
|
|
1775
|
+
| `recraft/recraft-v4.1-pro-vector` | Vector/flat/SVG-style with palette control | standard set | `1K` `2K` `4K` |
|
|
1776
|
+
|
|
1777
|
+
Standard aspect ratios: `1:1` `2:3` `3:2` `3:4` `4:3` `4:5` `5:4` `9:16` `16:9` `21:9`.
|
|
1778
|
+
|
|
1779
|
+
**Flags:**
|
|
1780
|
+
|
|
1781
|
+
| Flag | Description |
|
|
1782
|
+
|---|---|
|
|
1783
|
+
| `--model` | Model id (default `openai/gpt-5.4-image-2`) |
|
|
1784
|
+
| `--aspect-ratio` | Output aspect ratio (default `1:1`) |
|
|
1785
|
+
| `--image-size` | Resolution: `1K` (default) `2K` `4K` (Gemini flash also `0.5K`) |
|
|
1786
|
+
| `--reference` | Comma-separated visual references, each either a **public image URL** (Pinterest / stock / library `imageUrl`) **or a local file path** (a sandbox image — brand logo, product shot, cropped photo, screenshot). Local files are downscaled (≤1536px) and inlined automatically — no manual upload. Applied in order; the biggest quality lever for photographed, on-brand output. Split is on `,`, so a URL containing a literal comma in its query string would be torn in two (rare for image CDNs — pass it alone if it occurs); a single `data:` URL is taken whole. |
|
|
1787
|
+
| `--strength` | Recraft only: vectorization strength 0–1 |
|
|
1788
|
+
| `--rgb-colors` | Recraft only: JSON palette `[[r,g,b],…]` |
|
|
1789
|
+
| `--bg-rgb` | Recraft only: JSON background `[r,g,b]` |
|
|
1790
|
+
| `--context` | Describe-hint override for the ingested row (defaults to the prompt) |
|
|
1791
|
+
|
|
1792
|
+
Returns `{ images: [{ imageId, imageUrl, deduped, width, height }], model, costUsd }`. `imageUrl` is library-owned and ready to place. Requires `OPENROUTER_API_KEY` on the Convex deployment. No `seed` (OpenRouter image_config has no seed slot). Identical re-generations dedup by content hash.
|
|
1793
|
+
|
|
1729
1794
|
### `baker images logo <domain>`
|
|
1730
1795
|
|
|
1731
1796
|
Brand logo via Brandfetch CDN (`fallback/404`). Probes all 5 logo variants in parallel and returns whichever ones the domain actually publishes. Auto-ingests the first by default.
|
|
@@ -3493,9 +3558,17 @@ Turn a reference video into a **runnable, self-validated reproduction canvas** i
|
|
|
3493
3558
|
1. **`video_deconstruct`** (`~google/gemini-pro-latest`, full mode) — reverse-engineers the video into a scene-by-scene blueprint + word-level transcript, written next to the canvas as **`prompt.json`**. Each scene's `start_frame_prompt`/`end_frame_prompt` are inlined into the frame nodes (see below); `prompt.json` then rides along as the shared **global style reference** (palette, cast cohesion) and as provenance.
|
|
3494
3559
|
2. **recurring-element selection** (`~google/gemini-flash-latest`) — picks only the **recurring, identity-critical** elements (each `global.cast` person, a recurring animal, a showcased product, the brand logo) and the scene indices each appears in. One real reference image grounds each element across **every** frame it appears in, so the same actor stays consistent the whole video.
|
|
3495
3560
|
|
|
3496
|
-
It then scaffolds the full pipeline: per scene, two **static-ad-grade frames** (`image_generate` with its **own self-contained `params.prompt`** — edit a frame node to change only that frame; `prompt.json` is wired as a demoted shared-style `target_blueprint`, a per-element reference legend, the real extracted frame as a composition anchor) → `video_generate` (Seedance first/last-frame, fed an ultra-detailed motion brief composed from the scene's action, camera, dialogue, and transcript; duration snapped to the nearest allowed clip length).
|
|
3561
|
+
It then scaffolds the full pipeline: per scene, two **static-ad-grade frames** (`image_generate` with its **own self-contained `params.prompt`** — edit a frame node to change only that frame; `prompt.json` is wired as a demoted shared-style `target_blueprint`, a per-element reference legend, the real extracted frame as a composition anchor) → `video_generate` (Seedance first/last-frame, fed an ultra-detailed motion brief composed from the scene's action, camera, dialogue, and transcript; duration snapped to the nearest allowed clip length).
|
|
3562
|
+
|
|
3563
|
+
**Sequenced audio.** Dialogue is a back-and-forth on one absolute timeline, so each **contiguous same-speaker turn** becomes its own `tts` placed at its real `start_s` — turns alternate and never stack (the earlier design concatenated each speaker's whole monologue at their earliest timestamp, so two voices played in parallel for the entire video). Each speaker is locked to one shared `voice_select` voice; a `sound_effect` per SFX and a `music` bed (styled after the AudD-identified track when available, ducked under the voices) round out the mix (`audio_timeline`). The final mux normalizes the soundtrack to **−14 LUFS (stereo)** so the output plays loud in every player — the raw mix is quiet mono, which reads as "no sound."
|
|
3564
|
+
|
|
3565
|
+
**Lip-sync by default.** Seedance animates a generic mouth that won't track the separate voiceover, so any scene with a **single on-camera speaker** (a `dialogue.speaker` that maps to a `global.cast` member, when `global.voiceover.mode` isn't pure `voiceover`/`none`) routes its clip through `video_lipsync` against that scene's own turn audio (~20 cr/scene). Scenes with two on-camera speakers are left un-synced on purpose (one lip-sync track can't drive two faces) and flagged for you to split or pick a primary.
|
|
3566
|
+
|
|
3567
|
+
**Timeline-accurate picture.** Seedance can't render under 4s, so each clip is generated at the smallest allowed duration ≥ the scene length and then **trimmed back to the exact scene duration** before concat. This keeps the concatenated picture on the same timeline as the absolute-timed audio — without it, short scenes balloon to 4s, the spine runs far longer than the soundtrack, and every line plays over the wrong (slowed) scene so the lips never match. Frames are also prompted as **clean text-free plates** (no baked captions/lower-thirds/tickers/logos-as-text) so the overlay layer is the single source of on-screen text.
|
|
3568
|
+
|
|
3569
|
+
**Overlays are agent-painted HTML, not props.** The clips are concatenated, then the `video-overlay` composition (copied next to the canvas) composites the overlay layer. The scaffold **bakes the reference's overlays into that composition's `index.html` as real, editable HTML** (each overlay is a plain element with its text, a `.pos-*` position class, and `data-start`/`data-dur` timing); a tiny generic runtime only shows/hides each element at its timestamp (with an optional `data-anim` entrance). It makes **no styling decisions** — bars, tickers, colors, fonts, and a real logo `<img>` you drop into the dir all live in the HTML/CSS you edit. Floating elements (logo bugs) are seeded as commented `<img>` stubs so an un-edited render stays clean. Drop `brand-bold.otf`/`brand-regular.otf` for on-brand type.
|
|
3497
3570
|
|
|
3498
|
-
The emitted canvas is validated (`validateCanvasDeep`) before it's written, so it always runs. The full editable checklist is embedded
|
|
3571
|
+
The emitted canvas is validated (`validateCanvasDeep`) before it's written, so it always runs. It also carries a **`metadata.video`** timing plan that `baker canvas validate` proves **statically, before any billed render**: no two voiceover turns overlap, the audio length ≈ the video length, and every single-on-camera-speaker scene has a lip-sync node. The full editable checklist is embedded as **`metadata.todo`** (with a step-by-step guide in `metadata.description`). stdout returns `{ ok, canvas_path, prompt_path, models, stats, checklist }`.
|
|
3499
3572
|
|
|
3500
3573
|
```bash
|
|
3501
3574
|
baker canvas scaffold-video ./reference-ad.mp4 --focus "competitor UGC ad for <brand>"
|
|
@@ -3509,7 +3582,7 @@ baker canvas run ./reference-ad.video.canvas.json
|
|
|
3509
3582
|
|---|---|---|
|
|
3510
3583
|
| `--out <path>` | `<video-dir>/<name>.video.canvas.json` | Where to write the canvas (composition is copied alongside). |
|
|
3511
3584
|
| `--frames <mode>` | `generate` | `generate` regenerates frames anchored on the originals; `reuse` wires the real extracted frames straight into the clips (faithful, cheaper). |
|
|
3512
|
-
| `--max-scenes <n>` |
|
|
3585
|
+
| `--max-scenes <n>` | all source scenes | **Cost lever that reduces fidelity** — caps the deconstruct, MERGING away every scene beyond the cap (fewer cuts, lost beats). Prints a warning when set; omit it to reproduce every scene. |
|
|
3513
3586
|
| `--language <code>` | auto | Transcript/dialogue language hint (e.g. `fr`, `en`). |
|
|
3514
3587
|
| `--focus <text>` | — | Known provenance/emphasis to ground the deconstruct. |
|
|
3515
3588
|
| `--deconstruct-model <id>` | `~google/gemini-pro-latest` | Override the `video_deconstruct` model. |
|
|
@@ -5,10 +5,22 @@
|
|
|
5
5
|
<meta name="viewport" content="width=1080, height=1920" />
|
|
6
6
|
<script src="./gsap.min.js"></script>
|
|
7
7
|
<style>
|
|
8
|
-
/*
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
/* ═══════════════════════════════════════════════════════════════════════
|
|
9
|
+
OVERLAY LAYER — PAINT THIS YOURSELF.
|
|
10
|
+
|
|
11
|
+
The scaffold seeds #overlay-root (below) with the reference ad's overlays
|
|
12
|
+
as plain HTML elements — real text, a position class, and data-start/
|
|
13
|
+
data-dur timing. This file is YOURS: restyle every rule here, add classes,
|
|
14
|
+
build a lower-third bar or a scrolling ticker, drop a logo image into this
|
|
15
|
+
dir and reference it with <img>. The only contract is:
|
|
16
|
+
|
|
17
|
+
• keep each overlay element inside #overlay-root
|
|
18
|
+
• keep its data-start / data-dur (seconds) — the runtime shows/hides by them
|
|
19
|
+
• (optional) data-anim="fade|slide_up|slide_down|pop" for a canned entrance
|
|
20
|
+
|
|
21
|
+
The runtime makes NO styling decisions. How it looks is 100% your CSS/markup.
|
|
22
|
+
For on-brand type, drop brand-bold.otf / brand-regular.otf into this dir.
|
|
23
|
+
═══════════════════════════════════════════════════════════════════════ */
|
|
12
24
|
@font-face {
|
|
13
25
|
font-family: 'BrandFont';
|
|
14
26
|
src: url('./brand-bold.otf') format('opentype'), url('./brand-bold.ttf') format('truetype');
|
|
@@ -25,211 +37,73 @@
|
|
|
25
37
|
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
26
38
|
html, body { width: 1080px; height: 1920px; overflow: hidden; background: #000; }
|
|
27
39
|
video#bg { position: absolute; inset: 0; width: 100%; height: 100%; object-fit: cover; }
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
.
|
|
41
|
-
|
|
42
|
-
.
|
|
43
|
-
|
|
44
|
-
.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
.
|
|
40
|
+
#overlay-root { position: absolute; inset: 0; font-family: 'BrandFont', 'Helvetica Neue', Arial, sans-serif; }
|
|
41
|
+
|
|
42
|
+
/* Starter style for a seeded overlay — deliberately plain. Override freely. */
|
|
43
|
+
.ov {
|
|
44
|
+
position: absolute; visibility: hidden; max-width: 86%;
|
|
45
|
+
font-size: 64px; font-weight: 800; line-height: 1.15; color: #fff; text-align: center;
|
|
46
|
+
white-space: normal; overflow-wrap: break-word;
|
|
47
|
+
text-shadow: 0 2px 8px rgba(0,0,0,0.6), 0 0 2px rgba(0,0,0,0.9);
|
|
48
|
+
}
|
|
49
|
+
.ov.fe { font-size: 30px; font-weight: 600; opacity: 0.9; }
|
|
50
|
+
|
|
51
|
+
/* 9-grid position helpers (absolute). Tweak the insets or add your own. */
|
|
52
|
+
.pos-top-left { top: 90px; left: 56px; text-align: left; }
|
|
53
|
+
.pos-top-center { top: 90px; left: 50%; transform: translateX(-50%); }
|
|
54
|
+
.pos-top-right { top: 90px; right: 56px; text-align: right; }
|
|
55
|
+
.pos-mid-left,
|
|
56
|
+
.pos-center-left { top: 50%; left: 56px; transform: translateY(-50%); text-align: left; }
|
|
57
|
+
.pos-center,
|
|
58
|
+
.pos-mid-center { top: 50%; left: 50%; transform: translate(-50%,-50%); }
|
|
59
|
+
.pos-mid-right,
|
|
60
|
+
.pos-center-right { top: 50%; right: 56px; transform: translateY(-50%); text-align: right; }
|
|
61
|
+
.pos-bottom-left { bottom: 150px; left: 56px; text-align: left; }
|
|
62
|
+
.pos-bottom-center { bottom: 150px; left: 50%; transform: translateX(-50%); width: 88%; }
|
|
63
|
+
.pos-bottom-right { bottom: 150px; right: 56px; text-align: right; }
|
|
48
64
|
</style>
|
|
49
65
|
</head>
|
|
50
66
|
<body>
|
|
51
67
|
<div id="root" data-composition-id="main" data-start="0" data-duration="{{duration}}" data-width="1080" data-height="1920">
|
|
52
68
|
<video id="bg" src="background.mp4" muted></video>
|
|
53
|
-
<div id="
|
|
69
|
+
<div id="overlay-root">
|
|
70
|
+
<!--OVERLAYS-->
|
|
71
|
+
</div>
|
|
54
72
|
</div>
|
|
55
73
|
|
|
56
74
|
<script>
|
|
57
75
|
(() => {
|
|
58
|
-
const OVERLAYS = {{overlays}};
|
|
59
|
-
const FLOATING = {{floating_elements}};
|
|
60
76
|
const DURATION = parseFloat('{{duration}}');
|
|
61
|
-
const stage = document.getElementById('stage');
|
|
62
77
|
const tl = gsap.timeline({ paused: true });
|
|
78
|
+
const els = Array.from(document.querySelectorAll('#overlay-root [data-start]'));
|
|
79
|
+
|
|
80
|
+
// Generic timeline: show each element at data-start, hide at start+data-dur,
|
|
81
|
+
// with an optional canned entrance from data-anim. No styling decisions here —
|
|
82
|
+
// the look lives entirely in the CSS/markup above.
|
|
83
|
+
for (const el of els) {
|
|
84
|
+
const at = parseFloat(el.getAttribute('data-start') || '0') || 0;
|
|
85
|
+
const dur = parseFloat(el.getAttribute('data-dur') || '2.5') || 2.5;
|
|
86
|
+
const anim = el.getAttribute('data-anim') || '';
|
|
87
|
+
// Preserve any positioning transform the CSS set (translate(...)).
|
|
88
|
+
const baseTransform = getComputedStyle(el).transform;
|
|
89
|
+
const tx = baseTransform && baseTransform !== 'none' ? baseTransform : '';
|
|
63
90
|
|
|
64
|
-
const ROWS = { top: 'top', mid: 'mid', center: 'mid', bottom: 'bottom' };
|
|
65
|
-
const COLS = { left: 'left', center: 'center', right: 'right' };
|
|
66
|
-
// 3x3 cells indexed by "row_col"; build them once.
|
|
67
|
-
const cells = {};
|
|
68
|
-
for (const r of ['top', 'mid', 'bottom']) {
|
|
69
|
-
for (const c of ['left', 'center', 'right']) {
|
|
70
|
-
const el = document.createElement('div');
|
|
71
|
-
el.className = `cell ${r} ${c}`;
|
|
72
|
-
stage.appendChild(el);
|
|
73
|
-
cells[`${r}_${c}`] = el;
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
function cellFor(position) {
|
|
77
|
-
// position like "bottom_center" / "mid_left" / "center" (legacy mid_center alias)
|
|
78
|
-
const p = String(position || 'bottom_center');
|
|
79
|
-
if (p === 'center') return cells.mid_center;
|
|
80
|
-
const [rawRow, rawCol] = p.split('_');
|
|
81
|
-
const row = ROWS[rawRow] || 'bottom';
|
|
82
|
-
const col = COLS[rawCol] || 'center';
|
|
83
|
-
return cells[`${row}_${col}`];
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
function applyTextStyle(el, style) {
|
|
87
|
-
const s = style || {};
|
|
88
|
-
el.classList.add('size-' + (s.size || 'medium'));
|
|
89
|
-
if (s.color_hex) el.style.color = s.color_hex; else el.style.color = '#fff';
|
|
90
|
-
if (s.casing === 'upper') el.style.textTransform = 'uppercase';
|
|
91
|
-
else if (s.casing === 'lower') el.style.textTransform = 'lowercase';
|
|
92
|
-
else if (s.casing === 'title') el.style.textTransform = 'capitalize';
|
|
93
|
-
// Background heuristic from a free-text descriptor like "white rounded rectangle".
|
|
94
|
-
const bg = (s.background || '').toLowerCase();
|
|
95
|
-
const span = document.createElement('span');
|
|
96
|
-
span.textContent = '';
|
|
97
|
-
if (bg && bg !== 'none' && bg !== 'transparent') {
|
|
98
|
-
span.className = 'pill';
|
|
99
|
-
if (bg.includes('white')) { span.style.background = '#fff'; if (!s.color_hex) el.style.color = '#111'; }
|
|
100
|
-
else if (bg.includes('black')) { span.style.background = 'rgba(0,0,0,0.85)'; }
|
|
101
|
-
else { span.style.background = 'rgba(0,0,0,0.6)'; }
|
|
102
|
-
}
|
|
103
|
-
// Stroke/shadow: default a strong outline for legibility unless a pill bg is used.
|
|
104
|
-
if (!span.className) {
|
|
105
|
-
el.style.textShadow = s.stroke_or_shadow
|
|
106
|
-
? '2px 2px 0 rgba(0,0,0,0.9), -2px -2px 0 rgba(0,0,0,0.9), 2px -2px 0 rgba(0,0,0,0.9), -2px 2px 0 rgba(0,0,0,0.9)'
|
|
107
|
-
: '0 3px 10px rgba(0,0,0,0.6)';
|
|
108
|
-
}
|
|
109
|
-
return span;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
// Read the free-text `animation_detail` for speed / easing / distance hints
|
|
113
|
-
// so two overlays with the same coarse `animation` can still differ.
|
|
114
|
-
function tuneFromDetail(detail) {
|
|
115
|
-
const d = String(detail || '').toLowerCase();
|
|
116
|
-
let durMul = 1, ease = null, dist = 1;
|
|
117
|
-
if (/(fast|quick|snappy|rapid|instant|sharp)/.test(d)) durMul = 0.6;
|
|
118
|
-
if (/(slow|gentle|smooth|gradual|soft|ease)/.test(d)) durMul = 1.5;
|
|
119
|
-
if (/(bounce|spring|elastic|jelly)/.test(d)) ease = 'elastic.out(1,0.5)';
|
|
120
|
-
if (/(overshoot|back|punch)/.test(d)) ease = 'back.out(1.7)';
|
|
121
|
-
if (/(big|large|dramatic|hard|strong)/.test(d)) dist = 1.6;
|
|
122
|
-
return { durMul, ease, dist };
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
function animateIn(el, animation, at, detail) {
|
|
126
|
-
const a = String(animation || 'fade');
|
|
127
|
-
const t = tuneFromDetail(detail);
|
|
128
|
-
const base = 0.25 * t.durMul;
|
|
129
|
-
if (a === 'pop' || a === 'bounce') {
|
|
130
|
-
tl.fromTo(el, { scale: 0.6, opacity: 0 }, { scale: 1, opacity: 1, duration: base, ease: t.ease || (a === 'bounce' ? 'elastic.out(1,0.5)' : 'back.out(1.7)') }, at);
|
|
131
|
-
} else if (a === 'slide_up') {
|
|
132
|
-
tl.fromTo(el, { y: 60 * t.dist, opacity: 0 }, { y: 0, opacity: 1, duration: base, ease: t.ease || 'power2.out' }, at);
|
|
133
|
-
} else if (a === 'slide_down') {
|
|
134
|
-
tl.fromTo(el, { y: -60 * t.dist, opacity: 0 }, { y: 0, opacity: 1, duration: base, ease: t.ease || 'power2.out' }, at);
|
|
135
|
-
} else if (a === 'shake') {
|
|
136
|
-
tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.1 }, at);
|
|
137
|
-
tl.to(el, { x: '+=' + (12 * t.dist), duration: 0.05, repeat: 5, yoyo: true }, at);
|
|
138
|
-
} else if (a === 'none' || a === 'other') {
|
|
139
|
-
tl.set(el, { opacity: 1 }, at);
|
|
140
|
-
} else {
|
|
141
|
-
// fade
|
|
142
|
-
tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.2 * t.durMul, ease: 'power1.out' }, at);
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
// Build the visible content. For typewriter/karaoke we split the text into
|
|
147
|
-
// per-glyph / per-word tokens so the reveal can be driven across the
|
|
148
|
-
// overlay's own on-screen duration (independent of the voiceover track).
|
|
149
|
-
function buildContent(container, ov) {
|
|
150
|
-
const a = String(ov.animation || 'fade');
|
|
151
|
-
const text = String(ov.text == null ? '' : ov.text);
|
|
152
|
-
if (a === 'typewriter') {
|
|
153
|
-
const spans = [];
|
|
154
|
-
for (const ch of Array.from(text)) {
|
|
155
|
-
const s = document.createElement('span');
|
|
156
|
-
s.className = 'tok';
|
|
157
|
-
s.textContent = ch;
|
|
158
|
-
s.style.opacity = '0';
|
|
159
|
-
container.appendChild(s);
|
|
160
|
-
if (ch.trim() !== '') spans.push(s);
|
|
161
|
-
else s.style.opacity = '1';
|
|
162
|
-
}
|
|
163
|
-
return { mode: 'typewriter', spans };
|
|
164
|
-
}
|
|
165
|
-
if (a === 'karaoke') {
|
|
166
|
-
const spans = [];
|
|
167
|
-
for (const part of text.split(/(\s+)/)) {
|
|
168
|
-
if (part === '') continue;
|
|
169
|
-
if (/^\s+$/.test(part)) { container.appendChild(document.createTextNode(part)); continue; }
|
|
170
|
-
const s = document.createElement('span');
|
|
171
|
-
s.className = 'tok';
|
|
172
|
-
s.textContent = part;
|
|
173
|
-
s.style.opacity = '0.4';
|
|
174
|
-
container.appendChild(s);
|
|
175
|
-
spans.push(s);
|
|
176
|
-
}
|
|
177
|
-
return { mode: 'karaoke', spans };
|
|
178
|
-
}
|
|
179
|
-
container.textContent = text;
|
|
180
|
-
return { mode: 'plain', spans: [] };
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
(OVERLAYS || []).forEach((ov, i) => {
|
|
184
|
-
if (!ov || !ov.text) return;
|
|
185
|
-
const el = document.createElement('div');
|
|
186
|
-
el.className = 'overlay';
|
|
187
|
-
el.id = 'ov_' + i;
|
|
188
|
-
const span = applyTextStyle(el, ov.style);
|
|
189
|
-
const content = buildContent(span, ov);
|
|
190
|
-
el.appendChild(span);
|
|
191
|
-
cellFor(ov.position).appendChild(el);
|
|
192
|
-
|
|
193
|
-
const at = typeof ov.appears_at_s === 'number' ? ov.appears_at_s : 0;
|
|
194
|
-
const dur = typeof ov.duration_s === 'number' ? ov.duration_s : 2.5;
|
|
195
91
|
tl.set(el, { visibility: 'visible' }, at);
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
tl.
|
|
202
|
-
} else if (content.mode === 'karaoke' && content.spans.length) {
|
|
203
|
-
// Light up each word in sequence across the overlay's duration.
|
|
204
|
-
const reveal = Math.min(Math.max(dur * 0.85, 0.4), Math.max(dur - 0.2, 0.4));
|
|
205
|
-
tl.set(el, { opacity: 1 }, at);
|
|
206
|
-
tl.to(content.spans, { opacity: 1, duration: 0.12, ease: 'power1.out', stagger: reveal / content.spans.length }, at);
|
|
92
|
+
if (anim === 'pop') {
|
|
93
|
+
tl.fromTo(el, { opacity: 0, scale: 0.7 }, { opacity: 1, scale: 1, duration: 0.3, ease: 'back.out(1.7)' }, at);
|
|
94
|
+
} else if (anim === 'slide_up') {
|
|
95
|
+
tl.fromTo(el, { opacity: 0, yPercent: 30 }, { opacity: 1, yPercent: 0, duration: 0.3, ease: 'power2.out' }, at);
|
|
96
|
+
} else if (anim === 'slide_down') {
|
|
97
|
+
tl.fromTo(el, { opacity: 0, yPercent: -30 }, { opacity: 1, yPercent: 0, duration: 0.3, ease: 'power2.out' }, at);
|
|
207
98
|
} else {
|
|
208
|
-
|
|
99
|
+
// Default / any unrecognized data-anim value: a plain fade.
|
|
100
|
+
tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.25, ease: 'power1.out' }, at);
|
|
209
101
|
}
|
|
210
|
-
|
|
211
102
|
tl.to(el, { opacity: 0, duration: 0.2 }, Math.max(at + 0.2, at + dur));
|
|
212
103
|
tl.set(el, { visibility: 'hidden' }, at + dur + 0.21);
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
(FLOATING || []).forEach((fe, i) => {
|
|
216
|
-
if (!fe) return;
|
|
217
|
-
const el = document.createElement('div');
|
|
218
|
-
el.className = 'floating';
|
|
219
|
-
el.id = 'fe_' + i;
|
|
220
|
-
el.innerHTML = '<span class="kind">' + (fe.kind || 'element') + '</span>';
|
|
221
|
-
el.appendChild(document.createTextNode(fe.description || ''));
|
|
222
|
-
cellFor(fe.position).appendChild(el);
|
|
223
|
-
|
|
224
|
-
const at = typeof fe.appears_at_s === 'number' ? fe.appears_at_s : 0;
|
|
225
|
-
const dur = typeof fe.duration_s === 'number' ? fe.duration_s : 2.5;
|
|
226
|
-
tl.set(el, { visibility: 'visible' }, at);
|
|
227
|
-
tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.2 }, at);
|
|
228
|
-
tl.to(el, { opacity: 0, duration: 0.2 }, at + dur);
|
|
229
|
-
tl.set(el, { visibility: 'hidden' }, at + dur + 0.21);
|
|
230
|
-
});
|
|
104
|
+
if (tx) el.style.transform = tx;
|
|
105
|
+
}
|
|
231
106
|
|
|
232
|
-
// Keep the timeline alive to the full video duration.
|
|
233
107
|
tl.to({}, { duration: 0.01 }, Math.max(0.01, DURATION - 0.01));
|
|
234
108
|
window.__timelines = window.__timelines || {};
|
|
235
109
|
window.__timelines.main = tl;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"id": "video-overlay",
|
|
3
|
-
"title": "
|
|
4
|
-
"description": "
|
|
3
|
+
"title": "Overlay layer: agent-authored HTML/CSS composited over a video",
|
|
4
|
+
"description": "A paint-it-yourself overlay layer, NOT a prop renderer. The scaffold seeds #overlay-root in index.html with the reference ad's overlays as real, editable HTML elements (text + a position class + data-start/data-dur timing); a tiny generic runtime only shows/hides each element at its timestamp (with an optional data-anim entrance). Every styling decision — bars, tickers, colors, fonts, a real logo <img> dropped into this dir — lives in the HTML/CSS you edit, not in props. Drop brand-bold.otf / brand-regular.otf for on-brand type. Fixed 1080x1920; copy + edit width/height for other ratios.",
|
|
5
5
|
"width": 1080,
|
|
6
6
|
"height": 1920,
|
|
7
7
|
"fps": 30,
|
|
@@ -11,19 +11,8 @@
|
|
|
11
11
|
"kind": "video",
|
|
12
12
|
"required": true,
|
|
13
13
|
"staged_as": "background.mp4",
|
|
14
|
-
"description": "The base video to
|
|
14
|
+
"description": "The base video to composite overlays onto (e.g. the concatenated reproduction clips)."
|
|
15
15
|
}
|
|
16
16
|
},
|
|
17
|
-
"params": {
|
|
18
|
-
"overlays": {
|
|
19
|
-
"kind": "json",
|
|
20
|
-
"required": true,
|
|
21
|
-
"description": "Array of overlay objects from the blueprint: {text, appears_at_s, duration_s, position(9-grid: top_left..bottom_center), animation(none|fade|pop|slide_up|slide_down|typewriter|bounce|karaoke|shake|other), style{font_style, casing, color_hex, background, stroke_or_shadow, size(small|medium|large|huge)}}."
|
|
22
|
-
},
|
|
23
|
-
"floating_elements": {
|
|
24
|
-
"kind": "json",
|
|
25
|
-
"default": [],
|
|
26
|
-
"description": "Array of non-text floating elements: {kind, description, appears_at_s, duration_s, position}. Rendered as labeled dashed placeholders for the agent to replace with real image overlays."
|
|
27
|
-
}
|
|
28
|
-
}
|
|
17
|
+
"params": {}
|
|
29
18
|
}
|
|
@@ -917,6 +917,13 @@ var FAL_IMAGE_MIMES = ["image/png", "image/jpeg", "image/webp"];
|
|
|
917
917
|
var FAL_VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"];
|
|
918
918
|
var DECONSTRUCT_VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"];
|
|
919
919
|
var FAL_AUDIO_MIMES = ["audio/wav", "audio/mpeg", "audio/mp3"];
|
|
920
|
+
var IMAGE_GENERATE_MODELS = [
|
|
921
|
+
"openai/gpt-5.4-image-2",
|
|
922
|
+
"google/gemini-3.5-flash",
|
|
923
|
+
"google/gemini-3.1-flash-image-preview",
|
|
924
|
+
"google/gemini-3-pro-image-preview",
|
|
925
|
+
"recraft/recraft-v4.1-pro-vector"
|
|
926
|
+
];
|
|
920
927
|
var MODEL_REGISTRY = {
|
|
921
928
|
text_generate: {
|
|
922
929
|
"~google/gemini-flash-latest": {
|
|
@@ -1493,13 +1500,32 @@ var OutputRef = z.object({
|
|
|
1493
1500
|
node: z.string(),
|
|
1494
1501
|
output: z.string()
|
|
1495
1502
|
}).strict();
|
|
1503
|
+
var VideoMeta = z.object({
|
|
1504
|
+
duration_s: z.number(),
|
|
1505
|
+
// Each sequenced voiceover turn on the absolute timeline.
|
|
1506
|
+
vo_segments: z.array(
|
|
1507
|
+
z.object({
|
|
1508
|
+
slot: z.string(),
|
|
1509
|
+
start_s: z.number(),
|
|
1510
|
+
end_s: z.number(),
|
|
1511
|
+
scene: z.number().optional(),
|
|
1512
|
+
speaker: z.string().optional()
|
|
1513
|
+
})
|
|
1514
|
+
).default([]),
|
|
1515
|
+
// Scenes with a single on-camera speaker — each MUST be lip-synced. Either a
|
|
1516
|
+
// bare scene index (validator falls back to the scaffold naming convention) or
|
|
1517
|
+
// `{ scene, lipsync_node }`, which names the exact node to look for so a
|
|
1518
|
+
// hand-authored canvas with differently-named clip nodes isn't a false miss.
|
|
1519
|
+
talking_scenes: z.array(z.union([z.number(), z.object({ scene: z.number(), lipsync_node: z.string() })])).default([])
|
|
1520
|
+
}).strict().optional();
|
|
1496
1521
|
var CanvasMetadata = z.object({
|
|
1497
1522
|
name: z.string().optional(),
|
|
1498
1523
|
description: z.string().optional(),
|
|
1499
1524
|
// Free-form, human-facing next-steps guide embedded by scaffolds (e.g.
|
|
1500
1525
|
// `scaffold-video`) so the editable checklist travels inside the canvas
|
|
1501
1526
|
// instead of only printing to stdout. Ignored by the engine.
|
|
1502
|
-
todo: z.unknown().optional()
|
|
1527
|
+
todo: z.unknown().optional(),
|
|
1528
|
+
video: VideoMeta
|
|
1503
1529
|
}).strict().optional();
|
|
1504
1530
|
var CanvasSchema = z.object({
|
|
1505
1531
|
schema: z.literal("baker-canvas/1"),
|
|
@@ -1964,8 +1990,12 @@ var STAGE_CODES = {
|
|
|
1964
1990
|
DUPLICATE_ID: "DUPLICATE_NODE_ID",
|
|
1965
1991
|
CYCLE: "GRAPH_CYCLE",
|
|
1966
1992
|
SLOT: "PROMPT_SLOT_UNRESOLVED",
|
|
1967
|
-
OUTPUT: "OUTPUT_REF_INVALID"
|
|
1993
|
+
OUTPUT: "OUTPUT_REF_INVALID",
|
|
1994
|
+
VO_OVERLAP: "VIDEO_VO_OVERLAP",
|
|
1995
|
+
AUDIO_DURATION: "VIDEO_AUDIO_DURATION",
|
|
1996
|
+
LIPSYNC_MISSING: "VIDEO_LIPSYNC_MISSING"
|
|
1968
1997
|
};
|
|
1998
|
+
var VIDEO_TIME_SLACK_S = 0.75;
|
|
1969
1999
|
function validateCanvas(input, registry) {
|
|
1970
2000
|
const issues = [];
|
|
1971
2001
|
const shape = CanvasSchema.safeParse(input);
|
|
@@ -1987,6 +2017,7 @@ function validateCanvas(input, registry) {
|
|
|
1987
2017
|
checkAllSlots(ctx);
|
|
1988
2018
|
const estimatedCredits = estimateCredits(ctx);
|
|
1989
2019
|
checkOutputRef(ctx);
|
|
2020
|
+
checkVideoInvariants(ctx);
|
|
1990
2021
|
if (issues.length > 0) return { ok: false, issues };
|
|
1991
2022
|
return { ok: true, canvas, estimatedCredits };
|
|
1992
2023
|
}
|
|
@@ -2255,6 +2286,46 @@ function estimateCredits(ctx) {
|
|
|
2255
2286
|
}
|
|
2256
2287
|
return total;
|
|
2257
2288
|
}
|
|
2289
|
+
function checkVideoInvariants(ctx) {
|
|
2290
|
+
const meta = ctx.canvas.metadata?.video;
|
|
2291
|
+
if (!meta) return;
|
|
2292
|
+
const segments = [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s);
|
|
2293
|
+
for (let i = 1; i < segments.length; i++) {
|
|
2294
|
+
const prev = segments[i - 1];
|
|
2295
|
+
const cur = segments[i];
|
|
2296
|
+
if (!prev || !cur) continue;
|
|
2297
|
+
if (cur.start_s < prev.end_s - VIDEO_TIME_SLACK_S) {
|
|
2298
|
+
ctx.issues.push({
|
|
2299
|
+
path: "metadata.video.vo_segments",
|
|
2300
|
+
code: STAGE_CODES.VO_OVERLAP,
|
|
2301
|
+
message: `voiceover turns overlap: "${prev.slot}" runs to ${prev.end_s}s but "${cur.slot}" starts at ${cur.start_s}s \u2014 sequence them so dialogue alternates instead of stacking`
|
|
2302
|
+
});
|
|
2303
|
+
}
|
|
2304
|
+
}
|
|
2305
|
+
const audioEnd = segments.reduce((m, s) => Math.max(m, s.end_s), 0);
|
|
2306
|
+
if (audioEnd > meta.duration_s + VIDEO_TIME_SLACK_S) {
|
|
2307
|
+
ctx.issues.push({
|
|
2308
|
+
path: "metadata.video.duration_s",
|
|
2309
|
+
code: STAGE_CODES.AUDIO_DURATION,
|
|
2310
|
+
message: `audio runs to ${audioEnd}s but the video is ${meta.duration_s}s \u2014 trim the voiceover or extend the video so audio \u2248 video length`
|
|
2311
|
+
});
|
|
2312
|
+
}
|
|
2313
|
+
for (const entry of meta.talking_scenes) {
|
|
2314
|
+
const scene = typeof entry === "number" ? entry : entry.scene;
|
|
2315
|
+
const synced = typeof entry === "number" ? ctx.canvas.nodes.some((n) => {
|
|
2316
|
+
if (n.type !== "video_lipsync") return false;
|
|
2317
|
+
const video = n.inputs?.video;
|
|
2318
|
+
return video === `$ref:s${scene}_trim.video` || video === `$ref:s${scene}_clip.video`;
|
|
2319
|
+
}) : ctx.canvas.nodes.some((n) => n.id === entry.lipsync_node && n.type === "video_lipsync");
|
|
2320
|
+
if (!synced) {
|
|
2321
|
+
ctx.issues.push({
|
|
2322
|
+
path: "metadata.video.talking_scenes",
|
|
2323
|
+
code: STAGE_CODES.LIPSYNC_MISSING,
|
|
2324
|
+
message: `scene ${scene} has a single on-camera speaker but no video_lipsync on s${scene}_clip \u2014 its mouth will drift out of sync with the voiceover`
|
|
2325
|
+
});
|
|
2326
|
+
}
|
|
2327
|
+
}
|
|
2328
|
+
}
|
|
2258
2329
|
function checkOutputRef(ctx) {
|
|
2259
2330
|
const out = ctx.canvas.output;
|
|
2260
2331
|
if (!out) return;
|
|
@@ -4813,7 +4884,7 @@ var dialogueNode = delegated({
|
|
|
4813
4884
|
|
|
4814
4885
|
// src/engine/nodes/remote/image.ts
|
|
4815
4886
|
import { z as z15 } from "zod";
|
|
4816
|
-
var
|
|
4887
|
+
var IMAGE_GENERATE_MODELS2 = [
|
|
4817
4888
|
"openai/gpt-5.4-image-2",
|
|
4818
4889
|
"google/gemini-3.5-flash",
|
|
4819
4890
|
"google/gemini-3.1-flash-image-preview",
|
|
@@ -4821,7 +4892,7 @@ var IMAGE_GENERATE_MODELS = [
|
|
|
4821
4892
|
"recraft/recraft-v4.1-pro-vector"
|
|
4822
4893
|
];
|
|
4823
4894
|
var ImageGenerateParams = z15.object({
|
|
4824
|
-
model: z15.enum(
|
|
4895
|
+
model: z15.enum(IMAGE_GENERATE_MODELS2),
|
|
4825
4896
|
prompt: z15.string().min(1),
|
|
4826
4897
|
aspect_ratio: z15.enum(["1:1", "16:9", "9:16", "4:3", "3:4", "3:2", "2:3", "4:5", "5:4", "21:9", "1:4", "4:1", "1:8", "8:1"]).optional(),
|
|
4827
4898
|
image_size: z15.enum(["0.5K", "1K", "2K", "4K"]).optional(),
|
|
@@ -5648,6 +5719,7 @@ function createEngineFromEnv(opts = {}) {
|
|
|
5648
5719
|
export {
|
|
5649
5720
|
SEEDANCE_DURATIONS,
|
|
5650
5721
|
ELEVENLABS_MAX_MUSIC_LENGTH_MS,
|
|
5722
|
+
IMAGE_GENERATE_MODELS,
|
|
5651
5723
|
MODEL_REGISTRY,
|
|
5652
5724
|
BackendClient2 as BackendClient,
|
|
5653
5725
|
Engine2 as Engine,
|
|
@@ -5659,4 +5731,4 @@ export {
|
|
|
5659
5731
|
defaultRegistry,
|
|
5660
5732
|
createEngineFromEnv
|
|
5661
5733
|
};
|
|
5662
|
-
//# sourceMappingURL=chunk-
|
|
5734
|
+
//# sourceMappingURL=chunk-JIDZ37KG.js.map
|