npm - @koda-sl/baker-cli - Versions diffs - 0.68.0 → 0.71.2 - Mend

@koda-sl/baker-cli 0.68.0 → 0.71.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +76 -3
package/canvas/video-overlay-composition/index.html +65 -191
package/canvas/video-overlay-composition/meta.json +4 -15
package/dist/{chunk-K6LHXCKD.js → chunk-JIDZ37KG.js} +77 -5
package/dist/chunk-JIDZ37KG.js.map +1 -0
package/dist/cli.js +714 -195
package/dist/cli.js.map +1 -1
package/dist/engine/index.d.ts +14 -0
package/dist/engine/index.js +1 -1
package/package.json +1 -1
package/dist/chunk-K6LHXCKD.js.map +0 -1

package/README.md CHANGED Viewed

@@ -1587,6 +1587,8 @@ Each external source is its own subcommand. Pick the verb that matches the sourc
 | `baker images find <q>` | Fanout: library + opted-in providers | off |
 | `baker images stock <q> [--type photo\|vector\|psd]` | Magnific (Freepik's dev API) — photos, vectors, illustrations, PSDs (~250M assets) | off |
 | `baker images google <q>` | Google Images via the official Custom Search JSON API | off |
+| `baker images pinterest <q>` | Pinterest reference imagery via ScrapeCreators (photo-real mood boards) | off |
+| `baker images generate <prompt>` | AI image generation via OpenRouter image models (cost-tracked) | **always on** (the bytes are the artifact) |
 | `baker images logo <domain>` | Brand logo via Brandfetch CDN | **on** (top 1) |
 | `baker images icon <name>` | Iconify (200+ icon sets, no API key) | off (CDN URL is stable) |
 | `baker images gif <q>` | Reaction GIFs / memes via Giphy (paid-social creative) | off |
@@ -1726,6 +1728,69 @@ Requires both `GOOGLE_CUSTOM_SEARCH_API_KEY` and `GOOGLE_CUSTOM_SEARCH_ENGINE_ID
 | `--auto-ingest`  | Ingest top N (0–20, default 0) and return Baker-owned URLs on ingested hits                                      |
 | `--context`      | Free-text hint passed to Gemini describe to bias the generated description and tags (overrides provider-derived context on auto-ingest paths) |
+### `baker images pinterest <query>`
+Pinterest image search via ScrapeCreators. The best source for **photo-real reference and mood imagery** — interiors, fashion, food, product styling, lifestyle. ⚠ Unverified, trademark-bearing web content — use for *reference*, not republishing. Strongest paired with `generate --reference` (find the look on Pinterest, then generate an owned, on-brand image from it). Auto-ingest is opt-in.
+```bash
+baker images pinterest "scandinavian living room"
+baker images pinterest "minimalist skincare product photography" --limit 20
+baker images pinterest "cozy coffee shop interior" --auto-ingest 2 --context "Mood reference for hero photography"
+```
+**Flags:**
+| Flag            | Description                                                                 |
+|-----------------|----------------------------------------------------------------------------|
+| `--limit`       | Max results (1–20, default 5)                                              |
+| `--auto-ingest` | Ingest top N (0–20, default 0) and return Baker-owned URLs on ingested hits |
+| `--context`     | Free-text hint passed to Gemini describe on auto-ingest (overrides the pin title) |
+Endpoint `api.scrapecreators.com/v1/pinterest/search` (`SCRAPE_CREATORS_API_KEY`), $0.00188/request, 1-day cache.
+### `baker images generate <prompt>`
+Generate an image with AI (OpenRouter image models) and ingest it into the library. The generated bytes **are** the artifact, so the result is always auto-ingested (described + embedded) — the next `baker images library` query finds it. Cost is tracked per request via OpenRouter's reported usage. Models and defaults mirror the Baker canvas.
+```bash
+baker images generate "a friendly golden retriever in a bright modern living room" --aspect-ratio 16:9
+baker images generate "hero shot of a matte black water bottle on wet marble, studio light" \
+  --model google/gemini-3-pro-image-preview --image-size 2K --aspect-ratio 4:5
+baker images generate "home office hero, warm minimalist, natural light, 35mm photo" \
+  --reference "https://i.pinimg.com/…/pin1.jpg,https://…/brand-product.png"
+baker images generate "put this product on a marble countertop in soft daylight" \
+  --reference "./src/brand/logos/product.png,./refs/kitchen-mood.jpg"   # local sandbox files
+baker images generate "flat geometric mascot, brand palette" \
+  --model recraft/recraft-v4.1-pro-vector --rgb-colors "[[10,10,10],[255,80,0]]" --bg-rgb "[255,255,255]"
+```
+**Models** (`--model`, default `openai/gpt-5.4-image-2`):
+| Model | Best for | Aspect ratios | Sizes |
+|---|---|---|---|
+| `openai/gpt-5.4-image-2` **(default)** | Photoreal + cleanest in-image text — ad/landing reproduction | standard set | `1K` `2K` `4K` |
+| `google/gemini-3-pro-image-preview` | Highest fidelity (Nano Banana Pro) | standard set | `1K` `2K` `4K` |
+| `google/gemini-3.5-flash` | Fast; extreme aspect ratios | standard **+** `1:4` `4:1` `1:8` `8:1` | `0.5K`–`4K` |
+| `google/gemini-3.1-flash-image-preview` | Same as 3.5 flash (preview) | extreme set | `0.5K`–`4K` |
+| `recraft/recraft-v4.1-pro-vector` | Vector/flat/SVG-style with palette control | standard set | `1K` `2K` `4K` |
+Standard aspect ratios: `1:1` `2:3` `3:2` `3:4` `4:3` `4:5` `5:4` `9:16` `16:9` `21:9`.
+**Flags:**
+| Flag | Description |
+|---|---|
+| `--model` | Model id (default `openai/gpt-5.4-image-2`) |
+| `--aspect-ratio` | Output aspect ratio (default `1:1`) |
+| `--image-size` | Resolution: `1K` (default) `2K` `4K` (Gemini flash also `0.5K`) |
+| `--reference` | Comma-separated visual references, each either a **public image URL** (Pinterest / stock / library `imageUrl`) **or a local file path** (a sandbox image — brand logo, product shot, cropped photo, screenshot). Local files are downscaled (≤1536px) and inlined automatically — no manual upload. Applied in order; the biggest quality lever for photographed, on-brand output. Split is on `,`, so a URL containing a literal comma in its query string would be torn in two (rare for image CDNs — pass it alone if it occurs); a single `data:` URL is taken whole. |
+| `--strength` | Recraft only: vectorization strength 0–1 |
+| `--rgb-colors` | Recraft only: JSON palette `[[r,g,b],…]` |
+| `--bg-rgb` | Recraft only: JSON background `[r,g,b]` |
+| `--context` | Describe-hint override for the ingested row (defaults to the prompt) |
+Returns `{ images: [{ imageId, imageUrl, deduped, width, height }], model, costUsd }`. `imageUrl` is library-owned and ready to place. Requires `OPENROUTER_API_KEY` on the Convex deployment. No `seed` (OpenRouter image_config has no seed slot). Identical re-generations dedup by content hash.
 ### `baker images logo <domain>`
 Brand logo via Brandfetch CDN (`fallback/404`). Probes all 5 logo variants in parallel and returns whichever ones the domain actually publishes. Auto-ingests the first by default.
@@ -3493,9 +3558,17 @@ Turn a reference video into a **runnable, self-validated reproduction canvas** i
 1. **`video_deconstruct`** (`~google/gemini-pro-latest`, full mode) — reverse-engineers the video into a scene-by-scene blueprint + word-level transcript, written next to the canvas as **`prompt.json`**. Each scene's `start_frame_prompt`/`end_frame_prompt` are inlined into the frame nodes (see below); `prompt.json` then rides along as the shared **global style reference** (palette, cast cohesion) and as provenance.
 2. **recurring-element selection** (`~google/gemini-flash-latest`) — picks only the **recurring, identity-critical** elements (each `global.cast` person, a recurring animal, a showcased product, the brand logo) and the scene indices each appears in. One real reference image grounds each element across **every** frame it appears in, so the same actor stays consistent the whole video.
-It then scaffolds the full pipeline: per scene, two **static-ad-grade frames** (`image_generate` with its **own self-contained `params.prompt`** — edit a frame node to change only that frame; `prompt.json` is wired as a demoted shared-style `target_blueprint`, a per-element reference legend, the real extracted frame as a composition anchor) → `video_generate` (Seedance first/last-frame, fed an ultra-detailed motion brief composed from the scene's action, camera, dialogue, and transcript; duration snapped to the nearest allowed clip length). Each recurring element gets **one shared `[TODO]` ingest slot** wired into every frame it appears in. Globally it casts a `voice_select` per speaker, **one continuous `tts` per speaker** (every line that speaker says concatenated into a single read, voice locked via `voice_ref`), a `sound_effect` per SFX, a `music` bed (styled after the AudD-identified track when available), then concatenates the clips, burns on the animated overlays (the `video-overlay` composition, copied next to the canvas — brand fonts via `@font-face`, plus `typewriter`/`karaoke` reveals), mixes the audio (`audio_timeline`), and muxes it under the video.
+It then scaffolds the full pipeline: per scene, two **static-ad-grade frames** (`image_generate` with its **own self-contained `params.prompt`** — edit a frame node to change only that frame; `prompt.json` is wired as a demoted shared-style `target_blueprint`, a per-element reference legend, the real extracted frame as a composition anchor) → `video_generate` (Seedance first/last-frame, fed an ultra-detailed motion brief composed from the scene's action, camera, dialogue, and transcript; duration snapped to the nearest allowed clip length).
+**Sequenced audio.** Dialogue is a back-and-forth on one absolute timeline, so each **contiguous same-speaker turn** becomes its own `tts` placed at its real `start_s` — turns alternate and never stack (the earlier design concatenated each speaker's whole monologue at their earliest timestamp, so two voices played in parallel for the entire video). Each speaker is locked to one shared `voice_select` voice; a `sound_effect` per SFX and a `music` bed (styled after the AudD-identified track when available, ducked under the voices) round out the mix (`audio_timeline`). The final mux normalizes the soundtrack to **−14 LUFS (stereo)** so the output plays loud in every player — the raw mix is quiet mono, which reads as "no sound."
+**Lip-sync by default.** Seedance animates a generic mouth that won't track the separate voiceover, so any scene with a **single on-camera speaker** (a `dialogue.speaker` that maps to a `global.cast` member, when `global.voiceover.mode` isn't pure `voiceover`/`none`) routes its clip through `video_lipsync` against that scene's own turn audio (~20 cr/scene). Scenes with two on-camera speakers are left un-synced on purpose (one lip-sync track can't drive two faces) and flagged for you to split or pick a primary.
+**Timeline-accurate picture.** Seedance can't render under 4s, so each clip is generated at the smallest allowed duration ≥ the scene length and then **trimmed back to the exact scene duration** before concat. This keeps the concatenated picture on the same timeline as the absolute-timed audio — without it, short scenes balloon to 4s, the spine runs far longer than the soundtrack, and every line plays over the wrong (slowed) scene so the lips never match. Frames are also prompted as **clean text-free plates** (no baked captions/lower-thirds/tickers/logos-as-text) so the overlay layer is the single source of on-screen text.
+**Overlays are agent-painted HTML, not props.** The clips are concatenated, then the `video-overlay` composition (copied next to the canvas) composites the overlay layer. The scaffold **bakes the reference's overlays into that composition's `index.html` as real, editable HTML** (each overlay is a plain element with its text, a `.pos-*` position class, and `data-start`/`data-dur` timing); a tiny generic runtime only shows/hides each element at its timestamp (with an optional `data-anim` entrance). It makes **no styling decisions** — bars, tickers, colors, fonts, and a real logo `<img>` you drop into the dir all live in the HTML/CSS you edit. Floating elements (logo bugs) are seeded as commented `<img>` stubs so an un-edited render stays clean. Drop `brand-bold.otf`/`brand-regular.otf` for on-brand type.
-The emitted canvas is validated (`validateCanvasDeep`) before it's written, so it always runs. The full editable checklist is embedded in the canvas as **`metadata.todo`** (and a step-by-step guide in `metadata.description`). stdout returns `{ ok, canvas_path, prompt_path, models, stats, checklist }` — the **checklist** lists the recurring elements (with the scenes each spans and the real source image to supply), voices to confirm, SFX/overlay counts, music status, and any scenes clamped to the 15s clip ceiling.
+The emitted canvas is validated (`validateCanvasDeep`) before it's written, so it always runs. It also carries a **`metadata.video`** timing plan that `baker canvas validate` proves **statically, before any billed render**: no two voiceover turns overlap, the audio length ≈ the video length, and every single-on-camera-speaker scene has a lip-sync node. The full editable checklist is embedded as **`metadata.todo`** (with a step-by-step guide in `metadata.description`). stdout returns `{ ok, canvas_path, prompt_path, models, stats, checklist }`.
 ```bash
 baker canvas scaffold-video ./reference-ad.mp4 --focus "competitor UGC ad for <brand>"
@@ -3509,7 +3582,7 @@ baker canvas run ./reference-ad.video.canvas.json
 |---|---|---|
 | `--out <path>` | `<video-dir>/<name>.video.canvas.json` | Where to write the canvas (composition is copied alongside). |
 | `--frames <mode>` | `generate` | `generate` regenerates frames anchored on the originals; `reuse` wires the real extracted frames straight into the clips (faithful, cheaper). |
-| `--max-scenes <n>` | provider default | Cap the number of scenes the deconstruct emits. |
+| `--max-scenes <n>` | all source scenes | **Cost lever that reduces fidelity** — caps the deconstruct, MERGING away every scene beyond the cap (fewer cuts, lost beats). Prints a warning when set; omit it to reproduce every scene. |
 | `--language <code>` | auto | Transcript/dialogue language hint (e.g. `fr`, `en`). |
 | `--focus <text>` | — | Known provenance/emphasis to ground the deconstruct. |
 | `--deconstruct-model <id>` | `~google/gemini-pro-latest` | Override the `video_deconstruct` model. |

package/canvas/video-overlay-composition/index.html CHANGED Viewed

@@ -5,10 +5,22 @@
     <meta name="viewport" content="width=1080, height=1920" />
     <script src="./gsap.min.js"></script>
     <style>
-      /* Brand font: drop `brand-bold.otf` / `brand-regular.otf` (or .ttf) into
-         this composition dir to render overlays in the real brand typeface. If
-         the files are absent the @font-face simply fails and the system stack
-         below is used — so the composition still renders either way. */
+      /* ═══════════════════════════════════════════════════════════════════════
+         OVERLAY LAYER — PAINT THIS YOURSELF.
+         The scaffold seeds #overlay-root (below) with the reference ad's overlays
+         as plain HTML elements — real text, a position class, and data-start/
+         data-dur timing. This file is YOURS: restyle every rule here, add classes,
+         build a lower-third bar or a scrolling ticker, drop a logo image into this
+         dir and reference it with <img>. The only contract is:
+           • keep each overlay element inside #overlay-root
+           • keep its data-start / data-dur (seconds) — the runtime shows/hides by them
+           • (optional) data-anim="fade|slide_up|slide_down|pop" for a canned entrance
+         The runtime makes NO styling decisions. How it looks is 100% your CSS/markup.
+         For on-brand type, drop brand-bold.otf / brand-regular.otf into this dir.
+         ═══════════════════════════════════════════════════════════════════════ */
       @font-face {
         font-family: 'BrandFont';
         src: url('./brand-bold.otf') format('opentype'), url('./brand-bold.ttf') format('truetype');
@@ -25,211 +37,73 @@
       * { margin: 0; padding: 0; box-sizing: border-box; }
       html, body { width: 1080px; height: 1920px; overflow: hidden; background: #000; }
       video#bg { position: absolute; inset: 0; width: 100%; height: 100%; object-fit: cover; }
-      /* 9-grid: the stage is a 3x3 flex grid; each cell aligns its overlay. */
-      #stage { position: absolute; inset: 0; display: grid;
-        grid-template-rows: 1fr 1fr 1fr; grid-template-columns: 1fr 1fr 1fr; padding: 80px 48px; }
-      .cell { display: flex; padding: 8px; }
-      .cell.top { align-items: flex-start; } .cell.mid { align-items: center; } .cell.bottom { align-items: flex-end; }
-      .cell.left { justify-content: flex-start; } .cell.center { justify-content: center; } .cell.right { justify-content: flex-end; }
-      .overlay { visibility: hidden; max-width: 100%; font-family: 'BrandFont', 'Arial Black', 'Helvetica Neue', sans-serif;
-        font-weight: 900; line-height: 1.15; text-align: center; word-break: break-word; }
-      .overlay.size-small { font-size: 40px; } .overlay.size-medium { font-size: 64px; }
-      .overlay.size-large { font-size: 92px; } .overlay.size-huge { font-size: 128px; }
-      .overlay .pill { display: inline-block; padding: 0.12em 0.4em; border-radius: 0.22em; }
-      /* Per-glyph/word spans for typewriter & karaoke must stay inline. */
-      .overlay .tok { display: inline; }
-      .floating { visibility: hidden; font-family: 'Helvetica Neue', sans-serif; font-size: 30px; color: #fff;
-        border: 2px dashed rgba(255,255,255,0.85); border-radius: 12px; padding: 10px 16px;
-        background: rgba(0,0,0,0.45); max-width: 90%; text-align: center; }
-      .floating .kind { display: block; font-size: 20px; opacity: 0.7; text-transform: uppercase; letter-spacing: 1px; }
+      #overlay-root { position: absolute; inset: 0; font-family: 'BrandFont', 'Helvetica Neue', Arial, sans-serif; }
+      /* Starter style for a seeded overlay — deliberately plain. Override freely. */
+      .ov {
+        position: absolute; visibility: hidden; max-width: 86%;
+        font-size: 64px; font-weight: 800; line-height: 1.15; color: #fff; text-align: center;
+        white-space: normal; overflow-wrap: break-word;
+        text-shadow: 0 2px 8px rgba(0,0,0,0.6), 0 0 2px rgba(0,0,0,0.9);
+      }
+      .ov.fe { font-size: 30px; font-weight: 600; opacity: 0.9; }
+      /* 9-grid position helpers (absolute). Tweak the insets or add your own. */
+      .pos-top-left      { top: 90px; left: 56px; text-align: left; }
+      .pos-top-center    { top: 90px; left: 50%; transform: translateX(-50%); }
+      .pos-top-right     { top: 90px; right: 56px; text-align: right; }
+      .pos-mid-left,
+      .pos-center-left   { top: 50%; left: 56px; transform: translateY(-50%); text-align: left; }
+      .pos-center,
+      .pos-mid-center    { top: 50%; left: 50%; transform: translate(-50%,-50%); }
+      .pos-mid-right,
+      .pos-center-right  { top: 50%; right: 56px; transform: translateY(-50%); text-align: right; }
+      .pos-bottom-left   { bottom: 150px; left: 56px; text-align: left; }
+      .pos-bottom-center { bottom: 150px; left: 50%; transform: translateX(-50%); width: 88%; }
+      .pos-bottom-right  { bottom: 150px; right: 56px; text-align: right; }
     </style>
   </head>
   <body>
     <div id="root" data-composition-id="main" data-start="0" data-duration="{{duration}}" data-width="1080" data-height="1920">
       <video id="bg" src="background.mp4" muted></video>
-      <div id="stage"></div>
+      <div id="overlay-root">
+<!--OVERLAYS-->
+      </div>
     </div>
     <script>
       (() => {
-        const OVERLAYS = {{overlays}};
-        const FLOATING = {{floating_elements}};
         const DURATION = parseFloat('{{duration}}');
-        const stage = document.getElementById('stage');
         const tl = gsap.timeline({ paused: true });
+        const els = Array.from(document.querySelectorAll('#overlay-root [data-start]'));
+        // Generic timeline: show each element at data-start, hide at start+data-dur,
+        // with an optional canned entrance from data-anim. No styling decisions here —
+        // the look lives entirely in the CSS/markup above.
+        for (const el of els) {
+          const at = parseFloat(el.getAttribute('data-start') || '0') || 0;
+          const dur = parseFloat(el.getAttribute('data-dur') || '2.5') || 2.5;
+          const anim = el.getAttribute('data-anim') || '';
+          // Preserve any positioning transform the CSS set (translate(...)).
+          const baseTransform = getComputedStyle(el).transform;
+          const tx = baseTransform && baseTransform !== 'none' ? baseTransform : '';
-        const ROWS = { top: 'top', mid: 'mid', center: 'mid', bottom: 'bottom' };
-        const COLS = { left: 'left', center: 'center', right: 'right' };
-        // 3x3 cells indexed by "row_col"; build them once.
-        const cells = {};
-        for (const r of ['top', 'mid', 'bottom']) {
-          for (const c of ['left', 'center', 'right']) {
-            const el = document.createElement('div');
-            el.className = `cell ${r} ${c}`;
-            stage.appendChild(el);
-            cells[`${r}_${c}`] = el;
-          }
-        }
-        function cellFor(position) {
-          // position like "bottom_center" / "mid_left" / "center" (legacy mid_center alias)
-          const p = String(position || 'bottom_center');
-          if (p === 'center') return cells.mid_center;
-          const [rawRow, rawCol] = p.split('_');
-          const row = ROWS[rawRow] || 'bottom';
-          const col = COLS[rawCol] || 'center';
-          return cells[`${row}_${col}`];
-        }
-        function applyTextStyle(el, style) {
-          const s = style || {};
-          el.classList.add('size-' + (s.size || 'medium'));
-          if (s.color_hex) el.style.color = s.color_hex; else el.style.color = '#fff';
-          if (s.casing === 'upper') el.style.textTransform = 'uppercase';
-          else if (s.casing === 'lower') el.style.textTransform = 'lowercase';
-          else if (s.casing === 'title') el.style.textTransform = 'capitalize';
-          // Background heuristic from a free-text descriptor like "white rounded rectangle".
-          const bg = (s.background || '').toLowerCase();
-          const span = document.createElement('span');
-          span.textContent = '';
-          if (bg && bg !== 'none' && bg !== 'transparent') {
-            span.className = 'pill';
-            if (bg.includes('white')) { span.style.background = '#fff'; if (!s.color_hex) el.style.color = '#111'; }
-            else if (bg.includes('black')) { span.style.background = 'rgba(0,0,0,0.85)'; }
-            else { span.style.background = 'rgba(0,0,0,0.6)'; }
-          }
-          // Stroke/shadow: default a strong outline for legibility unless a pill bg is used.
-          if (!span.className) {
-            el.style.textShadow = s.stroke_or_shadow
-              ? '2px 2px 0 rgba(0,0,0,0.9), -2px -2px 0 rgba(0,0,0,0.9), 2px -2px 0 rgba(0,0,0,0.9), -2px 2px 0 rgba(0,0,0,0.9)'
-              : '0 3px 10px rgba(0,0,0,0.6)';
-          }
-          return span;
-        }
-        // Read the free-text `animation_detail` for speed / easing / distance hints
-        // so two overlays with the same coarse `animation` can still differ.
-        function tuneFromDetail(detail) {
-          const d = String(detail || '').toLowerCase();
-          let durMul = 1, ease = null, dist = 1;
-          if (/(fast|quick|snappy|rapid|instant|sharp)/.test(d)) durMul = 0.6;
-          if (/(slow|gentle|smooth|gradual|soft|ease)/.test(d)) durMul = 1.5;
-          if (/(bounce|spring|elastic|jelly)/.test(d)) ease = 'elastic.out(1,0.5)';
-          if (/(overshoot|back|punch)/.test(d)) ease = 'back.out(1.7)';
-          if (/(big|large|dramatic|hard|strong)/.test(d)) dist = 1.6;
-          return { durMul, ease, dist };
-        }
-        function animateIn(el, animation, at, detail) {
-          const a = String(animation || 'fade');
-          const t = tuneFromDetail(detail);
-          const base = 0.25 * t.durMul;
-          if (a === 'pop' || a === 'bounce') {
-            tl.fromTo(el, { scale: 0.6, opacity: 0 }, { scale: 1, opacity: 1, duration: base, ease: t.ease || (a === 'bounce' ? 'elastic.out(1,0.5)' : 'back.out(1.7)') }, at);
-          } else if (a === 'slide_up') {
-            tl.fromTo(el, { y: 60 * t.dist, opacity: 0 }, { y: 0, opacity: 1, duration: base, ease: t.ease || 'power2.out' }, at);
-          } else if (a === 'slide_down') {
-            tl.fromTo(el, { y: -60 * t.dist, opacity: 0 }, { y: 0, opacity: 1, duration: base, ease: t.ease || 'power2.out' }, at);
-          } else if (a === 'shake') {
-            tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.1 }, at);
-            tl.to(el, { x: '+=' + (12 * t.dist), duration: 0.05, repeat: 5, yoyo: true }, at);
-          } else if (a === 'none' || a === 'other') {
-            tl.set(el, { opacity: 1 }, at);
-          } else {
-            // fade
-            tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.2 * t.durMul, ease: 'power1.out' }, at);
-          }
-        }
-        // Build the visible content. For typewriter/karaoke we split the text into
-        // per-glyph / per-word tokens so the reveal can be driven across the
-        // overlay's own on-screen duration (independent of the voiceover track).
-        function buildContent(container, ov) {
-          const a = String(ov.animation || 'fade');
-          const text = String(ov.text == null ? '' : ov.text);
-          if (a === 'typewriter') {
-            const spans = [];
-            for (const ch of Array.from(text)) {
-              const s = document.createElement('span');
-              s.className = 'tok';
-              s.textContent = ch;
-              s.style.opacity = '0';
-              container.appendChild(s);
-              if (ch.trim() !== '') spans.push(s);
-              else s.style.opacity = '1';
-            }
-            return { mode: 'typewriter', spans };
-          }
-          if (a === 'karaoke') {
-            const spans = [];
-            for (const part of text.split(/(\s+)/)) {
-              if (part === '') continue;
-              if (/^\s+$/.test(part)) { container.appendChild(document.createTextNode(part)); continue; }
-              const s = document.createElement('span');
-              s.className = 'tok';
-              s.textContent = part;
-              s.style.opacity = '0.4';
-              container.appendChild(s);
-              spans.push(s);
-            }
-            return { mode: 'karaoke', spans };
-          }
-          container.textContent = text;
-          return { mode: 'plain', spans: [] };
-        }
-        (OVERLAYS || []).forEach((ov, i) => {
-          if (!ov || !ov.text) return;
-          const el = document.createElement('div');
-          el.className = 'overlay';
-          el.id = 'ov_' + i;
-          const span = applyTextStyle(el, ov.style);
-          const content = buildContent(span, ov);
-          el.appendChild(span);
-          cellFor(ov.position).appendChild(el);
-          const at = typeof ov.appears_at_s === 'number' ? ov.appears_at_s : 0;
-          const dur = typeof ov.duration_s === 'number' ? ov.duration_s : 2.5;
           tl.set(el, { visibility: 'visible' }, at);
-          if (content.mode === 'typewriter' && content.spans.length) {
-            // Type the glyphs across the first ~70% of the overlay's life.
-            const reveal = Math.min(Math.max(dur * 0.7, 0.3), 1.8);
-            tl.set(el, { opacity: 1 }, at);
-            tl.to(content.spans, { opacity: 1, duration: 0.01, ease: 'none', stagger: reveal / content.spans.length }, at);
-          } else if (content.mode === 'karaoke' && content.spans.length) {
-            // Light up each word in sequence across the overlay's duration.
-            const reveal = Math.min(Math.max(dur * 0.85, 0.4), Math.max(dur - 0.2, 0.4));
-            tl.set(el, { opacity: 1 }, at);
-            tl.to(content.spans, { opacity: 1, duration: 0.12, ease: 'power1.out', stagger: reveal / content.spans.length }, at);
+          if (anim === 'pop') {
+            tl.fromTo(el, { opacity: 0, scale: 0.7 }, { opacity: 1, scale: 1, duration: 0.3, ease: 'back.out(1.7)' }, at);
+          } else if (anim === 'slide_up') {
+            tl.fromTo(el, { opacity: 0, yPercent: 30 }, { opacity: 1, yPercent: 0, duration: 0.3, ease: 'power2.out' }, at);
+          } else if (anim === 'slide_down') {
+            tl.fromTo(el, { opacity: 0, yPercent: -30 }, { opacity: 1, yPercent: 0, duration: 0.3, ease: 'power2.out' }, at);
           } else {
-            animateIn(el, ov.animation, at, ov.animation_detail);
+            // Default / any unrecognized data-anim value: a plain fade.
+            tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.25, ease: 'power1.out' }, at);
           }
           tl.to(el, { opacity: 0, duration: 0.2 }, Math.max(at + 0.2, at + dur));
           tl.set(el, { visibility: 'hidden' }, at + dur + 0.21);
-        });
-        (FLOATING || []).forEach((fe, i) => {
-          if (!fe) return;
-          const el = document.createElement('div');
-          el.className = 'floating';
-          el.id = 'fe_' + i;
-          el.innerHTML = '<span class="kind">' + (fe.kind || 'element') + '</span>';
-          el.appendChild(document.createTextNode(fe.description || ''));
-          cellFor(fe.position).appendChild(el);
-          const at = typeof fe.appears_at_s === 'number' ? fe.appears_at_s : 0;
-          const dur = typeof fe.duration_s === 'number' ? fe.duration_s : 2.5;
-          tl.set(el, { visibility: 'visible' }, at);
-          tl.fromTo(el, { opacity: 0 }, { opacity: 1, duration: 0.2 }, at);
-          tl.to(el, { opacity: 0, duration: 0.2 }, at + dur);
-          tl.set(el, { visibility: 'hidden' }, at + dur + 0.21);
-        });
+          if (tx) el.style.transform = tx;
+        }
-        // Keep the timeline alive to the full video duration.
         tl.to({}, { duration: 0.01 }, Math.max(0.01, DURATION - 0.01));
         window.__timelines = window.__timelines || {};
         window.__timelines.main = tl;

package/canvas/video-overlay-composition/meta.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "id": "video-overlay",
-  "title": "Burn timed text overlays + floating-element placeholders over a video",
-  "description": "Generic overlay renderer for video reproduction. Takes the `overlays` (and optional `floating_elements`) arrays straight from a video_deconstruct blueprint and renders each one over a background video at its absolute timestamp, 9-grid position, animation, and typographic style. Honors fade/pop/bounce/slide_up/slide_down/shake plus glyph-by-glyph `typewriter` and word-by-word `karaoke` reveals, and tunes easing/speed from each overlay's free-text `animation_detail`. For on-brand type, drop `brand-bold.otf`/`brand-regular.otf` (or .ttf) into this dir — they are wired via @font-face and fall back to a system stack when absent. Fixed 1080x1920 (vertical); copy + edit width/height for other aspect ratios.",
+  "title": "Overlay layer: agent-authored HTML/CSS composited over a video",
+  "description": "A paint-it-yourself overlay layer, NOT a prop renderer. The scaffold seeds #overlay-root in index.html with the reference ad's overlays as real, editable HTML elements (text + a position class + data-start/data-dur timing); a tiny generic runtime only shows/hides each element at its timestamp (with an optional data-anim entrance). Every styling decision — bars, tickers, colors, fonts, a real logo <img> dropped into this dir — lives in the HTML/CSS you edit, not in props. Drop brand-bold.otf / brand-regular.otf for on-brand type. Fixed 1080x1920; copy + edit width/height for other ratios.",
   "width": 1080,
   "height": 1920,
   "fps": 30,
@@ -11,19 +11,8 @@
       "kind": "video",
       "required": true,
       "staged_as": "background.mp4",
-      "description": "The base video to burn overlays onto (e.g. the concatenated reproduction clips)."
+      "description": "The base video to composite overlays onto (e.g. the concatenated reproduction clips)."
     }
   },
-  "params": {
-    "overlays": {
-      "kind": "json",
-      "required": true,
-      "description": "Array of overlay objects from the blueprint: {text, appears_at_s, duration_s, position(9-grid: top_left..bottom_center), animation(none|fade|pop|slide_up|slide_down|typewriter|bounce|karaoke|shake|other), style{font_style, casing, color_hex, background, stroke_or_shadow, size(small|medium|large|huge)}}."
-    },
-    "floating_elements": {
-      "kind": "json",
-      "default": [],
-      "description": "Array of non-text floating elements: {kind, description, appears_at_s, duration_s, position}. Rendered as labeled dashed placeholders for the agent to replace with real image overlays."
-    }
-  }
+  "params": {}
 }

package/dist/{chunk-K6LHXCKD.js → chunk-JIDZ37KG.js} RENAMED Viewed

@@ -917,6 +917,13 @@ var FAL_IMAGE_MIMES = ["image/png", "image/jpeg", "image/webp"];
 var FAL_VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"];
 var DECONSTRUCT_VIDEO_MIMES = ["video/mp4", "video/webm", "video/quicktime"];
 var FAL_AUDIO_MIMES = ["audio/wav", "audio/mpeg", "audio/mp3"];
+var IMAGE_GENERATE_MODELS = [
+  "openai/gpt-5.4-image-2",
+  "google/gemini-3.5-flash",
+  "google/gemini-3.1-flash-image-preview",
+  "google/gemini-3-pro-image-preview",
+  "recraft/recraft-v4.1-pro-vector"
+];
 var MODEL_REGISTRY = {
   text_generate: {
     "~google/gemini-flash-latest": {
@@ -1493,13 +1500,32 @@ var OutputRef = z.object({
   node: z.string(),
   output: z.string()
 }).strict();
+var VideoMeta = z.object({
+  duration_s: z.number(),
+  // Each sequenced voiceover turn on the absolute timeline.
+  vo_segments: z.array(
+    z.object({
+      slot: z.string(),
+      start_s: z.number(),
+      end_s: z.number(),
+      scene: z.number().optional(),
+      speaker: z.string().optional()
+    })
+  ).default([]),
+  // Scenes with a single on-camera speaker — each MUST be lip-synced. Either a
+  // bare scene index (validator falls back to the scaffold naming convention) or
+  // `{ scene, lipsync_node }`, which names the exact node to look for so a
+  // hand-authored canvas with differently-named clip nodes isn't a false miss.
+  talking_scenes: z.array(z.union([z.number(), z.object({ scene: z.number(), lipsync_node: z.string() })])).default([])
+}).strict().optional();
 var CanvasMetadata = z.object({
   name: z.string().optional(),
   description: z.string().optional(),
   // Free-form, human-facing next-steps guide embedded by scaffolds (e.g.
   // `scaffold-video`) so the editable checklist travels inside the canvas
   // instead of only printing to stdout. Ignored by the engine.
-  todo: z.unknown().optional()
+  todo: z.unknown().optional(),
+  video: VideoMeta
 }).strict().optional();
 var CanvasSchema = z.object({
   schema: z.literal("baker-canvas/1"),
@@ -1964,8 +1990,12 @@ var STAGE_CODES = {
   DUPLICATE_ID: "DUPLICATE_NODE_ID",
   CYCLE: "GRAPH_CYCLE",
   SLOT: "PROMPT_SLOT_UNRESOLVED",
-  OUTPUT: "OUTPUT_REF_INVALID"
+  OUTPUT: "OUTPUT_REF_INVALID",
+  VO_OVERLAP: "VIDEO_VO_OVERLAP",
+  AUDIO_DURATION: "VIDEO_AUDIO_DURATION",
+  LIPSYNC_MISSING: "VIDEO_LIPSYNC_MISSING"
 };
+var VIDEO_TIME_SLACK_S = 0.75;
 function validateCanvas(input, registry) {
   const issues = [];
   const shape = CanvasSchema.safeParse(input);
@@ -1987,6 +2017,7 @@ function validateCanvas(input, registry) {
   checkAllSlots(ctx);
   const estimatedCredits = estimateCredits(ctx);
   checkOutputRef(ctx);
+  checkVideoInvariants(ctx);
   if (issues.length > 0) return { ok: false, issues };
   return { ok: true, canvas, estimatedCredits };
 }
@@ -2255,6 +2286,46 @@ function estimateCredits(ctx) {
   }
   return total;
 }
+function checkVideoInvariants(ctx) {
+  const meta = ctx.canvas.metadata?.video;
+  if (!meta) return;
+  const segments = [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s);
+  for (let i = 1; i < segments.length; i++) {
+    const prev = segments[i - 1];
+    const cur = segments[i];
+    if (!prev || !cur) continue;
+    if (cur.start_s < prev.end_s - VIDEO_TIME_SLACK_S) {
+      ctx.issues.push({
+        path: "metadata.video.vo_segments",
+        code: STAGE_CODES.VO_OVERLAP,
+        message: `voiceover turns overlap: "${prev.slot}" runs to ${prev.end_s}s but "${cur.slot}" starts at ${cur.start_s}s \u2014 sequence them so dialogue alternates instead of stacking`
+      });
+    }
+  }
+  const audioEnd = segments.reduce((m, s) => Math.max(m, s.end_s), 0);
+  if (audioEnd > meta.duration_s + VIDEO_TIME_SLACK_S) {
+    ctx.issues.push({
+      path: "metadata.video.duration_s",
+      code: STAGE_CODES.AUDIO_DURATION,
+      message: `audio runs to ${audioEnd}s but the video is ${meta.duration_s}s \u2014 trim the voiceover or extend the video so audio \u2248 video length`
+    });
+  }
+  for (const entry of meta.talking_scenes) {
+    const scene = typeof entry === "number" ? entry : entry.scene;
+    const synced = typeof entry === "number" ? ctx.canvas.nodes.some((n) => {
+      if (n.type !== "video_lipsync") return false;
+      const video = n.inputs?.video;
+      return video === `$ref:s${scene}_trim.video` || video === `$ref:s${scene}_clip.video`;
+    }) : ctx.canvas.nodes.some((n) => n.id === entry.lipsync_node && n.type === "video_lipsync");
+    if (!synced) {
+      ctx.issues.push({
+        path: "metadata.video.talking_scenes",
+        code: STAGE_CODES.LIPSYNC_MISSING,
+        message: `scene ${scene} has a single on-camera speaker but no video_lipsync on s${scene}_clip \u2014 its mouth will drift out of sync with the voiceover`
+      });
+    }
+  }
+}
 function checkOutputRef(ctx) {
   const out = ctx.canvas.output;
   if (!out) return;
@@ -4813,7 +4884,7 @@ var dialogueNode = delegated({
 // src/engine/nodes/remote/image.ts
 import { z as z15 } from "zod";
-var IMAGE_GENERATE_MODELS = [
+var IMAGE_GENERATE_MODELS2 = [
   "openai/gpt-5.4-image-2",
   "google/gemini-3.5-flash",
   "google/gemini-3.1-flash-image-preview",
@@ -4821,7 +4892,7 @@ var IMAGE_GENERATE_MODELS = [
   "recraft/recraft-v4.1-pro-vector"
 ];
 var ImageGenerateParams = z15.object({
-  model: z15.enum(IMAGE_GENERATE_MODELS),
+  model: z15.enum(IMAGE_GENERATE_MODELS2),
   prompt: z15.string().min(1),
   aspect_ratio: z15.enum(["1:1", "16:9", "9:16", "4:3", "3:4", "3:2", "2:3", "4:5", "5:4", "21:9", "1:4", "4:1", "1:8", "8:1"]).optional(),
   image_size: z15.enum(["0.5K", "1K", "2K", "4K"]).optional(),
@@ -5648,6 +5719,7 @@ function createEngineFromEnv(opts = {}) {
 export {
   SEEDANCE_DURATIONS,
   ELEVENLABS_MAX_MUSIC_LENGTH_MS,
+  IMAGE_GENERATE_MODELS,
   MODEL_REGISTRY,
   BackendClient2 as BackendClient,
   Engine2 as Engine,
@@ -5659,4 +5731,4 @@ export {
   defaultRegistry,
   createEngineFromEnv
 };
-//# sourceMappingURL=chunk-K6LHXCKD.js.map
+//# sourceMappingURL=chunk-JIDZ37KG.js.map