@koda-sl/baker-cli 0.39.29-dev.994437bd → 0.66.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1754 -7
- package/canvas/avocado-tutorial.json +89 -0
- package/canvas/hello-world-composition/index.html +83 -0
- package/canvas/hello-world-composition/meta.json +41 -0
- package/canvas/hello-world-overlay.json +37 -0
- package/canvas/phone-scroll-composition/index.html +141 -0
- package/canvas/phone-scroll-composition/meta.json +39 -0
- package/canvas/tiktok-captions-composition/index.html +126 -0
- package/canvas/tiktok-captions-composition/meta.json +46 -0
- package/canvas/video-overlay-composition/index.html +239 -0
- package/canvas/video-overlay-composition/meta.json +29 -0
- package/dist/cli.js +2 -0
- package/dist/cli.js.map +1 -1
- package/dist/commands/ads/meta/creatives.js +1 -1
- package/dist/commands/canvas/catalog.d.ts +2 -0
- package/dist/commands/canvas/catalog.d.ts.map +1 -0
- package/dist/commands/canvas/catalog.js +13 -0
- package/dist/commands/canvas/catalog.js.map +1 -0
- package/dist/commands/canvas/index.d.ts +2 -0
- package/dist/commands/canvas/index.d.ts.map +1 -0
- package/dist/commands/canvas/index.js +32 -0
- package/dist/commands/canvas/index.js.map +1 -0
- package/dist/commands/canvas/inspect.d.ts +16 -0
- package/dist/commands/canvas/inspect.d.ts.map +1 -0
- package/dist/commands/canvas/inspect.js +115 -0
- package/dist/commands/canvas/inspect.js.map +1 -0
- package/dist/commands/canvas/run.d.ts +24 -0
- package/dist/commands/canvas/run.d.ts.map +1 -0
- package/dist/commands/canvas/run.js +56 -0
- package/dist/commands/canvas/run.js.map +1 -0
- package/dist/commands/canvas/scaffold-static-ad.d.ts +40 -0
- package/dist/commands/canvas/scaffold-static-ad.d.ts.map +1 -0
- package/dist/commands/canvas/scaffold-static-ad.js +265 -0
- package/dist/commands/canvas/scaffold-static-ad.js.map +1 -0
- package/dist/commands/canvas/scaffold-video.d.ts +44 -0
- package/dist/commands/canvas/scaffold-video.d.ts.map +1 -0
- package/dist/commands/canvas/scaffold-video.js +235 -0
- package/dist/commands/canvas/scaffold-video.js.map +1 -0
- package/dist/commands/canvas/validate.d.ts +8 -0
- package/dist/commands/canvas/validate.d.ts.map +1 -0
- package/dist/commands/canvas/validate.js +37 -0
- package/dist/commands/canvas/validate.js.map +1 -0
- package/dist/commands/images/find.d.ts +1 -1
- package/dist/commands/images/find.d.ts.map +1 -1
- package/dist/commands/images/find.js +12 -3
- package/dist/commands/images/find.js.map +1 -1
- package/dist/commands/images/google.d.ts +1 -1
- package/dist/commands/images/google.d.ts.map +1 -1
- package/dist/commands/images/google.js +12 -3
- package/dist/commands/images/google.js.map +1 -1
- package/dist/commands/images/stock.d.ts +1 -1
- package/dist/commands/images/stock.d.ts.map +1 -1
- package/dist/commands/images/stock.js +7 -3
- package/dist/commands/images/stock.js.map +1 -1
- package/dist/commands/testimonials/list.d.ts.map +1 -1
- package/dist/commands/testimonials/list.js +21 -6
- package/dist/commands/testimonials/list.js.map +1 -1
- package/dist/commands/testimonials/search.d.ts.map +1 -1
- package/dist/commands/testimonials/search.js +21 -6
- package/dist/commands/testimonials/search.js.map +1 -1
- package/dist/commands/testimonials/search.test.js +3 -3
- package/dist/commands/testimonials/search.test.js.map +1 -1
- package/dist/engine/client/backend-client.d.ts +50 -0
- package/dist/engine/client/backend-client.d.ts.map +1 -0
- package/dist/engine/client/backend-client.js +20 -0
- package/dist/engine/client/backend-client.js.map +1 -0
- package/dist/engine/client/env.d.ts +7 -0
- package/dist/engine/client/env.d.ts.map +1 -0
- package/dist/engine/client/env.js +18 -0
- package/dist/engine/client/env.js.map +1 -0
- package/dist/engine/client/http.d.ts +56 -0
- package/dist/engine/client/http.d.ts.map +1 -0
- package/dist/engine/client/http.js +168 -0
- package/dist/engine/client/http.js.map +1 -0
- package/dist/engine/engine/cache-key.d.ts +19 -0
- package/dist/engine/engine/cache-key.d.ts.map +1 -0
- package/dist/engine/engine/cache-key.js +33 -0
- package/dist/engine/engine/cache-key.js.map +1 -0
- package/dist/engine/engine/canonical.d.ts +14 -0
- package/dist/engine/engine/canonical.d.ts.map +1 -0
- package/dist/engine/engine/canonical.js +53 -0
- package/dist/engine/engine/canonical.js.map +1 -0
- package/dist/engine/engine/composition-hash.d.ts +14 -0
- package/dist/engine/engine/composition-hash.d.ts.map +1 -0
- package/dist/engine/engine/composition-hash.js +51 -0
- package/dist/engine/engine/composition-hash.js.map +1 -0
- package/dist/engine/engine/composition-meta.d.ts +77 -0
- package/dist/engine/engine/composition-meta.d.ts.map +1 -0
- package/dist/engine/engine/composition-meta.js +199 -0
- package/dist/engine/engine/composition-meta.js.map +1 -0
- package/dist/engine/engine/context.d.ts +16 -0
- package/dist/engine/engine/context.d.ts.map +1 -0
- package/dist/engine/engine/context.js +2 -0
- package/dist/engine/engine/context.js.map +1 -0
- package/dist/engine/engine/define.d.ts +69 -0
- package/dist/engine/engine/define.d.ts.map +1 -0
- package/dist/engine/engine/define.js +9 -0
- package/dist/engine/engine/define.js.map +1 -0
- package/dist/engine/engine/didyoumean.d.ts +2 -0
- package/dist/engine/engine/didyoumean.d.ts.map +1 -0
- package/dist/engine/engine/didyoumean.js +58 -0
- package/dist/engine/engine/didyoumean.js.map +1 -0
- package/dist/engine/engine/errors.d.ts +41 -0
- package/dist/engine/engine/errors.d.ts.map +1 -0
- package/dist/engine/engine/errors.js +45 -0
- package/dist/engine/engine/errors.js.map +1 -0
- package/dist/engine/engine/executor.d.ts +72 -0
- package/dist/engine/engine/executor.d.ts.map +1 -0
- package/dist/engine/engine/executor.js +445 -0
- package/dist/engine/engine/executor.js.map +1 -0
- package/dist/engine/engine/refs.d.ts +12 -0
- package/dist/engine/engine/refs.d.ts.map +1 -0
- package/dist/engine/engine/refs.js +48 -0
- package/dist/engine/engine/refs.js.map +1 -0
- package/dist/engine/engine/registry.d.ts +11 -0
- package/dist/engine/engine/registry.d.ts.map +1 -0
- package/dist/engine/engine/registry.js +26 -0
- package/dist/engine/engine/registry.js.map +1 -0
- package/dist/engine/engine/scheduler.d.ts +10 -0
- package/dist/engine/engine/scheduler.d.ts.map +1 -0
- package/dist/engine/engine/scheduler.js +89 -0
- package/dist/engine/engine/scheduler.js.map +1 -0
- package/dist/engine/engine/validator.d.ts +31 -0
- package/dist/engine/engine/validator.d.ts.map +1 -0
- package/dist/engine/engine/validator.js +501 -0
- package/dist/engine/engine/validator.js.map +1 -0
- package/dist/engine/index.d.ts +33 -0
- package/dist/engine/index.d.ts.map +1 -0
- package/dist/engine/index.js +104 -0
- package/dist/engine/index.js.map +1 -0
- package/dist/engine/lib/ulid.d.ts +3 -0
- package/dist/engine/lib/ulid.d.ts.map +1 -0
- package/dist/engine/lib/ulid.js +36 -0
- package/dist/engine/lib/ulid.js.map +1 -0
- package/dist/engine/models/canvas-ad-params.test.d.ts +2 -0
- package/dist/engine/models/canvas-ad-params.test.d.ts.map +1 -0
- package/dist/engine/models/canvas-ad-params.test.js +60 -0
- package/dist/engine/models/canvas-ad-params.test.js.map +1 -0
- package/dist/engine/models/registry.d.ts +63 -0
- package/dist/engine/models/registry.d.ts.map +1 -0
- package/dist/engine/models/registry.js +432 -0
- package/dist/engine/models/registry.js.map +1 -0
- package/dist/engine/models/validateParams.d.ts +38 -0
- package/dist/engine/models/validateParams.d.ts.map +1 -0
- package/dist/engine/models/validateParams.js +166 -0
- package/dist/engine/models/validateParams.js.map +1 -0
- package/dist/engine/nodes/ingest.d.ts +113 -0
- package/dist/engine/nodes/ingest.d.ts.map +1 -0
- package/dist/engine/nodes/ingest.js +555 -0
- package/dist/engine/nodes/ingest.js.map +1 -0
- package/dist/engine/nodes/ingest.svg.test.d.ts +2 -0
- package/dist/engine/nodes/ingest.svg.test.d.ts.map +1 -0
- package/dist/engine/nodes/ingest.svg.test.js +30 -0
- package/dist/engine/nodes/ingest.svg.test.js.map +1 -0
- package/dist/engine/nodes/local/audioTimeline.d.ts +82 -0
- package/dist/engine/nodes/local/audioTimeline.d.ts.map +1 -0
- package/dist/engine/nodes/local/audioTimeline.js +97 -0
- package/dist/engine/nodes/local/audioTimeline.js.map +1 -0
- package/dist/engine/nodes/local/ffmpeg.d.ts +56 -0
- package/dist/engine/nodes/local/ffmpeg.d.ts.map +1 -0
- package/dist/engine/nodes/local/ffmpeg.js +50 -0
- package/dist/engine/nodes/local/ffmpeg.js.map +1 -0
- package/dist/engine/nodes/local/fontSpecimen.d.ts +50 -0
- package/dist/engine/nodes/local/fontSpecimen.d.ts.map +1 -0
- package/dist/engine/nodes/local/fontSpecimen.js +198 -0
- package/dist/engine/nodes/local/fontSpecimen.js.map +1 -0
- package/dist/engine/nodes/local/hyperframe-snapshot.d.ts +116 -0
- package/dist/engine/nodes/local/hyperframe-snapshot.d.ts.map +1 -0
- package/dist/engine/nodes/local/hyperframe-snapshot.js +230 -0
- package/dist/engine/nodes/local/hyperframe-snapshot.js.map +1 -0
- package/dist/engine/nodes/local/hyperframe.d.ts +123 -0
- package/dist/engine/nodes/local/hyperframe.d.ts.map +1 -0
- package/dist/engine/nodes/local/hyperframe.js +367 -0
- package/dist/engine/nodes/local/hyperframe.js.map +1 -0
- package/dist/engine/nodes/local/imagemagick.d.ts +56 -0
- package/dist/engine/nodes/local/imagemagick.d.ts.map +1 -0
- package/dist/engine/nodes/local/imagemagick.js +71 -0
- package/dist/engine/nodes/local/imagemagick.js.map +1 -0
- package/dist/engine/nodes/local/lib/assets.d.ts +20 -0
- package/dist/engine/nodes/local/lib/assets.d.ts.map +1 -0
- package/dist/engine/nodes/local/lib/assets.js +40 -0
- package/dist/engine/nodes/local/lib/assets.js.map +1 -0
- package/dist/engine/nodes/local/lib/cli-runner.d.ts +78 -0
- package/dist/engine/nodes/local/lib/cli-runner.d.ts.map +1 -0
- package/dist/engine/nodes/local/lib/cli-runner.js +254 -0
- package/dist/engine/nodes/local/lib/cli-runner.js.map +1 -0
- package/dist/engine/nodes/local/lib/ffmpeg.d.ts +23 -0
- package/dist/engine/nodes/local/lib/ffmpeg.d.ts.map +1 -0
- package/dist/engine/nodes/local/lib/ffmpeg.js +75 -0
- package/dist/engine/nodes/local/lib/ffmpeg.js.map +1 -0
- package/dist/engine/nodes/local/lib/hyperframe-errors.d.ts +2 -0
- package/dist/engine/nodes/local/lib/hyperframe-errors.d.ts.map +1 -0
- package/dist/engine/nodes/local/lib/hyperframe-errors.js +47 -0
- package/dist/engine/nodes/local/lib/hyperframe-errors.js.map +1 -0
- package/dist/engine/nodes/local/lib/templating.d.ts +22 -0
- package/dist/engine/nodes/local/lib/templating.d.ts.map +1 -0
- package/dist/engine/nodes/local/lib/templating.js +85 -0
- package/dist/engine/nodes/local/lib/templating.js.map +1 -0
- package/dist/engine/nodes/local/text.d.ts +6 -0
- package/dist/engine/nodes/local/text.d.ts.map +1 -0
- package/dist/engine/nodes/local/text.js +15 -0
- package/dist/engine/nodes/local/text.js.map +1 -0
- package/dist/engine/nodes/remote/delegate.d.ts +25 -0
- package/dist/engine/nodes/remote/delegate.d.ts.map +1 -0
- package/dist/engine/nodes/remote/delegate.js +160 -0
- package/dist/engine/nodes/remote/delegate.js.map +1 -0
- package/dist/engine/nodes/remote/dialogue.d.ts +34 -0
- package/dist/engine/nodes/remote/dialogue.d.ts.map +1 -0
- package/dist/engine/nodes/remote/dialogue.js +54 -0
- package/dist/engine/nodes/remote/dialogue.js.map +1 -0
- package/dist/engine/nodes/remote/image.d.ts +42 -0
- package/dist/engine/nodes/remote/image.d.ts.map +1 -0
- package/dist/engine/nodes/remote/image.js +43 -0
- package/dist/engine/nodes/remote/image.js.map +1 -0
- package/dist/engine/nodes/remote/imageAspectAdapt.d.ts +30 -0
- package/dist/engine/nodes/remote/imageAspectAdapt.d.ts.map +1 -0
- package/dist/engine/nodes/remote/imageAspectAdapt.js +42 -0
- package/dist/engine/nodes/remote/imageAspectAdapt.js.map +1 -0
- package/dist/engine/nodes/remote/imageBackgroundRemove.d.ts +39 -0
- package/dist/engine/nodes/remote/imageBackgroundRemove.d.ts.map +1 -0
- package/dist/engine/nodes/remote/imageBackgroundRemove.js +37 -0
- package/dist/engine/nodes/remote/imageBackgroundRemove.js.map +1 -0
- package/dist/engine/nodes/remote/imageDescribe.d.ts +29 -0
- package/dist/engine/nodes/remote/imageDescribe.d.ts.map +1 -0
- package/dist/engine/nodes/remote/imageDescribe.js +25 -0
- package/dist/engine/nodes/remote/imageDescribe.js.map +1 -0
- package/dist/engine/nodes/remote/imageReferenceSheet.d.ts +34 -0
- package/dist/engine/nodes/remote/imageReferenceSheet.d.ts.map +1 -0
- package/dist/engine/nodes/remote/imageReferenceSheet.js +38 -0
- package/dist/engine/nodes/remote/imageReferenceSheet.js.map +1 -0
- package/dist/engine/nodes/remote/imageSearch.d.ts +18 -0
- package/dist/engine/nodes/remote/imageSearch.d.ts.map +1 -0
- package/dist/engine/nodes/remote/imageSearch.js +22 -0
- package/dist/engine/nodes/remote/imageSearch.js.map +1 -0
- package/dist/engine/nodes/remote/imageSelect.d.ts +39 -0
- package/dist/engine/nodes/remote/imageSelect.d.ts.map +1 -0
- package/dist/engine/nodes/remote/imageSelect.js +45 -0
- package/dist/engine/nodes/remote/imageSelect.js.map +1 -0
- package/dist/engine/nodes/remote/music.d.ts +45 -0
- package/dist/engine/nodes/remote/music.d.ts.map +1 -0
- package/dist/engine/nodes/remote/music.js +73 -0
- package/dist/engine/nodes/remote/music.js.map +1 -0
- package/dist/engine/nodes/remote/soundEffect.d.ts +21 -0
- package/dist/engine/nodes/remote/soundEffect.d.ts.map +1 -0
- package/dist/engine/nodes/remote/soundEffect.js +41 -0
- package/dist/engine/nodes/remote/soundEffect.js.map +1 -0
- package/dist/engine/nodes/remote/textGenerate.d.ts +21 -0
- package/dist/engine/nodes/remote/textGenerate.d.ts.map +1 -0
- package/dist/engine/nodes/remote/textGenerate.js +27 -0
- package/dist/engine/nodes/remote/textGenerate.js.map +1 -0
- package/dist/engine/nodes/remote/tts.d.ts +45 -0
- package/dist/engine/nodes/remote/tts.d.ts.map +1 -0
- package/dist/engine/nodes/remote/tts.js +66 -0
- package/dist/engine/nodes/remote/tts.js.map +1 -0
- package/dist/engine/nodes/remote/video.d.ts +58 -0
- package/dist/engine/nodes/remote/video.d.ts.map +1 -0
- package/dist/engine/nodes/remote/video.js +44 -0
- package/dist/engine/nodes/remote/video.js.map +1 -0
- package/dist/engine/nodes/remote/videoBackgroundRemove.d.ts +30 -0
- package/dist/engine/nodes/remote/videoBackgroundRemove.d.ts.map +1 -0
- package/dist/engine/nodes/remote/videoBackgroundRemove.js +29 -0
- package/dist/engine/nodes/remote/videoBackgroundRemove.js.map +1 -0
- package/dist/engine/nodes/remote/videoDeconstruct.d.ts +61 -0
- package/dist/engine/nodes/remote/videoDeconstruct.d.ts.map +1 -0
- package/dist/engine/nodes/remote/videoDeconstruct.js +40 -0
- package/dist/engine/nodes/remote/videoDeconstruct.js.map +1 -0
- package/dist/engine/nodes/remote/videoLipsync.d.ts +37 -0
- package/dist/engine/nodes/remote/videoLipsync.d.ts.map +1 -0
- package/dist/engine/nodes/remote/videoLipsync.js +26 -0
- package/dist/engine/nodes/remote/videoLipsync.js.map +1 -0
- package/dist/engine/nodes/remote/videoTranscribe.d.ts +116 -0
- package/dist/engine/nodes/remote/videoTranscribe.d.ts.map +1 -0
- package/dist/engine/nodes/remote/videoTranscribe.js +123 -0
- package/dist/engine/nodes/remote/videoTranscribe.js.map +1 -0
- package/dist/engine/nodes/remote/voiceSelect.d.ts +28 -0
- package/dist/engine/nodes/remote/voiceSelect.d.ts.map +1 -0
- package/dist/engine/nodes/remote/voiceSelect.js +25 -0
- package/dist/engine/nodes/remote/voiceSelect.js.map +1 -0
- package/dist/engine/scaffold/staticAd.d.ts +44 -0
- package/dist/engine/scaffold/staticAd.d.ts.map +1 -0
- package/dist/engine/scaffold/staticAd.js +243 -0
- package/dist/engine/scaffold/staticAd.js.map +1 -0
- package/dist/engine/scaffold/video.d.ts +56 -0
- package/dist/engine/scaffold/video.d.ts.map +1 -0
- package/dist/engine/scaffold/video.js +709 -0
- package/dist/engine/scaffold/video.js.map +1 -0
- package/dist/engine/schema/canvas.d.ts +41 -0
- package/dist/engine/schema/canvas.d.ts.map +1 -0
- package/dist/engine/schema/canvas.js +67 -0
- package/dist/engine/schema/canvas.js.map +1 -0
- package/dist/engine/schema/catalog.d.ts +25 -0
- package/dist/engine/schema/catalog.d.ts.map +1 -0
- package/dist/engine/schema/catalog.js +48 -0
- package/dist/engine/schema/catalog.js.map +1 -0
- package/dist/engine/schema/primitives.d.ts +6 -0
- package/dist/engine/schema/primitives.d.ts.map +1 -0
- package/dist/engine/schema/primitives.js +4 -0
- package/dist/engine/schema/primitives.js.map +1 -0
- package/dist/engine/schema/prompts.d.ts +4 -0
- package/dist/engine/schema/prompts.d.ts.map +1 -0
- package/dist/engine/schema/prompts.js +23 -0
- package/dist/engine/schema/prompts.js.map +1 -0
- package/dist/engine/schema/refs.d.ts +113 -0
- package/dist/engine/schema/refs.d.ts.map +1 -0
- package/dist/engine/schema/refs.js +35 -0
- package/dist/engine/schema/refs.js.map +1 -0
- package/dist/engine/storage/asset-store.d.ts +48 -0
- package/dist/engine/storage/asset-store.d.ts.map +1 -0
- package/dist/engine/storage/asset-store.js +166 -0
- package/dist/engine/storage/asset-store.js.map +1 -0
- package/dist/engine/storage/cache-store.d.ts +21 -0
- package/dist/engine/storage/cache-store.d.ts.map +1 -0
- package/dist/engine/storage/cache-store.js +31 -0
- package/dist/engine/storage/cache-store.js.map +1 -0
- package/dist/engine/storage/output-writer.d.ts +18 -0
- package/dist/engine/storage/output-writer.d.ts.map +1 -0
- package/dist/engine/storage/output-writer.js +52 -0
- package/dist/engine/storage/output-writer.js.map +1 -0
- package/dist/engine/storage/sha256.d.ts +2 -0
- package/dist/engine/storage/sha256.d.ts.map +1 -0
- package/dist/engine/storage/sha256.js +7 -0
- package/dist/engine/storage/sha256.js.map +1 -0
- package/package.json +15 -3
package/README.md
CHANGED
|
@@ -1603,7 +1603,7 @@ Each external source is its own subcommand. Pick the verb that matches the sourc
|
|
|
1603
1603
|
| `baker images crop <file>` | Coordinate-based rectangular extract — local file or URL | n/a |
|
|
1604
1604
|
| `baker images dimensions <file\|url>` | Read width / height / aspect / format without decoding | n/a |
|
|
1605
1605
|
|
|
1606
|
-
**Auto-ingest** runs the full `processImage` pipeline (Gemini describe + Voyage multimodal embed + OpenRouter text embed) on every hit. Override with `--auto-ingest N` (turn on) or `--no-auto-ingest` (turn off where default is on). After auto-ingest the next `baker images library` query for the same concept hits the local row.
|
|
1606
|
+
**Auto-ingest** runs the full `processImage` pipeline (Gemini describe + Voyage multimodal embed + OpenRouter text embed) on every hit. Override with `--auto-ingest N` (turn on) or `--no-auto-ingest` (turn off where default is on). When auto-ingest succeeds, the matching returned hit uses the Baker-owned URL and keeps the original provider URL as `sourceUrl`. After auto-ingest the next `baker images library` query for the same concept hits the local row.
|
|
1607
1607
|
|
|
1608
1608
|
**Provider context biases the description.** When a hit carries human-readable text (Giphy `alt_text` / `title`, Magnific `title`, Brandfetch type+theme, etc.) it's surfaced on the hit as `descriptionContext` and forwarded to Gemini during auto-ingest as a hint — Gemini still trusts the pixels (e.g. it'll correct a wrong botanical name) but uses the hint for cultural/brand/scene awareness it can't infer from bytes alone.
|
|
1609
1609
|
|
|
@@ -1659,7 +1659,7 @@ baker images find "office" --sources library,magnific --fallback --threshold 0.4
|
|
|
1659
1659
|
baker images find "celebration" --sources library,giphy --auto-ingest 3
|
|
1660
1660
|
```
|
|
1661
1661
|
|
|
1662
|
-
Providers: `library`, `magnific`, `google`, `iconify`, `giphy`. Brandfetch is not part of the fanout — it takes a domain, not a query, so it lives at `baker images logo <domain>`. Response shape: `{ groups: { library, external }, meta: { counts, errors } }`. Partial failures (one provider throws) return the successful providers plus a `meta.errors` array; the whole call never fails on a single provider error.
|
|
1662
|
+
Providers: `library`, `magnific`, `google`, `iconify`, `giphy`. Brandfetch is not part of the fanout — it takes a domain, not a query, so it lives at `baker images logo <domain>`. Response shape: `{ groups: { library, external }, ingested, meta: { counts, errors } }`. When `--auto-ingest` is used, `ingested[]` preserves successful external ingest order and matching external hits are enriched with the Baker-owned URL plus source provenance. Partial failures (one provider throws) return the successful providers plus a `meta.errors` array; the whole call never fails on a single provider error.
|
|
1663
1663
|
|
|
1664
1664
|
**Flags:**
|
|
1665
1665
|
|
|
@@ -1669,7 +1669,7 @@ Providers: `library`, `magnific`, `google`, `iconify`, `giphy`. Brandfetch is no
|
|
|
1669
1669
|
| `--limit` | Max results per group (default 20, max 50) |
|
|
1670
1670
|
| `--fallback` | Skip external providers when the top library score ≥ `--threshold` |
|
|
1671
1671
|
| `--threshold` | Library score floor for `--fallback` (default `0.4`) |
|
|
1672
|
-
| `--auto-ingest` | Ingest top N external hits into the library (0–20, default 0)
|
|
1672
|
+
| `--auto-ingest` | Ingest top N external hits into the library (0–20, default 0) and return Baker-owned URLs on ingested hits |
|
|
1673
1673
|
| `--context` | Free-text hint passed to Gemini describe to bias the generated description and tags (overrides provider-derived context on auto-ingest paths) |
|
|
1674
1674
|
|
|
1675
1675
|
### `baker images stock <query>`
|
|
@@ -1699,12 +1699,12 @@ Free tier exists but watermarks previews — pass `--license freemium` to filter
|
|
|
1699
1699
|
| `--order` | `relevance` (default) or `recent` |
|
|
1700
1700
|
| `--limit` | Max results (1–50, default 10) |
|
|
1701
1701
|
| `--page` | Page number for pagination |
|
|
1702
|
-
| `--auto-ingest` | Ingest top N hits (0–20, default 0)
|
|
1702
|
+
| `--auto-ingest` | Ingest top N hits (0–20, default 0) and return Baker-owned URLs on ingested hits |
|
|
1703
1703
|
| `--context` | Free-text hint passed to Gemini describe to bias the generated description and tags (overrides provider-derived context on auto-ingest paths) |
|
|
1704
1704
|
|
|
1705
1705
|
### `baker images google <query>`
|
|
1706
1706
|
|
|
1707
|
-
Google Images via the official Custom Search JSON API. ⚠ Source is unverified web content
|
|
1707
|
+
Google Images via the official Custom Search JSON API. ⚠ Source is unverified web content. With `--auto-ingest`, ingested hits return Baker-owned URLs.
|
|
1708
1708
|
|
|
1709
1709
|
```bash
|
|
1710
1710
|
baker images google "industrial workshop" --type photo --size large --limit 20
|
|
@@ -1723,7 +1723,7 @@ Requires both `GOOGLE_CUSTOM_SEARCH_API_KEY` and `GOOGLE_CUSTOM_SEARCH_ENGINE_ID
|
|
|
1723
1723
|
| `--color` | `imgColorType`: `color \| gray \| mono \| trans` |
|
|
1724
1724
|
| `--safe` | `off \| active` |
|
|
1725
1725
|
| `--limit` | Max results (1–50, paginated 10 per call) |
|
|
1726
|
-
| `--auto-ingest` | Ingest top N (0–20, default 0)
|
|
1726
|
+
| `--auto-ingest` | Ingest top N (0–20, default 0) and return Baker-owned URLs on ingested hits |
|
|
1727
1727
|
| `--context` | Free-text hint passed to Gemini describe to bias the generated description and tags (overrides provider-derived context on auto-ingest paths) |
|
|
1728
1728
|
|
|
1729
1729
|
### `baker images logo <domain>`
|
|
@@ -1776,7 +1776,7 @@ Variants that don't exist for a domain are silently skipped — `linear.app` ret
|
|
|
1776
1776
|
{ "providerMeta": { "type": "symbol", "theme": "light" }, "…": "…" },
|
|
1777
1777
|
{ "providerMeta": { "type": "symbol", "theme": "dark" }, "…": "…" }
|
|
1778
1778
|
],
|
|
1779
|
-
"ingested": [{ "imageId": "…", "deduped": false }]
|
|
1779
|
+
"ingested": [{ "imageId": "…", "deduped": false, "imageUrl": "https://media.withbaker.com/…", "sourceUrl": "https://cdn.brandfetch.io/…" }]
|
|
1780
1780
|
}
|
|
1781
1781
|
}
|
|
1782
1782
|
```
|
|
@@ -2260,6 +2260,1753 @@ baker schema ads.google.query # Get query command schema
|
|
|
2260
2260
|
baker schema images.search # Get image search schema
|
|
2261
2261
|
```
|
|
2262
2262
|
|
|
2263
|
+
## Creative Canvas
|
|
2264
|
+
|
|
2265
|
+
The creative canvas runs declarative JSON pipelines. You author a graph of nodes (text, image, video, audio, composition, control-flow), `baker canvas run` it, and every node's output drops to disk under `canvas/<run_id>/` so you can watch progress and consume the artifacts.
|
|
2266
|
+
|
|
2267
|
+
Auth: same `BAKER_API_KEY` + `BAKER_API_URL` as the rest of the CLI.
|
|
2268
|
+
|
|
2269
|
+
### Quick start
|
|
2270
|
+
|
|
2271
|
+
```bash
|
|
2272
|
+
# 1. Validate (free, never calls the API)
|
|
2273
|
+
baker canvas validate my-canvas.json
|
|
2274
|
+
|
|
2275
|
+
# 2. Run (executes nodes, writes files to ./canvas/<run_id>/)
|
|
2276
|
+
baker canvas run my-canvas.json
|
|
2277
|
+
|
|
2278
|
+
# 3. Inspect a finished run (per-node timing, file list, optional video thumbs)
|
|
2279
|
+
baker canvas inspect <run_id>
|
|
2280
|
+
|
|
2281
|
+
# 4. Discover what's available (all node types, all models, all compositions)
|
|
2282
|
+
baker canvas catalog | jq
|
|
2283
|
+
```
|
|
2284
|
+
|
|
2285
|
+
A re-run with no changes hits the cache for every node — total runtime drops to sub-second.
|
|
2286
|
+
|
|
2287
|
+
---
|
|
2288
|
+
|
|
2289
|
+
### How it works
|
|
2290
|
+
|
|
2291
|
+
1. **You write a JSON file** describing nodes and how they wire together (`$ref:other_node.output`).
|
|
2292
|
+
2. **The validator** checks the schema, every node's params against the model registry, and that every `{{slot}}` in a prompt has a wired input.
|
|
2293
|
+
3. **The engine** runs nodes in topological order. Local nodes (text, ffmpeg, imagemagick, hyperframe_*, font_specimen) execute in-process. Remote nodes (text_generate, image_generate, image_describe, image_search, video_generate, tts, music…) POST to the Convex backend.
|
|
2294
|
+
4. **Outputs land on disk** the moment each node finishes — assets are downloaded from S3 and named `<node_id>__<slot>[__<index>].<ext>` inside the run dir.
|
|
2295
|
+
5. **The cache** is content-addressed by node params + input hashes. Edit a prompt, only the dependent nodes re-run.
|
|
2296
|
+
|
|
2297
|
+
---
|
|
2298
|
+
|
|
2299
|
+
### Canvas file structure
|
|
2300
|
+
|
|
2301
|
+
```jsonc
|
|
2302
|
+
{
|
|
2303
|
+
"schema": "baker-canvas/1",
|
|
2304
|
+
"cache_salt": "v2", // optional, bumps the cache key for every node
|
|
2305
|
+
"metadata": { "name": "my-canvas" },
|
|
2306
|
+
"nodes": [
|
|
2307
|
+
{ "id": "topic", "type": "text", "params": { "value": "espresso machines" } },
|
|
2308
|
+
{
|
|
2309
|
+
"id": "headline",
|
|
2310
|
+
"type": "text_generate",
|
|
2311
|
+
"inputs": { "topic": "$ref:topic.text" },
|
|
2312
|
+
"params": {
|
|
2313
|
+
"model": "google/gemini-3.5-flash",
|
|
2314
|
+
"prompt": "Write a 6-word tagline for {{topic}}."
|
|
2315
|
+
}
|
|
2316
|
+
}
|
|
2317
|
+
],
|
|
2318
|
+
"output": { "node": "headline", "output": "text" }
|
|
2319
|
+
}
|
|
2320
|
+
```
|
|
2321
|
+
|
|
2322
|
+
**Wiring nodes together:**
|
|
2323
|
+
|
|
2324
|
+
| Token | Meaning |
|
|
2325
|
+
|---|---|
|
|
2326
|
+
| `$ref:node_id.output` | Read the named output slot of another node. |
|
|
2327
|
+
| `$ref:node_id.output#0` | Read element 0 of a list output (e.g. `images#0`). |
|
|
2328
|
+
| `{{slot}}` | Inside a string param, substitute the value of `inputs.<slot>`. |
|
|
2329
|
+
|
|
2330
|
+
`{{slot}}` substitution by wired value kind:
|
|
2331
|
+
|
|
2332
|
+
- Plain strings (e.g. the local `text` node's output) are spliced in as-is.
|
|
2333
|
+
- **text/json assets** (e.g. `text_generate.text`, `image_describe.description`) are spliced in as their full content, read from the local asset store. Capped at 256 KB per asset — a larger asset fails the node rather than silently truncating.
|
|
2334
|
+
- image/video/audio assets render as compact placeholders (e.g. `[image: 1024x1024]`); use `inputs` to pass the asset itself to nodes that accept it.
|
|
2335
|
+
|
|
2336
|
+
The top-level `output` declares the canvas's final result — the engine returns its value on stdout when the run completes.
|
|
2337
|
+
|
|
2338
|
+
---
|
|
2339
|
+
|
|
2340
|
+
### Node types
|
|
2341
|
+
|
|
2342
|
+
Every node shares the envelope `{ id, type, inputs?, params? }`. `inputs` wires other nodes' output slots in; `params` configures the node. The validator enforces required input kinds and per-model param shape **before** anything is billed.
|
|
2343
|
+
|
|
2344
|
+
`baker canvas catalog` prints the always-current machine-readable schema for every node and every model. The tables below mirror the catalog at time of write.
|
|
2345
|
+
|
|
2346
|
+
In every table:
|
|
2347
|
+
|
|
2348
|
+
- "Required" on inputs means the validator blocks the run if the slot is unwired.
|
|
2349
|
+
- MIME lists are what the provider's transport accepts; `ingest` outputs flow through unchanged, so the source's MIME is what arrives.
|
|
2350
|
+
|
|
2351
|
+
---
|
|
2352
|
+
|
|
2353
|
+
#### Local nodes — run in-process, zero credits
|
|
2354
|
+
|
|
2355
|
+
##### `text`
|
|
2356
|
+
|
|
2357
|
+
A literal string value. Use for prompts, descriptions, copy.
|
|
2358
|
+
|
|
2359
|
+
**Inputs:** none.
|
|
2360
|
+
|
|
2361
|
+
**Params:** `value` (string, required).
|
|
2362
|
+
|
|
2363
|
+
**Outputs:** `text` → `text` / `text/plain`.
|
|
2364
|
+
|
|
2365
|
+
---
|
|
2366
|
+
|
|
2367
|
+
##### `ffmpeg`
|
|
2368
|
+
|
|
2369
|
+
Local ffmpeg passthrough. Write the argv you'd type, declare outputs, the engine stages inputs and ingests results. See [Local CLI nodes](#local-cli-nodes) for the placeholder safety contract.
|
|
2370
|
+
|
|
2371
|
+
**Inputs:** open record. Each slot accepts an `AssetRef` or `AssetRef[]`. Reference inside `args` as `{{in.<slot>}}` / `{{in.<slot>.<index>}}`.
|
|
2372
|
+
|
|
2373
|
+
**Params:**
|
|
2374
|
+
|
|
2375
|
+
| Name | Type | Required | Constraint |
|
|
2376
|
+
|---|---|---|---|
|
|
2377
|
+
| `args` | string[] | yes | min 1; only `{{in.…}}` / `{{out.…}}` placeholders, no raw paths or URLs |
|
|
2378
|
+
| `outputs` | record | yes | `<name> → { kind: "image" \| "video" \| "audio", ext: string }` |
|
|
2379
|
+
|
|
2380
|
+
**Outputs:** open record keyed by `params.outputs` entries; kind + ext per declaration.
|
|
2381
|
+
|
|
2382
|
+
---
|
|
2383
|
+
|
|
2384
|
+
##### `imagemagick`
|
|
2385
|
+
|
|
2386
|
+
Local ImageMagick passthrough. Identical schema and safety contract to `ffmpeg`; uses `magick` (v7+) or `convert` (v6) on PATH.
|
|
2387
|
+
|
|
2388
|
+
---
|
|
2389
|
+
|
|
2390
|
+
##### `hyperframe_render`
|
|
2391
|
+
|
|
2392
|
+
Render an HTML/CSS/GSAP composition to **mp4**. Quality, format, and worker count are fixed by the engine for ad-creative delivery — the canvas owns only the composition path and composition vars.
|
|
2393
|
+
|
|
2394
|
+
**Inputs**
|
|
2395
|
+
|
|
2396
|
+
Open record keyed by the composition's `meta.json` inputs (declared per composition).
|
|
2397
|
+
|
|
2398
|
+
**Params**
|
|
2399
|
+
|
|
2400
|
+
| Name | Type | Required | Default | Notes |
|
|
2401
|
+
|---|---|---|---|---|
|
|
2402
|
+
| `composition` | string | yes | — | path to composition dir |
|
|
2403
|
+
| `timeout_ms` | number | no | 600000 | positive |
|
|
2404
|
+
| **composition vars** | per `meta.json` | per `meta.json` | per `meta.json` | substituted into `{{var}}` |
|
|
2405
|
+
|
|
2406
|
+
**Outputs**
|
|
2407
|
+
|
|
2408
|
+
| Slot | Kind | MIME |
|
|
2409
|
+
|---|---|---|
|
|
2410
|
+
| `video` | video | `video/mp4` |
|
|
2411
|
+
|
|
2412
|
+
See [authoring compositions](#authoring-compositions) for the composition file format.
|
|
2413
|
+
|
|
2414
|
+
---
|
|
2415
|
+
|
|
2416
|
+
##### `hyperframe_snapshot`
|
|
2417
|
+
|
|
2418
|
+
Render the same composition format to a **PNG** still at **2× device-scale** (retina). Output size = composition `meta.json` `width × height × 2`.
|
|
2419
|
+
|
|
2420
|
+
**Inputs**
|
|
2421
|
+
|
|
2422
|
+
Open record keyed by the composition's `meta.json` inputs.
|
|
2423
|
+
|
|
2424
|
+
**Params**
|
|
2425
|
+
|
|
2426
|
+
| Name | Type | Required | Default | Notes |
|
|
2427
|
+
|---|---|---|---|---|
|
|
2428
|
+
| `composition` | string | yes | — | path to composition dir |
|
|
2429
|
+
| `wait_for` | `auto \| selector \| function \| timeout` | no | `{ kind: "auto" }` | `auto` waits for `load` + image decode + fonts ready |
|
|
2430
|
+
| `timeout_ms` | number | no | 60000 | positive |
|
|
2431
|
+
| **composition vars** | per `meta.json` | per `meta.json` | per `meta.json` | substituted into `{{var}}` |
|
|
2432
|
+
|
|
2433
|
+
**Outputs**
|
|
2434
|
+
|
|
2435
|
+
| Slot | Kind | MIME |
|
|
2436
|
+
|---|---|---|
|
|
2437
|
+
| `image` | image | `image/png` |
|
|
2438
|
+
|
|
2439
|
+
---
|
|
2440
|
+
|
|
2441
|
+
##### `font_specimen`
|
|
2442
|
+
|
|
2443
|
+
Render specimen text in a given font file to a **PNG** — black `#000` text on a white `#fff` background — via headless Chromium at 2× device-scale. Produces a **typeface reference image**: wire `image` into `image_generate`'s `reference` input so the model replicates the font when generating images that contain text.
|
|
2444
|
+
|
|
2445
|
+
Browser rendering means full text-shaping fidelity: kerning, ligatures, WOFF2, non-Latin scripts. If Chromium cannot parse the font file the node fails with a clear error instead of silently falling back to a system font.
|
|
2446
|
+
|
|
2447
|
+
**Inputs**
|
|
2448
|
+
|
|
2449
|
+
| Slot | Kind | Required | Notes |
|
|
2450
|
+
|---|---|---|---|
|
|
2451
|
+
| `font` | font | yes | ttf / otf / woff / woff2 — get one via `ingest` with `expect: "font"` |
|
|
2452
|
+
| _(any)_ | text/json | no | Extra slots are allowed and substitute into `text` via `{{slot}}` — wire upstream copy (e.g. a `text_generate` that extracts the exact on-image text) so the specimen renders the real headline, not a placeholder. |
|
|
2453
|
+
|
|
2454
|
+
**Params**
|
|
2455
|
+
|
|
2456
|
+
| Name | Type | Required | Default | Notes |
|
|
2457
|
+
|---|---|---|---|---|
|
|
2458
|
+
| `text` | string | no | pangram + A–Z + a–z + digits + punctuation | 1–2000 chars; `\n` for line breaks. Set to the exact headline, or `"{{slot}}"` wired to an upstream node. |
|
|
2459
|
+
| `font_size` | number | no | 72 | 8–512, integer (CSS px) |
|
|
2460
|
+
| `padding` | number | no | 64 | 0–512, integer (CSS px) |
|
|
2461
|
+
| `line_height` | number | no | 1.35 | 0.8–3 |
|
|
2462
|
+
| `max_width` | number | no | — (no wrap, auto width) | 256–4096, integer; wraps long lines when set |
|
|
2463
|
+
|
|
2464
|
+
**Outputs**
|
|
2465
|
+
|
|
2466
|
+
| Slot | Kind | MIME |
|
|
2467
|
+
|---|---|---|
|
|
2468
|
+
| `image` | image | `image/png` |
|
|
2469
|
+
|
|
2470
|
+
**Cost:** 0 engine credits (deterministic — cached by font sha + params).
|
|
2471
|
+
|
|
2472
|
+
Example pipeline:
|
|
2473
|
+
|
|
2474
|
+
```jsonc
|
|
2475
|
+
{ "id": "font", "type": "ingest", "params": { "source": "url", "url": "https://cdn.example.com/brand.woff2", "expect": "font" } }
|
|
2476
|
+
{ "id": "specimen", "type": "font_specimen", "inputs": { "font": "$ref:font.asset" }, "params": {} }
|
|
2477
|
+
{ "id": "gen", "type": "image_generate", "inputs": { "reference": "$ref:specimen.image" },
|
|
2478
|
+
"params": { "model": "google/gemini-3.1-flash-image-preview", "prompt": "Launch poster headlined 'SALE ENDS FRIDAY' using the exact typeface from the reference image" } }
|
|
2479
|
+
```
|
|
2480
|
+
|
|
2481
|
+
---
|
|
2482
|
+
|
|
2483
|
+
#### `ingest` — URL or local file → `AssetRef`
|
|
2484
|
+
|
|
2485
|
+
Pick a `source` discriminator and declare the kind you expect. See [Ingestion](#ingestion-ingest-node) for the strategy table.
|
|
2486
|
+
|
|
2487
|
+
**Inputs:** none.
|
|
2488
|
+
|
|
2489
|
+
**Params** — discriminated on `source`:
|
|
2490
|
+
|
|
2491
|
+
`source: "url"` (yt-dlp / direct fetch / Handinger):
|
|
2492
|
+
|
|
2493
|
+
| Name | Type | Required | Constraint |
|
|
2494
|
+
|---|---|---|---|
|
|
2495
|
+
| `source` | literal | yes | `"url"` |
|
|
2496
|
+
| `url` | string | yes | valid URL |
|
|
2497
|
+
| `expect` | enum | yes | `image \| video \| audio \| text \| json \| font` |
|
|
2498
|
+
|
|
2499
|
+
`source: "path"` (local filesystem → R2):
|
|
2500
|
+
|
|
2501
|
+
| Name | Type | Required | Constraint |
|
|
2502
|
+
|---|---|---|---|
|
|
2503
|
+
| `source` | literal | yes | `"path"` |
|
|
2504
|
+
| `path` | string | yes | absolute or cwd-relative; `~` not expanded |
|
|
2505
|
+
| `expect` | enum | yes | `image \| video \| audio \| text \| json \| font` |
|
|
2506
|
+
|
|
2507
|
+
**Outputs:** `asset` → `<params.expect>` / content-determined (URL strategy table) or extension-inferred (path).
|
|
2508
|
+
|
|
2509
|
+
**Path-source notes:** the canvas is **not portable** to another machine without the file. Cache key folds the file's `mtime:size`, so editing the file invalidates the cache automatically. Supported extensions: `png`, `jpg`/`jpeg`, `webp`, `gif`, `avif`, `svg`, `mp4`, `webm`, `mov`, `m4v`, `mp3`, `wav`, `m4a`, `ogg`, `flac`, `json`, `txt`, `md`, `markdown`, `html`/`htm`, `csv`, `ttf`, `otf`, `woff`, `woff2`. Unknown extensions fall back to magic-byte sniffing for common image formats (and an SVG content sniff), else `kind_mismatch`. **SVG (`expect: "image"`) is rasterized to a transparent PNG on ingest** — brand logos are usually SVG, and image-generation models can't read SVG markup, so it's upscaled (longest edge near 2048px) with transparency preserved and the resulting asset carries `metadata.rasterized_from: "svg"`.
|
|
2510
|
+
|
|
2511
|
+
**Cost:** 0 engine credits for direct fetch + yt-dlp + local file. Handinger charges per scrape.
|
|
2512
|
+
|
|
2513
|
+
---
|
|
2514
|
+
|
|
2515
|
+
#### Remote registry-managed nodes — billed in credits, curated model set
|
|
2516
|
+
|
|
2517
|
+
Every generative node has a **curated** model enum. The registry (`packages/cli/src/engine/models/registry.ts`) is the source of truth and the validator runs per-model checks on params **and** required input kinds before any API call. The canvas-engine runs inside the controlled E2B sandbox (`packages/e2b-template/src/template.ts`), so models, MIME sets, and per-model param shapes are intentionally narrow.
|
|
2518
|
+
|
|
2519
|
+
---
|
|
2520
|
+
|
|
2521
|
+
##### `text_generate`
|
|
2522
|
+
|
|
2523
|
+
Single-turn LLM text generation via OpenRouter.
|
|
2524
|
+
|
|
2525
|
+
**Inputs**
|
|
2526
|
+
|
|
2527
|
+
None.
|
|
2528
|
+
|
|
2529
|
+
**Params**
|
|
2530
|
+
|
|
2531
|
+
| Name | Type | Required | Notes |
|
|
2532
|
+
|---|---|---|---|
|
|
2533
|
+
| `model` | enum | yes | one of `~google/gemini-flash-latest`, `~google/gemini-pro-latest` |
|
|
2534
|
+
| `prompt` | string | yes | non-empty |
|
|
2535
|
+
| `system` | string | no | system prompt |
|
|
2536
|
+
| `response_format` | enum | no | `text` (default) or `json_object`. Set `json_object` when a downstream `{{slot}}` consumes the output as JSON (e.g. the ad-blueprint transform) so the model returns clean JSON with no markdown fences or prose. |
|
|
2537
|
+
| `web_search` | boolean | no | When `true`, routes through OpenRouter's `:online` web plugin so the model searches the live web before answering. Use it on the ad-blueprint transform so it adapts copy to the target brand's real facts (current pricing, the trust signals it actually has) instead of guessing. Adds web-search cost. |
|
|
2538
|
+
| `temperature` | number | no | 0–2 |
|
|
2539
|
+
| `max_tokens` | number | no | positive int |
|
|
2540
|
+
|
|
2541
|
+
**Outputs**
|
|
2542
|
+
|
|
2543
|
+
| Slot | Kind | MIME |
|
|
2544
|
+
|---|---|---|
|
|
2545
|
+
| `text` | text | `text/plain` |
|
|
2546
|
+
|
|
2547
|
+
**Cost** — 1 credit per call (flat estimate; usage-based pricing is a follow-up).
|
|
2548
|
+
|
|
2549
|
+
**Example**
|
|
2550
|
+
|
|
2551
|
+
```json
|
|
2552
|
+
{
|
|
2553
|
+
"id": "headline",
|
|
2554
|
+
"type": "text_generate",
|
|
2555
|
+
"params": {
|
|
2556
|
+
"model": "~google/gemini-flash-latest",
|
|
2557
|
+
"prompt": "Write a 6-word hero headline for a dog-insurance ad."
|
|
2558
|
+
}
|
|
2559
|
+
}
|
|
2560
|
+
```
|
|
2561
|
+
|
|
2562
|
+
---
|
|
2563
|
+
|
|
2564
|
+
##### `image_generate`
|
|
2565
|
+
|
|
2566
|
+
Generate images via OpenRouter. The Zod schema is a permissive union; per-model param/aspect-ratio support is registry-validated.
|
|
2567
|
+
|
|
2568
|
+
**Inputs**
|
|
2569
|
+
|
|
2570
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
2571
|
+
|---|---|---|---|
|
|
2572
|
+
| `reference` | image **or** image[] | no | `image/png`, `image/jpeg`, `image/webp`, `image/gif` |
|
|
2573
|
+
|
|
2574
|
+
`reference` accepts a single image or an **array** of images. Wire several to combine references in one
|
|
2575
|
+
generation — e.g. a subject reference sheet + a font specimen + the original ad. Each image is
|
|
2576
|
+
forwarded to the model as a separate input, in array order (the provider accepts multiple). Wire an
|
|
2577
|
+
array as a literal list: `"reference": ["$ref:subject.sheet", "$ref:type.image"]`.
|
|
2578
|
+
|
|
2579
|
+
**Outputs**
|
|
2580
|
+
|
|
2581
|
+
| Slot | Kind | MIME |
|
|
2582
|
+
|---|---|---|
|
|
2583
|
+
| `images` | image[] | `image/png \| image/jpeg \| image/webp` (provider-determined) |
|
|
2584
|
+
|
|
2585
|
+
**Cost** — 5 credits estimate per call (backend charges actual OpenRouter usage).
|
|
2586
|
+
|
|
2587
|
+
> `seed` is not supported — OpenRouter's `image_config` schema has no seed slot for image-gen.
|
|
2588
|
+
|
|
2589
|
+
Aspect-ratio sets used below:
|
|
2590
|
+
- **STD AR** = `1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9`
|
|
2591
|
+
- **EXTREME AR** = STD AR + `1:4, 4:1, 1:8, 8:1`
|
|
2592
|
+
|
|
2593
|
+
---
|
|
2594
|
+
|
|
2595
|
+
###### Model: `openai/gpt-5.4-image-2`
|
|
2596
|
+
|
|
2597
|
+
Photorealistic generalist. Optional `reference` image.
|
|
2598
|
+
|
|
2599
|
+
| Name | Type | Required | Notes |
|
|
2600
|
+
|---|---|---|---|
|
|
2601
|
+
| `model` | literal | yes | `"openai/gpt-5.4-image-2"` |
|
|
2602
|
+
| `prompt` | string | yes | non-empty |
|
|
2603
|
+
| `aspect_ratio` | enum | no | STD AR |
|
|
2604
|
+
| `image_size` | enum | no | `1K \| 2K \| 4K` |
|
|
2605
|
+
|
|
2606
|
+
```json
|
|
2607
|
+
{ "id": "hero", "type": "image_generate",
|
|
2608
|
+
"params": { "model": "openai/gpt-5.4-image-2", "prompt": "Photo of …", "aspect_ratio": "16:9", "image_size": "2K" } }
|
|
2609
|
+
```
|
|
2610
|
+
|
|
2611
|
+
###### Model: `google/gemini-3.5-flash`
|
|
2612
|
+
|
|
2613
|
+
Newest Gemini Flash image model. Always returns an image output. Best for fast iteration; supports extreme aspect ratios and the `0.5K` size for cheap previews.
|
|
2614
|
+
|
|
2615
|
+
| Name | Type | Required | Notes |
|
|
2616
|
+
|---|---|---|---|
|
|
2617
|
+
| `model` | literal | yes | `"google/gemini-3.5-flash"` |
|
|
2618
|
+
| `prompt` | string | yes | non-empty |
|
|
2619
|
+
| `aspect_ratio` | enum | no | EXTREME AR |
|
|
2620
|
+
| `image_size` | enum | no | `0.5K \| 1K \| 2K \| 4K` |
|
|
2621
|
+
|
|
2622
|
+
###### Model: `google/gemini-3.1-flash-image-preview`
|
|
2623
|
+
|
|
2624
|
+
Preview-channel Gemini flash. Same param surface as `3.5-flash`.
|
|
2625
|
+
|
|
2626
|
+
| Name | Type | Required | Notes |
|
|
2627
|
+
|---|---|---|---|
|
|
2628
|
+
| `model` | literal | yes | `"google/gemini-3.1-flash-image-preview"` |
|
|
2629
|
+
| `prompt` | string | yes | non-empty |
|
|
2630
|
+
| `aspect_ratio` | enum | no | EXTREME AR |
|
|
2631
|
+
| `image_size` | enum | no | `0.5K \| 1K \| 2K \| 4K` |
|
|
2632
|
+
|
|
2633
|
+
###### Model: `google/gemini-3-pro-image-preview`
|
|
2634
|
+
|
|
2635
|
+
"Nano Banana Pro" — heavier, higher-quality Gemini.
|
|
2636
|
+
|
|
2637
|
+
| Name | Type | Required | Notes |
|
|
2638
|
+
|---|---|---|---|
|
|
2639
|
+
| `model` | literal | yes | `"google/gemini-3-pro-image-preview"` |
|
|
2640
|
+
| `prompt` | string | yes | non-empty |
|
|
2641
|
+
| `aspect_ratio` | enum | no | STD AR |
|
|
2642
|
+
| `image_size` | enum | no | `1K \| 2K \| 4K` |
|
|
2643
|
+
|
|
2644
|
+
###### Model: `recraft/recraft-v4.1-pro-vector`
|
|
2645
|
+
|
|
2646
|
+
Crisp vector / logo generator. Adds palette controls forwarded to Recraft via `image_config`.
|
|
2647
|
+
|
|
2648
|
+
| Name | Type | Required | Notes |
|
|
2649
|
+
|---|---|---|---|
|
|
2650
|
+
| `model` | literal | yes | `"recraft/recraft-v4.1-pro-vector"` |
|
|
2651
|
+
| `prompt` | string | yes | non-empty |
|
|
2652
|
+
| `aspect_ratio` | enum | no | STD AR |
|
|
2653
|
+
| `image_size` | enum | no | `1K \| 2K \| 4K` |
|
|
2654
|
+
| `strength` | number | no | 0–1 |
|
|
2655
|
+
| `rgb_colors` | `[r,g,b][]` | no | palette, each component 0–255 |
|
|
2656
|
+
| `background_rgb_color` | `[r,g,b]` | no | background, each component 0–255 |
|
|
2657
|
+
|
|
2658
|
+
```json
|
|
2659
|
+
{ "id": "logo", "type": "image_generate",
|
|
2660
|
+
"params": { "model": "recraft/recraft-v4.1-pro-vector", "prompt": "Bold sans-serif wordmark 'OFFUGO'",
|
|
2661
|
+
"aspect_ratio": "16:9", "image_size": "2K",
|
|
2662
|
+
"rgb_colors": [[255, 98, 89]], "background_rgb_color": [249, 250, 251] } }
|
|
2663
|
+
```
|
|
2664
|
+
|
|
2665
|
+
---
|
|
2666
|
+
|
|
2667
|
+
##### `image_reference_sheet`
|
|
2668
|
+
|
|
2669
|
+
Fuse 1–6 images of a single subject (person, character, or product) into ONE multi-view reference sheet — a labeled turnaround grid (FRONT / SIDE / BACK…) in consistent style and lighting. Identity drift is the default behavior of image models; a sheet converts subject consistency from a prompting problem into a pipeline input. Wire the `sheet` output into the `reference` input of downstream `image_generate` / `video_generate` nodes to keep the subject consistent across many creatives.
|
|
2670
|
+
|
|
2671
|
+
The sheet prompt is baked into the backend — there is no `prompt` param. `subject_description` is spliced verbatim into the template; reuse the exact same wording in downstream prompts. `prompt_override` bypasses the template entirely.
|
|
2672
|
+
|
|
2673
|
+
**Inputs**
|
|
2674
|
+
|
|
2675
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
2676
|
+
|---|---|---|---|
|
|
2677
|
+
| `references` | image[] | yes (1–6) | `image/png`, `image/jpeg`, `image/webp`, `image/gif` |
|
|
2678
|
+
|
|
2679
|
+
**Params**
|
|
2680
|
+
|
|
2681
|
+
| Name | Type | Required | Notes |
|
|
2682
|
+
|---|---|---|---|
|
|
2683
|
+
| `model` | enum | yes | one of `google/gemini-3-pro-image-preview`, `google/gemini-3.1-flash-image-preview` |
|
|
2684
|
+
| `subject_description` | string | yes | exact subject wording — reuse verbatim downstream |
|
|
2685
|
+
| `subject_type` | enum | yes | `character \| person \| product` — picks the template + default views |
|
|
2686
|
+
| `views` | string[] | no | 2–6 view labels; defaults per `subject_type` (below) |
|
|
2687
|
+
| `style` | string | no | style/lighting notes, e.g. `"32-bit pixel art"`, `"matte black studio background"` |
|
|
2688
|
+
| `prompt_override` | string | no | full template bypass |
|
|
2689
|
+
| `aspect_ratio` | enum | no | STD AR on `3-pro`, EXTREME AR on `3.1-flash` |
|
|
2690
|
+
| `image_size` | enum | no | `1K \| 2K \| 4K` on `3-pro`, `0.5K \| 1K \| 2K \| 4K` on `3.1-flash` |
|
|
2691
|
+
|
|
2692
|
+
Default views per `subject_type`:
|
|
2693
|
+
|
|
2694
|
+
| `subject_type` | Default views |
|
|
2695
|
+
|---|---|
|
|
2696
|
+
| `character` | FRONT, THREE-QUARTER LEFT, SIDE LEFT, BACK |
|
|
2697
|
+
| `person` | FRONT, THREE-QUARTER LEFT, SIDE LEFT, THREE-QUARTER RIGHT, SIDE RIGHT, BACK |
|
|
2698
|
+
| `product` | FRONT, BACK, LEFT SIDE, RIGHT SIDE, TOP, THREE-QUARTER |
|
|
2699
|
+
|
|
2700
|
+
**Outputs**
|
|
2701
|
+
|
|
2702
|
+
| Slot | Kind | MIME |
|
|
2703
|
+
|---|---|---|
|
|
2704
|
+
| `sheet` | image | `image/png \| image/jpeg \| image/webp` (provider-determined) |
|
|
2705
|
+
|
|
2706
|
+
**Cost** — 20 credits estimate on `3-pro`, 5 on `3.1-flash` (backend charges actual OpenRouter usage).
|
|
2707
|
+
|
|
2708
|
+
Guidance:
|
|
2709
|
+
- 3–4 full-body views at `16:9` is the sweet spot for characters; more views shrink each figure and lose face detail.
|
|
2710
|
+
- Use `google/gemini-3.1-flash-image-preview` while iterating; switch to `google/gemini-3-pro-image-preview` at `2K`+ for final 6-view sheets.
|
|
2711
|
+
- Feed the whole sheet downstream as one `reference` input — don't crop individual views.
|
|
2712
|
+
|
|
2713
|
+
**Example** — two photos of a sneaker → product sheet → consistent hero shot:
|
|
2714
|
+
|
|
2715
|
+
```json
|
|
2716
|
+
{
|
|
2717
|
+
"schema": "baker-canvas/1",
|
|
2718
|
+
"nodes": [
|
|
2719
|
+
{ "id": "a", "type": "ingest", "params": { "source": "url", "url": "https://example.com/sneaker-front.jpg", "expect": "image" } },
|
|
2720
|
+
{ "id": "b", "type": "ingest", "params": { "source": "url", "url": "https://example.com/sneaker-side.jpg", "expect": "image" } },
|
|
2721
|
+
{ "id": "sheet", "type": "image_reference_sheet",
|
|
2722
|
+
"inputs": { "references": ["$ref:a.asset", "$ref:b.asset"] },
|
|
2723
|
+
"params": { "model": "google/gemini-3-pro-image-preview",
|
|
2724
|
+
"subject_description": "navy leather low-top sneaker with white sole and gold logo",
|
|
2725
|
+
"subject_type": "product",
|
|
2726
|
+
"style": "matte black studio background, soft even key light",
|
|
2727
|
+
"aspect_ratio": "16:9", "image_size": "2K" } },
|
|
2728
|
+
{ "id": "hero", "type": "image_generate",
|
|
2729
|
+
"inputs": { "reference": "$ref:sheet.sheet" },
|
|
2730
|
+
"params": { "model": "google/gemini-3-pro-image-preview",
|
|
2731
|
+
"prompt": "The navy leather low-top sneaker with white sole and gold logo from the reference sheet, hero shot on a rain-soaked neon street at night" } }
|
|
2732
|
+
],
|
|
2733
|
+
"output": { "node": "hero", "output": "images" }
|
|
2734
|
+
}
|
|
2735
|
+
```
|
|
2736
|
+
|
|
2737
|
+
---
|
|
2738
|
+
|
|
2739
|
+
##### `image_aspect_adapt`
|
|
2740
|
+
|
|
2741
|
+
Adapt ONE creative into multiple aspect ratios in a single step — the standard fan-out when a hero creative must ship to every placement (Meta: 9:16 stories, 1:1 feed, 4:5, 16:9…). The AI recomposes the layout per format: identical subject, faces, products, text (verbatim, same typography), logos, colors, and style; the scene is extended/restructured for the new canvas — never stretched, cropped, or letterboxed.
|
|
2742
|
+
|
|
2743
|
+
The adaptation prompt is baked into the backend and is direction-aware (a wider target gets horizontal recomposition guidance, a taller one vertical). Formats that already match the source ratio (within ~2%) pass the source asset through unchanged — no model call, no cost.
|
|
2744
|
+
|
|
2745
|
+
**Inputs**
|
|
2746
|
+
|
|
2747
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
2748
|
+
|---|---|---|---|
|
|
2749
|
+
| `source` | image | yes | `image/png`, `image/jpeg`, `image/webp`, `image/gif` |
|
|
2750
|
+
|
|
2751
|
+
**Params**
|
|
2752
|
+
|
|
2753
|
+
| Name | Type | Required | Notes |
|
|
2754
|
+
|---|---|---|---|
|
|
2755
|
+
| `model` | enum | yes | one of `google/gemini-3-pro-image-preview`, `google/gemini-3.1-flash-image-preview` |
|
|
2756
|
+
| `formats` | enum[] | yes | 1–6 unique target ratios from `1:1 \| 2:3 \| 3:2 \| 3:4 \| 4:3 \| 4:5 \| 5:4 \| 9:16 \| 16:9 \| 21:9` — output order follows this list |
|
|
2757
|
+
| `guidance` | string | no | hints appended to the baked prompt, e.g. `"keep the CTA button fully visible"` |
|
|
2758
|
+
| `image_size` | enum | no | `1K \| 2K \| 4K` on `3-pro`, `0.5K \| 1K \| 2K \| 4K` on `3.1-flash` |
|
|
2759
|
+
|
|
2760
|
+
**Outputs**
|
|
2761
|
+
|
|
2762
|
+
| Slot | Kind | MIME |
|
|
2763
|
+
|---|---|---|
|
|
2764
|
+
| `images` | image[] | one per requested format, in `formats` order; each carries `metadata.aspect_ratio` and `metadata.adapted` (`false` = source passed through) |
|
|
2765
|
+
|
|
2766
|
+
**Cost** — per-format estimate: 20 credits on `3-pro`, 5 on `3.1-flash`, × `formats.length` (backend charges actual OpenRouter usage; pass-through formats are free).
|
|
2767
|
+
|
|
2768
|
+
Guidance:
|
|
2769
|
+
- List the source's own ratio in `formats` too when you want the full set in one output — it passes through for free.
|
|
2770
|
+
- Use `google/gemini-3.1-flash-image-preview` while iterating; switch to `google/gemini-3-pro-image-preview` for final-quality adaptation.
|
|
2771
|
+
- Indexes are positional: `$ref:adapt.images#1` is always the second entry of `formats`.
|
|
2772
|
+
|
|
2773
|
+
**Example** — one 9:16 story creative fanned out to feed and landscape:
|
|
2774
|
+
|
|
2775
|
+
```json
|
|
2776
|
+
{
|
|
2777
|
+
"schema": "baker-canvas/1",
|
|
2778
|
+
"nodes": [
|
|
2779
|
+
{ "id": "hero", "type": "image_generate",
|
|
2780
|
+
"params": { "model": "google/gemini-3-pro-image-preview",
|
|
2781
|
+
"prompt": "Story-format ad for a citrus energy drink: can centered, bold OFFER text top, logo bottom",
|
|
2782
|
+
"aspect_ratio": "9:16", "image_size": "2K" } },
|
|
2783
|
+
{ "id": "adapt", "type": "image_aspect_adapt",
|
|
2784
|
+
"inputs": { "source": "$ref:hero.images#0" },
|
|
2785
|
+
"params": { "model": "google/gemini-3-pro-image-preview",
|
|
2786
|
+
"formats": ["9:16", "1:1", "16:9"],
|
|
2787
|
+
"guidance": "keep the offer text and logo fully visible" } }
|
|
2788
|
+
],
|
|
2789
|
+
"output": { "node": "adapt", "output": "images" }
|
|
2790
|
+
}
|
|
2791
|
+
```
|
|
2792
|
+
|
|
2793
|
+
---
|
|
2794
|
+
|
|
2795
|
+
##### `image_describe`
|
|
2796
|
+
|
|
2797
|
+
Reverse-engineer an image into an exhaustive, replication-grade JSON blueprint via an OpenRouter vision model. The extraction prompt ("Visual Architect") is baked into the backend and returns one JSON object covering not just WHAT is in the image but HOW it is rendered and WHY: `source_context` (who the advertiser is, what they sell, market, and their inferred brand palette), `meta`, `composition`, `subjects` (non-person objects/graphics, each with `expression`, `emotion_conveyed`, `gaze`, `treatment`, and `role_in_intent`), `people` (deep per-person detail plus expression/treatment/role-in-intent), `environment`, `camera` (estimated focal length, aperture, DoF), `lighting`, `color` (hex palette where each color is tagged `brand_ownership` — brand vs borrowed-functional — plus `purpose` and a `purpose_note`), `materials_textures`, `text_content` (all visible text verbatim), `brands_logos` (each mark identified by brand — "Trustpilot", never "green star"), `ad_signals` (proof badges, CTA, price/offer; null when not an ad), `ad_intent` (the persuasion engine: primary emotion, mechanism, arc, viewer takeaway; null when not an ad), `style`, and `post_processing`. The model runs at temperature 0 with JSON mode enforced.
|
|
2798
|
+
|
|
2799
|
+
Built for **market adaptation**: logos are named by brand, people and animals carry expression/emotion/intent, and each color is tagged brand vs borrowed-functional — so a downstream transform can keep, localize, or swap each element, preserve the emotional intent, and selectively keep the reds/yellows that do a job (a red "without insurance" column) rather than blindly recoloring to the new brand. Pass `context` with what you already know (advertiser, category, market) to ground `source_context` and color ownership. Because the output is a json asset, wiring it into a downstream `{{slot}}` splices the full blueprint into that prompt — restyle a reference onto a new subject, lock a look across a series, or feed exact palette/lighting into `image_generate`. The output is rich; for dense ads raise `max_tokens` (e.g. 8000+) so the JSON isn't truncated.
|
|
2800
|
+
|
|
2801
|
+
**Inputs**
|
|
2802
|
+
|
|
2803
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
2804
|
+
|---|---|---|---|
|
|
2805
|
+
| `image` | image | yes | `image/png`, `image/jpeg`, `image/webp`, `image/gif` |
|
|
2806
|
+
|
|
2807
|
+
**Params**
|
|
2808
|
+
|
|
2809
|
+
| Name | Type | Required | Notes |
|
|
2810
|
+
|---|---|---|---|
|
|
2811
|
+
| `model` | enum | yes | one of `~google/gemini-pro-latest`, `~google/gemini-flash-latest` (`pro` for the densest extraction) |
|
|
2812
|
+
| `focus` | string | no | aspects to emphasise, e.g. `"typography and color palette"` |
|
|
2813
|
+
| `context` | string | no | known provenance to ground `source_context` and color ownership, e.g. `"competitor ad for AssurOpoil, Italian pet insurance"` |
|
|
2814
|
+
| `temperature` | number | no | 0–2, defaults to 0 |
|
|
2815
|
+
| `max_tokens` | number | no | positive int |
|
|
2816
|
+
|
|
2817
|
+
There is no `prompt` param — the extraction schema is the node. Use `focus` to steer emphasis and `context` to supply provenance.
|
|
2818
|
+
|
|
2819
|
+
**Outputs**
|
|
2820
|
+
|
|
2821
|
+
| Slot | Kind | MIME |
|
|
2822
|
+
|---|---|---|
|
|
2823
|
+
| `description` | json | `application/json` |
|
|
2824
|
+
|
|
2825
|
+
**Cost** — 2 credits estimate per call (backend charges actual OpenRouter usage).
|
|
2826
|
+
|
|
2827
|
+
**Example** — extract a blueprint from a reference and restyle it onto a new subject:
|
|
2828
|
+
|
|
2829
|
+
```json
|
|
2830
|
+
{
|
|
2831
|
+
"schema": "baker-canvas/1",
|
|
2832
|
+
"nodes": [
|
|
2833
|
+
{ "id": "ref", "type": "ingest", "params": { "source": "url", "url": "https://example.com/competitor-ad.png", "expect": "image" } },
|
|
2834
|
+
{ "id": "blueprint", "type": "image_describe",
|
|
2835
|
+
"inputs": { "image": "$ref:ref.asset" },
|
|
2836
|
+
"params": { "model": "~google/gemini-pro-latest" } },
|
|
2837
|
+
{ "id": "remix", "type": "image_generate",
|
|
2838
|
+
"inputs": { "blueprint": "$ref:blueprint.description" },
|
|
2839
|
+
"params": { "model": "openai/gpt-5.4-image-2",
|
|
2840
|
+
"prompt": "Recreate an image matching this exact blueprint, but swap the product for a ceramic mug:\n{{blueprint}}" } }
|
|
2841
|
+
],
|
|
2842
|
+
"output": { "node": "remix", "output": "images" }
|
|
2843
|
+
}
|
|
2844
|
+
```
|
|
2845
|
+
|
|
2846
|
+
---
|
|
2847
|
+
|
|
2848
|
+
##### `image_search`
|
|
2849
|
+
|
|
2850
|
+
Search the web for real images instead of generating them. A backend LLM agent picks among Google Images, stock photography (Freepik), and Pinterest, refines queries, selects the best matches, and the chosen images are downloaded into canvas assets. Use it to gather references or candidates (e.g. five photos of an australian shepherd) for a later pick-the-best step.
|
|
2851
|
+
|
|
2852
|
+
**Inputs** — none.
|
|
2853
|
+
|
|
2854
|
+
**Params**
|
|
2855
|
+
|
|
2856
|
+
| Name | Type | Required | Notes |
|
|
2857
|
+
|---|---|---|---|
|
|
2858
|
+
| `prompt` | string | yes | What to find, non-empty |
|
|
2859
|
+
| `count` | int | no | Images to return, 1–20 (default 5) |
|
|
2860
|
+
|
|
2861
|
+
**Outputs**
|
|
2862
|
+
|
|
2863
|
+
| Slot | Kind | MIME |
|
|
2864
|
+
|---|---|---|
|
|
2865
|
+
| `images` | image[] | `image/png \| image/jpeg \| image/webp \| image/gif \| …` (source-determined) |
|
|
2866
|
+
|
|
2867
|
+
**Cost** — `2 + count/2` credits estimate per call (backend charges actual LLM + search-API usage). May return fewer than `count` images when sources run dry or downloads fail; fails only when nothing is retrievable.
|
|
2868
|
+
|
|
2869
|
+
**Example**
|
|
2870
|
+
|
|
2871
|
+
```json
|
|
2872
|
+
{
|
|
2873
|
+
"id": "refs",
|
|
2874
|
+
"type": "image_search",
|
|
2875
|
+
"params": { "prompt": "australian shepherd running on a beach", "count": 5 }
|
|
2876
|
+
}
|
|
2877
|
+
```
|
|
2878
|
+
|
|
2879
|
+
---
|
|
2880
|
+
|
|
2881
|
+
##### `image_select`
|
|
2882
|
+
|
|
2883
|
+
Pick the best `count` images out of 2+ candidates with a vision LLM, judged against a prompt. The output is a passthrough subset of the input refs — no new pixels are generated, so selection is cheap. Use it after fanning out several `image_generate` variants (or an `image_search` batch) to keep only the strongest before expensive downstream steps.
|
|
2884
|
+
|
|
2885
|
+
**Inputs**
|
|
2886
|
+
|
|
2887
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
2888
|
+
|---|---|---|---|
|
|
2889
|
+
| `images` | image[] | yes — at least 2 | `image/png`, `image/jpeg`, `image/webp`, `image/gif` |
|
|
2890
|
+
|
|
2891
|
+
Wire either a whole upstream array (`"$ref:gen.images"`) or a literal list collecting refs from several nodes (`["$ref:a.images#0", "$ref:b.images#0"]`).
|
|
2892
|
+
|
|
2893
|
+
**Params**
|
|
2894
|
+
|
|
2895
|
+
| Name | Type | Required | Notes |
|
|
2896
|
+
|---|---|---|---|
|
|
2897
|
+
| `model` | enum | yes | one of `~google/gemini-flash-latest`, `~google/gemini-pro-latest` (`pro` for harder aesthetic judgement) |
|
|
2898
|
+
| `prompt` | string | yes | selection criteria, e.g. `"most professional product shot, clean background"` |
|
|
2899
|
+
| `count` | int | no | images to select, ≥ 1 (default 1); must not exceed the wired image count |
|
|
2900
|
+
| `temperature` | number | no | 0–2, defaults to 0 |
|
|
2901
|
+
| `max_tokens` | number | no | positive int |
|
|
2902
|
+
|
|
2903
|
+
**Outputs**
|
|
2904
|
+
|
|
2905
|
+
| Slot | Kind | MIME |
|
|
2906
|
+
|---|---|---|
|
|
2907
|
+
| `images` | image[] | per input — exactly `count` refs, in the model's preference order, so `images#0`…`images#count-1` are always safe to wire |
|
|
2908
|
+
| `reasoning` | text | `text/plain` — why the winners won, compared against the rejected candidates |
|
|
2909
|
+
|
|
2910
|
+
**Cost** — 1 credit estimate per call (backend charges actual OpenRouter usage).
|
|
2911
|
+
|
|
2912
|
+
Fail-fast: an invalid model selection (wrong count, duplicate or out-of-range picks, non-JSON response) is a retryable provider error — the node never pads or repairs a selection. Passthrough refs keep `kind`/`url`/`sha256`/`mime`; upstream `width`/`height` metadata is not carried through.
|
|
2913
|
+
|
|
2914
|
+
**Example** — fan out 3 variants, keep the best one:
|
|
2915
|
+
|
|
2916
|
+
```json
|
|
2917
|
+
{
|
|
2918
|
+
"schema": "baker-canvas/1",
|
|
2919
|
+
"nodes": [
|
|
2920
|
+
{ "id": "a", "type": "image_generate", "params": { "model": "google/gemini-3.5-flash", "prompt": "hero shot of a ceramic mug, warm morning light" } },
|
|
2921
|
+
{ "id": "b", "type": "image_generate", "params": { "model": "google/gemini-3.5-flash", "prompt": "hero shot of a ceramic mug, studio softbox" } },
|
|
2922
|
+
{ "id": "c", "type": "image_generate", "params": { "model": "google/gemini-3.5-flash", "prompt": "hero shot of a ceramic mug, dramatic side light" } },
|
|
2923
|
+
{ "id": "best", "type": "image_select",
|
|
2924
|
+
"inputs": { "images": ["$ref:a.images#0", "$ref:b.images#0", "$ref:c.images#0"] },
|
|
2925
|
+
"params": { "model": "~google/gemini-pro-latest", "prompt": "most premium-feeling lighting, no harsh shadows" } }
|
|
2926
|
+
],
|
|
2927
|
+
"output": { "node": "best", "output": "images" }
|
|
2928
|
+
}
|
|
2929
|
+
```
|
|
2930
|
+
|
|
2931
|
+
---
|
|
2932
|
+
|
|
2933
|
+
##### `video_generate`
|
|
2934
|
+
|
|
2935
|
+
Generate video. Async with polling. Two curated models.
|
|
2936
|
+
|
|
2937
|
+
**Inputs**
|
|
2938
|
+
|
|
2939
|
+
| Slot | Kind | Required | Notes |
|
|
2940
|
+
|---|---|---|---|
|
|
2941
|
+
| `first_frame` | image | no | Starting keyframe (image-to-video) |
|
|
2942
|
+
| `last_frame` | image | no | Ending keyframe |
|
|
2943
|
+
| `reference` | image | no | Alternative to `first_frame` when only one image is given |
|
|
2944
|
+
|
|
2945
|
+
Accepted ref-image MIMEs vary by model — see per-model sections below.
|
|
2946
|
+
|
|
2947
|
+
**Outputs**
|
|
2948
|
+
|
|
2949
|
+
| Slot | Kind | MIME |
|
|
2950
|
+
|---|---|---|
|
|
2951
|
+
| `video` | video | `video/mp4` |
|
|
2952
|
+
|
|
2953
|
+
**Cost** — 50 credits estimate per call.
|
|
2954
|
+
|
|
2955
|
+
---
|
|
2956
|
+
|
|
2957
|
+
###### Model: `bytedance/seedance-2.0`
|
|
2958
|
+
|
|
2959
|
+
Production-quality ad-creative model. Routed via **fal.ai** (not OpenRouter) because OpenRouter's Seedance passthrough rejects photorealistic human reference frames via ByteDance's "real person" safety filter.
|
|
2960
|
+
|
|
2961
|
+
Ref-image MIMEs: `image/png`, `image/jpeg`, `image/webp` (via fal.ai).
|
|
2962
|
+
|
|
2963
|
+
| Name | Type | Required | Notes |
|
|
2964
|
+
|---|---|---|---|
|
|
2965
|
+
| `model` | literal | yes | `"bytedance/seedance-2.0"` |
|
|
2966
|
+
| `prompt` | string | yes | non-empty |
|
|
2967
|
+
| `duration` | enum | no | `4, 5, 6, 8, 10, 12, 15` (gaps trigger 400s) |
|
|
2968
|
+
| `resolution` | enum | no | `480p \| 720p \| 1080p` |
|
|
2969
|
+
| `aspect_ratio` | enum | no | `1:1, 3:4, 9:16, 4:3, 16:9, 21:9, 9:21` |
|
|
2970
|
+
| `generate_audio` | boolean | no | native audio track |
|
|
2971
|
+
| `seed` | number | no | nonneg int |
|
|
2972
|
+
|
|
2973
|
+
> Seedance does **not** accept `negative_prompt` — its OpenRouter passthrough is `[watermark, req_key]` only. Use Veo if you need negative prompting.
|
|
2974
|
+
|
|
2975
|
+
```json
|
|
2976
|
+
{ "id": "clip", "type": "video_generate",
|
|
2977
|
+
"inputs": { "first_frame": "$ref:start.images#0", "last_frame": "$ref:end.images#0" },
|
|
2978
|
+
"params": { "model": "bytedance/seedance-2.0", "prompt": "Same person, talking head, …",
|
|
2979
|
+
"duration": 8, "resolution": "720p", "aspect_ratio": "9:16", "generate_audio": false } }
|
|
2980
|
+
```
|
|
2981
|
+
|
|
2982
|
+
###### Model: `google/veo-3.1-fast`
|
|
2983
|
+
|
|
2984
|
+
Cheap/fast model — intended for iteration and tests. Routed via OpenRouter with passthroughs under `provider.options.google-vertex.parameters`.
|
|
2985
|
+
|
|
2986
|
+
Ref-image MIMEs: `image/png`, `image/jpeg`, `image/webp`, `image/gif` (via OpenRouter).
|
|
2987
|
+
|
|
2988
|
+
| Name | Type | Required | Notes |
|
|
2989
|
+
|---|---|---|---|
|
|
2990
|
+
| `model` | literal | yes | `"google/veo-3.1-fast"` |
|
|
2991
|
+
| `prompt` | string | yes | non-empty |
|
|
2992
|
+
| `duration` | enum | no | `4, 6, 8` |
|
|
2993
|
+
| `resolution` | enum | no | `720p \| 1080p` |
|
|
2994
|
+
| `aspect_ratio` | enum | no | `16:9 \| 9:16` |
|
|
2995
|
+
| `generate_audio` | boolean | no | native audio track |
|
|
2996
|
+
| `seed` | number | no | nonneg int |
|
|
2997
|
+
| `negative_prompt` | string | no | Veo passthrough |
|
|
2998
|
+
| `person_generation` | enum | no | only `allow_all` is currently verified |
|
|
2999
|
+
| `enhance_prompt` | boolean | no | Veo passthrough |
|
|
3000
|
+
| `conditioning_scale` | number | no | Veo passthrough |
|
|
3001
|
+
|
|
3002
|
+
```json
|
|
3003
|
+
{ "id": "iter", "type": "video_generate",
|
|
3004
|
+
"params": { "model": "google/veo-3.1-fast", "prompt": "Cinematic dolly-in on a logo",
|
|
3005
|
+
"duration": 6, "resolution": "1080p", "aspect_ratio": "16:9", "enhance_prompt": true } }
|
|
3006
|
+
```
|
|
3007
|
+
|
|
3008
|
+
---
|
|
3009
|
+
|
|
3010
|
+
##### `tts`
|
|
3011
|
+
|
|
3012
|
+
Single-voice text-to-speech via ElevenLabs.
|
|
3013
|
+
|
|
3014
|
+
**Inputs**
|
|
3015
|
+
|
|
3016
|
+
None.
|
|
3017
|
+
|
|
3018
|
+
**Outputs**
|
|
3019
|
+
|
|
3020
|
+
| Slot | Kind | MIME |
|
|
3021
|
+
|---|---|---|
|
|
3022
|
+
| `audio` | audio | provider-determined (mp3 by default) |
|
|
3023
|
+
| `timestamps` | json | `application/json` — present only when `with_timestamps: true` |
|
|
3024
|
+
|
|
3025
|
+
**Cost** — `max(1, ceil(text.length × 0.0015))` credits.
|
|
3026
|
+
|
|
3027
|
+
###### Model: `elevenlabs/eleven_v3`
|
|
3028
|
+
|
|
3029
|
+
| Name | Type | Required | Notes |
|
|
3030
|
+
|---|---|---|---|
|
|
3031
|
+
| `model` | literal | yes | `"elevenlabs/eleven_v3"` |
|
|
3032
|
+
| `text` | string | yes | non-empty, ≤45454 chars (the $10 per-node cost cap) |
|
|
3033
|
+
| `voice` | string | yes | ElevenLabs voice id |
|
|
3034
|
+
| `language_code` | string | no | ISO 639-1 hint |
|
|
3035
|
+
| `stability` | number | no | 0–1 (top-level shortcut) |
|
|
3036
|
+
| `similarity_boost` | number | no | 0–1 (top-level shortcut) |
|
|
3037
|
+
| `voice_settings` | json | no | `{ stability, similarity_boost, style, use_speaker_boost, speed }` |
|
|
3038
|
+
| `seed` | number | no | 0–4294967295 |
|
|
3039
|
+
| `pronunciation_dictionary_locators` | json | no | up to 3 dicts |
|
|
3040
|
+
| `apply_text_normalization` | enum | no | `auto \| on \| off` |
|
|
3041
|
+
| `apply_language_text_normalization` | boolean | no | Japanese-only, adds latency |
|
|
3042
|
+
| `with_timestamps` | boolean | no | character-level alignment in `timestamps` output |
|
|
3043
|
+
| `output_format` | enum | no | `mp3_22050_32 … mp3_44100_192` (mp3 family only — assets are stored as `audio/mpeg`) |
|
|
3044
|
+
|
|
3045
|
+
> eleven_v3 does **not** support context stitching (`previous_text`/`next_text`/`*_request_ids`) — ElevenLabs returns `unsupported_model` if you pass them, so the registry rejects them at validation time.
|
|
3046
|
+
|
|
3047
|
+
```json
|
|
3048
|
+
{ "id": "vo", "type": "tts",
|
|
3049
|
+
"params": { "model": "elevenlabs/eleven_v3", "text": "Hey! Quick story —", "voice": "JBFqnCBsd6RMkjVDRZzb",
|
|
3050
|
+
"language_code": "en", "with_timestamps": true } }
|
|
3051
|
+
```
|
|
3052
|
+
|
|
3053
|
+
---
|
|
3054
|
+
|
|
3055
|
+
##### `video_lipsync`
|
|
3056
|
+
|
|
3057
|
+
Lip-sync a video to an audio track via VEED (fal.ai).
|
|
3058
|
+
|
|
3059
|
+
**Inputs**
|
|
3060
|
+
|
|
3061
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
3062
|
+
|---|---|---|---|
|
|
3063
|
+
| `video` | video | yes | `video/mp4`, `video/webm`, `video/quicktime` |
|
|
3064
|
+
| `audio` | audio | yes | `audio/wav`, `audio/mpeg`, `audio/mp3` |
|
|
3065
|
+
|
|
3066
|
+
**Params**
|
|
3067
|
+
|
|
3068
|
+
| Name | Type | Required | Notes |
|
|
3069
|
+
|---|---|---|---|
|
|
3070
|
+
| `model` | literal | yes | `"fal/veed-lipsync"` |
|
|
3071
|
+
|
|
3072
|
+
**Outputs**
|
|
3073
|
+
|
|
3074
|
+
| Slot | Kind | MIME |
|
|
3075
|
+
|---|---|---|
|
|
3076
|
+
| `video` | video | `video/mp4` |
|
|
3077
|
+
|
|
3078
|
+
**Cost** — 20 credits per call.
|
|
3079
|
+
|
|
3080
|
+
---
|
|
3081
|
+
|
|
3082
|
+
##### `video_background_remove`
|
|
3083
|
+
|
|
3084
|
+
Strip a video's background → alpha WebM/H264. Powered by fal.ai VEED.
|
|
3085
|
+
|
|
3086
|
+
**Inputs**
|
|
3087
|
+
|
|
3088
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
3089
|
+
|---|---|---|---|
|
|
3090
|
+
| `video` | video | yes | `video/mp4`, `video/webm`, `video/quicktime` |
|
|
3091
|
+
|
|
3092
|
+
**Params**
|
|
3093
|
+
|
|
3094
|
+
| Name | Type | Required | Default | Notes |
|
|
3095
|
+
|---|---|---|---|---|
|
|
3096
|
+
| `model` | literal | no | `"fal/veed-video-background-removal"` | — |
|
|
3097
|
+
| `edge_refinement` | boolean | no | `true` | — |
|
|
3098
|
+
| `output_codec` | enum | no | `"vp9"` | `vp9 \| h264` |
|
|
3099
|
+
|
|
3100
|
+
**Outputs**
|
|
3101
|
+
|
|
3102
|
+
| Slot | Kind | MIME |
|
|
3103
|
+
|---|---|---|
|
|
3104
|
+
| `video` | video | `video/webm` (VP9 + alpha) or `video/mp4` (H264 RGB + alpha) |
|
|
3105
|
+
|
|
3106
|
+
**Cost** — 50 credits per call.
|
|
3107
|
+
|
|
3108
|
+
---
|
|
3109
|
+
|
|
3110
|
+
##### `video_transcribe`
|
|
3111
|
+
|
|
3112
|
+
Word-level transcription. Default `transcriber:"groq"` uses Groq Whisper Large v3 Turbo; `transcriber:"deepgram"` uses Deepgram Nova-3, which additionally emits a `rich` output with punctuated words + paragraph/sentence grouping (and speaker indices). Auto-extracts audio locally (mono 16 kHz, 64 kbps MP3) before uploading — payload shrinks ~100× vs. sending the full video.
|
|
3113
|
+
|
|
3114
|
+
**Inputs**
|
|
3115
|
+
|
|
3116
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
3117
|
+
|---|---|---|---|
|
|
3118
|
+
| `video` | video | yes | `video/mp4`, `video/webm`, `video/quicktime` |
|
|
3119
|
+
|
|
3120
|
+
**Params**
|
|
3121
|
+
|
|
3122
|
+
| Name | Type | Required | Default | Notes |
|
|
3123
|
+
|---|---|---|---|---|
|
|
3124
|
+
| `language` | string | no | auto | ISO 639-1, 2–8 chars (e.g. `en`, `es`) |
|
|
3125
|
+
| `transcriber` | enum | no | `groq` | `groq` (Whisper) \| `deepgram` (Nova-3, adds the `rich` output) |
|
|
3126
|
+
|
|
3127
|
+
**Outputs**
|
|
3128
|
+
|
|
3129
|
+
| Slot | Kind | MIME |
|
|
3130
|
+
|---|---|---|
|
|
3131
|
+
| `transcript` | json | `application/json` — array of `{text, start, end}` (Deepgram prefers the punctuated word form) |
|
|
3132
|
+
| `rich` | json | `application/json` — Deepgram only: `{text, words[], paragraphs[], duration_sec, language?}` with punctuated words, sentences, and speaker indices |
|
|
3133
|
+
|
|
3134
|
+
**Cost** — 2 credits estimate; backend reports actual based on duration (Groq: $0.04/hr, 10s minimum; Deepgram Nova-3: $0.0043/min).
|
|
3135
|
+
|
|
3136
|
+
**Requirements** — `ffmpeg` on PATH for audio extraction (already in the E2B sandbox image). Without it the node still works but uploads the full video and Groq's 100 MB cap applies.
|
|
3137
|
+
|
|
3138
|
+
---
|
|
3139
|
+
|
|
3140
|
+
##### `video_deconstruct`
|
|
3141
|
+
|
|
3142
|
+
Reverse-engineer a video into a replication-grade blueprint: scene boundaries, the **real start/end frame of every scene** (extracted from the video as PNG images), and an exhaustive JSON analysis — per-scene action detail, camera motion, generation-ready frame/motion prompts, overlay text with full typographic style, floating elements, deeply detailed `global.cast` (perceived demographics, ethnicity/skin-tone, styling, market-recasting notes), brand-identified logos in `global.branding` (named by brand and what they signal — "Trustpilot", never "green star" — with on-screen timestamps), dialogue with castable voice descriptions, music spec, SFX list — plus a word-level transcript. Built to feed reproduction/remix and market-adaptation workflows. When the backend has an `AUDD_API_TOKEN` configured and the blueprint reports music, an AudD recognition pass identifies the backing track at `analysis.global.music.identified_track` (title/artist/album + Spotify/Apple links); a recognition miss never fails the run.
|
|
3143
|
+
|
|
3144
|
+
**Agent workflow** (structure first, then extract, then author):
|
|
3145
|
+
|
|
3146
|
+
1. *(optional, cheap)* `mode:"index"` — one LLM call, no frames: global blueprint + scene boundaries + transcript. Use it to see how many scenes exist before planning.
|
|
3147
|
+
2. Full deconstruct — short videos (≤8 min) in one node; longer videos as **several parallel nodes**, each with a `start_s`/`end_s` window of ≤480s (snap window edges to scene boundaries from the index pass). Merge the per-window analyses by concatenating `scenes` (each carries an absolute-time `window` marker).
|
|
3148
|
+
3. For a full reproduction, you normally don't run this node directly — `baker canvas scaffold-video <video>` runs the deconstruct for you and scaffolds the canvas. Use this node standalone when you need the raw `analysis`/`transcript` JSON (e.g. windowed long-video analysis) for a custom workflow.
|
|
3149
|
+
|
|
3150
|
+
Over-length runs fail with a message that includes ready-to-use suggested windows, so the loop is self-correcting.
|
|
3151
|
+
|
|
3152
|
+
**Inputs**
|
|
3153
|
+
|
|
3154
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
3155
|
+
|---|---|---|---|
|
|
3156
|
+
| `video` | video | yes | `video/mp4`, `video/webm`, `video/quicktime` |
|
|
3157
|
+
|
|
3158
|
+
**Params**
|
|
3159
|
+
|
|
3160
|
+
| Name | Type | Required | Default | Notes |
|
|
3161
|
+
|---|---|---|---|---|
|
|
3162
|
+
| `model` | enum | yes | — | `~google/gemini-flash-latest` (cheap/fast) \| `~google/gemini-pro-latest` (densest extraction) |
|
|
3163
|
+
| `mode` | enum | no | `full` | `index` = structure-first pass: boundaries + global blueprint + transcript, no frames, ~60s |
|
|
3164
|
+
| `language` | string | no | auto | ISO 639-1 transcription hint (e.g. `en`, `es`) |
|
|
3165
|
+
| `transcriber` | enum | no | `groq` | transcript provider: `groq` (Whisper) \| `deepgram` (Nova-3, punctuated words) |
|
|
3166
|
+
| `max_scenes` | number | no | `20` (full ≤3 min) / `40` (longer) / `60` (index) | 1–60; extra cuts merge into the last scene |
|
|
3167
|
+
| `focus` | string | no | — | extra extraction emphasis (e.g. `"overlay typography"`) |
|
|
3168
|
+
| `start_s` / `end_s` | number | no | whole video | analysis window (absolute seconds) for chunking long videos; window ≤480s |
|
|
3169
|
+
|
|
3170
|
+
**Outputs**
|
|
3171
|
+
|
|
3172
|
+
| Slot | Kind | MIME |
|
|
3173
|
+
|---|---|---|
|
|
3174
|
+
| `analysis` | json | `application/json` — `video-deconstruct/1` blueprint (or `video-deconstruct-index/1` in index mode) |
|
|
3175
|
+
| `start_frames` | image[] | `image/png` — real first frame of each scene, native resolution (absent in `mode:"index"`) |
|
|
3176
|
+
| `end_frames` | image[] | `image/png` — real last frame of each scene (absent in `mode:"index"`) |
|
|
3177
|
+
| `transcript` | json | `application/json` — array of `{text, start, end}` (same contract as `video_transcribe`; window-scoped on windowed runs) |
|
|
3178
|
+
|
|
3179
|
+
`analysis.scenes[i]` aligns positionally with `start_frames#i` / `end_frames#i`, and each scene also embeds its frames' `{url, sha256}` so the JSON is self-sufficient.
|
|
3180
|
+
|
|
3181
|
+
**Reproduction recipe** — the blueprint maps 1:1 onto generation nodes:
|
|
3182
|
+
|
|
3183
|
+
| Blueprint field | Feed into |
|
|
3184
|
+
|---|---|
|
|
3185
|
+
| `scenes[i].start_frame_prompt` / `end_frame_prompt` | `image_generate` (overlay text is excluded by contract — recomposite from `overlays`) |
|
|
3186
|
+
| `scenes[i].motion_prompt` + `start_frames#i` / `end_frames#i` | `video_generate` (`first_frame` / `last_frame`, e.g. Seedance) |
|
|
3187
|
+
| `scenes[i].dialogue[].line` + `voice_description` | `tts` / `dialogue` |
|
|
3188
|
+
| `global.music.music_prompt` | `music` |
|
|
3189
|
+
| `scenes[i].sfx[].sound_effect_prompt` | `sound_effect` |
|
|
3190
|
+
| `scenes[i].overlays[]` (text, timing, 9-grid position, animation, font style) + `floating_elements[]` | `ffmpeg` / `hyperframe_render` overlay pass |
|
|
3191
|
+
|
|
3192
|
+
```jsonc
|
|
3193
|
+
{
|
|
3194
|
+
"schema": "baker-canvas/1",
|
|
3195
|
+
"nodes": [
|
|
3196
|
+
{ "id": "ref", "type": "ingest", "params": { "source": "url", "url": "https://…/competitor-ad.mp4", "expect": "video" } },
|
|
3197
|
+
{ "id": "dec", "type": "video_deconstruct",
|
|
3198
|
+
"inputs": { "video": "$ref:ref.asset" },
|
|
3199
|
+
"params": { "model": "~google/gemini-flash-latest", "focus": "overlay typography and pacing" } },
|
|
3200
|
+
// Remake scene 1 conditioned on its real boundary frames:
|
|
3201
|
+
{ "id": "s1", "type": "video_generate",
|
|
3202
|
+
"inputs": { "first_frame": "$ref:dec.start_frames#0", "last_frame": "$ref:dec.end_frames#0" },
|
|
3203
|
+
"params": { "model": "bytedance/seedance-2.0", "prompt": "{{motion}}" } }
|
|
3204
|
+
],
|
|
3205
|
+
"output": { "node": "dec", "output": "analysis" }
|
|
3206
|
+
}
|
|
3207
|
+
```
|
|
3208
|
+
|
|
3209
|
+
**Cost** — 15 credits estimate (3 for `mode:"index"`); backend reports actual OpenRouter usage across `scene_count + 1` Gemini calls (~$0.05–0.40 for a 45s ad on flash) plus Groq transcription ($0.04/hr, 10s min). For videos over 3 minutes, per-scene calls automatically switch from full-video attachment to frame strips (boundary + mid-scene thumbnails) so cost stays flat per scene.
|
|
3210
|
+
|
|
3211
|
+
**Limits** — video must have a public URL (use `ingest` first for local files); ≤8 min in one node, ≤30 min total via parallel `start_s`/`end_s` windows. Transcription routes through a Mux audio-only rendition above 5 min (Groq caps uploads at 100MB). Frame extraction rides on a throwaway Mux asset (created, thumbnailed, deleted per run), so expect ~1–3 min wall clock for a 45s video.
|
|
3212
|
+
|
|
3213
|
+
**Reproduction** — for the full-pipeline canvas, use `baker canvas scaffold-video <video>` (below): it runs this deconstruct for you, detects the recurring identity elements, and scaffolds the runnable canvas in one command.
|
|
3214
|
+
|
|
3215
|
+
---
|
|
3216
|
+
|
|
3217
|
+
##### `voice_select`
|
|
3218
|
+
|
|
3219
|
+
Cast an ElevenLabs voice from a natural-language description (e.g. a blueprint's `voice_description`). Lists the account's voices (`GET /v2/voices`, free) and ranks them against the brief.
|
|
3220
|
+
|
|
3221
|
+
**Params**
|
|
3222
|
+
|
|
3223
|
+
| Name | Type | Required | Notes |
|
|
3224
|
+
|---|---|---|---|
|
|
3225
|
+
| `description` | string | yes | the casting brief |
|
|
3226
|
+
| `gender` / `age` / `accent` / `language` | string | no | structured hints that sharpen the ranking |
|
|
3227
|
+
| `limit` | number | no | how many `candidates` to return (default 5) |
|
|
3228
|
+
|
|
3229
|
+
**Outputs** — `voice_id` (text, bare id) + `candidates` (json, ranked shortlist). Wire `voice_id` into a `tts` node: set `inputs.voice_ref: $ref:<this>.voice_id` and `params.voice: "{{voice_ref}}"` — the engine splices the id in at run time.
|
|
3230
|
+
|
|
3231
|
+
**Cost** — free (voice listing). Generation is billed under `tts`/`dialogue`.
|
|
3232
|
+
|
|
3233
|
+
---
|
|
3234
|
+
|
|
3235
|
+
##### `audio_timeline`
|
|
3236
|
+
|
|
3237
|
+
Place and mix several audio clips onto one timeline — a music bed plus timed voiceover and SFX — into a single track. Local ffmpeg.
|
|
3238
|
+
|
|
3239
|
+
**Inputs** — open audio slots (one per clip), wired as `inputs.<slot>`.
|
|
3240
|
+
|
|
3241
|
+
**Params**
|
|
3242
|
+
|
|
3243
|
+
| Name | Type | Required | Notes |
|
|
3244
|
+
|---|---|---|---|
|
|
3245
|
+
| `tracks` | array | yes | `[{slot, start_s, gain_db?}]` — `slot` matches a wired input; `start_s` is absolute seconds; `gain_db` ducks (e.g. `-12` for a music bed) |
|
|
3246
|
+
| `total_ms` | number | no | pins the final length (pad/trim) |
|
|
3247
|
+
| `output_format` | enum | no | `mp3` (default) / `wav` / `m4a` |
|
|
3248
|
+
|
|
3249
|
+
**Outputs** — `audio`. **Cost** — free (local). Requires `ffmpeg`.
|
|
3250
|
+
|
|
3251
|
+
---
|
|
3252
|
+
|
|
3253
|
+
##### `image_background_remove`
|
|
3254
|
+
|
|
3255
|
+
Strip background → transparent PNG (or mask). Powered by fal.ai BiRefNet v2.
|
|
3256
|
+
|
|
3257
|
+
**Inputs**
|
|
3258
|
+
|
|
3259
|
+
| Slot | Kind | Required | Accepted MIMEs |
|
|
3260
|
+
|---|---|---|---|
|
|
3261
|
+
| `image` | image | yes | `image/png`, `image/jpeg`, `image/webp` |
|
|
3262
|
+
|
|
3263
|
+
**Params**
|
|
3264
|
+
|
|
3265
|
+
| Name | Type | Required | Default | Notes |
|
|
3266
|
+
|---|---|---|---|---|
|
|
3267
|
+
| `model` | literal | no | `"fal/birefnet-v2"` | — |
|
|
3268
|
+
| `model_variant` | enum | no | `"General Use (Light)"` | `General Use (Light) \| General Use (Heavy) \| Matting \| Portrait \| DIS \| HRSOD \| COD` |
|
|
3269
|
+
| `operating_resolution` | enum | no | — | `1024x1024 \| 2048x2048 \| 2304x2304` |
|
|
3270
|
+
| `mask_only` | boolean | no | `false` | return binary mask instead of alpha-cut |
|
|
3271
|
+
|
|
3272
|
+
**Outputs**
|
|
3273
|
+
|
|
3274
|
+
| Slot | Kind | MIME |
|
|
3275
|
+
|---|---|---|
|
|
3276
|
+
| `image` | image | `image/png` (with alpha) |
|
|
3277
|
+
| `mask` | image | `image/png` (only when `mask_only: true`) |
|
|
3278
|
+
|
|
3279
|
+
**Cost** — 1 credit per call.
|
|
3280
|
+
|
|
3281
|
+
---
|
|
3282
|
+
|
|
3283
|
+
##### `music`
|
|
3284
|
+
|
|
3285
|
+
Generate music from prompt OR score an existing video clip.
|
|
3286
|
+
|
|
3287
|
+
**Inputs**
|
|
3288
|
+
|
|
3289
|
+
| Slot | Kind | Required | Notes |
|
|
3290
|
+
|---|---|---|---|
|
|
3291
|
+
| `video` | video | required for `video-background-music-v1` only | `video/mp4`, `video/webm`, `video/quicktime` |
|
|
3292
|
+
|
|
3293
|
+
**Outputs**
|
|
3294
|
+
|
|
3295
|
+
| Slot | Kind | MIME |
|
|
3296
|
+
|---|---|---|
|
|
3297
|
+
| `audio` | audio | provider-determined |
|
|
3298
|
+
| `timestamps` | json | `application/json` (optional) |
|
|
3299
|
+
|
|
3300
|
+
**Cost** — `max(1, ceil(seconds × 2.2))` credits (defaults to 30s if `music_length_ms` unset).
|
|
3301
|
+
|
|
3302
|
+
###### Model: `elevenlabs/music-v1`
|
|
3303
|
+
|
|
3304
|
+
Compose music from a free-form prompt or a structured composition plan.
|
|
3305
|
+
|
|
3306
|
+
| Name | Type | Required | Notes |
|
|
3307
|
+
|---|---|---|---|
|
|
3308
|
+
| `model` | literal | yes | `"elevenlabs/music-v1"` |
|
|
3309
|
+
| `prompt` | string | no | free-form brief (exactly one of `prompt`/`composition_plan` is required — validated) |
|
|
3310
|
+
| `composition_plan` | json | no | structured `{ sections: [...] }` |
|
|
3311
|
+
| `music_length_ms` | number | no | 3000–454545 (upper bound keeps the estimate under the $10 per-node cost cap) |
|
|
3312
|
+
| `seed` | number | no | — |
|
|
3313
|
+
| `force_instrumental` | boolean | no | prompt mode only |
|
|
3314
|
+
| `respect_sections_durations` | boolean | no | composition_plan mode only |
|
|
3315
|
+
| `with_timestamps` | boolean | no | word-level timestamps |
|
|
3316
|
+
| `output_format` | enum | no | mp3 family only (see `tts`) |
|
|
3317
|
+
|
|
3318
|
+
###### Model: `elevenlabs/video-background-music-v1`
|
|
3319
|
+
|
|
3320
|
+
Score an existing video. Requires `inputs.video`.
|
|
3321
|
+
|
|
3322
|
+
| Name | Type | Required | Notes |
|
|
3323
|
+
|---|---|---|---|
|
|
3324
|
+
| `model` | literal | yes | `"elevenlabs/video-background-music-v1"` |
|
|
3325
|
+
| `description` | string | no | max 1000 chars |
|
|
3326
|
+
| `tags` | string[] | no | up to 10 style tags |
|
|
3327
|
+
| `output_format` | enum | no | mp3 family only (see `tts`) |
|
|
3328
|
+
|
|
3329
|
+
---
|
|
3330
|
+
|
|
3331
|
+
##### `dialogue`
|
|
3332
|
+
|
|
3333
|
+
Multi-voice scripted dialogue via ElevenLabs Eleven v3.
|
|
3334
|
+
|
|
3335
|
+
**Inputs**
|
|
3336
|
+
|
|
3337
|
+
None.
|
|
3338
|
+
|
|
3339
|
+
**Params**
|
|
3340
|
+
|
|
3341
|
+
| Name | Type | Required | Notes |
|
|
3342
|
+
|---|---|---|---|
|
|
3343
|
+
| `model` | literal | yes | `"elevenlabs/eleven_v3"` |
|
|
3344
|
+
| `inputs` | json | yes | `{ text, voice_id }[]`, 1–50 items, ≤45454 total chars (the $10 per-node cost cap) |
|
|
3345
|
+
| `language_code` | string | no | — |
|
|
3346
|
+
| `settings` | json | no | provider settings passthrough |
|
|
3347
|
+
| `seed` | number | no | 0–4294967295 |
|
|
3348
|
+
| `apply_text_normalization` | enum | no | `auto \| on \| off` |
|
|
3349
|
+
| `with_timestamps` | boolean | no | per-voice segment markers in `timestamps` output |
|
|
3350
|
+
| `output_format` | enum | no | mp3 family only (see `tts`) |
|
|
3351
|
+
|
|
3352
|
+
**Outputs**
|
|
3353
|
+
|
|
3354
|
+
Same as `tts`.
|
|
3355
|
+
|
|
3356
|
+
**Cost** — `max(1, ceil(total_chars × 0.0015))` credits.
|
|
3357
|
+
|
|
3358
|
+
---
|
|
3359
|
+
|
|
3360
|
+
##### `sound_effect`
|
|
3361
|
+
|
|
3362
|
+
Short SFX from a text prompt. ElevenLabs Text-to-Sound v2.
|
|
3363
|
+
|
|
3364
|
+
**Inputs**
|
|
3365
|
+
|
|
3366
|
+
None.
|
|
3367
|
+
|
|
3368
|
+
**Params**
|
|
3369
|
+
|
|
3370
|
+
| Name | Type | Required | Notes |
|
|
3371
|
+
|---|---|---|---|
|
|
3372
|
+
| `model` | literal | yes | `"elevenlabs/eleven_text_to_sound_v2"` |
|
|
3373
|
+
| `text` | string | yes | SFX description |
|
|
3374
|
+
| `duration_seconds` | number | no | 0.5–30 |
|
|
3375
|
+
| `prompt_influence` | number | no | 0–1 |
|
|
3376
|
+
| `loop` | boolean | no | seamless loop |
|
|
3377
|
+
| `output_format` | enum | no | mp3 family only (see `tts`) |
|
|
3378
|
+
|
|
3379
|
+
**Outputs**
|
|
3380
|
+
|
|
3381
|
+
| Slot | Kind | MIME |
|
|
3382
|
+
|---|---|---|
|
|
3383
|
+
| `audio` | audio | provider-determined |
|
|
3384
|
+
|
|
3385
|
+
**Cost** — `max(1, ceil(seconds × 2.2))` credits.
|
|
3386
|
+
|
|
3387
|
+
---
|
|
3388
|
+
|
|
3389
|
+
### Live discovery
|
|
3390
|
+
|
|
3391
|
+
The tables above mirror `packages/cli/src/engine/models/registry.ts` at the time of this README write. For the always-current machine-readable schema:
|
|
3392
|
+
|
|
3393
|
+
```bash
|
|
3394
|
+
baker canvas catalog | jq '.categories | keys'
|
|
3395
|
+
baker canvas catalog | jq '.categories.image_generate[].id'
|
|
3396
|
+
baker canvas catalog | jq '.categories.video_generate[]'
|
|
3397
|
+
```
|
|
3398
|
+
|
|
3399
|
+
The validator surfaces unknown models / illegal aspect ratios / out-of-range params / missing required inputs with `did_you_mean` hints **before** the run starts.
|
|
3400
|
+
|
|
3401
|
+
### Roadmap
|
|
3402
|
+
|
|
3403
|
+
- **Multimodal `text_generate`** — Gemini Flash + Pro support image / video / audio input through OpenRouter's `messages[].content[]` array. The engine schema currently exposes text-only; a follow-up adds optional `image` / `video` / `audio` input slots and a backend handler that builds the multimodal request.
|
|
3404
|
+
- **`{{slot}}`-as-attachment** — once multimodal lands, `{{slot}}` in a prompt where the wired input is an `AssetRef` will splice the asset into the request as a content part (in the prompt's positional order), instead of stringifying. Today's text/`TextRef` substitution stays unchanged.
|
|
3405
|
+
- **Pre-flight MIME enforcement** — Stage 3 will declare a `outputMimes` set on every node and reject set-disjoint wiring before any API call. Backend already validates MIMEs at dispatch as defence-in-depth.
|
|
3406
|
+
|
|
3407
|
+
---
|
|
3408
|
+
|
|
3409
|
+
### Outputs on disk
|
|
3410
|
+
|
|
3411
|
+
Every run gets its own directory:
|
|
3412
|
+
|
|
3413
|
+
```
|
|
3414
|
+
./canvas/
|
|
3415
|
+
└── r_01KS0F8FZQ3MSBE9QNXH8E5S78/
|
|
3416
|
+
├── manifest.json # per-node timing, cache status, asset refs
|
|
3417
|
+
├── host_still__images__0.jpg # image_generate, slot=images, index=0
|
|
3418
|
+
├── host_video__video.mp4 # video_generate, slot=video
|
|
3419
|
+
├── app_screenshot__images__0.jpg
|
|
3420
|
+
├── composite__video.mp4
|
|
3421
|
+
└── _final.mp4 # the canvas's declared `output` (if a file)
|
|
3422
|
+
```
|
|
3423
|
+
|
|
3424
|
+
- Naming: `<node_id>__<slot>[__<index>].<ext>`.
|
|
3425
|
+
- `_final.<ext>` is a copy of whichever node the top-level `output` points at.
|
|
3426
|
+
- `manifest.json` is what `baker canvas inspect` reads — duration, cache hit/miss, asset refs, errors.
|
|
3427
|
+
- Default root is `./canvas/`. Override with `--outputs-dir`.
|
|
3428
|
+
|
|
3429
|
+
---
|
|
3430
|
+
|
|
3431
|
+
### Caching
|
|
3432
|
+
|
|
3433
|
+
Cache key:
|
|
3434
|
+
|
|
3435
|
+
```
|
|
3436
|
+
sha256(canonical({ node_id, node_version, params, input_hashes, cache_salt, extras }))
|
|
3437
|
+
```
|
|
3438
|
+
|
|
3439
|
+
- Entries live in `./canvas/.cache/` (index + content-addressed blobs).
|
|
3440
|
+
- Cache hits re-materialize asset files into the new run dir instantly; misses execute the node and write a new entry atomically.
|
|
3441
|
+
- `hyperframe_*` folds a recursive sha256 of the composition directory into `extras` — edit any file in `my-composition/` and the cache busts automatically. No version bump needed.
|
|
3442
|
+
- If computing `extras` fails (e.g. the composition directory is unreadable), the node fails with that error — it never falls back to a partial cache key, so stale results can't be served silently.
|
|
3443
|
+
- Wipe a single entry: `rm canvas/.cache/index/<key>.json`. Wipe everything: `rm -rf canvas/`.
|
|
3444
|
+
|
|
3445
|
+
Cache policies (via `--cache-policy`):
|
|
3446
|
+
|
|
3447
|
+
| Policy | Reads | Writes |
|
|
3448
|
+
|---|---|---|
|
|
3449
|
+
| `read_write` (default) | yes | yes |
|
|
3450
|
+
| `read_only` | yes | no |
|
|
3451
|
+
| `bypass` | no | no |
|
|
3452
|
+
|
|
3453
|
+
`read_only` executes cache misses normally but leaves the cache untouched — use it to re-run a canvas without polluting a shared or committed cache directory.
|
|
3454
|
+
|
|
3455
|
+
---
|
|
3456
|
+
|
|
3457
|
+
### Safety limits
|
|
3458
|
+
|
|
3459
|
+
- **Asset size cap: 2 GiB per asset.** Applies to every download (remote node outputs, `ingest` fetches, yt-dlp output) and to local files ingested via `source: "path"`. Oversized assets fail the node with a `file_too_large` / byte-limit error.
|
|
3460
|
+
- **Per-request cost cap: $10 of estimated provider cost.** The backend pre-estimates cost for billable nodes (`video_generate` on Seedance, `tts`, `dialogue`, `music`, `sound_effect`) and rejects a single execution above the cap with `cost_limit_exceeded`. Split long renders or long scripts into smaller nodes.
|
|
3461
|
+
- **Retries never double-charge.** Every remote node execution carries a run-scoped idempotency key; if a request is retried after a transient failure, the backend replays the stored response instead of dispatching (and billing) the provider again. A fresh run (including `--cache-policy bypass`) always generates fresh.
|
|
3462
|
+
|
|
3463
|
+
---
|
|
3464
|
+
|
|
3465
|
+
### CLI commands
|
|
3466
|
+
|
|
3467
|
+
#### `baker canvas validate <file.json>`
|
|
3468
|
+
|
|
3469
|
+
Parse + schema-check + per-node deep validation (composition `meta.json` is consulted for `hyperframe_*`). Prints a structured result with a per-node cost preview. Exits non-zero on issues. Never calls Convex, never spends credits.
|
|
3470
|
+
|
|
3471
|
+
```bash
|
|
3472
|
+
$ baker canvas validate my-canvas.json
|
|
3473
|
+
{
|
|
3474
|
+
"ok": true,
|
|
3475
|
+
"total_nodes": 4,
|
|
3476
|
+
"estimated_credits": 8,
|
|
3477
|
+
"cost_preview": [
|
|
3478
|
+
{ "node_id": "topic", "node_type": "text", "credits": 0 },
|
|
3479
|
+
{ "node_id": "headline", "node_type": "text_generate", "credits": 8 }
|
|
3480
|
+
]
|
|
3481
|
+
}
|
|
3482
|
+
```
|
|
3483
|
+
|
|
3484
|
+
On error you get an array of `ValidationIssue`s, each with `path`, `code`, `message`, and optional `did_you_mean`.
|
|
3485
|
+
|
|
3486
|
+
#### `baker canvas run <file.json> [flags]`
|
|
3487
|
+
|
|
3488
|
+
Validate, then execute the graph. Blocks until done. Logs one line per node. Returns `{ ok, run_id, outputs_dir, output, stats }` on stdout.
|
|
3489
|
+
|
|
3490
|
+
| Flag | Default | Effect |
|
|
3491
|
+
|---|---|---|
|
|
3492
|
+
| `--cache-dir <path>` | `./canvas/.cache` | Where content-addressed bytes + cache index live. |
|
|
3493
|
+
| `--outputs-dir <path>` | `./canvas` | Root for per-run output directories. |
|
|
3494
|
+
| `--run-id <id>` | auto ULID | Override the generated run id. |
|
|
3495
|
+
| `--cache-policy <policy>` | `read_write` | `read_write`, `bypass`, or `read_only`. |
|
|
3496
|
+
|
|
3497
|
+
#### `baker canvas scaffold-video <video> [flags]`
|
|
3498
|
+
|
|
3499
|
+
Turn a reference video into a **runnable, self-validated reproduction canvas** in one command — the video counterpart of `scaffold-static-ad`. It runs **billed passes** up front:
|
|
3500
|
+
|
|
3501
|
+
1. **`video_deconstruct`** (`~google/gemini-pro-latest`, full mode) — reverse-engineers the video into a scene-by-scene blueprint + word-level transcript, written next to the canvas as **`prompt.json`**. Each scene's `start_frame_prompt`/`end_frame_prompt` are inlined into the frame nodes (see below); `prompt.json` then rides along as the shared **global style reference** (palette, cast cohesion) and as provenance.
|
|
3502
|
+
2. **recurring-element selection** (`~google/gemini-flash-latest`) — picks only the **recurring, identity-critical** elements (each `global.cast` person, a recurring animal, a showcased product, the brand logo) and the scene indices each appears in. One real reference image grounds each element across **every** frame it appears in, so the same actor stays consistent the whole video.
|
|
3503
|
+
|
|
3504
|
+
It then scaffolds the full pipeline: per scene, two **static-ad-grade frames** (`image_generate` with its **own self-contained `params.prompt`** — edit a frame node to change only that frame; `prompt.json` is wired as a demoted shared-style `target_blueprint`, a per-element reference legend, the real extracted frame as a composition anchor) → `video_generate` (Seedance first/last-frame, fed an ultra-detailed motion brief composed from the scene's action, camera, dialogue, and transcript; duration snapped to the nearest allowed clip length). Each recurring element gets **one shared `[TODO]` ingest slot** wired into every frame it appears in. Globally it casts a `voice_select` per speaker, **one continuous `tts` per speaker** (every line that speaker says concatenated into a single read, voice locked via `voice_ref`), a `sound_effect` per SFX, a `music` bed (styled after the AudD-identified track when available), then concatenates the clips, burns on the animated overlays (the `video-overlay` composition, copied next to the canvas — brand fonts via `@font-face`, plus `typewriter`/`karaoke` reveals), mixes the audio (`audio_timeline`), and muxes it under the video.
|
|
3505
|
+
|
|
3506
|
+
The emitted canvas is validated (`validateCanvasDeep`) before it's written, so it always runs. The full editable checklist is embedded in the canvas as **`metadata.todo`** (and a step-by-step guide in `metadata.description`). stdout returns `{ ok, canvas_path, prompt_path, models, stats, checklist }` — the **checklist** lists the recurring elements (with the scenes each spans and the real source image to supply), voices to confirm, SFX/overlay counts, music status, and any scenes clamped to the 15s clip ceiling.
|
|
3507
|
+
|
|
3508
|
+
```bash
|
|
3509
|
+
baker canvas scaffold-video ./reference-ad.mp4 --focus "competitor UGC ad for <brand>"
|
|
3510
|
+
# → writes ./reference-ad.video.canvas.json + ./prompt.json (+ video-overlay-composition/)
|
|
3511
|
+
# edit each frame's own params.prompt + drop ONE real source image at each el_* [TODO], confirm voices, then:
|
|
3512
|
+
baker canvas validate ./reference-ad.video.canvas.json
|
|
3513
|
+
baker canvas run ./reference-ad.video.canvas.json
|
|
3514
|
+
```
|
|
3515
|
+
|
|
3516
|
+
| Flag | Default | Effect |
|
|
3517
|
+
|---|---|---|
|
|
3518
|
+
| `--out <path>` | `<video-dir>/<name>.video.canvas.json` | Where to write the canvas (composition is copied alongside). |
|
|
3519
|
+
| `--frames <mode>` | `generate` | `generate` regenerates frames anchored on the originals; `reuse` wires the real extracted frames straight into the clips (faithful, cheaper). |
|
|
3520
|
+
| `--max-scenes <n>` | provider default | Cap the number of scenes the deconstruct emits. |
|
|
3521
|
+
| `--language <code>` | auto | Transcript/dialogue language hint (e.g. `fr`, `en`). |
|
|
3522
|
+
| `--focus <text>` | — | Known provenance/emphasis to ground the deconstruct. |
|
|
3523
|
+
| `--deconstruct-model <id>` | `~google/gemini-pro-latest` | Override the `video_deconstruct` model. |
|
|
3524
|
+
| `--select-model <id>` | `~google/gemini-flash-latest` | Override the element-selection `text_generate` model. |
|
|
3525
|
+
| `--image-model <id>` | `openai/gpt-5.4-image-2` | Override the per-frame `image_generate` model (defaults to the strongest, matching `scaffold-static-ad`). |
|
|
3526
|
+
| `--video-model <id>` | `bytedance/seedance-2.0` | Override the `video_generate` model. |
|
|
3527
|
+
|
|
3528
|
+
The two scaffold passes are billed (the full `video_deconstruct` is the heavy one); **running** the result then generates many image/video/audio assets and is not free. Defaults to vertical 1080×1920 overlays — copy + edit the composition for other aspect ratios. For on-brand overlay type, drop `brand-bold.otf`/`brand-regular.otf` into the copied `video-overlay-composition/` dir (wired via `@font-face`, with a system fallback). Richer transcription (punctuated words + paragraphs) is available via the deconstruct's `transcriber: "deepgram"` param when `DEEPGRAM_API_KEY` is set.
|
|
3529
|
+
|
|
3530
|
+
#### `baker canvas scaffold-static-ad <image> [flags]`
|
|
3531
|
+
|
|
3532
|
+
Turn a source/inspiration image into a **runnable, self-validated static-ad canvas** — the static counterpart of `scaffold-video`. Like the video scaffold, this runs **billed Gemini passes** up front:
|
|
3533
|
+
|
|
3534
|
+
1. **`image_describe`** (`~google/gemini-pro-latest`) — reverse-engineers the image into a blueprint JSON, written next to the canvas as **`prompt.json`**. This is the editable "prompt": you rewrite it by hand into the ad you want (palette, copy, claims, subjects). It feeds the generator directly — there is **no automatic brand-transform step**.
|
|
3535
|
+
2. **element selection** (`~google/gemini-flash-latest`) — picks only the **main, identity-critical** elements (the brand logo, a showcased product, a foreground person/animal, a trust badge); background extras are dropped. Each is stamped back onto its blueprint entry as a `reference_image` label so the JSON self-documents which slot grounds which subject.
|
|
3536
|
+
3. **global layout** (`~google/gemini-flash-latest`) — produces a structured `layout` block in `prompt.json`: the column/row grid, each region's `x_pct`/`y_pct` bounds, panel splits, background/shape, and every text block's relative size/weight/case/alignment. This is what gives the generator a precise composition to rebuild.
|
|
3537
|
+
|
|
3538
|
+
It then scaffolds a canvas that ingests `prompt.json`, wires **one `[TODO]` ingest slot per detected element** (plus an optional brand-font → type-specimen) into `image_generate`, and wires the original image in for composition only. The canvas is validated before it's written. stdout returns `{ ok, canvas_path, prompt_path, models, layout_regions, stats, checklist }` — the **checklist** lists every real asset to drop in.
|
|
3539
|
+
|
|
3540
|
+
```bash
|
|
3541
|
+
baker canvas scaffold-static-ad ./reference-ad.png --context "competitor ad for <brand>, <category>, <market>"
|
|
3542
|
+
# → writes ./static-ad.canvas.json + ./prompt.json
|
|
3543
|
+
# edit prompt.json into your ad + replace each [TODO] ingest path with a real file, then:
|
|
3544
|
+
baker canvas validate ./static-ad.canvas.json
|
|
3545
|
+
baker canvas run ./static-ad.canvas.json
|
|
3546
|
+
```
|
|
3547
|
+
|
|
3548
|
+
| Flag | Default | Effect |
|
|
3549
|
+
|---|---|---|
|
|
3550
|
+
| `--context <text>` | — | Known provenance (advertiser, category, market) to ground the describe. |
|
|
3551
|
+
| `--out <path>` | `<image-dir>/static-ad.canvas.json` | Where to write the canvas (`prompt.json` is written alongside). |
|
|
3552
|
+
| `--describe-model <id>` | registry default (`~google/gemini-pro-latest`) | Override the `image_describe` model. |
|
|
3553
|
+
| `--select-model <id>` | registry default (`~google/gemini-flash-latest`) | Override the element-selection `text_generate` model. |
|
|
3554
|
+
| `--layout-model <id>` | registry default (`~google/gemini-flash-latest`) | Override the global-layout `text_generate` model. |
|
|
3555
|
+
| `--gen-model <id>` | registry default (`openai/gpt-5.4-image-2`) | Override the `image_generate` model. |
|
|
3556
|
+
| `--aspect <ratio>` | inferred from the image, else `9:16` | Force the output aspect ratio. |
|
|
3557
|
+
| `--skip-font` | off | Skip the brand-font → type-specimen slot. |
|
|
3558
|
+
|
|
3559
|
+
Scaffolding runs (and bills) the two vision passes; **running** the result generates a billed image. `baker canvas validate` does not check that the `[TODO]` paths exist — supply the real files before `run`.
|
|
3560
|
+
|
|
3561
|
+
#### `baker canvas inspect <run_id> [--thumbnails]`
|
|
3562
|
+
|
|
3563
|
+
One-page summary of a completed run: per-node duration + cache status, list of files in the run dir, optional video thumbnails (start/middle/end frames extracted via ffmpeg).
|
|
3564
|
+
|
|
3565
|
+
```bash
|
|
3566
|
+
baker canvas inspect r_01JXYZ...
|
|
3567
|
+
baker canvas inspect r_01JXYZ... --thumbnails
|
|
3568
|
+
```
|
|
3569
|
+
|
|
3570
|
+
Resolves the run dir against `--outputs-dir` (default `./canvas`) or accepts an absolute path. Reads `manifest.json` for per-node stats.
|
|
3571
|
+
|
|
3572
|
+
#### `baker canvas catalog`
|
|
3573
|
+
|
|
3574
|
+
Prints the agent-facing node + composition catalog as JSON Schema. Every node has `id`, `version`, `location`, `category`, `summary`, `inputs`/`params`/`outputs` schemas, `cost_estimate_credits`, `runtime_estimate_seconds`, and (when present) `when_to_use`. Grouped by `category`. The `compositions` array enumerates every `@baker/*` composition with its full `meta.json`.
|
|
3575
|
+
|
|
3576
|
+
```bash
|
|
3577
|
+
baker canvas catalog | jq '.categories | keys'
|
|
3578
|
+
baker canvas catalog | jq '.compositions[].id'
|
|
3579
|
+
```
|
|
3580
|
+
|
|
3581
|
+
---
|
|
3582
|
+
|
|
3583
|
+
### Ingestion (`ingest` node)
|
|
3584
|
+
|
|
3585
|
+
`ingest` is the on-ramp for any external asset a canvas needs. Pick a `source` (URL or local file path) and declare what kind of asset you want — `image`, `video`, `audio`, `text`, `json`, or `font`. The node figures out the right strategy:
|
|
3586
|
+
|
|
3587
|
+
```jsonc
|
|
3588
|
+
// URL source — direct CDN image
|
|
3589
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://cdn.example.com/cat.png", "expect": "image" } }
|
|
3590
|
+
|
|
3591
|
+
// URL source — YouTube video (yt-dlp)
|
|
3592
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://youtu.be/dQw4w9WgXcQ", "expect": "video" } }
|
|
3593
|
+
|
|
3594
|
+
// URL source — YouTube audio-only (yt-dlp -x)
|
|
3595
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://youtu.be/dQw4w9WgXcQ", "expect": "audio" } }
|
|
3596
|
+
|
|
3597
|
+
// URL source — blog post → clean markdown (Handinger)
|
|
3598
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://example.com/blog/post", "expect": "text" } }
|
|
3599
|
+
|
|
3600
|
+
// URL source — PDF → markdown (Handinger handles PDFs)
|
|
3601
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://example.com/whitepaper.pdf", "expect": "text" } }
|
|
3602
|
+
|
|
3603
|
+
// URL source — raw markdown file (direct fetch)
|
|
3604
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://raw.githubusercontent.com/foo/bar/README.md", "expect": "text" } }
|
|
3605
|
+
|
|
3606
|
+
// URL source — JSON endpoint (direct fetch)
|
|
3607
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://api.example.com/data.json", "expect": "json" } }
|
|
3608
|
+
|
|
3609
|
+
// URL source — font file for font_specimen (direct fetch; magic-byte sniffed, so octet-stream CDNs work)
|
|
3610
|
+
{ "id": "src", "type": "ingest", "params": { "source": "url", "url": "https://cdn.example.com/brand.ttf", "expect": "font" } }
|
|
3611
|
+
|
|
3612
|
+
// Path source — relative path resolved against process.cwd()
|
|
3613
|
+
{ "id": "src", "type": "ingest", "params": { "source": "path", "path": "./inputs/headshot.png", "expect": "image" } }
|
|
3614
|
+
|
|
3615
|
+
// Path source — absolute path used as-is
|
|
3616
|
+
{ "id": "src", "type": "ingest", "params": { "source": "path", "path": "/abs/work/clip.mp4", "expect": "video" } }
|
|
3617
|
+
```
|
|
3618
|
+
|
|
3619
|
+
**Strategy resolution** (deterministic from `source` + `expect` + URL extension for text):
|
|
3620
|
+
|
|
3621
|
+
| `source` | `expect` | URL / path | Strategy |
|
|
3622
|
+
|----------|---------------|----------------------------------|-----------------|
|
|
3623
|
+
| path | any | any | `local_file` — read from disk, sniff mime, upload to R2 via presign |
|
|
3624
|
+
| url | video / audio | any | `yt_dlp` (works on platform URLs *and* direct file URLs via the generic extractor) |
|
|
3625
|
+
| url | image / json / font | any | `direct_fetch` (HTTP GET) |
|
|
3626
|
+
| url | text | ends in `.txt` or `.md` | `direct_fetch` |
|
|
3627
|
+
| url | text | anything else (HTML, PDF, …) | `handinger` `/markdown` (extracted markdown — HTML/PDF made readable) |
|
|
3628
|
+
|
|
3629
|
+
If the bytes' mime contradicts `expect` (e.g. `expect: "image"` but the URL serves `text/html`, or a `.pdf` file is passed as `expect: "image"`), the node fails with a clear `kind_mismatch` error instead of forwarding bad bytes downstream.
|
|
3630
|
+
|
|
3631
|
+
**Params** — discriminated union on `source`. The agent picks one of two shapes:
|
|
3632
|
+
|
|
3633
|
+
`source: "url"`:
|
|
3634
|
+
|
|
3635
|
+
| Field | Type | Required | Notes |
|
|
3636
|
+
|----------|-------------------------------------------------------|----------|-------|
|
|
3637
|
+
| `source` | `"url"` (literal) | yes | Discriminator. |
|
|
3638
|
+
| `url` | string (must parse as `URL`) | yes | `http://` or `https://`. `file://` is rejected — use `source: "path"` for local files. |
|
|
3639
|
+
| `expect` | `"image" \| "video" \| "audio" \| "text" \| "json" \| "font"` | yes | Declares the output port's kind. Drives strategy selection and enforces a runtime kind check. |
|
|
3640
|
+
|
|
3641
|
+
`source: "path"`:
|
|
3642
|
+
|
|
3643
|
+
| Field | Type | Required | Notes |
|
|
3644
|
+
|----------|-------------------------------------------------------|----------|-------|
|
|
3645
|
+
| `source` | `"path"` (literal) | yes | Discriminator. |
|
|
3646
|
+
| `path` | non-empty string | yes | Absolute or `process.cwd()`-relative filesystem path. `~/…` is rejected (no shell expansion). |
|
|
3647
|
+
| `expect` | `"image" \| "video" \| "audio" \| "text" \| "json" \| "font"` | yes | Same as the url arm. |
|
|
3648
|
+
|
|
3649
|
+
**Inputs:** none (`{}` — strict). `ingest` is a source node; it has no upstream slots.
|
|
3650
|
+
|
|
3651
|
+
**Output:** `{ asset: AssetRef }`. `asset.kind === params.expect` always — guaranteed statically (the engine's `outputKinds` resolver exposes this to validators / dashboard before the node runs) and at runtime (any mismatch throws `kind_mismatch`).
|
|
3652
|
+
|
|
3653
|
+
`AssetRef` shape (returned for every successful run):
|
|
3654
|
+
|
|
3655
|
+
```ts
|
|
3656
|
+
{
|
|
3657
|
+
kind: "image" | "video" | "audio" | "text" | "json" | "font",
|
|
3658
|
+
sha256: string, // content hash; identical bytes always yield the same sha
|
|
3659
|
+
mime: string, // e.g. "image/jpeg", "video/mp4", "text/markdown"
|
|
3660
|
+
url: string, // public R2 URL (stable, content-addressed)
|
|
3661
|
+
path: string, // local CLI cache path — downstream local nodes read from this
|
|
3662
|
+
metadata: {
|
|
3663
|
+
// url-sourced assets:
|
|
3664
|
+
source_url?: string, // the URL the agent passed in
|
|
3665
|
+
// path-sourced assets:
|
|
3666
|
+
source_path?: string, // absolute resolved path
|
|
3667
|
+
file_size?: number, // bytes
|
|
3668
|
+
original_filename?: string, // basename of the source path
|
|
3669
|
+
|
|
3670
|
+
strategy: "direct_fetch" | "handinger" | "yt_dlp" | "local_file",
|
|
3671
|
+
ingested_at: string, // ISO 8601 timestamp
|
|
3672
|
+
|
|
3673
|
+
// yt-dlp adds (when available from --dump-json):
|
|
3674
|
+
original_title?: string,
|
|
3675
|
+
original_uploader?: string,
|
|
3676
|
+
original_duration_s?: number,
|
|
3677
|
+
webpage_url?: string,
|
|
3678
|
+
width?: number,
|
|
3679
|
+
height?: number,
|
|
3680
|
+
|
|
3681
|
+
// Handinger adds:
|
|
3682
|
+
word_count?: number,
|
|
3683
|
+
},
|
|
3684
|
+
|
|
3685
|
+
// Kind-specific dimensions (when known, populated by downstream metadata sniffing):
|
|
3686
|
+
width?: number, // image, video
|
|
3687
|
+
height?: number, // image, video
|
|
3688
|
+
duration_ms?: number, // video, audio
|
|
3689
|
+
}
|
|
3690
|
+
```
|
|
3691
|
+
|
|
3692
|
+
The `url` is a stable R2 URL — remote downstream nodes (e.g. `video_lipsync`) fetch from it. The `path` is the local cache file — local downstream nodes (e.g. `ffmpeg`, `video_transcribe`) read from it directly. Both are populated for every `ingest` output.
|
|
3693
|
+
|
|
3694
|
+
**Errors:**
|
|
3695
|
+
|
|
3696
|
+
| Code | Source | When |
|
|
3697
|
+
|-----------------------|----------------|------------------------------------------------------------------------------|
|
|
3698
|
+
| `INVALID_PARAMS` | engine validator | Missing/malformed fields, mixing `url` + `path`, unknown `source`, unknown `expect`. |
|
|
3699
|
+
| `kind_mismatch` | url + direct_fetch | URL returned bytes whose mime doesn't match `expect`. Message: `ingest expect=<X> but <url> returned <mime> (kind=<Y>)`. |
|
|
3700
|
+
| `kind_mismatch` | path + local_file | File mime (from extension or magic-byte sniff) doesn't match `expect`, *or* mime doesn't map to any of our kinds (e.g. local PDFs aren't supported as `text` — host them via URL for Handinger to extract). |
|
|
3701
|
+
| `provider_error` | url + handinger / direct_fetch | Handinger `/markdown` non-2xx, or upstream `fetch` non-ok. |
|
|
3702
|
+
| `file_not_found` | path + local_file | `stat()` returned `ENOENT`. Error names the absolute resolved path. |
|
|
3703
|
+
| `not_a_file` | path + local_file | Path exists but is a directory / block device / socket. |
|
|
3704
|
+
| `permission_denied` | path + local_file | `stat()` or `readFile()` returned `EACCES`. |
|
|
3705
|
+
| local exec error | url + yt_dlp | `yt-dlp` exited non-zero — full stderr surfaced verbatim (last ~40 lines). |
|
|
3706
|
+
| `network` | url | HTTP timeout, DNS failure, presigned R2 PUT failure. |
|
|
3707
|
+
|
|
3708
|
+
All execution-time failures flow through `NodeExecutionError` so they appear in the run output's structured error envelope. The yt-dlp stderr passthrough is important: bot-checks, geo-blocks, age-gates, and outdated-binary errors all come through as the original yt-dlp message. If a YouTube ingest fails with "No video formats found" or "Sign in to confirm you're not a bot", run `yt-dlp -U` to update.
|
|
3709
|
+
|
|
3710
|
+
**Validation note** — `baker canvas validate` does **not** check that path-sourced files exist on disk. Validation runs without filesystem state (e.g. on CI, in dashboard previews). Missing files surface as `file_not_found` at execute time with the full resolved path in the message.
|
|
3711
|
+
|
|
3712
|
+
**Caching:**
|
|
3713
|
+
|
|
3714
|
+
- **Content-addressed dedup is automatic.** Two `ingest` runs that produce identical bytes share the same R2 object and the same local cache shard (keyed by sha256).
|
|
3715
|
+
- **Engine-level cache** (`canvas/.cache/index/`) keys by node params + node version + node-supplied cache extras. Re-running the same canvas with the same params is a cache hit and skips the network/yt-dlp/local-read entirely.
|
|
3716
|
+
- **yt-dlp version is folded into the cache key** for `source: "url"` with `expect: video | audio`. Upgrading yt-dlp busts the cache automatically, since new versions can produce different bitstreams.
|
|
3717
|
+
- **Local file `mtime + size` is folded into the cache key** for `source: "path"`. Editing the file (any save bumps mtime) invalidates the cache so you get fresh ingestion. mtime+size is cheap to read at validate time; for cases where mtime is unreliable (`cp -p`, content-only changes), force-bypass with `--cache-policy bypass` or change the canvas's `cache_salt`.
|
|
3718
|
+
- **No TTL.** Handinger and direct fetch don't cache-bust on page changes — if the source page updates, use `--cache-policy bypass` or change `cache_salt`.
|
|
3719
|
+
|
|
3720
|
+
**Local files (`source: "path"`):**
|
|
3721
|
+
|
|
3722
|
+
- **Path resolution:** absolute paths are used as-is; relative paths resolve against `process.cwd()` (the directory you ran `baker canvas run` from). `~/…` is **not** expanded — use an absolute path instead.
|
|
3723
|
+
- **Reproducibility caveat:** canvas JSONs using `source: "path"` are **not portable** — moving the canvas to a different machine without the same files at the same paths will fail at execute time. Pick `source: "url"` for portable canvases; pick `source: "path"` for local convenience.
|
|
3724
|
+
- **Mime / kind inference:** primary signal is the file extension (`.png`, `.mp4`, `.m4a`, `.mp3`, `.wav`, `.json`, `.md`, `.txt`, `.csv`, `.html`, …). For images without an extension, a magic-byte sniff catches PNG/JPEG/GIF/WebP. Unknown mimes (or mimes that don't map to a kind, like `application/pdf`) fail with `kind_mismatch`.
|
|
3725
|
+
- **R2 upload:** local files are uploaded to R2 via the same presign flow yt-dlp uses, so remote downstream nodes (e.g. `image_background_remove`) can fetch by URL just like with URL-sourced assets. `asset.url` is always populated.
|
|
3726
|
+
- **Big files:** no explicit size cap. Files >500 MB are slow due to presign + PUT bandwidth — if that's a concern, host them on a CDN and use `source: "url"`.
|
|
3727
|
+
|
|
3728
|
+
**Composition** — `ingest` produces bytes and stops there. For richer pipelines, chain it:
|
|
3729
|
+
|
|
3730
|
+
```
|
|
3731
|
+
ingest { source: "url", expect: "video" } → video_transcribe # YouTube → word-level Whisper transcript
|
|
3732
|
+
ingest { source: "url", expect: "text" } → text_generate # Blog post / PDF → summary
|
|
3733
|
+
ingest { source: "url", expect: "image" } → image_background_remove # CDN image → transparent PNG
|
|
3734
|
+
ingest { source: "path", expect: "image" } → image_background_remove # Local PNG → transparent PNG
|
|
3735
|
+
ingest { source: "path", expect: "video" } → video_transcribe # Local mp4 → transcript
|
|
3736
|
+
```
|
|
3737
|
+
|
|
3738
|
+
**Requirements:**
|
|
3739
|
+
|
|
3740
|
+
- `yt-dlp` and `ffmpeg` on PATH for `source: "url"` with `expect: "video" | "audio"` (already in the E2B sandbox image; install locally with `pip install yt-dlp` and `brew install ffmpeg` / equivalent).
|
|
3741
|
+
- `HANDINGER_API_KEY` on the Convex backend for `source: "url"` with `expect: "text"` on non-`.md`/`.txt` URLs.
|
|
3742
|
+
- No additional dependencies for `source: "path"` — just filesystem access.
|
|
3743
|
+
|
|
3744
|
+
**Cost:** zero engine credits for direct_fetch + yt_dlp. Handinger charges per scrape on your Handinger account (configured outside Baker).
|
|
3745
|
+
|
|
3746
|
+
---
|
|
3747
|
+
|
|
3748
|
+
### Local CLI nodes
|
|
3749
|
+
|
|
3750
|
+
`ffmpeg` and `imagemagick` are pure CLI passthroughs. You write the argv you'd type at the terminal, declare what files come in and what files go out, and the engine handles staging, execution, and ingestion. No `op` discriminator, no flag modeling — anything the binary can do, the node can do.
|
|
3751
|
+
|
|
3752
|
+
**Shape (identical for both nodes):**
|
|
3753
|
+
|
|
3754
|
+
```jsonc
|
|
3755
|
+
{
|
|
3756
|
+
"id": "thumb",
|
|
3757
|
+
"type": "ffmpeg",
|
|
3758
|
+
"inputs": { "video": "$ref:clip.video" },
|
|
3759
|
+
"params": {
|
|
3760
|
+
"args": ["-y", "-i", "{{in.video}}", "-ss", "1", "-frames:v", "1", "{{out.thumb}}"],
|
|
3761
|
+
"outputs": { "thumb": { "kind": "image", "ext": "png" } }
|
|
3762
|
+
}
|
|
3763
|
+
}
|
|
3764
|
+
```
|
|
3765
|
+
|
|
3766
|
+
**How the engine runs it (in order):**
|
|
3767
|
+
|
|
3768
|
+
1. **Stages inputs.** Every entry of `inputs` is copied into a per-node tmp dir. Single AssetRef → `{{in.<slot>}}`. AssetRef array → `{{in.<slot>.0}}`, `{{in.<slot>.1}}`, …
|
|
3769
|
+
2. **Allocates outputs.** Each `params.outputs` entry gets a tmp path with the declared `ext`. Reference it as `{{out.<name>}}` in `args`.
|
|
3770
|
+
3. **Spawns argv.** `spawn(bin, substitutedArgs)` — no shell, no string interpolation outside the placeholder system.
|
|
3771
|
+
4. **Captures stderr.** On non-zero exit, throws with the last ~40 lines of stderr for debugging.
|
|
3772
|
+
5. **Validates outputs.** Every declared output file must exist and be non-empty.
|
|
3773
|
+
6. **Ingests outputs.** Each declared output is added to the asset store with its `kind` (`"image" | "video" | "audio"`) and materialized into the run dir like any other node's outputs.
|
|
3774
|
+
|
|
3775
|
+
**Placeholder rules (the safety contract):**
|
|
3776
|
+
|
|
3777
|
+
| Allowed in `args` | Rejected at execute time |
|
|
3778
|
+
|---|---|
|
|
3779
|
+
| `{{in.<slot>}}` / `{{in.<slot>.<index>}}` / `{{out.<name>}}` | Raw absolute paths outside the staging tmp dir (e.g. `/etc/passwd`) |
|
|
3780
|
+
| Flags, values, filter graphs (`-i`, `scale=1080:1920`, `[0:v][1:v]vstack`, `30000/1001`) | URL-style inputs (`http://…`, `file://…`, `data:…`) |
|
|
3781
|
+
| Substituted placeholders resolving under the staging dir | Home-relative (`~/…`) or relative (`./…`, `../…`) paths |
|
|
3782
|
+
|
|
3783
|
+
Why: with raw paths allowed, two runs with the same `args` but different `/local/file.png` would share a cache key and produce different outputs. Placeholders force every input through the engine's content-addressed asset model, which the cache key actually sees.
|
|
3784
|
+
|
|
3785
|
+
**Cache key includes** node params, every input's hash, and the tool's `-version` first line (so upgrading `ffmpeg`/`magick` busts the cache automatically).
|
|
3786
|
+
|
|
3787
|
+
**Worked examples:**
|
|
3788
|
+
|
|
3789
|
+
```jsonc
|
|
3790
|
+
// ffmpeg — extract a thumbnail from a video at t=1s
|
|
3791
|
+
{
|
|
3792
|
+
"type": "ffmpeg",
|
|
3793
|
+
"inputs": { "video": "$ref:clip.video" },
|
|
3794
|
+
"params": {
|
|
3795
|
+
"args": ["-y", "-i", "{{in.video}}", "-ss", "1", "-frames:v", "1", "{{out.thumb}}"],
|
|
3796
|
+
"outputs": { "thumb": { "kind": "image", "ext": "png" } }
|
|
3797
|
+
}
|
|
3798
|
+
}
|
|
3799
|
+
|
|
3800
|
+
// ffmpeg — concat two clips via the concat demuxer's filter syntax
|
|
3801
|
+
{
|
|
3802
|
+
"type": "ffmpeg",
|
|
3803
|
+
"inputs": { "clips": ["$ref:v1.video", "$ref:v2.video"] },
|
|
3804
|
+
"params": {
|
|
3805
|
+
"args": [
|
|
3806
|
+
"-y",
|
|
3807
|
+
"-i", "{{in.clips.0}}",
|
|
3808
|
+
"-i", "{{in.clips.1}}",
|
|
3809
|
+
"-filter_complex", "[0:v][0:a][1:v][1:a]concat=n=2:v=1:a=1[v][a]",
|
|
3810
|
+
"-map", "[v]", "-map", "[a]",
|
|
3811
|
+
"{{out.video}}"
|
|
3812
|
+
],
|
|
3813
|
+
"outputs": { "video": { "kind": "video", "ext": "mp4" } }
|
|
3814
|
+
}
|
|
3815
|
+
}
|
|
3816
|
+
|
|
3817
|
+
// imagemagick — resize an image to fit-cover 1080x1920
|
|
3818
|
+
{
|
|
3819
|
+
"type": "imagemagick",
|
|
3820
|
+
"inputs": { "image": "$ref:photo.image" },
|
|
3821
|
+
"params": {
|
|
3822
|
+
"args": [
|
|
3823
|
+
"{{in.image}}",
|
|
3824
|
+
"-resize", "1080x1920^",
|
|
3825
|
+
"-gravity", "center",
|
|
3826
|
+
"-extent", "1080x1920",
|
|
3827
|
+
"{{out.result}}"
|
|
3828
|
+
],
|
|
3829
|
+
"outputs": { "result": { "kind": "image", "ext": "jpg" } }
|
|
3830
|
+
}
|
|
3831
|
+
}
|
|
3832
|
+
```
|
|
3833
|
+
|
|
3834
|
+
**Requirements:** `ffmpeg` on PATH for the `ffmpeg` node; `magick` (v7+) or `convert` (v6) on PATH for the `imagemagick` node. The node fails with a clear message if neither is available.
|
|
3835
|
+
|
|
3836
|
+
**Cost:** zero credits (local execution).
|
|
3837
|
+
|
|
3838
|
+
---
|
|
3839
|
+
|
|
3840
|
+
### Authoring compositions
|
|
3841
|
+
|
|
3842
|
+
`hyperframe_render` and `hyperframe_snapshot` render custom HTML/CSS compositions to video or PNG. Point `params.composition` at a directory containing two files:
|
|
3843
|
+
|
|
3844
|
+
#### Directory structure
|
|
3845
|
+
|
|
3846
|
+
```
|
|
3847
|
+
my-composition/
|
|
3848
|
+
├── index.html # HTML/CSS/JS template with {{variable}} placeholders
|
|
3849
|
+
└── meta.json # Declares dimensions, inputs, and params
|
|
3850
|
+
```
|
|
3851
|
+
|
|
3852
|
+
#### `meta.json`
|
|
3853
|
+
|
|
3854
|
+
Declares what the composition accepts:
|
|
3855
|
+
|
|
3856
|
+
```json
|
|
3857
|
+
{
|
|
3858
|
+
"id": "my-overlay",
|
|
3859
|
+
"title": "Text overlay on a background image",
|
|
3860
|
+
"width": 1080,
|
|
3861
|
+
"height": 1920,
|
|
3862
|
+
"fps": 30,
|
|
3863
|
+
"default_duration": 3,
|
|
3864
|
+
"inputs": {
|
|
3865
|
+
"background": {
|
|
3866
|
+
"kind": "image",
|
|
3867
|
+
"required": true,
|
|
3868
|
+
"staged_as": "background.png",
|
|
3869
|
+
"description": "Full-bleed background image."
|
|
3870
|
+
}
|
|
3871
|
+
},
|
|
3872
|
+
"params": {
|
|
3873
|
+
"headline": {
|
|
3874
|
+
"kind": "string",
|
|
3875
|
+
"required": true,
|
|
3876
|
+
"description": "Text to render."
|
|
3877
|
+
},
|
|
3878
|
+
"font_size": {
|
|
3879
|
+
"kind": "integer",
|
|
3880
|
+
"min": 24,
|
|
3881
|
+
"max": 300,
|
|
3882
|
+
"default": 128
|
|
3883
|
+
},
|
|
3884
|
+
"text_color": {
|
|
3885
|
+
"kind": "color",
|
|
3886
|
+
"default": "#ffffff"
|
|
3887
|
+
}
|
|
3888
|
+
}
|
|
3889
|
+
}
|
|
3890
|
+
```
|
|
3891
|
+
|
|
3892
|
+
**`inputs`** — keyed assets staged into the render directory. Each declares a `kind` (`image`, `video`, `audio`), whether it's `required`, and the filename it's staged as (`staged_as`). Reference staged files by filename in your HTML (e.g. `<img src="background.png" />`).
|
|
3893
|
+
|
|
3894
|
+
**`params`** — variables substituted into HTML/CSS/JS via `{{var}}` before render. Supported kinds: `string`, `integer`, `number`, `color`, `json`. The canvas passes every variable as a **primitive in `params`** — no `variables` blob, no `JSON.stringify`. The engine inlines arrays/objects as valid JS literals.
|
|
3895
|
+
|
|
3896
|
+
#### `index.html`
|
|
3897
|
+
|
|
3898
|
+
Standard HTML with `{{variable}}` placeholders that get substituted from params:
|
|
3899
|
+
|
|
3900
|
+
```html
|
|
3901
|
+
<!doctype html>
|
|
3902
|
+
<html lang="en">
|
|
3903
|
+
<head>
|
|
3904
|
+
<meta charset="UTF-8" />
|
|
3905
|
+
<script src="./gsap.min.js"></script>
|
|
3906
|
+
<style>
|
|
3907
|
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
3908
|
+
html, body { width: 1080px; height: 1920px; overflow: hidden; }
|
|
3909
|
+
.bg { position: absolute; inset: 0; width: 100%; height: 100%; object-fit: cover; }
|
|
3910
|
+
.text {
|
|
3911
|
+
position: absolute; inset: 0;
|
|
3912
|
+
display: flex; align-items: center; justify-content: center;
|
|
3913
|
+
font-size: {{font_size}}px;
|
|
3914
|
+
color: {{text_color}};
|
|
3915
|
+
font-weight: 900;
|
|
3916
|
+
text-align: center;
|
|
3917
|
+
}
|
|
3918
|
+
</style>
|
|
3919
|
+
</head>
|
|
3920
|
+
<body>
|
|
3921
|
+
<div id="root" data-composition-id="main" data-start="0"
|
|
3922
|
+
data-duration="{{duration}}" data-width="1080" data-height="1920">
|
|
3923
|
+
<img class="bg" src="background.png" alt="" />
|
|
3924
|
+
<div class="text">{{headline}}</div>
|
|
3925
|
+
</div>
|
|
3926
|
+
<script>
|
|
3927
|
+
(() => {
|
|
3928
|
+
var tl = gsap.timeline({ paused: true });
|
|
3929
|
+
tl.to({}, { duration: 0.01 }, 0);
|
|
3930
|
+
window.__timelines = window.__timelines || {};
|
|
3931
|
+
window.__timelines.main = tl;
|
|
3932
|
+
})();
|
|
3933
|
+
</script>
|
|
3934
|
+
</body>
|
|
3935
|
+
</html>
|
|
3936
|
+
```
|
|
3937
|
+
|
|
3938
|
+
#### Using in a canvas
|
|
3939
|
+
|
|
3940
|
+
```jsonc
|
|
3941
|
+
{
|
|
3942
|
+
"id": "overlay",
|
|
3943
|
+
"type": "hyperframe_snapshot",
|
|
3944
|
+
"inputs": { "background": "$ref:person_image.images#0" },
|
|
3945
|
+
"params": {
|
|
3946
|
+
"composition": "./my-composition",
|
|
3947
|
+
"headline": "HELLO WORLD",
|
|
3948
|
+
"font_size": 128,
|
|
3949
|
+
"text_color": "#ffffff"
|
|
3950
|
+
}
|
|
3951
|
+
}
|
|
3952
|
+
```
|
|
3953
|
+
|
|
3954
|
+
For video rendering, use `hyperframe_render` instead — same composition format, outputs video instead of PNG.
|
|
3955
|
+
|
|
3956
|
+
#### Guardrails
|
|
3957
|
+
|
|
3958
|
+
- **GSAP timeline required.** Register a root timeline at `window.__timelines["main"]` — even for static snapshots, use a no-op timeline.
|
|
3959
|
+
- **Element IDs.** Every overlay element gets an `id` attribute. `<audio>` / `<video>` elements MUST have an `id` or the renderer drops them silently.
|
|
3960
|
+
- **Reserved `duration` param.** Never declare a param called `duration` — the engine injects it automatically from the input video's duration (or `default_duration` for snapshots).
|
|
3961
|
+
- **GSAP animations.** Use `gsap.set()` for visibility transitions — `fromTo(opacity)` silently fails under Hyperframes' screenshot capture.
|
|
3962
|
+
- **Single root timeline.** `window.__timelines["main"]` only. Sub-composition timelines aren't scrubbed.
|
|
3963
|
+
- **Cache invalidation.** The composition directory is hashed recursively; editing any file invalidates the cache automatically.
|
|
3964
|
+
|
|
3965
|
+
---
|
|
3966
|
+
|
|
3967
|
+
### Bundled canvases & compositions
|
|
3968
|
+
|
|
3969
|
+
A few end-to-end canvases ship with the CLI under `packages/cli/canvas/`:
|
|
3970
|
+
|
|
3971
|
+
- **`hello-world-overlay.json`** + `hello-world-composition/` — minimal "image + text overlay → png" demo.
|
|
3972
|
+
- **`avocado-tutorial.json`** + `tiktok-captions-composition/` — full "persona → first/last frame → Veo video → transcribe → TikTok captions" pipeline.
|
|
3973
|
+
#### `phone-scroll-composition`
|
|
3974
|
+
|
|
3975
|
+
Non-linear vertical scroll of a tall app screenshot, driven by a GSAP timeline. Inputs: one tall image (`image`, required, staged as `screenshot.png`). Key params:
|
|
3976
|
+
|
|
3977
|
+
- **`timeline`** (json, required) — array of `{at: seconds, y: pixels, ease?: gsap-ease, label?: string}` keyframes. Direction changes are just keyframes going the other way. `ease: "none"` between two equal-y keyframes produces a pause. The first keyframe sets the starting position (instant); subsequent keyframes animate from the previous one.
|
|
3978
|
+
- **`taps`** (json, optional) — array of `{at: seconds, x: "NN%"|"NNpx", y: "NN%"|"NNpx", kind?: "tap"|"long-press"}` finger-tap ripple overlays.
|
|
3979
|
+
- **`background_color`**, **`tap_color`** — visual tuning.
|
|
3980
|
+
|
|
3981
|
+
Output is a 720×1280 MP4 (modern phone aspect) intended to feed straight into an `ffmpeg` chromakey filter graph as the `app` overlay.
|
|
3982
|
+
|
|
3983
|
+
---
|
|
3984
|
+
|
|
3985
|
+
### Programmatic use
|
|
3986
|
+
|
|
3987
|
+
The engine library is also exposed for embedding inside another Node program:
|
|
3988
|
+
|
|
3989
|
+
```ts
|
|
3990
|
+
import { createEngineFromEnv, ValidationError } from "@koda-sl/baker-cli/engine";
|
|
3991
|
+
|
|
3992
|
+
const engine = createEngineFromEnv({ log: console.log });
|
|
3993
|
+
try {
|
|
3994
|
+
const result = await engine.run(canvasJson, { cache_policy: "read_write" });
|
|
3995
|
+
console.log(result.run_id, result.stats);
|
|
3996
|
+
} catch (e) {
|
|
3997
|
+
if (e instanceof ValidationError) console.error(e.issues);
|
|
3998
|
+
else throw e;
|
|
3999
|
+
}
|
|
4000
|
+
```
|
|
4001
|
+
|
|
4002
|
+
For full control (custom asset store, custom fetch, in-memory cache, etc.) the lower-level building blocks are also exported:
|
|
4003
|
+
|
|
4004
|
+
```ts
|
|
4005
|
+
import {
|
|
4006
|
+
Engine, BackendClient, LocalAssetStore, LocalCacheStore, defaultRegistry,
|
|
4007
|
+
} from "@koda-sl/baker-cli/engine";
|
|
4008
|
+
```
|
|
4009
|
+
|
|
2263
4010
|
## Help & Discovery
|
|
2264
4011
|
|
|
2265
4012
|
Every command supports `--help` for usage info:
|