@koda-sl/baker-cli 0.71.2 → 0.79.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -9,10 +9,10 @@ import {
9
9
  defaultRegistry,
10
10
  generateCatalog,
11
11
  validateCanvasDeep
12
- } from "./chunk-JIDZ37KG.js";
12
+ } from "./chunk-CCO34ACK.js";
13
13
 
14
14
  // src/cli.ts
15
- import { defineCommand as defineCommand138, runMain } from "citty";
15
+ import { defineCommand as defineCommand141, runMain } from "citty";
16
16
 
17
17
  // src/commands/actions/index.ts
18
18
  import { defineCommand as defineCommand12 } from "citty";
@@ -314,6 +314,52 @@ function testimonialNormalizer(record, full) {
314
314
  }
315
315
  return compactTestimonial(record);
316
316
  }
317
+ function round2(value) {
318
+ if (typeof value !== "number" || Number.isNaN(value)) {
319
+ return null;
320
+ }
321
+ return Math.round(value * 100) / 100;
322
+ }
323
+ function asRecord(value) {
324
+ return value && typeof value === "object" ? value : {};
325
+ }
326
+ function compactWinningAd(record) {
327
+ const dna = asRecord(record.dna);
328
+ return {
329
+ advertiser: String(record.advertiser ?? ""),
330
+ platform: String(record.platform ?? ""),
331
+ format: String(record.format ?? ""),
332
+ relevance: round2(record.relevance),
333
+ winner_score: round2(record.winner_score),
334
+ summary: String(dna.creative_concept ?? ""),
335
+ advertiser_id: String(record.advertiser_id ?? ""),
336
+ ad_id: String(record.ad_id ?? ""),
337
+ media_url: String(record.media_url ?? "")
338
+ };
339
+ }
340
+ function fullWinningAd(record) {
341
+ const compact = compactWinningAd(record);
342
+ const dna = asRecord(record.dna);
343
+ return {
344
+ ...compact,
345
+ winner_category: String(record.winner_category ?? ""),
346
+ media_kind: record.media_kind ?? null,
347
+ days_active: record.days_active ?? null,
348
+ reach: record.reach ?? null,
349
+ active: typeof record.active === "boolean" ? record.active : null,
350
+ angle: dna.angle ?? null,
351
+ awareness_stage: dna.awareness_stage ?? null,
352
+ target_persona: dna.target_persona ?? null,
353
+ hook_archetype: dna.hook_archetype ?? null,
354
+ industry: dna.industry ?? null
355
+ };
356
+ }
357
+ function winningAdNormalizer(record, full) {
358
+ if (full) {
359
+ return fullWinningAd(record);
360
+ }
361
+ return compactWinningAd(record);
362
+ }
317
363
  function applyFieldMask(data, fields) {
318
364
  const result = {};
319
365
  for (const field of fields) {
@@ -5633,10 +5679,10 @@ var IDENTITY_FIELDS_BY_LEVEL = {
5633
5679
  };
5634
5680
  function composeFields2(intent, level) {
5635
5681
  const intentFields = INSIGHTS_INTENTS[intent].fields;
5636
- const identity = IDENTITY_FIELDS_BY_LEVEL[level];
5682
+ const identity2 = IDENTITY_FIELDS_BY_LEVEL[level];
5637
5683
  const seen = /* @__PURE__ */ new Set();
5638
5684
  const out = [];
5639
- for (const f of [...identity, ...intentFields]) {
5685
+ for (const f of [...identity2, ...intentFields]) {
5640
5686
  if (!seen.has(f)) {
5641
5687
  seen.add(f);
5642
5688
  out.push(f);
@@ -8234,11 +8280,121 @@ import { defineCommand as defineCommand76 } from "citty";
8234
8280
 
8235
8281
  // src/engine/scaffold/video.ts
8236
8282
  import { z as z3 } from "zod";
8283
+
8284
+ // src/engine/scaffold/lib/shoot-modes.ts
8285
+ var SHOOT_MODES = [
8286
+ "ugc_selfie",
8287
+ "ugc_broll",
8288
+ "studio_product",
8289
+ "lifestyle_cinematic",
8290
+ "screen_ui"
8291
+ ];
8292
+ var SHOOT_MODE_SPECS = {
8293
+ ugc_selfie: {
8294
+ label: "UGC selfie / talking-head",
8295
+ allowsDoF: false,
8296
+ capture: [
8297
+ "CAPTURE \u2014 AUTHENTIC PHONE UGC (front camera):",
8298
+ "Shot on a modern phone front camera \u2014 natural lens, real skin texture and pores,",
8299
+ "catchlights, mixed indoor white balance, faint sensor grain, slight handheld imperfection.",
8300
+ "NO shallow depth of field or background blur \u2014 native phone footage is flat front-to-back;",
8301
+ "blur reads as 'produced', not filmed-on-a-phone. Keep the whole frame in focus."
8302
+ ].join("\n"),
8303
+ motion: "Lock the camera at arm's-length selfie distance; only natural handheld micro-movement. Move the camera only if a move is named above.",
8304
+ diegetic: "a quiet room tone with soft fabric and breath under the speaker's own voice"
8305
+ },
8306
+ ugc_broll: {
8307
+ label: "UGC b-roll / handheld",
8308
+ allowsDoF: false,
8309
+ capture: [
8310
+ "CAPTURE \u2014 AUTHENTIC PHONE UGC (rear camera, candid):",
8311
+ "Shot on a modern phone rear camera, handheld and candid \u2014 natural lens, real materials and",
8312
+ "textures, real hands in frame where natural, mixed natural white balance, faint sensor grain.",
8313
+ "NO shallow depth of field or background blur \u2014 native phone footage is flat front-to-back;",
8314
+ "keep the whole frame in focus."
8315
+ ].join("\n"),
8316
+ motion: "Handheld, candid framing; keep any move small and motivated. Move the camera only if a move is named above.",
8317
+ diegetic: "the real ambient of the setting \u2014 handling sounds, footsteps, and room or outdoor tone"
8318
+ },
8319
+ studio_product: {
8320
+ label: "Studio / product (pack shot)",
8321
+ allowsDoF: true,
8322
+ capture: [
8323
+ "CAPTURE \u2014 CONTROLLED PRODUCT / STUDIO:",
8324
+ "Photographed on a controlled set \u2014 a clean seamless or one styled surface, a soft key with",
8325
+ "gentle fill and a subtle rim, true-to-life color. Shallow depth of field IS allowed to isolate",
8326
+ "the hero, with crisp specular highlights on the product's real materials.",
8327
+ "Still a real photograph, not CGI \u2014 no plastic or waxy surfaces, no over-render; real material",
8328
+ "texture and weight."
8329
+ ].join("\n"),
8330
+ motion: "Lock off, or a slow motivated push-in / settle onto the product; otherwise hold.",
8331
+ diegetic: "minimal \u2014 soft product-handling sounds over a quiet room tone"
8332
+ },
8333
+ lifestyle_cinematic: {
8334
+ label: "Lifestyle / cinematic",
8335
+ allowsDoF: true,
8336
+ capture: [
8337
+ "CAPTURE \u2014 LIFESTYLE / CINEMATIC:",
8338
+ "A real camera in a real location \u2014 natural motivated light, true color, a gentle filmic grade,",
8339
+ "fine grain. A shallow depth of field is allowed when motivated by the moment.",
8340
+ "Photographic, not rendered \u2014 real skin and material texture, no airbrushing, no glossy 3D look."
8341
+ ].join("\n"),
8342
+ motion: "A slow, motivated camera move (gentle push-in, drift, or settle) is allowed; otherwise hold.",
8343
+ diegetic: "the location's natural ambience \u2014 wind, traffic, water, or room tone as the setting implies"
8344
+ },
8345
+ screen_ui: {
8346
+ label: "Screen / UI / demo",
8347
+ allowsDoF: true,
8348
+ capture: [
8349
+ "CAPTURE \u2014 SCREEN / UI CAPTURE:",
8350
+ "A clean screen or app capture \u2014 crisp pixels, true on-screen color, optionally framed inside a",
8351
+ "real device held in a real hand. No human-skin realism is needed; the screen content is the subject.",
8352
+ "Do not bake invented UI copy into the plate beyond what the reference shows \u2014 editable text lives",
8353
+ "on the overlay layer."
8354
+ ].join("\n"),
8355
+ motion: "Hold on the screen; allow a slow push-in or a UI scroll only if a move is named above.",
8356
+ diegetic: "soft UI taps and device handling over a quiet room tone"
8357
+ }
8358
+ };
8359
+ function isShootMode(value) {
8360
+ return typeof value === "string" && SHOOT_MODES.includes(value);
8361
+ }
8362
+ function captureBlockFor(mode) {
8363
+ return SHOOT_MODE_SPECS[mode].capture;
8364
+ }
8365
+ function seedanceMotionFor(mode) {
8366
+ return SHOOT_MODE_SPECS[mode].motion;
8367
+ }
8368
+ function diegeticFor(mode) {
8369
+ return SHOOT_MODE_SPECS[mode].diegetic;
8370
+ }
8371
+ function deriveShootMode(opts) {
8372
+ if (isShootMode(opts.explicit)) return opts.explicit;
8373
+ if (opts.talking) return "ugc_selfie";
8374
+ if (opts.hasProduct && !opts.hasPerson) return "studio_product";
8375
+ return "ugc_broll";
8376
+ }
8377
+
8378
+ // src/engine/scaffold/video.ts
8237
8379
  var FIXED_TTS_MODEL = "elevenlabs/eleven_v3";
8238
8380
  var FIXED_SFX_MODEL = "elevenlabs/eleven_text_to_sound_v2";
8239
8381
  var FIXED_MUSIC_MODEL = "elevenlabs/music-v1";
8240
- var FIXED_LIPSYNC_MODEL = "fal/veed-lipsync";
8382
+ var FIXED_VOICE_CONVERT_MODEL = "elevenlabs/eleven_multilingual_sts_v2";
8241
8383
  var MUSIC_BED_GAIN_DB = -12;
8384
+ var AMBIENT_BED_GAIN_DB = -20;
8385
+ var TRANSITION_DEFAULT_S = 0.4;
8386
+ var XFADE_BY_TYPE = {
8387
+ fade: "fade",
8388
+ dissolve: "dissolve",
8389
+ whip: "smoothleft",
8390
+ swipe: "wipeleft",
8391
+ zoom: "zoomin"
8392
+ };
8393
+ var WORDS_PER_SECOND = 2.5;
8394
+ function estSpeechS(text) {
8395
+ const words = text.trim().split(/\s+/).filter(Boolean).length;
8396
+ return words / WORDS_PER_SECOND;
8397
+ }
8242
8398
  var NARRATOR_SPEAKERS = /* @__PURE__ */ new Set([
8243
8399
  "voiceover",
8244
8400
  "voice_over",
@@ -8314,10 +8470,25 @@ var Scene = z3.object({
8314
8470
  duration_s: z3.number().optional(),
8315
8471
  summary: z3.string().optional(),
8316
8472
  action_detail: z3.string().optional(),
8473
+ // The capture "look" for this scene — selected from the ad-native shoot-mode
8474
+ // grammar (see lib/shoot-modes.ts). When absent the scaffold auto-derives a
8475
+ // UGC/product mode; a human can override per scene by setting this.
8476
+ shoot_mode: z3.string().optional(),
8477
+ // Diegetic ambient the clip's native audio should carry (no music). When
8478
+ // absent the scene falls back to its shoot mode's default ambience.
8479
+ ambient: z3.string().optional(),
8317
8480
  camera_motion: CameraMotion.optional(),
8318
8481
  start_frame_prompt: z3.string().optional(),
8319
8482
  end_frame_prompt: z3.string().optional(),
8320
8483
  motion_prompt: z3.string().optional(),
8484
+ // The scene's role in the ad's persuasion arc (DECON-supplied); drives the
8485
+ // script re-craft checklist. Inferred from position when absent.
8486
+ narrative_role: z3.string().optional(),
8487
+ // How this scene cuts to the next (DECON-supplied). A recognized non-cut type
8488
+ // (fade/whip/zoom/dissolve/swipe) is reproduced as an ffmpeg xfade at the
8489
+ // boundary; cut/match_cut/none/other stay hard cuts. The last scene's value is
8490
+ // ignored (nothing follows it).
8491
+ transition_out: z3.object({ type: z3.string().optional(), description: z3.string().optional() }).loose().optional(),
8321
8492
  dialogue: z3.array(DialogueLine).optional(),
8322
8493
  sfx: z3.array(Sfx).optional(),
8323
8494
  overlays: z3.array(z3.unknown()).optional(),
@@ -8332,6 +8503,10 @@ var VideoBlueprint = z3.object({
8332
8503
  music: z3.object({
8333
8504
  present: z3.boolean().optional(),
8334
8505
  music_prompt: z3.string().optional(),
8506
+ // Absolute second the music enters in the reference (the bed often
8507
+ // kicks in mid-ad, after the hook). We start the regenerated track here
8508
+ // instead of at 0 so the timing matches.
8509
+ starts_at_s: z3.number().optional(),
8335
8510
  // Populated by the deconstruct when AudD (Shazam-style) recognizes the
8336
8511
  // reference track. We never reuse it — only style the regenerated bed.
8337
8512
  identified_track: z3.object({ title: z3.string().optional(), artist: z3.string().optional() }).loose().nullish()
@@ -8355,6 +8530,11 @@ var RecurringElement = z3.object({
8355
8530
  expression: z3.string().nullable().optional(),
8356
8531
  // When the element maps to a global cast entry, its stable id (for annotation).
8357
8532
  cast_id: z3.string().nullable().optional(),
8533
+ // The label of another element that is the SAME individual as this one, shown
8534
+ // in a DIFFERENT wardrobe/persona/state (e.g. one creator playing skeptic in a
8535
+ // pink shirt and believer in a white shirt). Each look gets its own reference
8536
+ // slot, but the face/identity must stay identical across them.
8537
+ same_as: z3.string().nullable().optional(),
8358
8538
  // Scenes the element appears in. Either a bare list of scene indices (both
8359
8539
  // edges) or per-{scene,edge} entries. Both forms are accepted and merged.
8360
8540
  scenes: z3.array(z3.number()).optional(),
@@ -8430,15 +8610,27 @@ function roleForType2(type) {
8430
8610
  return "the showcased product; keep this exact product identity consistent across every frame. Ignore any caption text printed on this reference.";
8431
8611
  case "person":
8432
8612
  case "animal":
8433
- return "a recurring hero subject; keep this exact identity (face, hair, wardrobe, markings) consistent across EVERY frame of the video. Ignore any caption text printed on this reference.";
8613
+ return "a recurring cast member; render the SAME individual as this reference image and keep them consistent across EVERY frame \u2014 their appearance comes from this reference, never from prose. Ignore any caption text printed on this reference.";
8614
+ case "location":
8615
+ return "the fixed set/location; keep the room, background, and layout identical to this reference across EVERY frame \u2014 do not re-invent the environment. Ignore any caption text printed on this reference.";
8434
8616
  default:
8435
8617
  return "a recurring identity element; reproduce it faithfully and keep it consistent across every frame. Ignore any caption text printed on it.";
8436
8618
  }
8437
8619
  }
8620
+ function roleForSlot(slot) {
8621
+ if (slot.sameAs) {
8622
+ const what = slot.description ? ` (${slot.description})` : "";
8623
+ return `the SAME individual as ${slot.sameAs}, shown in a DIFFERENT wardrobe/persona/state${what} \u2014 keep the FACE and identity IDENTICAL to the ${slot.sameAs} references; change ONLY wardrobe, styling, and expression. Ignore any caption text printed on this reference.`;
8624
+ }
8625
+ return roleForType2(slot.type);
8626
+ }
8438
8627
  function todoPath2(el, label) {
8439
8628
  const desc = el.description ? ` \u2014 ${el.description}` : "";
8440
8629
  const expr = el.expression ? `, with a ${el.expression} expression` : "";
8441
- return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in]`;
8630
+ const t = el.type.toLowerCase();
8631
+ const fresh = t === "person" || t === "animal" || t === "location" ? " [SOURCE FRESH \u2014 a DIFFERENT person/animal/set than the original ad; do not reuse the source's individual]" : "";
8632
+ const same = el.same_as ? ` [SAME INDIVIDUAL as ${el.same_as} \u2014 a different wardrobe/look of the same person; reuse that cast person, change only the outfit]` : "";
8633
+ return `[TODO: drop one real source image for ${label} (${el.type})${desc}${expr} \u2014 reused across every frame it appears in${fresh}${same}]`;
8442
8634
  }
8443
8635
  function buildElementSlots(elements) {
8444
8636
  const usedIds = /* @__PURE__ */ new Set(["prompt", "spine", "overlaid", "audio_mix", "final", "music_bed"]);
@@ -8453,6 +8645,7 @@ function buildElementSlots(elements) {
8453
8645
  label,
8454
8646
  type: el.type,
8455
8647
  description: el.description,
8648
+ sameAs: el.same_as ?? void 0,
8456
8649
  presence: presenceOf(el)
8457
8650
  });
8458
8651
  });
@@ -8461,38 +8654,90 @@ function buildElementSlots(elements) {
8461
8654
  function slotsForFrame(slots, sceneIndex, edge) {
8462
8655
  return slots.filter((s) => s.presence.get(sceneIndex)?.has(edge));
8463
8656
  }
8657
+ var ACTOR_SHEET_MODEL = "google/gemini-3-pro-image-preview";
8658
+ function applyActorSheets(slots, nodes) {
8659
+ for (const slot of slots) {
8660
+ const t = slot.type.toLowerCase();
8661
+ if (t !== "person" && t !== "animal") continue;
8662
+ if (slot.presence.size < 2) continue;
8663
+ const sheetId = `${slot.id}_sheet`;
8664
+ nodes.push({
8665
+ id: sheetId,
8666
+ type: "image_reference_sheet",
8667
+ // The lone dropped ingest is the source; the sheet fans it into a turnaround.
8668
+ inputs: { references: [slot.ref] },
8669
+ params: {
8670
+ model: ACTOR_SHEET_MODEL,
8671
+ subject_description: slot.description ?? `the ${slot.type}`,
8672
+ subject_type: t === "person" ? "person" : "character",
8673
+ image_size: "2K"
8674
+ }
8675
+ });
8676
+ slot.ref = `$ref:${sheetId}.sheet`;
8677
+ }
8678
+ }
8464
8679
  function slotsForScene(slots, sceneIndex) {
8465
8680
  return slots.filter((s) => s.presence.has(sceneIndex));
8466
8681
  }
8467
- function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor) {
8682
+ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mode) {
8468
8683
  const EDGE = edge.toUpperCase();
8469
8684
  const legend = [
8470
- ...present.map((s) => `- ${s.label} \u2014 ${roleForType2(s.type)}`),
8685
+ ...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
8471
8686
  ...hasAnchor ? [
8472
8687
  "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions of THIS frame. IGNORE its overlay text, captions, and any brand that is being swapped."
8473
8688
  ] : []
8474
8689
  ].join("\n");
8475
8690
  const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
8691
+ const isHookFrame = sceneIndex === 0 && edge === "start";
8476
8692
  return [
8477
8693
  `Render the ${EDGE} frame of scene ${sceneIndex + 1} as a single still image. This prompt is self-contained and edit-per-frame: change the FRAME DESCRIPTION below to alter ONLY this frame.`,
8478
8694
  "",
8479
- "CRITICAL \u2014 RENDER A CLEAN PLATE WITH ZERO TEXT OR GRAPHICS:",
8480
- "This frame is a background plate. ALL words, captions, headlines, lower-third bars,",
8481
- "news tickers/crawls, chyrons, on-screen logos/wordmarks, station bugs, watermarks,",
8482
- "subtitles, UI and numbers are added afterwards as a separate HTML layer. Render NONE",
8483
- "of them \u2014 no legible text anywhere in the image, not even in the background, on the",
8484
- "news desk, on screens, or as part of a 'broadcast look'. If a reference image (a logo,",
8485
- "a desk, a studio) contains any text or graphics, DO NOT reproduce that text \u2014 render",
8486
- "the subject/scene only, with blank surfaces where text would be. Imperfect/garbled",
8487
- "letterforms are the worst outcome; leave those areas clean.",
8695
+ "CRITICAL \u2014 RENDER A CLEAN PLATE WITH ZERO TEXT OR GRAPHIC OVERLAYS:",
8696
+ "This frame is a background plate. Every overlay element is composited afterwards as a",
8697
+ "separate hyperframe HTML overlay layer \u2014 NOT painted into this image. Render NONE of:",
8698
+ "words, captions, subtitles, headlines, lower-third bars, news tickers/crawls, chyrons,",
8699
+ "station bugs, watermarks, numbers, prices; and NONE of the graphic overlays layered on",
8700
+ "the picture either \u2014 icons, stickers, emojis, badges, rating/trust seals, progress bars,",
8701
+ "UI chrome/buttons, and arrows. No legible text anywhere, not even in the background, on a",
8702
+ "desk, on screens, or as part of a 'broadcast look'. If a reference image (a logo, a desk,",
8703
+ "a studio) contains any text or graphics, DO NOT reproduce them \u2014 render the subject/scene",
8704
+ "only, leaving the regions where overlays will sit clean. Imperfect/garbled letterforms or",
8705
+ "stray icons are the worst outcome; leave those areas blank.",
8706
+ "",
8707
+ "FRAMING \u2014 ONE UNCUT FRAME:",
8708
+ "Render ONE single uncut photographic frame: NO split screen, NO panels, NO dividing line,",
8709
+ "NO collage, NO before/after. Avoid the AI look \u2014 no waxy/plastic skin, no airbrushing, no",
8710
+ "over-smoothing, no over-saturation, no glossy 3D render. Every descriptor needs a technical",
8711
+ 'anchor (a named lens / focal length / color grade) \u2014 no empty adjectives like "cinematic",',
8712
+ `"beautiful", "high quality"; they waste tokens and don't move the model.`,
8713
+ "",
8714
+ // The capture aesthetic + depth-of-field rule are SHOOT-MODE specific: a UGC
8715
+ // selfie is flat phone footage; a pack shot is a controlled studio frame. Only
8716
+ // this block varies by mode — the clean-plate and framing rules above are universal.
8717
+ captureBlockFor(mode),
8718
+ "",
8719
+ // Moderation-safe phrasing — Seedance routes around the real-person filter but
8720
+ // prompts still hit provider moderation; age-blind, role-based descriptions trip
8721
+ // it far less. (The client's own brand assets come from the references, not here.)
8722
+ "Describe any person by role, wardrobe, and build \u2014 never by name and never by age",
8723
+ "(no child/kid/teen/young/elderly); do not invent brand logos or marks.",
8724
+ ...isHookFrame ? [
8725
+ "",
8726
+ "HOOK FRAME (scene 1 opens the ad): the feed plays muted, so this frame must read",
8727
+ "INSTANTLY SOUND-OFF \u2014 one clear subject, legible at a glance in ~1 second, no clutter."
8728
+ ] : [],
8488
8729
  "",
8489
8730
  "REFERENCE IMAGES (in the order provided):",
8490
8731
  legend,
8491
8732
  "",
8733
+ "Identity comes from the reference images, not from this prose \u2014 render each person,",
8734
+ "product, and set to MATCH its reference image, and describe only pose, expression, action,",
8735
+ "and camera in the FRAME DESCRIPTION below.",
8736
+ "",
8492
8737
  "FRAME DESCRIPTION (this frame's editable prompt):",
8493
8738
  description,
8494
8739
  "",
8495
- "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared cast identity, palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: NO rendered text or graphic overlays \u2014 clean plate only.",
8740
+ "Keep every recurring element identical to its reference image across all frames. Use the GLOBAL STYLE REFERENCE only for shared palette, typography mood, and aspect ratio \u2014 do NOT copy another scene's composition from it; this frame's content is the FRAME DESCRIPTION above. Again: clean plate only \u2014 no rendered text, and no icons/stickers/emojis/badges (those live on the overlay layer).",
8496
8741
  "",
8497
8742
  "GLOBAL STYLE REFERENCE (shared across frames; not this frame's content):",
8498
8743
  "{{target_blueprint}}"
@@ -8506,7 +8751,7 @@ function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
8506
8751
  const genParams = {
8507
8752
  model: ctx.imageModel,
8508
8753
  image_size: "2K",
8509
- prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url))
8754
+ prompt: buildFramePrompt(edge, ctx.sceneIndex, framePrompt, present, Boolean(url), ctx.shootMode)
8510
8755
  };
8511
8756
  if (ctx.ar) genParams.aspect_ratio = ctx.ar;
8512
8757
  const genNode = {
@@ -8521,7 +8766,17 @@ function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
8521
8766
  nodes.push(genNode);
8522
8767
  return `$ref:s${ctx.sceneIndex}_${edge}.images#0`;
8523
8768
  }
8524
- function buildSeedancePrompt(scene, sceneIndex, present) {
8769
+ function seedanceAudioLine(scene, mode, audio, nativeLine) {
8770
+ const ambient = scene.ambient?.trim() || diegeticFor(mode);
8771
+ if (nativeLine) {
8772
+ return `Audio: diegetic only \u2014 the speaker's own voice over ${ambient}; no music, no song, no soundtrack (the music bed is a separate track).`;
8773
+ }
8774
+ if (audio) {
8775
+ return `Audio: diegetic ambient only \u2014 ${ambient}; no spoken dialogue, no music, no song, no soundtrack (voice and music are separate tracks).`;
8776
+ }
8777
+ return null;
8778
+ }
8779
+ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine) {
8525
8780
  const parts = [];
8526
8781
  const summary = scene.summary?.trim();
8527
8782
  parts.push(summary ? `Scene ${sceneIndex + 1}: ${summary}` : `Scene ${sceneIndex + 1}`);
@@ -8537,18 +8792,109 @@ function buildSeedancePrompt(scene, sceneIndex, present) {
8537
8792
  `Keep these consistent with their references: ${present.map((s) => `${s.label} (${s.description ?? s.type})`).join("; ")}`
8538
8793
  );
8539
8794
  }
8540
- const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
8541
- if (lines.length > 0) parts.push(`Spoken: ${lines.map((l) => `"${l}"`).join(" ")}`);
8795
+ if (nativeLine) {
8796
+ parts.push(
8797
+ `The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${nativeLine}"`
8798
+ );
8799
+ } else {
8800
+ const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
8801
+ if (lines.length > 0)
8802
+ parts.push(`Spoken context (do not render as audio): ${lines.map((l) => `"${l}"`).join(" ")}`);
8803
+ }
8542
8804
  const transcript = (scene.transcript_slice ?? []).map((w) => w.text?.trim()).filter(Boolean).join(" ").trim();
8543
8805
  if (transcript) parts.push(`Transcript: ${transcript}`);
8806
+ const audioLine = seedanceAudioLine(scene, mode, audio, nativeLine);
8807
+ if (audioLine) parts.push(audioLine);
8808
+ parts.push(
8809
+ `Direction: describe MOTION ONLY \u2014 the frames carry the content; keep it short. ${seedanceMotionFor(mode)} Spell choreography move-by-move (not 'she dances' but the actual beats: head nod, shoulder roll, knee dip). One short continuous beat. Real physical weight on any impact (no weightless AI motion). Describe any person by role and wardrobe, never by name or age.`
8810
+ );
8544
8811
  return parts.join("\n");
8545
8812
  }
8813
+ function audioExtractArgs(durationS) {
8814
+ return [
8815
+ "-i",
8816
+ "{{in.clip}}",
8817
+ "-t",
8818
+ durationS.toFixed(3),
8819
+ "-vn",
8820
+ "-acodec",
8821
+ "libmp3lame",
8822
+ "-q:a",
8823
+ "2",
8824
+ "{{out.audio}}"
8825
+ ];
8826
+ }
8827
+ function sceneOutTransition(scene, isLast) {
8828
+ if (isLast) return null;
8829
+ const type = scene.transition_out?.type?.toLowerCase();
8830
+ const xfade = type ? XFADE_BY_TYPE[type] : void 0;
8831
+ return xfade ? { xfade, dur: TRANSITION_DEFAULT_S } : null;
8832
+ }
8833
+ function sceneShootMode(scene, present, nativeTurn, cameraOn, casts) {
8834
+ const talking = Boolean(nativeTurn) || cameraOn && (scene.dialogue ?? []).some(
8835
+ (d) => d.line?.trim() && isOnCameraSpeaker(d.speaker ?? "voiceover", casts, cameraOn)
8836
+ );
8837
+ return deriveShootMode({
8838
+ explicit: scene.shoot_mode,
8839
+ talking,
8840
+ hasPerson: present.some((s) => s.type.toLowerCase() === "person" || s.type.toLowerCase() === "animal"),
8841
+ hasProduct: present.some((s) => s.type.toLowerCase() === "product")
8842
+ });
8843
+ }
8844
+ function emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, lengths, nodes, voTracks) {
8845
+ if (nativeTurn) {
8846
+ const extractLen = Math.min(Math.max(lengths.dur, lengths.speech), lengths.genDur);
8847
+ nodes.push({
8848
+ id: `s${i}_voextract`,
8849
+ type: "ffmpeg",
8850
+ inputs: { clip: `$ref:s${i}_clip.video` },
8851
+ params: { args: audioExtractArgs(extractLen), outputs: { audio: { kind: "audio", ext: "mp3" } } }
8852
+ });
8853
+ nodes.push({
8854
+ id: `s${i}_voconv`,
8855
+ type: "audio_voice_convert",
8856
+ inputs: { audio: `$ref:s${i}_voextract.audio`, voice_ref: `$ref:${nativeTurn.voiceNode}.voice_id` },
8857
+ params: { model: FIXED_VOICE_CONVERT_MODEL, voice: "{{voice_ref}}" }
8858
+ });
8859
+ voTracks.push({
8860
+ slot: `s${i}_voconv`,
8861
+ ref: `$ref:s${i}_voconv.audio`,
8862
+ start_s: nativeTurn.start_s,
8863
+ end_s: nativeTurn.start_s + extractLen,
8864
+ kind: "vo"
8865
+ });
8866
+ } else if (ambientBroll) {
8867
+ const ambientStart = scene.start_s ?? 0;
8868
+ nodes.push({
8869
+ id: `s${i}_ambient`,
8870
+ type: "ffmpeg",
8871
+ inputs: { clip: `$ref:s${i}_clip.video` },
8872
+ params: { args: audioExtractArgs(lengths.dur), outputs: { audio: { kind: "audio", ext: "mp3" } } }
8873
+ });
8874
+ voTracks.push({
8875
+ slot: `s${i}_ambient`,
8876
+ ref: `$ref:s${i}_ambient.audio`,
8877
+ start_s: ambientStart,
8878
+ end_s: ambientStart + lengths.dur,
8879
+ gain_db: AMBIENT_BED_GAIN_DB,
8880
+ kind: "ambient"
8881
+ });
8882
+ }
8883
+ }
8546
8884
  function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
8547
8885
  const ar = aspectRatioParam(blueprint);
8548
8886
  const reuse = opts.frames === "reuse";
8549
- const clipRefs = [];
8887
+ const clips = [];
8888
+ const voTracks = [];
8889
+ const lastIndex = blueprint.scenes.length - 1;
8890
+ const cameraOn = onCameraDialogue(blueprint);
8891
+ const casts = castIdSet(blueprint);
8550
8892
  blueprint.scenes.forEach((scene, i) => {
8551
- const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel };
8893
+ const nativeTurn = (sceneTurns.get(i) ?? []).find((t) => t.native);
8894
+ const present = slotsForScene(slots, i);
8895
+ const mode = sceneShootMode(scene, present, nativeTurn, cameraOn, casts);
8896
+ const ambientBroll = Boolean(opts.ambient) && !nativeTurn && mode !== "ugc_selfie";
8897
+ const ctx = { sceneIndex: i, ar, reuse, imageModel: opts.imageModel, shootMode: mode };
8552
8898
  const firstFrame = buildFrameRef(
8553
8899
  "start",
8554
8900
  scene.start_frame_asset?.url,
@@ -8566,10 +8912,22 @@ function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
8566
8912
  nodes
8567
8913
  );
8568
8914
  const dur = sceneDurationS(scene);
8915
+ let out = sceneOutTransition(scene, i === lastIndex);
8916
+ let trimTarget = dur + (out?.dur ?? 0);
8917
+ if (out && ceilToSeedance(trimTarget) < trimTarget) {
8918
+ out = null;
8919
+ trimTarget = dur;
8920
+ }
8921
+ const speech = nativeTurn ? estSpeechS(nativeTurn.text) : 0;
8922
+ const genDur = ceilToSeedance(Math.max(trimTarget, speech));
8569
8923
  const clipParams = {
8570
8924
  model: opts.videoModel,
8571
- prompt: buildSeedancePrompt(scene, i, slotsForScene(slots, i)),
8572
- duration: ceilToSeedance(dur)
8925
+ prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
8926
+ duration: genDur,
8927
+ // Native talking scene → Seedance generates the spoken audio + lip-sync;
8928
+ // an opt-in ambient b-roll beat generates diegetic ambient only; otherwise the
8929
+ // clip is silent and audio comes from the tts/music timeline.
8930
+ generate_audio: Boolean(nativeTurn) || ambientBroll
8573
8931
  };
8574
8932
  if (ar) clipParams.aspect_ratio = ar;
8575
8933
  nodes.push({
@@ -8578,31 +8936,21 @@ function buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns) {
8578
8936
  inputs: { first_frame: firstFrame, last_frame: lastFrame },
8579
8937
  params: clipParams
8580
8938
  });
8581
- let base = `$ref:s${i}_clip.video`;
8582
- const onCam = (sceneTurns.get(i) ?? []).filter((t) => t.onCamera);
8583
- const solo = onCam.length === 1 ? onCam[0] : void 0;
8584
- if (solo) {
8585
- nodes.push({
8586
- id: `s${i}_lipsync`,
8587
- type: "video_lipsync",
8588
- inputs: { video: base, audio: solo.audioRef },
8589
- params: { model: FIXED_LIPSYNC_MODEL }
8590
- });
8591
- base = `$ref:s${i}_lipsync.video`;
8592
- }
8593
- if (ceilToSeedance(dur) === dur) {
8594
- clipRefs.push(base);
8939
+ emitSceneNativeAudio(i, scene, nativeTurn, ambientBroll, { dur, speech, genDur }, nodes, voTracks);
8940
+ const base = `$ref:s${i}_clip.video`;
8941
+ if (genDur === trimTarget) {
8942
+ clips.push({ ref: base, scene_s: dur, out });
8595
8943
  } else {
8596
8944
  nodes.push({
8597
8945
  id: `s${i}_trim`,
8598
8946
  type: "ffmpeg",
8599
8947
  inputs: { clip: base },
8600
- params: { args: trimArgs(dur), outputs: { video: { kind: "video", ext: "mp4" } } }
8948
+ params: { args: trimArgs(trimTarget), outputs: { video: { kind: "video", ext: "mp4" } } }
8601
8949
  });
8602
- clipRefs.push(`$ref:s${i}_trim.video`);
8950
+ clips.push({ ref: `$ref:s${i}_trim.video`, scene_s: dur, out });
8603
8951
  }
8604
8952
  });
8605
- return clipRefs;
8953
+ return { clips, voTracks };
8606
8954
  }
8607
8955
  function musicBedPrompt(blueprint, musicPrompt) {
8608
8956
  const track2 = blueprint.global?.music?.identified_track;
@@ -8618,6 +8966,33 @@ function onCameraDialogue(blueprint) {
8618
8966
  return mode !== "voiceover" && mode !== "none";
8619
8967
  }
8620
8968
  var castIdSet = (blueprint) => new Set((blueprint.global?.cast ?? []).map((c) => c.id).filter((id) => Boolean(id)));
8969
+ var LANGUAGE_WORDS = [
8970
+ [/\b(french|fran[çc]ais|francaise)\b/, "french"],
8971
+ [/\b(spanish|espa[ñn]ol|castilian)\b/, "spanish"],
8972
+ [/\benglish\b/, "english"],
8973
+ [/\b(german|deutsch)\b/, "german"],
8974
+ [/\b(italian|italiano)\b/, "italian"],
8975
+ [/\b(portuguese|portugu[êe]s|brazilian)\b/, "portuguese"],
8976
+ [/\b(dutch|nederlands)\b/, "dutch"],
8977
+ [/\b(arabic)\b/, "arabic"],
8978
+ [/\b(japanese)\b/, "japanese"],
8979
+ [/\b(korean)\b/, "korean"],
8980
+ [/\b(hindi)\b/, "hindi"],
8981
+ [/\b(polish)\b/, "polish"]
8982
+ ];
8983
+ function parseVoiceTraits(description) {
8984
+ const d = description.toLowerCase();
8985
+ const out = {};
8986
+ if (/\b(female|woman|women|girl|lady)\b/.test(d)) out.gender = "female";
8987
+ else if (/\b(male|man|men|guy|boy)\b/.test(d)) out.gender = "male";
8988
+ for (const [re, name] of LANGUAGE_WORDS) {
8989
+ if (re.test(d)) {
8990
+ out.language = name;
8991
+ break;
8992
+ }
8993
+ }
8994
+ return out;
8995
+ }
8621
8996
  function isOnCameraSpeaker(speaker, casts, cameraOn) {
8622
8997
  if (!cameraOn) return false;
8623
8998
  if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
@@ -8642,7 +9017,9 @@ function buildDialogue(blueprint, nodes) {
8642
9017
  const existing = voiceNodeBySpeaker.get(speaker);
8643
9018
  if (existing) return existing;
8644
9019
  const id = sanitizeId2(`voice_${speaker}`, `voice_${voiceNodeBySpeaker.size}`);
8645
- nodes.push({ id, type: "voice_select", params: { description: speakerDescription(speaker) } });
9020
+ const description = speakerDescription(speaker);
9021
+ const traits = parseVoiceTraits(description);
9022
+ nodes.push({ id, type: "voice_select", params: { description, ...traits } });
8646
9023
  voiceNodeBySpeaker.set(speaker, id);
8647
9024
  return id;
8648
9025
  };
@@ -8657,44 +9034,52 @@ function buildDialogue(blueprint, nodes) {
8657
9034
  if (last && last.speaker === speaker) last.lines.push(line);
8658
9035
  else groups.push({ speaker, lines: [line] });
8659
9036
  }
8660
- const list = [];
8661
- groups.forEach((group, gi) => {
9037
+ const shells = groups.map((group) => {
8662
9038
  const first = group.lines[0];
8663
9039
  const last = group.lines[group.lines.length - 1];
8664
- if (!first || !last) return;
8665
- const start = first.start_s ?? scene.start_s ?? 0;
8666
- const end = last.end_s ?? last.start_s ?? scene.end_s ?? start;
9040
+ if (!first || !last) return void 0;
9041
+ return {
9042
+ group,
9043
+ start: first.start_s ?? scene.start_s ?? 0,
9044
+ end: last.end_s ?? last.start_s ?? scene.end_s ?? first.start_s ?? scene.start_s ?? 0,
9045
+ onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn)
9046
+ };
9047
+ }).filter((s) => Boolean(s));
9048
+ const onCamCount = shells.filter((s) => s.onCamera).length;
9049
+ const list = [];
9050
+ shells.forEach((shell, gi) => {
9051
+ const { group, start, end, onCamera } = shell;
8667
9052
  const voiceNode = ensureVoiceNode(group.speaker);
8668
- let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
8669
- if (usedVoIds.has(id)) {
8670
- let n = 2;
8671
- while (usedVoIds.has(`${id}_${n}`)) n++;
8672
- id = `${id}_${n}`;
8673
- }
8674
- usedVoIds.add(id);
8675
- nodes.push({
8676
- id,
8677
- type: "tts",
8678
- inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
8679
- // Lines join with a space; each keeps its terminal punctuation so eleven_v3
8680
- // reads the sentence boundaries (and their pauses) within the one turn.
8681
- params: {
8682
- model: FIXED_TTS_MODEL,
8683
- text: group.lines.map((l) => l.line.trim()).join(" "),
8684
- voice: "{{voice_ref}}"
8685
- }
8686
- });
9053
+ const text = group.lines.map((l) => l.line.trim()).join(" ");
9054
+ const native = onCamera && onCamCount === 1;
8687
9055
  const turn = {
8688
9056
  sceneIndex,
8689
9057
  speaker: group.speaker,
8690
- onCamera: isOnCameraSpeaker(group.speaker, casts, cameraOn),
8691
9058
  start_s: start,
8692
9059
  end_s: end,
8693
- ttsId: id,
8694
- audioRef: `$ref:${id}.audio`
9060
+ text,
9061
+ voiceNode,
9062
+ native
8695
9063
  };
9064
+ if (!native) {
9065
+ let id = sanitizeId2(`vo_s${sceneIndex}_${group.speaker}`, `vo_${sceneIndex}_${gi}`);
9066
+ if (usedVoIds.has(id)) {
9067
+ let n = 2;
9068
+ while (usedVoIds.has(`${id}_${n}`)) n++;
9069
+ id = `${id}_${n}`;
9070
+ }
9071
+ usedVoIds.add(id);
9072
+ nodes.push({
9073
+ id,
9074
+ type: "tts",
9075
+ inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
9076
+ params: { model: FIXED_TTS_MODEL, text, voice: "{{voice_ref}}" }
9077
+ });
9078
+ turn.ttsId = id;
9079
+ const audioRef = `$ref:${id}.audio`;
9080
+ tracks.push({ slot: id, ref: audioRef, start_s: start, end_s: end, kind: "vo" });
9081
+ }
8696
9082
  list.push(turn);
8697
- tracks.push({ slot: id, ref: turn.audioRef, start_s: start, end_s: end, kind: "vo" });
8698
9083
  });
8699
9084
  sceneTurns.set(sceneIndex, list);
8700
9085
  });
@@ -8720,14 +9105,22 @@ function buildSfxMusic(blueprint, nodes) {
8720
9105
  });
8721
9106
  const musicPrompt = blueprint.global?.music?.music_prompt;
8722
9107
  if (musicPrompt) {
8723
- const totalMs = Math.round((blueprint.source?.duration_s ?? lastSceneEnd(blueprint)) * 1e3);
9108
+ const total = blueprint.source?.duration_s ?? lastSceneEnd(blueprint);
9109
+ const startAt = Math.min(Math.max(blueprint.global?.music?.starts_at_s ?? 0, 0), Math.max(total - 0.5, 0));
9110
+ const totalMs = Math.round((total - startAt) * 1e3);
8724
9111
  const musicMs = Math.min(Math.max(totalMs, 3e3), ELEVENLABS_MAX_MUSIC_LENGTH_MS);
8725
9112
  nodes.push({
8726
9113
  id: "music_bed",
8727
9114
  type: "music",
8728
9115
  params: { model: FIXED_MUSIC_MODEL, prompt: musicBedPrompt(blueprint, musicPrompt), music_length_ms: musicMs }
8729
9116
  });
8730
- tracks.push({ slot: "music", ref: "$ref:music_bed.audio", start_s: 0, gain_db: MUSIC_BED_GAIN_DB, kind: "music" });
9117
+ tracks.push({
9118
+ slot: "music",
9119
+ ref: "$ref:music_bed.audio",
9120
+ start_s: startAt,
9121
+ gain_db: MUSIC_BED_GAIN_DB,
9122
+ kind: "music"
9123
+ });
8731
9124
  }
8732
9125
  return tracks;
8733
9126
  }
@@ -8777,14 +9170,29 @@ function overlayElement(ov, sceneStart) {
8777
9170
  const detail = ov.animation_detail ? ` data-anim-detail="${escapeHtml(ov.animation_detail)}"` : "";
8778
9171
  return `<div class="ov ${positionClass(ov.position)}" data-start="${at}" data-dur="${dur}"${role}${anim}${detail}>${escapeHtml(ov.text.trim())}</div>`;
8779
9172
  }
9173
+ function sourceHint(fe) {
9174
+ const desc = fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element";
9175
+ switch ((fe.kind ?? "").toLowerCase()) {
9176
+ case "logo":
9177
+ return "baker images logo <domain> (or baker images library)";
9178
+ case "emoji":
9179
+ case "sticker":
9180
+ return `baker images sticker "${desc}" (or baker images gif)`;
9181
+ case "product_cutout":
9182
+ return `baker images library "${desc}" (the client's own product)`;
9183
+ default:
9184
+ return `baker images icon "${desc}"`;
9185
+ }
9186
+ }
8780
9187
  function floatingStub(fe, sceneStart) {
8781
9188
  const at = fe.appears_at_s ?? sceneStart;
8782
9189
  const dur = fe.duration_s ?? 2.5;
8783
9190
  const kind = commentSafe(fe.kind ?? "element");
8784
9191
  const label = commentSafe(fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element");
9192
+ const hint = commentSafe(sourceHint(fe));
8785
9193
  const slug = (fe.kind ?? "element").toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "") || "element";
8786
9194
  return [
8787
- `<!-- ${kind}: ${label} @ ${at}s for ${dur}s (${positionClass(fe.position)}). Drop an image in this dir and uncomment:`,
9195
+ `<!-- ${kind}: ${label} @ ${at}s for ${dur}s (${positionClass(fe.position)}). Source a real asset: ${hint} \u2014 drop it in this dir and uncomment:`,
8788
9196
  `<img class="ov ${positionClass(fe.position)}" src="your-${slug}.png" data-start="${at}" data-dur="${dur}" alt="" /> -->`
8789
9197
  ].join("\n");
8790
9198
  }
@@ -8825,6 +9233,52 @@ function concatArgs(count) {
8825
9233
  }
8826
9234
  return [...inputs, "-filter_complex", `${labels}concat=n=${count}:v=1:a=0[v]`, "-map", "[v]", "{{out.video}}"];
8827
9235
  }
9236
+ function clipInputLen(c) {
9237
+ return c.scene_s + (c.out?.dur ?? 0);
9238
+ }
9239
+ function xfadeSpineArgs(clips) {
9240
+ const n = clips.length;
9241
+ const inputs = [];
9242
+ const filt = [];
9243
+ for (let i = 0; i < n; i++) {
9244
+ inputs.push("-i", `{{in.c${i}}}`);
9245
+ filt.push(`[${i}:v]format=yuv420p,fps=30,setsar=1,settb=AVTB[c${i}]`);
9246
+ }
9247
+ let cur = "c0";
9248
+ let accLen = clipInputLen(clips[0]);
9249
+ for (let k = 0; k < n - 1; k++) {
9250
+ const join3 = clips[k].out;
9251
+ const next = `c${k + 1}`;
9252
+ const out = k === n - 2 ? "v" : `j${k + 1}`;
9253
+ if (join3) {
9254
+ const offset = Math.max(0, accLen - join3.dur);
9255
+ filt.push(
9256
+ `[${cur}][${next}]xfade=transition=${join3.xfade}:duration=${join3.dur.toFixed(3)}:offset=${offset.toFixed(3)}[${out}]`
9257
+ );
9258
+ accLen = accLen - join3.dur + clipInputLen(clips[k + 1]);
9259
+ } else {
9260
+ filt.push(`[${cur}][${next}]concat=n=2:v=1[${out}]`);
9261
+ accLen += clipInputLen(clips[k + 1]);
9262
+ }
9263
+ cur = out;
9264
+ }
9265
+ return [...inputs, "-filter_complex", filt.join(";"), "-map", "[v]", "{{out.video}}"];
9266
+ }
9267
+ function buildSpine(clips, nodes) {
9268
+ const inputs = {};
9269
+ clips.forEach((c, i) => {
9270
+ inputs[`c${i}`] = c.ref;
9271
+ });
9272
+ const hasTransition = clips.length > 1 && clips.some((c) => c.out);
9273
+ const args = hasTransition ? xfadeSpineArgs(clips) : concatArgs(clips.length);
9274
+ nodes.push({
9275
+ id: "spine",
9276
+ type: "ffmpeg",
9277
+ inputs,
9278
+ params: { args, outputs: { video: { kind: "video", ext: "mp4" } } }
9279
+ });
9280
+ return "$ref:spine.video";
9281
+ }
8828
9282
  function scaffoldVideoCanvas(input, elementsInput, opts) {
8829
9283
  const blueprint = VideoBlueprint.parse(input);
8830
9284
  const elements = RecurringElements.parse(elementsInput);
@@ -8842,19 +9296,11 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
8842
9296
  params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
8843
9297
  });
8844
9298
  });
8845
- const { tracks: voTracks, sceneTurns } = buildDialogue(blueprint, nodes);
8846
- const clipRefs = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
8847
- const concatInputs = {};
8848
- clipRefs.forEach((ref, i) => {
8849
- concatInputs[`c${i}`] = ref;
8850
- });
8851
- nodes.push({
8852
- id: "spine",
8853
- type: "ffmpeg",
8854
- inputs: concatInputs,
8855
- params: { args: concatArgs(clipRefs.length), outputs: { video: { kind: "video", ext: "mp4" } } }
8856
- });
8857
- let videoRef = "$ref:spine.video";
9299
+ if (opts.actorSheets) applyActorSheets(slots, nodes);
9300
+ const { tracks: ttsTracks, sceneTurns } = buildDialogue(blueprint, nodes);
9301
+ const { clips, voTracks: nativeVoTracks } = buildSceneVisuals(blueprint, slots, opts, nodes, sceneTurns);
9302
+ const voTracks = [...ttsTracks, ...nativeVoTracks];
9303
+ let videoRef = buildSpine(clips, nodes);
8858
9304
  let videoNode = "spine";
8859
9305
  const overlays = blueprint.scenes.flatMap((s) => s.overlays ?? []);
8860
9306
  const floating = blueprint.scenes.flatMap((s) => s.floating_elements ?? []);
@@ -8926,7 +9372,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
8926
9372
  metadata: {
8927
9373
  name: "video reproduction",
8928
9374
  description: VIDEO_GUIDE,
8929
- todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts),
9375
+ todo: buildVideoTodo(videoReport(input, elementsInput), overlays.length, floating.length, opts, blueprint),
8930
9376
  // The timing plan `baker canvas validate` checks before any billed render:
8931
9377
  // sequenced voiceover turns (no overlap), audio ≈ video length, and which
8932
9378
  // scenes must be lip-synced.
@@ -8941,60 +9387,162 @@ function buildVideoMeta(blueprint, sceneTurns) {
8941
9387
  const talking_scenes = [];
8942
9388
  for (const [scene, turns] of [...sceneTurns.entries()].sort((a, b) => a[0] - b[0])) {
8943
9389
  for (const t of turns) {
8944
- vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
8945
- }
8946
- if (turns.filter((t) => t.onCamera).length === 1) {
8947
- talking_scenes.push({ scene, lipsync_node: `s${scene}_lipsync` });
9390
+ if (t.ttsId) vo_segments.push({ slot: t.ttsId, start_s: t.start_s, end_s: t.end_s, scene, speaker: t.speaker });
9391
+ }
9392
+ const nativeTurn = turns.find((t) => t.native);
9393
+ if (nativeTurn) {
9394
+ const sceneObj = blueprint.scenes[scene];
9395
+ talking_scenes.push({
9396
+ scene,
9397
+ voice_convert_node: `s${scene}_voconv`,
9398
+ scene_s: sceneObj ? Math.round(sceneDurationS(sceneObj) * 100) / 100 : void 0,
9399
+ est_speech_s: Math.round(estSpeechS(nativeTurn.text) * 100) / 100
9400
+ });
8948
9401
  }
8949
9402
  }
8950
9403
  return {
8951
9404
  duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
8952
9405
  vo_segments,
8953
- talking_scenes
9406
+ talking_scenes,
9407
+ motion_board: buildMotionBoard(blueprint, sceneTurns)
8954
9408
  };
8955
9409
  }
9410
+ function buildMotionBoard(blueprint, sceneTurns) {
9411
+ const round = (n) => Math.round(n * 100) / 100;
9412
+ let cursor = 0;
9413
+ return blueprint.scenes.map((scene, i) => {
9414
+ const start_s = scene.start_s ?? cursor;
9415
+ const end_s = scene.end_s ?? start_s + sceneDurationS(scene);
9416
+ cursor = end_s;
9417
+ const spoken = (sceneTurns.get(i) ?? []).map((t) => t.text?.trim()).filter((l) => Boolean(l)).join(" ") || null;
9418
+ const overlays = z3.array(Overlay).safeParse(scene.overlays ?? []);
9419
+ const floats = z3.array(FloatingElement).safeParse(scene.floating_elements ?? []);
9420
+ const graphics = [
9421
+ ...(overlays.success ? overlays.data : []).filter((ov) => ov.text?.trim()).map((ov) => ({
9422
+ kind: "text",
9423
+ at_s: round(ov.appears_at_s ?? start_s),
9424
+ dur_s: round(ov.duration_s ?? 2.5),
9425
+ position: ov.position ?? "bottom_center",
9426
+ text: ov.text?.trim()
9427
+ })),
9428
+ ...(floats.success ? floats.data : []).map((fe) => ({
9429
+ kind: "graphic",
9430
+ at_s: round(fe.appears_at_s ?? start_s),
9431
+ dur_s: round(fe.duration_s ?? 2.5),
9432
+ position: fe.position ?? "bottom_center",
9433
+ label: fe.brand_name || fe.what_it_represents || fe.description || fe.kind || "element"
9434
+ }))
9435
+ ].sort((a, b) => a.at_s - b.at_s);
9436
+ return {
9437
+ scene: i,
9438
+ role: scene.narrative_role?.trim() || inferNarrativeRole(i, blueprint.scenes.length),
9439
+ window_s: [round(start_s), round(end_s)],
9440
+ storyboard_frames: [`s${i}_start`, `s${i}_end`],
9441
+ spoken,
9442
+ graphics
9443
+ };
9444
+ });
9445
+ }
8956
9446
  var VIDEO_GUIDE = [
8957
- "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 optional lip-sync \u2192 concatenated. On-screen text is a separate HTML layer you paint; audio is sequenced voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
9447
+ "Scaffolded by `baker canvas scaffold-video` \u2014 a runnable reproduction of your reference video. Per scene: two AI-generated CLEAN-PLATE frames (no baked text) \u2192 a clip \u2192 trimmed to the real scene length so the picture stays on the audio timeline \u2192 concatenated. Talking heads are voiced NATIVELY by Seedance (lips+voice generated together) then re-voiced to one brand voice; off-camera narration is sequenced tts. On-screen text/graphics are a separate HTML overlay layer you paint; audio is the voiceover + SFX + a ducked music bed, normalized stereo. It is a DRAFT: edit it, supply the real assets, then validate and run.",
8958
9448
  "",
8959
9449
  "WHAT TO DO NEXT:",
9450
+ "0. RE-CRAFT THE SCRIPT FIRST (don't clone). This reference already won in-market, but copying a video is much harder than a static: the hook is targeting and may not transfer, and the message must become TRUE for our brand. Work the `metadata.todo.script_recraft` checklist \u2014 for each scene judge its role (hook/body/CTA), decide keep/cut/reorder/replace, and re-author every line for OUR customer's pain + OUR offer. See `references/script-craft.md` (hook/body/CTA framework) and the `meta-ads-playbook` skill. Most of the work lives here.",
8960
9451
  "1. Edit each frame's prompt IN PLACE. Every `s<i>_start` / `s<i>_end` node has its OWN self-contained `params.prompt` (the FRAME DESCRIPTION) \u2014 editing one changes only that frame. Rewrite the cast, product, claims, palette into the ad you want.",
9452
+ "1b. STORYBOARD FIRST \u2014 align the look on the cheap stills before clips bill. The boundary frames ARE your storyboard; `metadata.video.motion_board` lays out each scene's frames, time window, spoken line, and the graphics scheduled in it. Lock the frames + check each graphic lands on its spoken beat, THEN run the clips (images are cheap, videos aren't; the cache re-bills only what you change). See references/video-flow.md.",
8961
9453
  "2. Drop ONE real source image at each `el_*` ingest `[TODO]` path. Each recurring element (person/product/logo) is reused across every frame it appears in, so the same identity stays consistent. `baker canvas run` REFUSES to start until every `[TODO]` slot holds a real source \u2014 so this is mandatory, not optional.",
8962
- "3. Confirm the `voice_select` casting (one per speaker). Voiceover is SEQUENCED: each contiguous same-speaker turn is its own `tts` placed at its real start, so dialogue alternates instead of stacking. Edit a turn's `params.text` (punctuation / ALL-CAPS / line breaks are read verbatim by eleven_v3 for emphasis and pauses) to shape delivery; re-author the words to be TRUE for your brand.",
8963
- "4. Lip-sync: scenes with a single on-camera speaker route their clip through `video_lipsync` (~20 cr each) so the mouth matches the line. Two-speaker scenes are left un-synced \u2014 split them or pick a primary speaker if you want sync. Drop the node to skip.",
8964
- "5. Overlays are REAL HTML you paint. Open `video-overlay-composition/index.html`: the reference's overlays are seeded inside `#overlay-root` as plain elements (text + a `.pos-*` class + `data-start`/`data-dur`). Restyle the CSS freely \u2014 build lower-thirds, a ticker, whatever the look needs \u2014 and replace a logo placeholder with a real `<img>` you drop in that dir. The runtime only shows/hides by timestamp; it makes no styling decisions. Drop `brand-bold.otf` / `brand-regular.otf` there for on-brand type.",
8965
- "6. `baker canvas validate` (proves audio/lip-sync timing for free) then `baker canvas run` (generates many billed image/video/audio assets \u2014 not free).",
9454
+ "3. Talking heads are NATIVE: a scene with one on-camera speaker is voiced by Seedance itself (the line is in the clip's prompt + `generate_audio`), so lips and voice are generated together \u2014 no separate tts, no post-hoc lip-sync. Edit the line in the scene's `s<i>_clip` prompt to re-author the words TRUE for your brand. Off-camera narration scenes still use a sequenced `tts` per turn.",
9455
+ "4. Voice consistency: every native talking clip's audio is re-voiced to ONE brand voice via `audio_voice_convert` (timing preserved \u2192 lips stay matched). Confirm the `voice_select` casting (one per speaker) \u2014 its `voice_id` is the brand voice; set its gender/language so the voice matches the creator.",
9456
+ "5. Overlays are REAL HTML you paint. Open `video-overlay-composition/index.html`: the reference's overlays are seeded inside `#overlay-root` as plain elements (text + a `.pos-*` class + `data-start`/`data-dur`). Restyle the CSS freely \u2014 build lower-thirds, a ticker, whatever the look needs \u2014 and replace a logo placeholder with a real `<img>` you source (`baker images icon/sticker/gif/logo`) and drop in that dir. The runtime only shows/hides by timestamp; it makes no styling decisions. Drop `brand-bold.otf` / `brand-regular.otf` there for on-brand type.",
9457
+ "6. `baker canvas validate` (proves native-audio + timing for free) then `baker canvas run` (generates many billed image/video/audio assets \u2014 not free).",
9458
+ "",
9459
+ "CRAFT \u2014 raises every clip's realism (one-liners; full rationale in references/video-craft.md):",
9460
+ "- Iterate cheaply, do NOT brute-force takes: credits are scarce. Strong inputs (locked reference + a precise move-by-move prompt) make 1\u20132 takes land. When a take misses, change the SMALLEST thing and re-run \u2014 the cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
9461
+ "- Keep clips SHORT (trust the scene-length trim \u2014 don't pad) and LOCK THE CAMERA: unmotivated AI drift is the top tell.",
9462
+ "- One generation = one speaking voice \u2014 Seedance can't voice two on-camera speakers in one clip; split a two-speaker beat into separate scenes (or one stays silent). Lip-sync follows the line verbatim; emotion in [brackets].",
9463
+ "- Preview cheap \u2192 finalize high-res (never prompt the aspect ratio in prose \u2014 the pipeline sets it). Match-cut continuous action by setting scene N+1's start frame = scene N's end frame (costs no extra gens).",
9464
+ "- Nail the pack shot (final product hero \u2014 motivated move, matched light). Geometry drifting? drop a high-angle SCHEMATIC still as an extra reference (a map beats a paragraph). Before/after (dry\u2192wet) = two separate locked references, not words mid-scene.",
9465
+ "- The hook reads SOUND-OFF: scene 1's frame + overlay carry it in ~1s \u2014 don't bury the hook in the spoken line (meta-ads-playbook \xA739/\xA748).",
8966
9466
  "",
8967
- "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are."
9467
+ "Tip: `prompt.json` is the deconstruction provenance + the demoted GLOBAL STYLE REFERENCE each frame reads for shared palette/cast cohesion. It is NOT the per-frame editing surface \u2014 the frame nodes are.",
9468
+ "Production craft: references/video-craft.md. Hook/message/script-structure layer: references/script-craft.md + the meta-ads-playbook skill."
8968
9469
  ].join("\n");
8969
- function buildVideoTodo(report, overlayCount, floatingCount, opts) {
9470
+ function inferNarrativeRole(index, total) {
9471
+ if (index === 0 && total > 1) return "hook";
9472
+ if (index === total - 1) return "cta";
9473
+ return "body";
9474
+ }
9475
+ function buildScriptRecraft(blueprint) {
9476
+ const total = blueprint.scenes.length;
9477
+ return blueprint.scenes.map((scene, i) => {
9478
+ const role = scene.narrative_role?.trim() || inferNarrativeRole(i, total);
9479
+ const original = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ");
9480
+ return {
9481
+ scene: i,
9482
+ role,
9483
+ original_line: original || null,
9484
+ recraft: `[RECRAFT: rewrite this ${role} for OUR brand \u2014 true claims only; do NOT render the reference's words. See references/script-craft.md + meta-ads-playbook.]`
9485
+ };
9486
+ });
9487
+ }
9488
+ function buildVideoTodo(report, overlayCount, floatingCount, opts, blueprint) {
8970
9489
  return {
9490
+ recraft_the_script_first: "VIDEO IS INSPIRATION, NOT A CLONE. Before rendering, re-craft the script (hook \u2192 body \u2192 CTA) for OUR brand \u2014 the reference already won in-market, but its hook is targeting and may not transfer. Work the per-scene `script_recraft` checklist below; see references/script-craft.md + the meta-ads-playbook skill.",
9491
+ script_recraft: buildScriptRecraft(blueprint),
8971
9492
  edit_frames_in_place: "Each s<i>_start / s<i>_end node has its own editable params.prompt (FRAME DESCRIPTION). Edit per frame; the blueprint is only a shared style reference. Frames are CLEAN PLATES \u2014 they render no on-screen text; all text is the overlay HTML layer.",
8972
9493
  frames_mode: opts.frames ?? "generate",
9494
+ review_storyboard_before_clips: "STORYBOARD FIRST. The per-scene boundary frames (s<i>_start / s<i>_end) ARE your storyboard \u2014 align the LOOK on them before clips bill. Images are cheap, videos aren't, and the content-addressed cache re-bills only changed nodes, so iterate prompts here first and let the storyboard lock before the video_generate clips run. `metadata.video.motion_board` lists each scene's frames, window, spoken line, and the graphics scheduled in it \u2014 walk it scene by scene.",
9495
+ motion_board: "`metadata.video.motion_board` maps which overlay/graphic fires on which spoken beat (per scene: window_s, spoken line, each graphic's at_s). Review it so every graphic lands on the word it punctuates \u2014 don't let an overlay drift off its beat. Adjust an overlay's data-start in video-overlay-composition/index.html (or appears_at_s in prompt.json) to retime it.",
8973
9496
  assets_required: "MANDATORY: drop a real image at every el_* [TODO] ingest before running \u2014 `baker canvas run` refuses to start (and bills nothing) until each placeholder holds a real source.",
9497
+ sourcing: 'Resolve each el_* [TODO]: (1) the client\'s OWN assets first \u2014 `baker images library "<subject>"` (e.g. the company owner as the actor) and `baker images logo <domain>` for the brand mark; (2) otherwise search a FRESH real reference \u2014 `baker images find "<subject>" --sources pinterest` (Pinterest is photo-real/candid, best for people & sets) or generate one with the image model. el_creator_* and el_location are [SOURCE FRESH]: cast a DIFFERENT person/animal/set than the original ad \u2014 never the source\'s individual.',
8974
9498
  recurring_elements_to_supply: report.elements,
8975
9499
  text_strategy: "Decide per ad: text is either baked by the generated creative OR painted via the overlay HTML \u2014 not both. Default here is clean text-free frames + the HTML overlay layer (video-overlay-composition/index.html) as the single text source, which you fully control.",
8976
- timeline: "Automatic: each clip is generated at >= its scene length then trimmed back to the real scene duration, so the concatenated picture stays on the same timeline as the absolute-timed audio (this is what makes the lips line up). You don't manage it.",
9500
+ timeline: "Automatic: each clip is generated at >= its scene length then trimmed back to the real scene duration, so the concatenated picture stays on the same timeline as the audio. You don't manage it.",
8977
9501
  voices_to_confirm: report.dialogue.map((d) => ({
8978
9502
  scene: d.scene,
8979
9503
  speaker: d.speaker,
8980
9504
  voice_description: d.voice_description,
8981
9505
  line: d.line
8982
9506
  })),
8983
- voiceover_note: "Sequenced: one tts per contiguous same-speaker TURN, placed at its real start_s so turns alternate (no parallel monologues); same voice locked via voice_select.voice_id. Edit a turn's params.text (punctuation / ALL CAPS / line breaks read verbatim) to shape delivery.",
8984
- lip_sync_note: "Scenes with a single on-camera speaker route their clip through video_lipsync (~20 cr each) so the mouth matches the line. Two-speaker scenes are left un-synced (one track can't drive two faces) \u2014 split or pick a primary. `baker canvas validate` checks every talking scene is synced.",
9507
+ talking_head_note: "NATIVE: a single-on-camera-speaker scene is voiced by Seedance itself (line in s<i>_clip prompt + generate_audio) so lips+voice are generated together \u2014 no tts, no veed-lipsync. Edit the line in the clip's prompt to re-author it.",
9508
+ voice_note: "Every native talking clip's audio is re-voiced to ONE brand voice via audio_voice_convert (eleven_multilingual_sts_v2), timing preserved so lips stay matched. voice_select.voice_id is that brand voice \u2014 set its gender/language to match the creator. Off-camera narration uses a sequenced tts per turn.",
9509
+ native_timing: "Seedance paces the spoken line to fill the clip, so each native talking clip is generated long enough for the estimated speech and its audio is kept full-length (not hard-trimmed to the visual scene) \u2014 the line is never cut mid-word; the voice may continue a beat past the visual cut (natural VO continuity). `metadata.video.talking_scenes` carries each scene's scene_s vs est_speech_s. If a rendered line still sounds clipped, the line is simply longer than the scene: shorten the line or lengthen the scene in the deconstruct.",
9510
+ craft: {
9511
+ note: "Production-craft principles that raise every clip's realism. Full rationale: references/video-craft.md (production craft); references/script-craft.md + meta-ads-playbook for the hook/message layer.",
9512
+ principles: [
9513
+ "Iterate cheaply \u2014 do NOT brute-force takes. Credits are scarce. Strong inputs (a locked reference + a precise move-by-move prompt) make 1\u20132 takes land; bad input = bad output. When a take misses, change the SMALLEST thing and re-run \u2014 the content-addressed cache re-bills only that node. Images are cheap, videos aren't: fix on the frame/reference, not by re-rolling clips.",
9514
+ "Keep clips SHORT \u2014 the longer a shot holds, the more the eye catches the AI tell. Trust the scene-length trim; don't pad.",
9515
+ "LOCK THE CAMERA \u2014 a first/last-frame clip holds the framing the two frames define; only move when a move is specified. Unmotivated camera drift is the top realism tell.",
9516
+ "One generation = one speaking voice. Seedance can't voice two on-camera speakers in one clip \u2014 split a two-speaker beat into separate scenes (or one stays silent). Fragment long dialogue at a natural pause and stitch via the shared boundary frame. Lip-sync follows the line verbatim; delivery cues go in [brackets].",
9517
+ "Preview cheap \u2192 finalize high-res (credit discipline). Never prompt the aspect ratio in prose ('make it vertical' \u2192 stretch artifacts) \u2014 the pipeline sets aspect_ratio as a param.",
9518
+ "Match-cut continuous action across a cut by reusing the boundary frame: scene N+1's start frame = scene N's end frame (a composition choice, costs no extra gens).",
9519
+ "Nail the PACK SHOT \u2014 the final product hero sells (motivated camera move, light matched to the rest of the ad).",
9520
+ "Geometry/objects drifting frame to frame? A MAP beats a paragraph \u2014 drop a high-angle schematic still (marked positions) as an extra el_*/reference rather than adding more words.",
9521
+ "Before/after (dry\u2192wet, skeptic\u2192believer) = two SEPARATE locked reference images \u2014 don't ask words to transform a subject mid-scene (cheaper and more reliable than re-rolling clips).",
9522
+ "The hook is VISUAL-FIRST: the feed plays muted, so scene 1's opening frame + its overlay text must read sound-off in ~1 second \u2014 don't bury the hook in the spoken line alone (meta-ads-playbook \xA739 visual-hooks-beat-audio, \xA748 the 1-second/feed-native rule)."
9523
+ ]
9524
+ },
9525
+ transitions: "Scene-to-scene cuts the deconstruct flagged as fade/whip/zoom/dissolve/swipe are reproduced as an ffmpeg xfade at the boundary (everything else stays a hard cut). The overlap is consumed from extra generated footage, so the picture stays exactly on the audio timeline. To change a transition, edit the scene's `transition_out.type` in prompt.json and re-scaffold, or hand-edit the `spine` node's ffmpeg args.",
8985
9526
  text_overlays: {
8986
9527
  count: overlayCount,
8987
9528
  note: "Seeded as editable HTML inside `#overlay-root` in video-overlay-composition/index.html (text + a .pos-* class + data-start/data-dur). PAINT it: restyle the CSS, build lower-thirds/tickers, drop brand-*.otf for on-brand type. The runtime only shows/hides by timestamp."
8988
9529
  },
8989
9530
  floating_elements: {
8990
9531
  count: floatingCount,
8991
- note: floatingCount > 0 ? "Seeded as labeled placeholders in index.html \u2014 replace each with a real <img> you drop into video-overlay-composition/. Recurring logos are also handled well as an el_* element baked into frames." : "none detected"
9532
+ note: floatingCount > 0 ? "Seeded as commented <img> stubs in index.html (each names the `baker images icon/sticker/gif/logo` command to source it) \u2014 source the asset, drop it in video-overlay-composition/, uncomment the <img>." : "none detected by the deconstruct \u2014 see `completeness_check`, the reference may still have icons/stickers it missed."
9533
+ },
9534
+ sound_effects: {
9535
+ count: report.sfx_count,
9536
+ note: report.sfx_count > 0 ? "Seeded as `sound_effect` nodes on `audio_mix` at their timestamps \u2014 edit the prompt or retime." : "none detected by the deconstruct \u2014 see `completeness_check`, the reference may still have sound cues it missed."
8992
9537
  },
8993
- sound_effects: { count: report.sfx_count },
8994
9538
  music: {
8995
9539
  present: report.has_music,
8996
- note: report.has_music ? "Original bed regenerated from the deconstruct prompt (styled after the AudD-identified track when available); ducked under the voices." : "no music bed scaffolded"
9540
+ note: report.has_music ? "Original bed regenerated from the deconstruct prompt (styled after the AudD-identified track when available); ducked under the voices." : "no music bed scaffolded \u2014 if the reference has music, see `completeness_check`."
8997
9541
  },
9542
+ // ALWAYS-ON safety net: the scaffold can only seed what the deconstruct
9543
+ // cataloged, and it under-detects on-image graphics + sound cues. Never trust
9544
+ // "none detected" — re-watch the reference and fill the gaps with the right tool.
9545
+ completeness_check: 'The scaffold mirrors the deconstruct\'s catalog, which UNDER-DETECTS \u2014 never trust a 0 count. Re-watch the reference frame-by-frame and add anything missing: (1) ON-IMAGE GRAPHICS not in floating_elements (dollar/coin icons, emojis, checkmarks, rating stars, price tags, arrows, progress bars, app UI) \u2192 source each with `baker images icon "<desc>"` / `baker images sticker` / `baker images gif` / `baker images logo <domain>` and add it as an <img class="ov pos-* " data-start data-dur> in video-overlay-composition/index.html (NEVER bake graphics into the frame plates). (2) SOUND CUES not in sound_effects (cha-ching/coin, whoosh, ding, pop, notification, keyboard) \u2192 add a `sound_effect` node (eleven_text_to_sound) and wire it onto `audio_mix` at its timestamp. (3) RECURRING people/animals/products/logos/sets with no el_* slot \u2192 add an `ingest` [TODO] slot and reference it from the frames they appear in. (4) Burned-in captions/text not in text_overlays \u2192 add an <img>-free <div class="ov"> in index.html. (5) ONE person playing MULTIPLE personas/wardrobes (skeptic vs believer, before vs after, two outfits) collapsed into a single el_* slot \u2192 split into one el_* slot PER look, each linked as the SAME individual via `same_as` so every outfit has its own reference image but the face/identity stays identical.',
8998
9546
  scenes_clamped_to_15s: report.clamped_scenes,
8999
9547
  run_warning: "`baker canvas run` generates many billed image/video/audio assets \u2014 validate first, it is not free."
9000
9548
  };
@@ -9072,10 +9620,13 @@ List ONLY the elements worth keeping consistent across frames \u2014 the ones a
9072
9620
  - a showcased product, package, card, or device the ad sells or demonstrates -> type "product"
9073
9621
  - the advertiser brand logo/wordmark (from global.branding) -> type "logo"
9074
9622
  - a recurring trust/rating/certification badge -> type "badge"
9623
+ - the dominant recording set/location the scenes share (e.g. the same living room, car interior, kitchen) -> type "location"
9075
9624
 
9076
- DROP one-off background extras, incidental props, and generic scenery. A person in global.cast is almost always recurring. Keep at most ~8.
9625
+ DROP one-off background extras and incidental props \u2014 but the shared set/location is NOT generic scenery: pin it as ONE "location" element so the room stays identical across scenes. A person in global.cast is almost always recurring. Keep at most ~8.
9077
9626
 
9078
- For each kept element return: { "type": one of person|animal|product|logo|badge, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset (for a person/animal repeat the exact look from global.cast verbatim), "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
9627
+ ONE PERSON, MULTIPLE LOOKS: if a single individual plays MULTIPLE personas or wardrobes (e.g. a creator as a skeptic in one outfit and a believer in another, or a before/after), emit ONE element PER look (so each outfit gets its own real reference image) and link the extra looks to the first via "same_as" \u2014 they are the SAME person, only the wardrobe/state differs, so the face must stay identical.
9628
+
9629
+ For each kept element return: { "type": one of person|animal|product|logo|badge|location, "label": a short UPPER_SNAKE_CASE name (e.g. HERO, CREATOR_SKEPTIC, INSURANCE_CARD, LOGO), "description": a concrete reusable description to source/shoot the real asset \u2014 for a person/animal give a NEUTRAL castable role (e.g. "hero pet-owner, woman in her 30s" or "a small beagle"), NOT the original individual's literal face/identity: we RECAST with a FRESH person/animal, so never tell the agent to reuse the original. "expression": a living subject's typical expression or null, "cast_id": the global.cast id if it maps to one else null, "same_as": the label of another element this is the SAME individual as (different wardrobe/persona) else null, "scenes": the 0-based indices of EVERY scene the element is visually present in \u2014 read each scene's action_detail, its start_frame/end_frame subjects, its dialogue speaker, and its floating_elements. Be generous: if the same person narrates throughout, list every scene index. Output ONLY the JSON object.`;
9079
9630
  async function loadAssetText2(ref, label) {
9080
9631
  const r = ref;
9081
9632
  if (typeof r?.path === "string") return readFile4(r.path, "utf8");
@@ -9169,6 +9720,14 @@ var scaffoldVideoCommand = defineCommand76({
9169
9720
  file: { type: "positional", required: true, description: "Path to the reference video" },
9170
9721
  out: { type: "string", description: "Output canvas path (default <video-dir>/<name>.video.canvas.json)" },
9171
9722
  frames: { type: "string", description: '"generate" (default, anchored regen) or "reuse" (wire real frames in)' },
9723
+ ambient: {
9724
+ type: "boolean",
9725
+ description: "Give silent b-roll scenes native diegetic ambient mixed deep under the music bed (off by default)"
9726
+ },
9727
+ "actor-sheets": {
9728
+ type: "boolean",
9729
+ description: "Lock a recast person/animal that recurs across \u22652 scenes to ONE turnaround sheet grounding every frame"
9730
+ },
9172
9731
  "max-scenes": { type: "string", description: "Cap the number of scenes the deconstruct emits" },
9173
9732
  language: { type: "string", description: "Transcript/dialogue language hint (e.g. fr, en)" },
9174
9733
  focus: { type: "string", description: "Known provenance/emphasis to ground the deconstruct" },
@@ -9220,7 +9779,9 @@ var scaffoldVideoCommand = defineCommand76({
9220
9779
  videoModel,
9221
9780
  overlayCompositionPath: compositionDest,
9222
9781
  blueprintPath,
9223
- frames
9782
+ frames,
9783
+ ambient: Boolean(args.ambient),
9784
+ actorSheets: Boolean(args["actor-sheets"])
9224
9785
  };
9225
9786
  let canvas;
9226
9787
  let report;
@@ -10511,7 +11072,7 @@ registerSchema({
10511
11072
  query: { type: "string", description: "Search query", required: true },
10512
11073
  sources: {
10513
11074
  type: "string",
10514
- description: "Comma-separated providers: library,magnific,google,iconify,giphy (brandfetch lives at `baker images logo`)",
11075
+ description: "Comma-separated providers: library,magnific,google,iconify,giphy,pinterest (brandfetch lives at `baker images logo`)",
10515
11076
  required: false
10516
11077
  },
10517
11078
  limit: { type: "number", description: "Max results per group", required: false, default: 20 },
@@ -14600,6 +15161,317 @@ Examples:
14600
15161
  }
14601
15162
  });
14602
15163
 
15164
+ // src/commands/winning-ads/index.ts
15165
+ import { defineCommand as defineCommand140 } from "citty";
15166
+
15167
+ // src/commands/winning-ads/advertisers.ts
15168
+ import { defineCommand as defineCommand138 } from "citty";
15169
+ registerSchema({
15170
+ command: "winning-ads.advertisers",
15171
+ description: "Resolve a brand name to advertiser_id(s) in the ad-dna corpus \u2014 to find your OWN advertiser (to --exclude-advertiser) or a competitor (to --advertiser-id).",
15172
+ args: {
15173
+ query: { type: "string", description: "Brand / advertiser name to match (case-insensitive)", required: false },
15174
+ limit: { type: "number", description: "Max results (default 20)", required: false, default: 20 },
15175
+ platform: { type: "string", description: "Filter to a single platform", required: false }
15176
+ }
15177
+ });
15178
+ function identity(record) {
15179
+ return record;
15180
+ }
15181
+ var advertisersCommand2 = defineCommand138({
15182
+ meta: {
15183
+ name: "advertisers",
15184
+ description: 'Resolve a brand name to advertiser_id(s). Use it to find your own advertiser for --exclude-advertiser, or a competitor for --advertiser-id. Example: baker winning-ads advertisers "Deel" --output md'
15185
+ },
15186
+ args: {
15187
+ query: { type: "positional", description: "Brand / advertiser name", required: false },
15188
+ limit: { type: "string", description: "Max results (default 20)", required: false },
15189
+ platform: { type: "string", description: "Filter to a single platform", required: false },
15190
+ output: { type: "string", description: "Output format: json|files|md", required: false, default: "json" },
15191
+ fields: { type: "string", description: "Comma-separated field names to include", required: false }
15192
+ },
15193
+ run: async ({ args }) => {
15194
+ try {
15195
+ const params = {};
15196
+ const query = args.query;
15197
+ if (query) {
15198
+ params.q = query;
15199
+ }
15200
+ if (args.limit) {
15201
+ params.limit = String(args.limit);
15202
+ }
15203
+ if (args.platform) {
15204
+ params.platform = String(args.platform);
15205
+ }
15206
+ const data = await apiGet("/api/winning-ads/advertisers", params);
15207
+ const output = args.output || "json";
15208
+ if (output === "json") {
15209
+ writeJson({ ok: true, data });
15210
+ return;
15211
+ }
15212
+ const list = Array.isArray(data?.advertisers) ? data.advertisers : [];
15213
+ writeOutput(
15214
+ { ok: true, data: list },
15215
+ output,
15216
+ args.fields ? args.fields.split(",") : void 0,
15217
+ false,
15218
+ identity
15219
+ );
15220
+ } catch (err) {
15221
+ if (err instanceof ApiError) {
15222
+ writeJson({ ok: false, error: { code: err.code, message: err.message } });
15223
+ process.exit(1);
15224
+ }
15225
+ writeJson({ ok: false, error: { code: "INTERNAL_ERROR", message: "Unexpected error" } });
15226
+ process.exit(1);
15227
+ }
15228
+ }
15229
+ });
15230
+
15231
+ // src/commands/winning-ads/search.ts
15232
+ import { defineCommand as defineCommand139 } from "citty";
15233
+ registerSchema({
15234
+ command: "winning-ads.search",
15235
+ description: "Search the ad-dna corpus of scored winning ads. Returns a lean shortlist (advertiser, summary, scores, media_url) to pick a reference to reproduce.",
15236
+ args: {
15237
+ query: { type: "string", description: "Free-text natural-language query", required: false },
15238
+ "ref-ad-id": {
15239
+ type: "string",
15240
+ description: "Find ads similar to this ad id (instead of free text)",
15241
+ required: false
15242
+ },
15243
+ limit: { type: "number", description: "Max results 1-100 (default 10)", required: false, default: 10 },
15244
+ "max-per-advertiser": {
15245
+ type: "number",
15246
+ description: "Cap results per advertiser 1-50 (default 3)",
15247
+ required: false,
15248
+ default: 3
15249
+ },
15250
+ "min-relevance": { type: "number", description: "Relevance floor 0-1; trims weak matches", required: false },
15251
+ platform: {
15252
+ type: "string",
15253
+ description: "Comma list: meta,tiktok,linkedin,google_search,google_display,youtube,reddit,x,pinterest,snapchat",
15254
+ required: false
15255
+ },
15256
+ format: { type: "string", description: "Comma list: video,static,carousel", required: false },
15257
+ "winner-category": {
15258
+ type: "string",
15259
+ description: "Comma list: winner,scaled_winner,evergreen,rising,untested,dud,\u2026",
15260
+ required: false
15261
+ },
15262
+ awareness: {
15263
+ type: "string",
15264
+ description: "Comma list: unaware,problem_aware,solution_aware,product_aware,most_aware",
15265
+ required: false
15266
+ },
15267
+ country: { type: "string", description: "Comma list of country codes", required: false },
15268
+ language: { type: "string", description: "Comma list of language codes", required: false },
15269
+ "advertiser-id": {
15270
+ type: "string",
15271
+ description: "Restrict to these advertiser ids (browse one brand's winners)",
15272
+ required: false
15273
+ },
15274
+ "exclude-advertiser": {
15275
+ type: "string",
15276
+ description: "Drop these advertiser ids \u2014 your own brand + already-used references",
15277
+ required: false
15278
+ },
15279
+ "first-seen-after": {
15280
+ type: "string",
15281
+ description: "ISO datetime; only ads first seen after this",
15282
+ required: false
15283
+ },
15284
+ "first-seen-before": {
15285
+ type: "string",
15286
+ description: "ISO datetime; only ads first seen before this",
15287
+ required: false
15288
+ }
15289
+ }
15290
+ });
15291
+ function splitList(value) {
15292
+ if (!value) {
15293
+ return [];
15294
+ }
15295
+ return value.split(",").map((v) => v.trim()).filter(Boolean);
15296
+ }
15297
+ function setNumber(body, key, value) {
15298
+ if (value !== void 0 && value !== "") {
15299
+ body[key] = Number(value);
15300
+ }
15301
+ }
15302
+ function setList(target, key, value) {
15303
+ const list = splitList(value);
15304
+ if (list.length) {
15305
+ target[key] = list;
15306
+ }
15307
+ }
15308
+ function setString(target, key, value) {
15309
+ if (value) {
15310
+ target[key] = value;
15311
+ }
15312
+ }
15313
+ function buildSearchBody(args) {
15314
+ const body = {};
15315
+ if (args.query) {
15316
+ body.free_text_query = args.query;
15317
+ }
15318
+ if (args.refAdId) {
15319
+ body.ref_ad_id = args.refAdId;
15320
+ }
15321
+ setNumber(body, "limit", args.limit);
15322
+ setNumber(body, "max_per_advertiser", args.maxPerAdvertiser);
15323
+ setNumber(body, "min_relevance", args.minRelevance);
15324
+ const hardFilters = {};
15325
+ setList(hardFilters, "platform", args.platform);
15326
+ setList(hardFilters, "format", args.format);
15327
+ setList(hardFilters, "winner_category", args.winnerCategory);
15328
+ setList(hardFilters, "awareness_stage", args.awareness);
15329
+ setList(hardFilters, "country", args.country);
15330
+ setList(hardFilters, "language", args.language);
15331
+ setList(hardFilters, "advertiser_ids", args.advertiserId);
15332
+ setList(hardFilters, "exclude_advertiser_ids", args.excludeAdvertiser);
15333
+ setString(hardFilters, "first_seen_after", args.firstSeenAfter);
15334
+ setString(hardFilters, "first_seen_before", args.firstSeenBefore);
15335
+ if (Object.keys(hardFilters).length) {
15336
+ body.hard_filters = hardFilters;
15337
+ }
15338
+ return body;
15339
+ }
15340
+ var searchCommand4 = defineCommand139({
15341
+ meta: {
15342
+ name: "search",
15343
+ description: "Search winning reference ads. Example: baker winning-ads search 'B2B SaaS before/after AI automation' --platform meta --format static --winner-category winner --exclude-advertiser adv_123 --output md"
15344
+ },
15345
+ args: {
15346
+ query: { type: "positional", description: "Free-text search query", required: false },
15347
+ "ref-ad-id": { type: "string", description: "Find ads similar to this ad id", required: false },
15348
+ limit: {
15349
+ type: "string",
15350
+ description: "Max results 1-100 (default 10 \u2014 a shortlist)",
15351
+ required: false,
15352
+ default: "10"
15353
+ },
15354
+ "max-per-advertiser": {
15355
+ type: "string",
15356
+ description: "Cap results per advertiser 1-50 (default 3)",
15357
+ required: false
15358
+ },
15359
+ "min-relevance": { type: "string", description: "Relevance floor 0-1", required: false },
15360
+ platform: {
15361
+ type: "string",
15362
+ description: "Comma list of platforms (meta,linkedin,tiktok,\u2026) \u2014 search one or many",
15363
+ required: false
15364
+ },
15365
+ format: { type: "string", description: "Comma list of formats (video,static,carousel)", required: false },
15366
+ "winner-category": {
15367
+ type: "string",
15368
+ description: "Comma list of winner categories (default: all)",
15369
+ required: false
15370
+ },
15371
+ awareness: { type: "string", description: "Comma list of awareness stages", required: false },
15372
+ country: { type: "string", description: "Comma list of country codes", required: false },
15373
+ language: { type: "string", description: "Comma list of language codes", required: false },
15374
+ "advertiser-id": { type: "string", description: "Restrict to these advertiser ids", required: false },
15375
+ "exclude-advertiser": {
15376
+ type: "string",
15377
+ description: "Drop these advertiser ids (your own + already-used)",
15378
+ required: false
15379
+ },
15380
+ "first-seen-after": { type: "string", description: "ISO datetime lower bound", required: false },
15381
+ "first-seen-before": { type: "string", description: "ISO datetime upper bound", required: false },
15382
+ output: { type: "string", description: "Output format: json|files|md", required: false, default: "json" },
15383
+ fields: { type: "string", description: "Comma-separated field names to include", required: false },
15384
+ full: {
15385
+ type: "boolean",
15386
+ description: "Include DNA detail (angle, persona, hook) + longevity",
15387
+ required: false,
15388
+ default: false
15389
+ }
15390
+ },
15391
+ run: async ({ args }) => {
15392
+ try {
15393
+ const searchArgs = {
15394
+ query: args.query,
15395
+ refAdId: args["ref-ad-id"],
15396
+ limit: args.limit,
15397
+ maxPerAdvertiser: args["max-per-advertiser"],
15398
+ minRelevance: args["min-relevance"],
15399
+ platform: args.platform,
15400
+ format: args.format,
15401
+ winnerCategory: args["winner-category"],
15402
+ awareness: args.awareness,
15403
+ country: args.country,
15404
+ language: args.language,
15405
+ advertiserId: args["advertiser-id"],
15406
+ excludeAdvertiser: args["exclude-advertiser"],
15407
+ firstSeenAfter: args["first-seen-after"],
15408
+ firstSeenBefore: args["first-seen-before"]
15409
+ };
15410
+ const body = buildSearchBody(searchArgs);
15411
+ if (!("free_text_query" in body) && !("ref_ad_id" in body) && !("hard_filters" in body)) {
15412
+ writeJson({
15413
+ ok: false,
15414
+ error: { code: "VALIDATION_ERROR", message: "Provide a query, --ref-ad-id, or a filter flag" }
15415
+ });
15416
+ process.exit(1);
15417
+ }
15418
+ const data = await apiPost(
15419
+ "/api/winning-ads/search",
15420
+ body
15421
+ );
15422
+ const output = args.output || "json";
15423
+ const full = args.full;
15424
+ const rawResults = Array.isArray(data?.results) ? data.results : [];
15425
+ if (output === "json") {
15426
+ const results = rawResults.map((r) => winningAdNormalizer(r, full));
15427
+ writeJson({
15428
+ ok: true,
15429
+ data: { results, pool_size: data?.pool_size ?? null, match_confidence: data?.match_confidence ?? null }
15430
+ });
15431
+ return;
15432
+ }
15433
+ writeOutput(
15434
+ { ok: true, data: rawResults },
15435
+ output,
15436
+ args.fields ? args.fields.split(",") : void 0,
15437
+ full,
15438
+ winningAdNormalizer
15439
+ );
15440
+ } catch (err) {
15441
+ if (err instanceof ApiError) {
15442
+ writeJson({ ok: false, error: { code: err.code, message: err.message } });
15443
+ process.exit(1);
15444
+ }
15445
+ writeJson({ ok: false, error: { code: "INTERNAL_ERROR", message: "Unexpected error" } });
15446
+ process.exit(1);
15447
+ }
15448
+ }
15449
+ });
15450
+
15451
+ // src/commands/winning-ads/index.ts
15452
+ var winningAdsCommand = defineCommand140({
15453
+ meta: {
15454
+ name: "winning-ads",
15455
+ description: `Search the ad-dna corpus of scored "winning" ads for reference creatives to reproduce. Proxied through the Baker backend (BAKER_API_KEY) \u2014 no separate token needed.
15456
+
15457
+ Auth: BAKER_API_KEY (must start with bk_) + BAKER_API_URL (your Convex .convex.site URL).
15458
+
15459
+ Subcommands:
15460
+ baker winning-ads search "<free text>" \u2014 semantic search; returns a lean shortlist (advertiser, summary, scores, media_url)
15461
+ baker winning-ads advertisers "<brand>" \u2014 resolve a brand \u2192 advertiser_id(s), to --exclude-advertiser (your own) or --advertiser-id (a competitor)
15462
+
15463
+ Examples:
15464
+ baker winning-ads search "B2B SaaS ad: before/after of an overworked team replaced by AI automation" --platform meta --format static
15465
+ baker winning-ads search "skincare UGC testimonial" --platform tiktok --format video --output md
15466
+ baker winning-ads search "fintech onboarding" --winner-category winner --exclude-advertiser adv_ourbrand,adv_usedbefore --output md
15467
+ baker winning-ads advertisers "Acme" --output md # find our own advertiser id to exclude`
15468
+ },
15469
+ subCommands: {
15470
+ search: searchCommand4,
15471
+ advertisers: advertisersCommand2
15472
+ }
15473
+ });
15474
+
14603
15475
  // src/version.ts
14604
15476
  import { readFileSync as readFileSync8 } from "fs";
14605
15477
  function packageJsonUrl() {
@@ -14617,7 +15489,7 @@ function getCliVersion() {
14617
15489
  }
14618
15490
 
14619
15491
  // src/cli.ts
14620
- var main = defineCommand138({
15492
+ var main = defineCommand141({
14621
15493
  meta: {
14622
15494
  name: "baker",
14623
15495
  version: getCliVersion(),
@@ -14640,6 +15512,7 @@ Introspection: Run 'baker schema <command>' to inspect argument schemas.`
14640
15512
  videos: videosCommand,
14641
15513
  testimonials: testimonialsCommand,
14642
15514
  canvas: canvasCommand,
15515
+ "winning-ads": winningAdsCommand,
14643
15516
  schema: schemaCommand
14644
15517
  }
14645
15518
  });