@koda-sl/baker-cli 0.91.0 → 0.92.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  defaultRegistry,
10
10
  generateCatalog,
11
11
  validateCanvasDeep
12
- } from "./chunk-LMVDA3EZ.js";
12
+ } from "./chunk-RCPMJKI7.js";
13
13
 
14
14
  // src/cli.ts
15
15
  import { defineCommand as defineCommand141, runMain } from "citty";
@@ -8369,6 +8369,18 @@ async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
8369
8369
  }
8370
8370
 
8371
8371
  // src/engine/scaffold/video.ts
8372
+ import { toCardinal as nwAr } from "n2words/ar-SA";
8373
+ import { toCardinal as nwDe } from "n2words/de-DE";
8374
+ import { toCardinal as nwEn } from "n2words/en-US";
8375
+ import { toCardinal as nwEs } from "n2words/es-ES";
8376
+ import { toCardinal as nwFr } from "n2words/fr-FR";
8377
+ import { toCardinal as nwHi } from "n2words/hi-IN";
8378
+ import { toCardinal as nwIt } from "n2words/it-IT";
8379
+ import { toCardinal as nwJa } from "n2words/ja-JP";
8380
+ import { toCardinal as nwKo } from "n2words/ko-KR";
8381
+ import { toCardinal as nwNl } from "n2words/nl-NL";
8382
+ import { toCardinal as nwPl } from "n2words/pl-PL";
8383
+ import { toCardinal as nwPt } from "n2words/pt-PT";
8372
8384
  import { z as z3 } from "zod";
8373
8385
 
8374
8386
  // src/engine/scaffold/lib/shoot-modes.ts
@@ -8480,6 +8492,14 @@ var XFADE_BY_TYPE = {
8480
8492
  swipe: "wipeleft",
8481
8493
  zoom: "zoomin"
8482
8494
  };
8495
+ var DEFAULT_VIDEO_RESOLUTION = "1080p";
8496
+ var VIDEO_MODELS_WITH_RESOLUTION = new Set(
8497
+ Object.entries(MODEL_REGISTRY.video_generate).filter(([, spec]) => "resolution" in spec.params).map(([id]) => id)
8498
+ );
8499
+ function videoResolutionParam(videoModel, resolution) {
8500
+ if (!VIDEO_MODELS_WITH_RESOLUTION.has(videoModel)) return {};
8501
+ return { resolution: resolution ?? DEFAULT_VIDEO_RESOLUTION };
8502
+ }
8483
8503
  var WORDS_PER_SECOND = 2.5;
8484
8504
  function estSpeechS(text) {
8485
8505
  const words = text.trim().split(/\s+/).filter(Boolean).length;
@@ -8697,12 +8717,21 @@ var VideoBlueprint = z3.object({
8697
8717
  // reference track. We never reuse it — only style the regenerated bed.
8698
8718
  identified_track: z3.object({ title: z3.string().optional(), artist: z3.string().optional() }).loose().nullish()
8699
8719
  }).loose().optional(),
8700
- cast: z3.array(z3.object({ id: z3.string().optional(), description: z3.string().optional() }).loose()).optional(),
8720
+ cast: z3.array(
8721
+ z3.object({
8722
+ id: z3.string().optional(),
8723
+ description: z3.string().optional(),
8724
+ // The deconstruct's note on the target-market localization (e.g. "native
8725
+ // French speaker") — read to derive the spoken-track language code.
8726
+ market_localization_note: z3.string().optional()
8727
+ }).loose()
8728
+ ).optional(),
8701
8729
  voiceover: z3.object({
8702
8730
  // on_camera | mixed → mouths are on screen (lip-sync candidates);
8703
8731
  // voiceover | none → narration over the picture (no lip-sync).
8704
8732
  mode: z3.string().optional(),
8705
- voice_description: z3.string().optional()
8733
+ voice_description: z3.string().optional(),
8734
+ persona: z3.string().optional()
8706
8735
  }).loose().optional()
8707
8736
  }).loose().optional(),
8708
8737
  scenes: z3.array(Scene).min(1)
@@ -8885,11 +8914,18 @@ function slotsForFrame(slots, sceneIndex, edge) {
8885
8914
  return slots.filter((s) => s.presence.get(sceneIndex)?.has(edge));
8886
8915
  }
8887
8916
  var ACTOR_SHEET_MODEL = "google/gemini-3-pro-image-preview";
8888
- function applyActorSheets(slots, nodes) {
8917
+ var SHEET_SUBJECT_TYPE = {
8918
+ person: "person",
8919
+ animal: "character",
8920
+ product: "product",
8921
+ location: "location"
8922
+ };
8923
+ function buildElementSheets(slots, nodes) {
8889
8924
  for (const slot of slots) {
8890
- const t = slot.type.toLowerCase();
8891
- if (t !== "person" && t !== "animal") continue;
8892
- if (slot.presence.size < 2) continue;
8925
+ const subjectType = SHEET_SUBJECT_TYPE[slot.type.toLowerCase()];
8926
+ if (!subjectType) continue;
8927
+ if (slot.sameAs) continue;
8928
+ if (slot.presence.size < 1) continue;
8893
8929
  const sheetId = `${slot.id}_sheet`;
8894
8930
  nodes.push({
8895
8931
  id: sheetId,
@@ -8899,11 +8935,15 @@ function applyActorSheets(slots, nodes) {
8899
8935
  params: {
8900
8936
  model: ACTOR_SHEET_MODEL,
8901
8937
  subject_description: slot.description ?? `the ${slot.type}`,
8902
- subject_type: t === "person" ? "person" : "character",
8903
- image_size: "2K"
8938
+ subject_type: subjectType,
8939
+ // 4K: the sheet packs up to 8 cells (angles + tight face/detail close-ups), and
8940
+ // it's the ONE reference every frame grounds on — per-cell sharpness here
8941
+ // propagates to every clip, so it's worth the highest tier on this single asset.
8942
+ image_size: "4K"
8904
8943
  }
8905
8944
  });
8906
8945
  slot.ref = `$ref:${sheetId}.sheet`;
8946
+ slot.sheetBacked = true;
8907
8947
  }
8908
8948
  }
8909
8949
  function slotsForScene(slots, sceneIndex) {
@@ -8914,7 +8954,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
8914
8954
  const legend = [
8915
8955
  ...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
8916
8956
  ...hasAnchor ? [
8917
- "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions. IGNORE its text, its logo, its brand name, and its colors entirely \u2014 it is a DIFFERENT brand's footage, here only to anchor layout/pose, never identity or palette."
8957
+ "- ORIGINAL_FRAME \u2014 use ONLY for composition: framing, camera angle, shot size, subject placement, pose, and proportions. IGNORE its text, logo, brand name, colors, AND the identity of every person/animal/object in it \u2014 those come from the labeled reference images above, never from this frame. It is a DIFFERENT brand's footage with DIFFERENT actors, here ONLY to anchor where things sit and how the shot is framed (e.g. a profile/side angle stays a profile/side angle), never who they are or what palette to use."
8918
8958
  ] : []
8919
8959
  ].join("\n");
8920
8960
  const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
@@ -9003,11 +9043,12 @@ function ingestFrameRef(url, edge, ctx, nodes) {
9003
9043
  function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
9004
9044
  const tag = ctx.tag ?? "";
9005
9045
  if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
9006
- const hasPersonOrAnimal = present.some((s) => {
9046
+ const castSlots = present.filter((s) => {
9007
9047
  const t = s.type.toLowerCase();
9008
9048
  return t === "person" || t === "animal";
9009
9049
  });
9010
- const useOriginalAnchor = Boolean(url) && !hasPersonOrAnimal;
9050
+ const castIdentityLocked = castSlots.every((s) => s.sheetBacked);
9051
+ const useOriginalAnchor = Boolean(url) && (castSlots.length === 0 || castIdentityLocked);
9011
9052
  const hasOriginal = useOriginalAnchor;
9012
9053
  const originalRef = useOriginalAnchor && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
9013
9054
  const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
@@ -9039,17 +9080,18 @@ function seedanceAudioLine(scene, mode, audio, nativeLine) {
9039
9080
  }
9040
9081
  return null;
9041
9082
  }
9042
- function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine) {
9083
+ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine, nativeLang) {
9084
+ const loc = (s) => nativeLine ? localizeNumeralsForNative(s, nativeLang) : s;
9043
9085
  const parts = [];
9044
9086
  const summary = scene.summary?.trim();
9045
- parts.push(summary ? `Scene ${sceneIndex + 1}: ${summary}` : `Scene ${sceneIndex + 1}`);
9046
- if (scene.action_detail) parts.push(`Action: ${scene.action_detail}`);
9087
+ parts.push(summary ? `Scene ${sceneIndex + 1}: ${loc(summary)}` : `Scene ${sceneIndex + 1}`);
9088
+ if (scene.action_detail) parts.push(`Action: ${loc(scene.action_detail)}`);
9047
9089
  const cm = scene.camera_motion;
9048
9090
  if (cm) {
9049
9091
  const camera = [cm.movement, cm.detail].filter(Boolean).join(" \u2014 ");
9050
9092
  if (camera) parts.push(`Camera: ${camera}`);
9051
9093
  }
9052
- if (scene.motion_prompt) parts.push(`Motion: ${scene.motion_prompt}`);
9094
+ if (scene.motion_prompt) parts.push(`Motion: ${loc(scene.motion_prompt)}`);
9053
9095
  if (present.length > 0) {
9054
9096
  parts.push(
9055
9097
  `Keep these consistent with their references: ${present.map((s) => `${s.label} (${s.description ?? s.type})`).join("; ")}`
@@ -9057,7 +9099,7 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
9057
9099
  }
9058
9100
  if (nativeLine) {
9059
9101
  parts.push(
9060
- `The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${nativeLine}"`
9102
+ `The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${loc(nativeLine)}"`
9061
9103
  );
9062
9104
  } else {
9063
9105
  const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
@@ -9065,7 +9107,7 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
9065
9107
  parts.push(`Spoken context (do not render as audio): ${lines.map((l) => `"${l}"`).join(" ")}`);
9066
9108
  }
9067
9109
  const transcript = (scene.transcript_slice ?? []).map((w) => w.text?.trim()).filter(Boolean).join(" ").trim();
9068
- if (transcript) parts.push(`Transcript: ${transcript}`);
9110
+ if (transcript) parts.push(`Transcript: ${loc(transcript)}`);
9069
9111
  const audioLine = seedanceAudioLine(scene, mode, audio, nativeLine);
9070
9112
  if (audioLine) parts.push(audioLine);
9071
9113
  parts.push(
@@ -9176,8 +9218,17 @@ function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
9176
9218
  function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
9177
9219
  const clipParams = {
9178
9220
  model: opts.videoModel,
9179
- prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
9221
+ prompt: buildSeedancePrompt(
9222
+ scene,
9223
+ i,
9224
+ present,
9225
+ mode,
9226
+ Boolean(nativeTurn) || ambientBroll,
9227
+ nativeTurn?.text,
9228
+ opts.nativeLang
9229
+ ),
9180
9230
  duration: lengths.genDur,
9231
+ ...videoResolutionParam(opts.videoModel, opts.resolution),
9181
9232
  // Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
9182
9233
  // ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
9183
9234
  generate_audio: Boolean(nativeTurn) || ambientBroll
@@ -9281,7 +9332,7 @@ function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nat
9281
9332
  { first, last },
9282
9333
  lengths,
9283
9334
  null,
9284
- { ar: opts.ar, videoModel: opts.videoModel },
9335
+ { ar: opts.ar, videoModel: opts.videoModel, resolution: opts.resolution, nativeLang: opts.nativeLang },
9285
9336
  nodes,
9286
9337
  tag
9287
9338
  );
@@ -9413,6 +9464,65 @@ var LANGUAGE_WORDS = [
9413
9464
  [/\b(hindi)\b/, "hindi"],
9414
9465
  [/\b(polish)\b/, "polish"]
9415
9466
  ];
9467
+ var LANGUAGE_ISO = {
9468
+ french: "fr",
9469
+ spanish: "es",
9470
+ english: "en",
9471
+ german: "de",
9472
+ italian: "it",
9473
+ portuguese: "pt",
9474
+ dutch: "nl",
9475
+ arabic: "ar",
9476
+ japanese: "ja",
9477
+ korean: "ko",
9478
+ hindi: "hi",
9479
+ polish: "pl"
9480
+ };
9481
+ function languageHaystacks(blueprint) {
9482
+ const vo = blueprint.global?.voiceover;
9483
+ const cast = blueprint.global?.cast ?? [];
9484
+ const dialogue = blueprint.scenes.flatMap((s) => s.dialogue ?? []);
9485
+ return [
9486
+ vo?.voice_description,
9487
+ vo?.persona,
9488
+ ...cast.flatMap((c) => [c.market_localization_note, c.description]),
9489
+ ...dialogue.map((l) => l.voice_description)
9490
+ ].filter((s) => Boolean(s));
9491
+ }
9492
+ function deriveTtsLanguageCode(blueprint) {
9493
+ for (const text of languageHaystacks(blueprint)) {
9494
+ const name = parseVoiceTraits(text).language;
9495
+ if (name && LANGUAGE_ISO[name]) return LANGUAGE_ISO[name];
9496
+ }
9497
+ return void 0;
9498
+ }
9499
+ var INTEGER_SPELLERS = {
9500
+ fr: nwFr,
9501
+ es: nwEs,
9502
+ en: nwEn,
9503
+ de: nwDe,
9504
+ it: nwIt,
9505
+ pt: nwPt,
9506
+ nl: nwNl,
9507
+ pl: nwPl,
9508
+ ar: nwAr,
9509
+ ja: nwJa,
9510
+ ko: nwKo,
9511
+ hi: nwHi
9512
+ };
9513
+ function spellNumber(langCode, n) {
9514
+ const spell = langCode ? INTEGER_SPELLERS[langCode] : void 0;
9515
+ if (!spell || !Number.isFinite(n)) return String(n);
9516
+ try {
9517
+ return spell(n);
9518
+ } catch {
9519
+ return String(n);
9520
+ }
9521
+ }
9522
+ function localizeNumeralsForNative(text, langCode) {
9523
+ if (!langCode || !INTEGER_SPELLERS[langCode]) return text;
9524
+ return text.replace(/(?<![\w.,-])\d{1,9}(?![\w.,-])/g, (m) => spellNumber(langCode, Number.parseInt(m, 10)));
9525
+ }
9416
9526
  function parseVoiceTraits(description) {
9417
9527
  const d = description.toLowerCase();
9418
9528
  const out = {};
@@ -9431,14 +9541,14 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
9431
9541
  if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
9432
9542
  return casts.has(speaker);
9433
9543
  }
9434
- function makePresenterPresent(slots, canonical) {
9544
+ function makePresenterPresent(slots, canonical, opts = {}) {
9435
9545
  const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
9436
9546
  const bySpeaker = /* @__PURE__ */ new Map();
9437
9547
  for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
9438
- const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
9548
+ const solePerson = !opts.strict && personSlots.length === 1 ? personSlots[0].presence : null;
9439
9549
  return (speaker, sceneIndex) => {
9440
9550
  const presence = bySpeaker.get(speaker) ?? solePerson;
9441
- if (!presence) return true;
9551
+ if (!presence) return opts.strict ? false : true;
9442
9552
  return presence.has(sceneIndex);
9443
9553
  };
9444
9554
  }
@@ -9457,16 +9567,18 @@ function collapseVoiceover(blueprint) {
9457
9567
  const presenter = [...presenters][0];
9458
9568
  return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
9459
9569
  }
9460
- function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
9570
+ function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent, presentStrict) {
9461
9571
  const casts = castIdSet(blueprint);
9462
9572
  const cameraOn = onCameraDialogue(blueprint);
9463
9573
  const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
9464
9574
  const multiSpeaker = /* @__PURE__ */ new Set();
9465
9575
  blueprint.scenes.forEach((scene, i) => {
9466
- const onCam = new Set(
9576
+ const onCamAll = new Set(
9467
9577
  (scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
9468
9578
  );
9469
- if (onCam.size >= 2) multiSpeaker.add(i);
9579
+ const onCamPresent = [...onCamAll].filter((sp) => presentStrict(canonical(sp), i));
9580
+ const effective = onCamPresent.length > 0 ? new Set(onCamPresent) : onCamAll;
9581
+ if (effective.size >= 2) multiSpeaker.add(i);
9470
9582
  });
9471
9583
  const lines = blueprint.scenes.flatMap(
9472
9584
  (scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
@@ -9605,8 +9717,9 @@ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
9605
9717
  const genDur = ceilToSeedance(phraseLen);
9606
9718
  const clipParams = {
9607
9719
  model: env.opts.videoModel,
9608
- prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
9720
+ prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text, env.ttsLanguageCode),
9609
9721
  duration: genDur,
9722
+ ...videoResolutionParam(env.opts.videoModel, env.opts.resolution),
9610
9723
  generate_audio: true
9611
9724
  };
9612
9725
  if (env.ar) clipParams.aspect_ratio = env.ar;
@@ -9666,7 +9779,7 @@ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
9666
9779
  });
9667
9780
  }
9668
9781
  }
9669
- function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
9782
+ function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out, languageCode) {
9670
9783
  let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
9671
9784
  while (used.has(id)) id = `${id}_x`;
9672
9785
  used.add(id);
@@ -9674,7 +9787,12 @@ function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
9674
9787
  id,
9675
9788
  type: "tts",
9676
9789
  inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
9677
- params: { model: FIXED_TTS_MODEL, text: phrase.text, voice: "{{voice_ref}}" }
9790
+ params: {
9791
+ model: FIXED_TTS_MODEL,
9792
+ text: phrase.text,
9793
+ voice: "{{voice_ref}}",
9794
+ ...languageCode ? { language_code: languageCode } : {}
9795
+ }
9678
9796
  });
9679
9797
  out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
9680
9798
  out.voSegments.push({
@@ -9717,17 +9835,34 @@ function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, en
9717
9835
  nativeTurn,
9718
9836
  lengths,
9719
9837
  lengths.out,
9720
- { ar: env.ar, reuse: env.reuse, imageModel: env.opts.imageModel, videoModel: env.opts.videoModel },
9838
+ {
9839
+ ar: env.ar,
9840
+ reuse: env.reuse,
9841
+ imageModel: env.opts.imageModel,
9842
+ videoModel: env.opts.videoModel,
9843
+ resolution: env.opts.resolution,
9844
+ nativeLang: env.ttsLanguageCode
9845
+ },
9721
9846
  nodes,
9722
9847
  out.voTracks,
9723
9848
  out.nativeSegments,
9724
9849
  out.clips
9725
9850
  );
9726
9851
  if (!nativeTurn && distinctSpeakers.size >= 2) {
9727
- emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out);
9852
+ emitCompositeMultiSpeakerVoice(
9853
+ onCam,
9854
+ scene,
9855
+ i,
9856
+ canonical,
9857
+ ensureVoiceNode,
9858
+ usedVoIds,
9859
+ nodes,
9860
+ out,
9861
+ env.ttsLanguageCode
9862
+ );
9728
9863
  }
9729
9864
  }
9730
- function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
9865
+ function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out, languageCode) {
9731
9866
  const bySpeaker = /* @__PURE__ */ new Map();
9732
9867
  for (const l of onCam) {
9733
9868
  const speaker = canonical(l.speaker ?? "voiceover");
@@ -9759,7 +9894,8 @@ function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceN
9759
9894
  i,
9760
9895
  usedVoIds,
9761
9896
  nodes,
9762
- out
9897
+ out,
9898
+ languageCode
9763
9899
  );
9764
9900
  }
9765
9901
  }
@@ -9806,7 +9942,7 @@ function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
9806
9942
  { first, last },
9807
9943
  { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
9808
9944
  lengths.out,
9809
- { ar: env.ar, videoModel: env.opts.videoModel },
9945
+ { ar: env.ar, videoModel: env.opts.videoModel, resolution: env.opts.resolution, nativeLang: env.ttsLanguageCode },
9810
9946
  nodes
9811
9947
  );
9812
9948
  if (ambientBroll) {
@@ -9842,7 +9978,8 @@ function buildTimeline(blueprint, slots, opts, nodes) {
9842
9978
  reuse,
9843
9979
  cameraOn: onCameraDialogue(blueprint),
9844
9980
  casts: castIdSet(blueprint),
9845
- ingestCache: /* @__PURE__ */ new Map()
9981
+ ingestCache: /* @__PURE__ */ new Map(),
9982
+ ttsLanguageCode: deriveTtsLanguageCode(blueprint)
9846
9983
  };
9847
9984
  const out = {
9848
9985
  clips: [],
@@ -9853,7 +9990,8 @@ function buildTimeline(blueprint, slots, opts, nodes) {
9853
9990
  sceneSlice: /* @__PURE__ */ new Map()
9854
9991
  };
9855
9992
  const presenterPresent = makePresenterPresent(slots, canonical);
9856
- const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent);
9993
+ const presentStrict = makePresenterPresent(slots, canonical, { strict: true });
9994
+ const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent, presentStrict);
9857
9995
  const usedVoIds = /* @__PURE__ */ new Set();
9858
9996
  const claimed = /* @__PURE__ */ new Set();
9859
9997
  phrases.forEach((phrase, k) => {
@@ -9863,7 +10001,7 @@ function buildTimeline(blueprint, slots, opts, nodes) {
9863
10001
  for (const s of available) claimed.add(s);
9864
10002
  emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
9865
10003
  } else {
9866
- emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
10004
+ emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out, env.ttsLanguageCode);
9867
10005
  }
9868
10006
  });
9869
10007
  const lastIndex = blueprint.scenes.length - 1;
@@ -10200,7 +10338,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
10200
10338
  params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
10201
10339
  });
10202
10340
  });
10203
- applyActorSheets(slots, nodes);
10341
+ buildElementSheets(slots, nodes);
10204
10342
  const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
10205
10343
  let videoRef = buildSpine(clips, nodes);
10206
10344
  let videoNode = "spine";
@@ -10309,9 +10447,27 @@ function buildVideoMeta(blueprint, meta) {
10309
10447
  duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
10310
10448
  vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
10311
10449
  talking_scenes: meta.talking_scenes,
10450
+ lip_sync_caution: buildLipSyncCaution(meta.vo_segments),
10312
10451
  motion_board: buildMotionBoard(blueprint)
10313
10452
  };
10314
10453
  }
10454
+ function buildLipSyncCaution(segments) {
10455
+ const out = [];
10456
+ const byScene = /* @__PURE__ */ new Map();
10457
+ for (const s of segments) {
10458
+ const arr = byScene.get(s.scene) ?? [];
10459
+ arr.push(s);
10460
+ byScene.set(s.scene, arr);
10461
+ }
10462
+ for (const [scene, segs] of [...byScene.entries()].sort((a, b) => a[0] - b[0])) {
10463
+ const nativeSpeakers = new Set(segs.filter((s) => s.slot.endsWith("_conv")).map((s) => s.speaker));
10464
+ for (const speaker of nativeSpeakers) {
10465
+ const ttsOver = segs.filter((s) => !s.slot.endsWith("_conv") && s.speaker === speaker).map((s) => s.slot);
10466
+ if (ttsOver.length > 0) out.push({ scene, speaker, tts_over_native: ttsOver });
10467
+ }
10468
+ }
10469
+ return out;
10470
+ }
10315
10471
  function sceneSpokenText(scene) {
10316
10472
  return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
10317
10473
  }
@@ -10743,7 +10899,11 @@ var scaffoldVideoCommand = defineCommand76({
10743
10899
  "deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
10744
10900
  "select-model": { type: "string", description: "Override the text_generate model id for element selection" },
10745
10901
  "image-model": { type: "string", description: "Override the image_generate model id for frames" },
10746
- "video-model": { type: "string", description: "Override the video_generate model id for clips" }
10902
+ "video-model": { type: "string", description: "Override the video_generate model id for clips" },
10903
+ resolution: {
10904
+ type: "string",
10905
+ description: `Output resolution for generated clips (e.g. "1080p"). Default 1080p \u2014 the highest the video model supports \u2014 so clips keep the keyframe sharpness instead of the model's low default.`
10906
+ }
10747
10907
  },
10748
10908
  async run({ args }) {
10749
10909
  const videoPath = path5.resolve(String(args.file));
@@ -10795,7 +10955,8 @@ var scaffoldVideoCommand = defineCommand76({
10795
10955
  transcriptPath: captions.transcriptPath,
10796
10956
  blueprintPath,
10797
10957
  frames,
10798
- ambient: Boolean(args.ambient)
10958
+ ambient: Boolean(args.ambient),
10959
+ ...args.resolution ? { resolution: String(args.resolution) } : {}
10799
10960
  };
10800
10961
  let canvas;
10801
10962
  let report;