npm - @koda-sl/baker-cli - Versions diffs - 0.91.0 → 0.92.1 - Mend

@koda-sl/baker-cli 0.91.0 → 0.92.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +6 -3
package/dist/{chunk-LMVDA3EZ.js → chunk-RCPMJKI7.js} +13 -6
package/dist/chunk-RCPMJKI7.js.map +1 -0
package/dist/cli.js +201 -40
package/dist/cli.js.map +1 -1
package/dist/engine/index.d.ts +5 -0
package/dist/engine/index.js +1 -1
package/package.json +2 -1
package/dist/chunk-LMVDA3EZ.js.map +0 -1

package/dist/cli.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   defaultRegistry,
   generateCatalog,
   validateCanvasDeep
-} from "./chunk-LMVDA3EZ.js";
+} from "./chunk-RCPMJKI7.js";
 // src/cli.ts
 import { defineCommand as defineCommand141, runMain } from "citty";
@@ -8369,6 +8369,18 @@ async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
 }
 // src/engine/scaffold/video.ts
+import { toCardinal as nwAr } from "n2words/ar-SA";
+import { toCardinal as nwDe } from "n2words/de-DE";
+import { toCardinal as nwEn } from "n2words/en-US";
+import { toCardinal as nwEs } from "n2words/es-ES";
+import { toCardinal as nwFr } from "n2words/fr-FR";
+import { toCardinal as nwHi } from "n2words/hi-IN";
+import { toCardinal as nwIt } from "n2words/it-IT";
+import { toCardinal as nwJa } from "n2words/ja-JP";
+import { toCardinal as nwKo } from "n2words/ko-KR";
+import { toCardinal as nwNl } from "n2words/nl-NL";
+import { toCardinal as nwPl } from "n2words/pl-PL";
+import { toCardinal as nwPt } from "n2words/pt-PT";
 import { z as z3 } from "zod";
 // src/engine/scaffold/lib/shoot-modes.ts
@@ -8480,6 +8492,14 @@ var XFADE_BY_TYPE = {
   swipe: "wipeleft",
   zoom: "zoomin"
 };
+var DEFAULT_VIDEO_RESOLUTION = "1080p";
+var VIDEO_MODELS_WITH_RESOLUTION = new Set(
+  Object.entries(MODEL_REGISTRY.video_generate).filter(([, spec]) => "resolution" in spec.params).map(([id]) => id)
+);
+function videoResolutionParam(videoModel, resolution) {
+  if (!VIDEO_MODELS_WITH_RESOLUTION.has(videoModel)) return {};
+  return { resolution: resolution ?? DEFAULT_VIDEO_RESOLUTION };
+}
 var WORDS_PER_SECOND = 2.5;
 function estSpeechS(text) {
   const words = text.trim().split(/\s+/).filter(Boolean).length;
@@ -8697,12 +8717,21 @@ var VideoBlueprint = z3.object({
       // reference track. We never reuse it — only style the regenerated bed.
       identified_track: z3.object({ title: z3.string().optional(), artist: z3.string().optional() }).loose().nullish()
     }).loose().optional(),
-    cast: z3.array(z3.object({ id: z3.string().optional(), description: z3.string().optional() }).loose()).optional(),
+    cast: z3.array(
+      z3.object({
+        id: z3.string().optional(),
+        description: z3.string().optional(),
+        // The deconstruct's note on the target-market localization (e.g. "native
+        // French speaker") — read to derive the spoken-track language code.
+        market_localization_note: z3.string().optional()
+      }).loose()
+    ).optional(),
     voiceover: z3.object({
       // on_camera | mixed → mouths are on screen (lip-sync candidates);
       // voiceover | none → narration over the picture (no lip-sync).
       mode: z3.string().optional(),
-      voice_description: z3.string().optional()
+      voice_description: z3.string().optional(),
+      persona: z3.string().optional()
     }).loose().optional()
   }).loose().optional(),
   scenes: z3.array(Scene).min(1)
@@ -8885,11 +8914,18 @@ function slotsForFrame(slots, sceneIndex, edge) {
   return slots.filter((s) => s.presence.get(sceneIndex)?.has(edge));
 }
 var ACTOR_SHEET_MODEL = "google/gemini-3-pro-image-preview";
-function applyActorSheets(slots, nodes) {
+var SHEET_SUBJECT_TYPE = {
+  person: "person",
+  animal: "character",
+  product: "product",
+  location: "location"
+};
+function buildElementSheets(slots, nodes) {
   for (const slot of slots) {
-    const t = slot.type.toLowerCase();
-    if (t !== "person" && t !== "animal") continue;
-    if (slot.presence.size < 2) continue;
+    const subjectType = SHEET_SUBJECT_TYPE[slot.type.toLowerCase()];
+    if (!subjectType) continue;
+    if (slot.sameAs) continue;
+    if (slot.presence.size < 1) continue;
     const sheetId = `${slot.id}_sheet`;
     nodes.push({
       id: sheetId,
@@ -8899,11 +8935,15 @@ function applyActorSheets(slots, nodes) {
       params: {
         model: ACTOR_SHEET_MODEL,
         subject_description: slot.description ?? `the ${slot.type}`,
-        subject_type: t === "person" ? "person" : "character",
-        image_size: "2K"
+        subject_type: subjectType,
+        // 4K: the sheet packs up to 8 cells (angles + tight face/detail close-ups), and
+        // it's the ONE reference every frame grounds on — per-cell sharpness here
+        // propagates to every clip, so it's worth the highest tier on this single asset.
+        image_size: "4K"
       }
     });
     slot.ref = `$ref:${sheetId}.sheet`;
+    slot.sheetBacked = true;
   }
 }
 function slotsForScene(slots, sceneIndex) {
@@ -8914,7 +8954,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
   const legend = [
     ...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
     ...hasAnchor ? [
-      "- ORIGINAL_FRAME \u2014 use ONLY for composition, framing, pose, and proportions. IGNORE its text, its logo, its brand name, and its colors entirely \u2014 it is a DIFFERENT brand's footage, here only to anchor layout/pose, never identity or palette."
+      "- ORIGINAL_FRAME \u2014 use ONLY for composition: framing, camera angle, shot size, subject placement, pose, and proportions. IGNORE its text, logo, brand name, colors, AND the identity of every person/animal/object in it \u2014 those come from the labeled reference images above, never from this frame. It is a DIFFERENT brand's footage with DIFFERENT actors, here ONLY to anchor where things sit and how the shot is framed (e.g. a profile/side angle stays a profile/side angle), never who they are or what palette to use."
     ] : []
   ].join("\n");
   const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
@@ -9003,11 +9043,12 @@ function ingestFrameRef(url, edge, ctx, nodes) {
 function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
   const tag = ctx.tag ?? "";
   if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
-  const hasPersonOrAnimal = present.some((s) => {
+  const castSlots = present.filter((s) => {
     const t = s.type.toLowerCase();
     return t === "person" || t === "animal";
   });
-  const useOriginalAnchor = Boolean(url) && !hasPersonOrAnimal;
+  const castIdentityLocked = castSlots.every((s) => s.sheetBacked);
+  const useOriginalAnchor = Boolean(url) && (castSlots.length === 0 || castIdentityLocked);
   const hasOriginal = useOriginalAnchor;
   const originalRef = useOriginalAnchor && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
   const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
@@ -9039,17 +9080,18 @@ function seedanceAudioLine(scene, mode, audio, nativeLine) {
   }
   return null;
 }
-function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine) {
+function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine, nativeLang) {
+  const loc = (s) => nativeLine ? localizeNumeralsForNative(s, nativeLang) : s;
   const parts = [];
   const summary = scene.summary?.trim();
-  parts.push(summary ? `Scene ${sceneIndex + 1}: ${summary}` : `Scene ${sceneIndex + 1}`);
-  if (scene.action_detail) parts.push(`Action: ${scene.action_detail}`);
+  parts.push(summary ? `Scene ${sceneIndex + 1}: ${loc(summary)}` : `Scene ${sceneIndex + 1}`);
+  if (scene.action_detail) parts.push(`Action: ${loc(scene.action_detail)}`);
   const cm = scene.camera_motion;
   if (cm) {
     const camera = [cm.movement, cm.detail].filter(Boolean).join(" \u2014 ");
     if (camera) parts.push(`Camera: ${camera}`);
   }
-  if (scene.motion_prompt) parts.push(`Motion: ${scene.motion_prompt}`);
+  if (scene.motion_prompt) parts.push(`Motion: ${loc(scene.motion_prompt)}`);
   if (present.length > 0) {
     parts.push(
       `Keep these consistent with their references: ${present.map((s) => `${s.label} (${s.description ?? s.type})`).join("; ")}`
@@ -9057,7 +9099,7 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
   }
   if (nativeLine) {
     parts.push(
-      `The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${nativeLine}"`
+      `The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${loc(nativeLine)}"`
     );
   } else {
     const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
@@ -9065,7 +9107,7 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
       parts.push(`Spoken context (do not render as audio): ${lines.map((l) => `"${l}"`).join(" ")}`);
   }
   const transcript = (scene.transcript_slice ?? []).map((w) => w.text?.trim()).filter(Boolean).join(" ").trim();
-  if (transcript) parts.push(`Transcript: ${transcript}`);
+  if (transcript) parts.push(`Transcript: ${loc(transcript)}`);
   const audioLine = seedanceAudioLine(scene, mode, audio, nativeLine);
   if (audioLine) parts.push(audioLine);
   parts.push(
@@ -9176,8 +9218,17 @@ function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
 function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
   const clipParams = {
     model: opts.videoModel,
-    prompt: buildSeedancePrompt(scene, i, present, mode, Boolean(nativeTurn) || ambientBroll, nativeTurn?.text),
+    prompt: buildSeedancePrompt(
+      scene,
+      i,
+      present,
+      mode,
+      Boolean(nativeTurn) || ambientBroll,
+      nativeTurn?.text,
+      opts.nativeLang
+    ),
     duration: lengths.genDur,
+    ...videoResolutionParam(opts.videoModel, opts.resolution),
     // Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
     // ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
     generate_audio: Boolean(nativeTurn) || ambientBroll
@@ -9281,7 +9332,7 @@ function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nat
       { first, last },
       lengths,
       null,
-      { ar: opts.ar, videoModel: opts.videoModel },
+      { ar: opts.ar, videoModel: opts.videoModel, resolution: opts.resolution, nativeLang: opts.nativeLang },
       nodes,
       tag
     );
@@ -9413,6 +9464,65 @@ var LANGUAGE_WORDS = [
   [/\b(hindi)\b/, "hindi"],
   [/\b(polish)\b/, "polish"]
 ];
+var LANGUAGE_ISO = {
+  french: "fr",
+  spanish: "es",
+  english: "en",
+  german: "de",
+  italian: "it",
+  portuguese: "pt",
+  dutch: "nl",
+  arabic: "ar",
+  japanese: "ja",
+  korean: "ko",
+  hindi: "hi",
+  polish: "pl"
+};
+function languageHaystacks(blueprint) {
+  const vo = blueprint.global?.voiceover;
+  const cast = blueprint.global?.cast ?? [];
+  const dialogue = blueprint.scenes.flatMap((s) => s.dialogue ?? []);
+  return [
+    vo?.voice_description,
+    vo?.persona,
+    ...cast.flatMap((c) => [c.market_localization_note, c.description]),
+    ...dialogue.map((l) => l.voice_description)
+  ].filter((s) => Boolean(s));
+}
+function deriveTtsLanguageCode(blueprint) {
+  for (const text of languageHaystacks(blueprint)) {
+    const name = parseVoiceTraits(text).language;
+    if (name && LANGUAGE_ISO[name]) return LANGUAGE_ISO[name];
+  }
+  return void 0;
+}
+var INTEGER_SPELLERS = {
+  fr: nwFr,
+  es: nwEs,
+  en: nwEn,
+  de: nwDe,
+  it: nwIt,
+  pt: nwPt,
+  nl: nwNl,
+  pl: nwPl,
+  ar: nwAr,
+  ja: nwJa,
+  ko: nwKo,
+  hi: nwHi
+};
+function spellNumber(langCode, n) {
+  const spell = langCode ? INTEGER_SPELLERS[langCode] : void 0;
+  if (!spell || !Number.isFinite(n)) return String(n);
+  try {
+    return spell(n);
+  } catch {
+    return String(n);
+  }
+}
+function localizeNumeralsForNative(text, langCode) {
+  if (!langCode || !INTEGER_SPELLERS[langCode]) return text;
+  return text.replace(/(?<![\w.,-])\d{1,9}(?![\w.,-])/g, (m) => spellNumber(langCode, Number.parseInt(m, 10)));
+}
 function parseVoiceTraits(description) {
   const d = description.toLowerCase();
   const out = {};
@@ -9431,14 +9541,14 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
   if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
   return casts.has(speaker);
 }
-function makePresenterPresent(slots, canonical) {
+function makePresenterPresent(slots, canonical, opts = {}) {
   const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
   const bySpeaker = /* @__PURE__ */ new Map();
   for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
-  const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
+  const solePerson = !opts.strict && personSlots.length === 1 ? personSlots[0].presence : null;
   return (speaker, sceneIndex) => {
     const presence = bySpeaker.get(speaker) ?? solePerson;
-    if (!presence) return true;
+    if (!presence) return opts.strict ? false : true;
     return presence.has(sceneIndex);
   };
 }
@@ -9457,16 +9567,18 @@ function collapseVoiceover(blueprint) {
   const presenter = [...presenters][0];
   return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
 }
-function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
+function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent, presentStrict) {
   const casts = castIdSet(blueprint);
   const cameraOn = onCameraDialogue(blueprint);
   const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
   const multiSpeaker = /* @__PURE__ */ new Set();
   blueprint.scenes.forEach((scene, i) => {
-    const onCam = new Set(
+    const onCamAll = new Set(
       (scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
     );
-    if (onCam.size >= 2) multiSpeaker.add(i);
+    const onCamPresent = [...onCamAll].filter((sp) => presentStrict(canonical(sp), i));
+    const effective = onCamPresent.length > 0 ? new Set(onCamPresent) : onCamAll;
+    if (effective.size >= 2) multiSpeaker.add(i);
   });
   const lines = blueprint.scenes.flatMap(
     (scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
@@ -9605,8 +9717,9 @@ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
   const genDur = ceilToSeedance(phraseLen);
   const clipParams = {
     model: env.opts.videoModel,
-    prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
+    prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text, env.ttsLanguageCode),
     duration: genDur,
+    ...videoResolutionParam(env.opts.videoModel, env.opts.resolution),
     generate_audio: true
   };
   if (env.ar) clipParams.aspect_ratio = env.ar;
@@ -9666,7 +9779,7 @@ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
     });
   }
 }
-function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
+function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out, languageCode) {
   let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
   while (used.has(id)) id = `${id}_x`;
   used.add(id);
@@ -9674,7 +9787,12 @@ function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
     id,
     type: "tts",
     inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
-    params: { model: FIXED_TTS_MODEL, text: phrase.text, voice: "{{voice_ref}}" }
+    params: {
+      model: FIXED_TTS_MODEL,
+      text: phrase.text,
+      voice: "{{voice_ref}}",
+      ...languageCode ? { language_code: languageCode } : {}
+    }
   });
   out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
   out.voSegments.push({
@@ -9717,17 +9835,34 @@ function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, en
     nativeTurn,
     lengths,
     lengths.out,
-    { ar: env.ar, reuse: env.reuse, imageModel: env.opts.imageModel, videoModel: env.opts.videoModel },
+    {
+      ar: env.ar,
+      reuse: env.reuse,
+      imageModel: env.opts.imageModel,
+      videoModel: env.opts.videoModel,
+      resolution: env.opts.resolution,
+      nativeLang: env.ttsLanguageCode
+    },
     nodes,
     out.voTracks,
     out.nativeSegments,
     out.clips
   );
   if (!nativeTurn && distinctSpeakers.size >= 2) {
-    emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out);
+    emitCompositeMultiSpeakerVoice(
+      onCam,
+      scene,
+      i,
+      canonical,
+      ensureVoiceNode,
+      usedVoIds,
+      nodes,
+      out,
+      env.ttsLanguageCode
+    );
   }
 }
-function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
+function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out, languageCode) {
   const bySpeaker = /* @__PURE__ */ new Map();
   for (const l of onCam) {
     const speaker = canonical(l.speaker ?? "voiceover");
@@ -9759,7 +9894,8 @@ function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceN
       i,
       usedVoIds,
       nodes,
-      out
+      out,
+      languageCode
     );
   }
 }
@@ -9806,7 +9942,7 @@ function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
     { first, last },
     { dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
     lengths.out,
-    { ar: env.ar, videoModel: env.opts.videoModel },
+    { ar: env.ar, videoModel: env.opts.videoModel, resolution: env.opts.resolution, nativeLang: env.ttsLanguageCode },
     nodes
   );
   if (ambientBroll) {
@@ -9842,7 +9978,8 @@ function buildTimeline(blueprint, slots, opts, nodes) {
     reuse,
     cameraOn: onCameraDialogue(blueprint),
     casts: castIdSet(blueprint),
-    ingestCache: /* @__PURE__ */ new Map()
+    ingestCache: /* @__PURE__ */ new Map(),
+    ttsLanguageCode: deriveTtsLanguageCode(blueprint)
   };
   const out = {
     clips: [],
@@ -9853,7 +9990,8 @@ function buildTimeline(blueprint, slots, opts, nodes) {
     sceneSlice: /* @__PURE__ */ new Map()
   };
   const presenterPresent = makePresenterPresent(slots, canonical);
-  const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent);
+  const presentStrict = makePresenterPresent(slots, canonical, { strict: true });
+  const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent, presentStrict);
   const usedVoIds = /* @__PURE__ */ new Set();
   const claimed = /* @__PURE__ */ new Set();
   phrases.forEach((phrase, k) => {
@@ -9863,7 +10001,7 @@ function buildTimeline(blueprint, slots, opts, nodes) {
       for (const s of available) claimed.add(s);
       emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
     } else {
-      emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
+      emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out, env.ttsLanguageCode);
     }
   });
   const lastIndex = blueprint.scenes.length - 1;
@@ -10200,7 +10338,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
       params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
     });
   });
-  applyActorSheets(slots, nodes);
+  buildElementSheets(slots, nodes);
   const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
   let videoRef = buildSpine(clips, nodes);
   let videoNode = "spine";
@@ -10309,9 +10447,27 @@ function buildVideoMeta(blueprint, meta) {
     duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
     vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
     talking_scenes: meta.talking_scenes,
+    lip_sync_caution: buildLipSyncCaution(meta.vo_segments),
     motion_board: buildMotionBoard(blueprint)
   };
 }
+function buildLipSyncCaution(segments) {
+  const out = [];
+  const byScene = /* @__PURE__ */ new Map();
+  for (const s of segments) {
+    const arr = byScene.get(s.scene) ?? [];
+    arr.push(s);
+    byScene.set(s.scene, arr);
+  }
+  for (const [scene, segs] of [...byScene.entries()].sort((a, b) => a[0] - b[0])) {
+    const nativeSpeakers = new Set(segs.filter((s) => s.slot.endsWith("_conv")).map((s) => s.speaker));
+    for (const speaker of nativeSpeakers) {
+      const ttsOver = segs.filter((s) => !s.slot.endsWith("_conv") && s.speaker === speaker).map((s) => s.slot);
+      if (ttsOver.length > 0) out.push({ scene, speaker, tts_over_native: ttsOver });
+    }
+  }
+  return out;
+}
 function sceneSpokenText(scene) {
   return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
 }
@@ -10743,7 +10899,11 @@ var scaffoldVideoCommand = defineCommand76({
     "deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
     "select-model": { type: "string", description: "Override the text_generate model id for element selection" },
     "image-model": { type: "string", description: "Override the image_generate model id for frames" },
-    "video-model": { type: "string", description: "Override the video_generate model id for clips" }
+    "video-model": { type: "string", description: "Override the video_generate model id for clips" },
+    resolution: {
+      type: "string",
+      description: `Output resolution for generated clips (e.g. "1080p"). Default 1080p \u2014 the highest the video model supports \u2014 so clips keep the keyframe sharpness instead of the model's low default.`
+    }
   },
   async run({ args }) {
     const videoPath = path5.resolve(String(args.file));
@@ -10795,7 +10955,8 @@ var scaffoldVideoCommand = defineCommand76({
       transcriptPath: captions.transcriptPath,
       blueprintPath,
       frames,
-      ambient: Boolean(args.ambient)
+      ambient: Boolean(args.ambient),
+      ...args.resolution ? { resolution: String(args.resolution) } : {}
     };
     let canvas;
     let report;