@koda-sl/baker-cli 0.91.0 → 0.92.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/{chunk-LMVDA3EZ.js → chunk-RCPMJKI7.js} +13 -6
- package/dist/chunk-RCPMJKI7.js.map +1 -0
- package/dist/cli.js +201 -40
- package/dist/cli.js.map +1 -1
- package/dist/engine/index.d.ts +5 -0
- package/dist/engine/index.js +1 -1
- package/package.json +2 -1
- package/dist/chunk-LMVDA3EZ.js.map +0 -1
package/dist/cli.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
defaultRegistry,
|
|
10
10
|
generateCatalog,
|
|
11
11
|
validateCanvasDeep
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-RCPMJKI7.js";
|
|
13
13
|
|
|
14
14
|
// src/cli.ts
|
|
15
15
|
import { defineCommand as defineCommand141, runMain } from "citty";
|
|
@@ -8369,6 +8369,18 @@ async function detectSceneCutsPySceneDetect(filePath, opts = {}) {
|
|
|
8369
8369
|
}
|
|
8370
8370
|
|
|
8371
8371
|
// src/engine/scaffold/video.ts
|
|
8372
|
+
import { toCardinal as nwAr } from "n2words/ar-SA";
|
|
8373
|
+
import { toCardinal as nwDe } from "n2words/de-DE";
|
|
8374
|
+
import { toCardinal as nwEn } from "n2words/en-US";
|
|
8375
|
+
import { toCardinal as nwEs } from "n2words/es-ES";
|
|
8376
|
+
import { toCardinal as nwFr } from "n2words/fr-FR";
|
|
8377
|
+
import { toCardinal as nwHi } from "n2words/hi-IN";
|
|
8378
|
+
import { toCardinal as nwIt } from "n2words/it-IT";
|
|
8379
|
+
import { toCardinal as nwJa } from "n2words/ja-JP";
|
|
8380
|
+
import { toCardinal as nwKo } from "n2words/ko-KR";
|
|
8381
|
+
import { toCardinal as nwNl } from "n2words/nl-NL";
|
|
8382
|
+
import { toCardinal as nwPl } from "n2words/pl-PL";
|
|
8383
|
+
import { toCardinal as nwPt } from "n2words/pt-PT";
|
|
8372
8384
|
import { z as z3 } from "zod";
|
|
8373
8385
|
|
|
8374
8386
|
// src/engine/scaffold/lib/shoot-modes.ts
|
|
@@ -8480,6 +8492,14 @@ var XFADE_BY_TYPE = {
|
|
|
8480
8492
|
swipe: "wipeleft",
|
|
8481
8493
|
zoom: "zoomin"
|
|
8482
8494
|
};
|
|
8495
|
+
var DEFAULT_VIDEO_RESOLUTION = "1080p";
|
|
8496
|
+
var VIDEO_MODELS_WITH_RESOLUTION = new Set(
|
|
8497
|
+
Object.entries(MODEL_REGISTRY.video_generate).filter(([, spec]) => "resolution" in spec.params).map(([id]) => id)
|
|
8498
|
+
);
|
|
8499
|
+
function videoResolutionParam(videoModel, resolution) {
|
|
8500
|
+
if (!VIDEO_MODELS_WITH_RESOLUTION.has(videoModel)) return {};
|
|
8501
|
+
return { resolution: resolution ?? DEFAULT_VIDEO_RESOLUTION };
|
|
8502
|
+
}
|
|
8483
8503
|
var WORDS_PER_SECOND = 2.5;
|
|
8484
8504
|
function estSpeechS(text) {
|
|
8485
8505
|
const words = text.trim().split(/\s+/).filter(Boolean).length;
|
|
@@ -8697,12 +8717,21 @@ var VideoBlueprint = z3.object({
|
|
|
8697
8717
|
// reference track. We never reuse it — only style the regenerated bed.
|
|
8698
8718
|
identified_track: z3.object({ title: z3.string().optional(), artist: z3.string().optional() }).loose().nullish()
|
|
8699
8719
|
}).loose().optional(),
|
|
8700
|
-
cast: z3.array(
|
|
8720
|
+
cast: z3.array(
|
|
8721
|
+
z3.object({
|
|
8722
|
+
id: z3.string().optional(),
|
|
8723
|
+
description: z3.string().optional(),
|
|
8724
|
+
// The deconstruct's note on the target-market localization (e.g. "native
|
|
8725
|
+
// French speaker") — read to derive the spoken-track language code.
|
|
8726
|
+
market_localization_note: z3.string().optional()
|
|
8727
|
+
}).loose()
|
|
8728
|
+
).optional(),
|
|
8701
8729
|
voiceover: z3.object({
|
|
8702
8730
|
// on_camera | mixed → mouths are on screen (lip-sync candidates);
|
|
8703
8731
|
// voiceover | none → narration over the picture (no lip-sync).
|
|
8704
8732
|
mode: z3.string().optional(),
|
|
8705
|
-
voice_description: z3.string().optional()
|
|
8733
|
+
voice_description: z3.string().optional(),
|
|
8734
|
+
persona: z3.string().optional()
|
|
8706
8735
|
}).loose().optional()
|
|
8707
8736
|
}).loose().optional(),
|
|
8708
8737
|
scenes: z3.array(Scene).min(1)
|
|
@@ -8885,11 +8914,18 @@ function slotsForFrame(slots, sceneIndex, edge) {
|
|
|
8885
8914
|
return slots.filter((s) => s.presence.get(sceneIndex)?.has(edge));
|
|
8886
8915
|
}
|
|
8887
8916
|
var ACTOR_SHEET_MODEL = "google/gemini-3-pro-image-preview";
|
|
8888
|
-
|
|
8917
|
+
var SHEET_SUBJECT_TYPE = {
|
|
8918
|
+
person: "person",
|
|
8919
|
+
animal: "character",
|
|
8920
|
+
product: "product",
|
|
8921
|
+
location: "location"
|
|
8922
|
+
};
|
|
8923
|
+
function buildElementSheets(slots, nodes) {
|
|
8889
8924
|
for (const slot of slots) {
|
|
8890
|
-
const
|
|
8891
|
-
if (
|
|
8892
|
-
if (slot.
|
|
8925
|
+
const subjectType = SHEET_SUBJECT_TYPE[slot.type.toLowerCase()];
|
|
8926
|
+
if (!subjectType) continue;
|
|
8927
|
+
if (slot.sameAs) continue;
|
|
8928
|
+
if (slot.presence.size < 1) continue;
|
|
8893
8929
|
const sheetId = `${slot.id}_sheet`;
|
|
8894
8930
|
nodes.push({
|
|
8895
8931
|
id: sheetId,
|
|
@@ -8899,11 +8935,15 @@ function applyActorSheets(slots, nodes) {
|
|
|
8899
8935
|
params: {
|
|
8900
8936
|
model: ACTOR_SHEET_MODEL,
|
|
8901
8937
|
subject_description: slot.description ?? `the ${slot.type}`,
|
|
8902
|
-
subject_type:
|
|
8903
|
-
|
|
8938
|
+
subject_type: subjectType,
|
|
8939
|
+
// 4K: the sheet packs up to 8 cells (angles + tight face/detail close-ups), and
|
|
8940
|
+
// it's the ONE reference every frame grounds on — per-cell sharpness here
|
|
8941
|
+
// propagates to every clip, so it's worth the highest tier on this single asset.
|
|
8942
|
+
image_size: "4K"
|
|
8904
8943
|
}
|
|
8905
8944
|
});
|
|
8906
8945
|
slot.ref = `$ref:${sheetId}.sheet`;
|
|
8946
|
+
slot.sheetBacked = true;
|
|
8907
8947
|
}
|
|
8908
8948
|
}
|
|
8909
8949
|
function slotsForScene(slots, sceneIndex) {
|
|
@@ -8914,7 +8954,7 @@ function buildFramePrompt(edge, sceneIndex, framePrompt, present, hasAnchor, mod
|
|
|
8914
8954
|
const legend = [
|
|
8915
8955
|
...present.map((s) => `- ${s.label} \u2014 ${roleForSlot(s)}`),
|
|
8916
8956
|
...hasAnchor ? [
|
|
8917
|
-
"- ORIGINAL_FRAME \u2014 use ONLY for composition
|
|
8957
|
+
"- ORIGINAL_FRAME \u2014 use ONLY for composition: framing, camera angle, shot size, subject placement, pose, and proportions. IGNORE its text, logo, brand name, colors, AND the identity of every person/animal/object in it \u2014 those come from the labeled reference images above, never from this frame. It is a DIFFERENT brand's footage with DIFFERENT actors, here ONLY to anchor where things sit and how the shot is framed (e.g. a profile/side angle stays a profile/side angle), never who they are or what palette to use."
|
|
8918
8958
|
] : []
|
|
8919
8959
|
].join("\n");
|
|
8920
8960
|
const description = framePrompt?.trim() || `the ${edge} frame of scene ${sceneIndex + 1} \u2014 describe the full composition, subjects, setting, action, lighting, and palette here. (Edit this line to change ONLY this frame.)`;
|
|
@@ -9003,11 +9043,12 @@ function ingestFrameRef(url, edge, ctx, nodes) {
|
|
|
9003
9043
|
function buildFrameRef(edge, url, framePrompt, present, ctx, nodes) {
|
|
9004
9044
|
const tag = ctx.tag ?? "";
|
|
9005
9045
|
if (ctx.reuse && url) return ingestFrameRef(url, edge, ctx, nodes);
|
|
9006
|
-
const
|
|
9046
|
+
const castSlots = present.filter((s) => {
|
|
9007
9047
|
const t = s.type.toLowerCase();
|
|
9008
9048
|
return t === "person" || t === "animal";
|
|
9009
9049
|
});
|
|
9010
|
-
const
|
|
9050
|
+
const castIdentityLocked = castSlots.every((s) => s.sheetBacked);
|
|
9051
|
+
const useOriginalAnchor = Boolean(url) && (castSlots.length === 0 || castIdentityLocked);
|
|
9011
9052
|
const hasOriginal = useOriginalAnchor;
|
|
9012
9053
|
const originalRef = useOriginalAnchor && url ? ingestFrameRef(url, edge, ctx, nodes) : void 0;
|
|
9013
9054
|
const reference = [...present.map((s) => s.ref), ...originalRef ? [originalRef] : []];
|
|
@@ -9039,17 +9080,18 @@ function seedanceAudioLine(scene, mode, audio, nativeLine) {
|
|
|
9039
9080
|
}
|
|
9040
9081
|
return null;
|
|
9041
9082
|
}
|
|
9042
|
-
function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine) {
|
|
9083
|
+
function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine, nativeLang) {
|
|
9084
|
+
const loc = (s) => nativeLine ? localizeNumeralsForNative(s, nativeLang) : s;
|
|
9043
9085
|
const parts = [];
|
|
9044
9086
|
const summary = scene.summary?.trim();
|
|
9045
|
-
parts.push(summary ? `Scene ${sceneIndex + 1}: ${summary}` : `Scene ${sceneIndex + 1}`);
|
|
9046
|
-
if (scene.action_detail) parts.push(`Action: ${scene.action_detail}`);
|
|
9087
|
+
parts.push(summary ? `Scene ${sceneIndex + 1}: ${loc(summary)}` : `Scene ${sceneIndex + 1}`);
|
|
9088
|
+
if (scene.action_detail) parts.push(`Action: ${loc(scene.action_detail)}`);
|
|
9047
9089
|
const cm = scene.camera_motion;
|
|
9048
9090
|
if (cm) {
|
|
9049
9091
|
const camera = [cm.movement, cm.detail].filter(Boolean).join(" \u2014 ");
|
|
9050
9092
|
if (camera) parts.push(`Camera: ${camera}`);
|
|
9051
9093
|
}
|
|
9052
|
-
if (scene.motion_prompt) parts.push(`Motion: ${scene.motion_prompt}`);
|
|
9094
|
+
if (scene.motion_prompt) parts.push(`Motion: ${loc(scene.motion_prompt)}`);
|
|
9053
9095
|
if (present.length > 0) {
|
|
9054
9096
|
parts.push(
|
|
9055
9097
|
`Keep these consistent with their references: ${present.map((s) => `${s.label} (${s.description ?? s.type})`).join("; ")}`
|
|
@@ -9057,7 +9099,7 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
|
|
|
9057
9099
|
}
|
|
9058
9100
|
if (nativeLine) {
|
|
9059
9101
|
parts.push(
|
|
9060
|
-
`The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${nativeLine}"`
|
|
9102
|
+
`The person speaks to camera. Lip-sync follows the dialogue verbatim; put delivery/emotion cues in [brackets]. Dialogue: "${loc(nativeLine)}"`
|
|
9061
9103
|
);
|
|
9062
9104
|
} else {
|
|
9063
9105
|
const lines = (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l));
|
|
@@ -9065,7 +9107,7 @@ function buildSeedancePrompt(scene, sceneIndex, present, mode, audio, nativeLine
|
|
|
9065
9107
|
parts.push(`Spoken context (do not render as audio): ${lines.map((l) => `"${l}"`).join(" ")}`);
|
|
9066
9108
|
}
|
|
9067
9109
|
const transcript = (scene.transcript_slice ?? []).map((w) => w.text?.trim()).filter(Boolean).join(" ").trim();
|
|
9068
|
-
if (transcript) parts.push(`Transcript: ${transcript}`);
|
|
9110
|
+
if (transcript) parts.push(`Transcript: ${loc(transcript)}`);
|
|
9069
9111
|
const audioLine = seedanceAudioLine(scene, mode, audio, nativeLine);
|
|
9070
9112
|
if (audioLine) parts.push(audioLine);
|
|
9071
9113
|
parts.push(
|
|
@@ -9176,8 +9218,17 @@ function buildPerSpeakerVoiceConversion(segments, totalMs, nodes) {
|
|
|
9176
9218
|
function emitSceneClip(i, scene, present, mode, nativeTurn, ambientBroll, frames, lengths, out, opts, nodes, tag = "") {
|
|
9177
9219
|
const clipParams = {
|
|
9178
9220
|
model: opts.videoModel,
|
|
9179
|
-
prompt: buildSeedancePrompt(
|
|
9221
|
+
prompt: buildSeedancePrompt(
|
|
9222
|
+
scene,
|
|
9223
|
+
i,
|
|
9224
|
+
present,
|
|
9225
|
+
mode,
|
|
9226
|
+
Boolean(nativeTurn) || ambientBroll,
|
|
9227
|
+
nativeTurn?.text,
|
|
9228
|
+
opts.nativeLang
|
|
9229
|
+
),
|
|
9180
9230
|
duration: lengths.genDur,
|
|
9231
|
+
...videoResolutionParam(opts.videoModel, opts.resolution),
|
|
9181
9232
|
// Native talking scene → Seedance generates the spoken audio + lip-sync; an opt-in
|
|
9182
9233
|
// ambient b-roll beat generates diegetic ambient only; otherwise the clip is silent.
|
|
9183
9234
|
generate_audio: Boolean(nativeTurn) || ambientBroll
|
|
@@ -9281,7 +9332,7 @@ function buildCompositeScene(layout, regions, comp, scene, i, present, mode, nat
|
|
|
9281
9332
|
{ first, last },
|
|
9282
9333
|
lengths,
|
|
9283
9334
|
null,
|
|
9284
|
-
{ ar: opts.ar, videoModel: opts.videoModel },
|
|
9335
|
+
{ ar: opts.ar, videoModel: opts.videoModel, resolution: opts.resolution, nativeLang: opts.nativeLang },
|
|
9285
9336
|
nodes,
|
|
9286
9337
|
tag
|
|
9287
9338
|
);
|
|
@@ -9413,6 +9464,65 @@ var LANGUAGE_WORDS = [
|
|
|
9413
9464
|
[/\b(hindi)\b/, "hindi"],
|
|
9414
9465
|
[/\b(polish)\b/, "polish"]
|
|
9415
9466
|
];
|
|
9467
|
+
var LANGUAGE_ISO = {
|
|
9468
|
+
french: "fr",
|
|
9469
|
+
spanish: "es",
|
|
9470
|
+
english: "en",
|
|
9471
|
+
german: "de",
|
|
9472
|
+
italian: "it",
|
|
9473
|
+
portuguese: "pt",
|
|
9474
|
+
dutch: "nl",
|
|
9475
|
+
arabic: "ar",
|
|
9476
|
+
japanese: "ja",
|
|
9477
|
+
korean: "ko",
|
|
9478
|
+
hindi: "hi",
|
|
9479
|
+
polish: "pl"
|
|
9480
|
+
};
|
|
9481
|
+
function languageHaystacks(blueprint) {
|
|
9482
|
+
const vo = blueprint.global?.voiceover;
|
|
9483
|
+
const cast = blueprint.global?.cast ?? [];
|
|
9484
|
+
const dialogue = blueprint.scenes.flatMap((s) => s.dialogue ?? []);
|
|
9485
|
+
return [
|
|
9486
|
+
vo?.voice_description,
|
|
9487
|
+
vo?.persona,
|
|
9488
|
+
...cast.flatMap((c) => [c.market_localization_note, c.description]),
|
|
9489
|
+
...dialogue.map((l) => l.voice_description)
|
|
9490
|
+
].filter((s) => Boolean(s));
|
|
9491
|
+
}
|
|
9492
|
+
function deriveTtsLanguageCode(blueprint) {
|
|
9493
|
+
for (const text of languageHaystacks(blueprint)) {
|
|
9494
|
+
const name = parseVoiceTraits(text).language;
|
|
9495
|
+
if (name && LANGUAGE_ISO[name]) return LANGUAGE_ISO[name];
|
|
9496
|
+
}
|
|
9497
|
+
return void 0;
|
|
9498
|
+
}
|
|
9499
|
+
var INTEGER_SPELLERS = {
|
|
9500
|
+
fr: nwFr,
|
|
9501
|
+
es: nwEs,
|
|
9502
|
+
en: nwEn,
|
|
9503
|
+
de: nwDe,
|
|
9504
|
+
it: nwIt,
|
|
9505
|
+
pt: nwPt,
|
|
9506
|
+
nl: nwNl,
|
|
9507
|
+
pl: nwPl,
|
|
9508
|
+
ar: nwAr,
|
|
9509
|
+
ja: nwJa,
|
|
9510
|
+
ko: nwKo,
|
|
9511
|
+
hi: nwHi
|
|
9512
|
+
};
|
|
9513
|
+
function spellNumber(langCode, n) {
|
|
9514
|
+
const spell = langCode ? INTEGER_SPELLERS[langCode] : void 0;
|
|
9515
|
+
if (!spell || !Number.isFinite(n)) return String(n);
|
|
9516
|
+
try {
|
|
9517
|
+
return spell(n);
|
|
9518
|
+
} catch {
|
|
9519
|
+
return String(n);
|
|
9520
|
+
}
|
|
9521
|
+
}
|
|
9522
|
+
function localizeNumeralsForNative(text, langCode) {
|
|
9523
|
+
if (!langCode || !INTEGER_SPELLERS[langCode]) return text;
|
|
9524
|
+
return text.replace(/(?<![\w.,-])\d{1,9}(?![\w.,-])/g, (m) => spellNumber(langCode, Number.parseInt(m, 10)));
|
|
9525
|
+
}
|
|
9416
9526
|
function parseVoiceTraits(description) {
|
|
9417
9527
|
const d = description.toLowerCase();
|
|
9418
9528
|
const out = {};
|
|
@@ -9431,14 +9541,14 @@ function isOnCameraSpeaker(speaker, casts, cameraOn) {
|
|
|
9431
9541
|
if (NARRATOR_SPEAKERS.has(speaker.toLowerCase())) return false;
|
|
9432
9542
|
return casts.has(speaker);
|
|
9433
9543
|
}
|
|
9434
|
-
function makePresenterPresent(slots, canonical) {
|
|
9544
|
+
function makePresenterPresent(slots, canonical, opts = {}) {
|
|
9435
9545
|
const personSlots = slots.filter((s) => s.type.toLowerCase() === "person");
|
|
9436
9546
|
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9437
9547
|
for (const slot of personSlots) if (slot.castId) bySpeaker.set(canonical(slot.castId), slot.presence);
|
|
9438
|
-
const solePerson = personSlots.length === 1 ? personSlots[0].presence : null;
|
|
9548
|
+
const solePerson = !opts.strict && personSlots.length === 1 ? personSlots[0].presence : null;
|
|
9439
9549
|
return (speaker, sceneIndex) => {
|
|
9440
9550
|
const presence = bySpeaker.get(speaker) ?? solePerson;
|
|
9441
|
-
if (!presence) return true;
|
|
9551
|
+
if (!presence) return opts.strict ? false : true;
|
|
9442
9552
|
return presence.has(sceneIndex);
|
|
9443
9553
|
};
|
|
9444
9554
|
}
|
|
@@ -9457,16 +9567,18 @@ function collapseVoiceover(blueprint) {
|
|
|
9457
9567
|
const presenter = [...presenters][0];
|
|
9458
9568
|
return (speaker) => NARRATOR_SPEAKERS.has(speaker.toLowerCase()) ? presenter : speaker;
|
|
9459
9569
|
}
|
|
9460
|
-
function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent) {
|
|
9570
|
+
function buildPhrases(blueprint, canonical, compositeScenes, presenterPresent, presentStrict) {
|
|
9461
9571
|
const casts = castIdSet(blueprint);
|
|
9462
9572
|
const cameraOn = onCameraDialogue(blueprint);
|
|
9463
9573
|
const sceneEndS = (i) => blueprint.scenes[i]?.end_s ?? blueprint.scenes[i]?.start_s ?? 0;
|
|
9464
9574
|
const multiSpeaker = /* @__PURE__ */ new Set();
|
|
9465
9575
|
blueprint.scenes.forEach((scene, i) => {
|
|
9466
|
-
const
|
|
9576
|
+
const onCamAll = new Set(
|
|
9467
9577
|
(scene.dialogue ?? []).map((l) => l.speaker ?? "voiceover").filter((sp) => isOnCameraSpeaker(sp, casts, cameraOn))
|
|
9468
9578
|
);
|
|
9469
|
-
|
|
9579
|
+
const onCamPresent = [...onCamAll].filter((sp) => presentStrict(canonical(sp), i));
|
|
9580
|
+
const effective = onCamPresent.length > 0 ? new Set(onCamPresent) : onCamAll;
|
|
9581
|
+
if (effective.size >= 2) multiSpeaker.add(i);
|
|
9470
9582
|
});
|
|
9471
9583
|
const lines = blueprint.scenes.flatMap(
|
|
9472
9584
|
(scene, sceneIndex) => compositeScenes.has(sceneIndex) ? [] : (scene.dialogue ?? []).filter((l) => Boolean(l.line?.trim())).map((l) => {
|
|
@@ -9605,8 +9717,9 @@ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
|
|
|
9605
9717
|
const genDur = ceilToSeedance(phraseLen);
|
|
9606
9718
|
const clipParams = {
|
|
9607
9719
|
model: env.opts.videoModel,
|
|
9608
|
-
prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text),
|
|
9720
|
+
prompt: buildSeedancePrompt(anchorScene, anchor, present, mode, true, phrase.text, env.ttsLanguageCode),
|
|
9609
9721
|
duration: genDur,
|
|
9722
|
+
...videoResolutionParam(env.opts.videoModel, env.opts.resolution),
|
|
9610
9723
|
generate_audio: true
|
|
9611
9724
|
};
|
|
9612
9725
|
if (env.ar) clipParams.aspect_ratio = env.ar;
|
|
@@ -9666,7 +9779,7 @@ function emitPhraseClip(phrase, voiceNode, env, nodes, out) {
|
|
|
9666
9779
|
});
|
|
9667
9780
|
}
|
|
9668
9781
|
}
|
|
9669
|
-
function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
|
|
9782
|
+
function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out, languageCode) {
|
|
9670
9783
|
let id = sanitizeId2(`vo_ph${idx}_${phrase.speaker}`, `vo_ph${idx}`);
|
|
9671
9784
|
while (used.has(id)) id = `${id}_x`;
|
|
9672
9785
|
used.add(id);
|
|
@@ -9674,7 +9787,12 @@ function emitPhraseTts(phrase, voiceNode, idx, used, nodes, out) {
|
|
|
9674
9787
|
id,
|
|
9675
9788
|
type: "tts",
|
|
9676
9789
|
inputs: { voice_ref: `$ref:${voiceNode}.voice_id` },
|
|
9677
|
-
params: {
|
|
9790
|
+
params: {
|
|
9791
|
+
model: FIXED_TTS_MODEL,
|
|
9792
|
+
text: phrase.text,
|
|
9793
|
+
voice: "{{voice_ref}}",
|
|
9794
|
+
...languageCode ? { language_code: languageCode } : {}
|
|
9795
|
+
}
|
|
9678
9796
|
});
|
|
9679
9797
|
out.voTracks.push({ slot: id, ref: `$ref:${id}.audio`, start_s: phrase.start_s, end_s: phrase.end_s, kind: "vo" });
|
|
9680
9798
|
out.voSegments.push({
|
|
@@ -9717,17 +9835,34 @@ function emitCompositeInTimeline(composite, scene, i, isLast, env, canonical, en
|
|
|
9717
9835
|
nativeTurn,
|
|
9718
9836
|
lengths,
|
|
9719
9837
|
lengths.out,
|
|
9720
|
-
{
|
|
9838
|
+
{
|
|
9839
|
+
ar: env.ar,
|
|
9840
|
+
reuse: env.reuse,
|
|
9841
|
+
imageModel: env.opts.imageModel,
|
|
9842
|
+
videoModel: env.opts.videoModel,
|
|
9843
|
+
resolution: env.opts.resolution,
|
|
9844
|
+
nativeLang: env.ttsLanguageCode
|
|
9845
|
+
},
|
|
9721
9846
|
nodes,
|
|
9722
9847
|
out.voTracks,
|
|
9723
9848
|
out.nativeSegments,
|
|
9724
9849
|
out.clips
|
|
9725
9850
|
);
|
|
9726
9851
|
if (!nativeTurn && distinctSpeakers.size >= 2) {
|
|
9727
|
-
emitCompositeMultiSpeakerVoice(
|
|
9852
|
+
emitCompositeMultiSpeakerVoice(
|
|
9853
|
+
onCam,
|
|
9854
|
+
scene,
|
|
9855
|
+
i,
|
|
9856
|
+
canonical,
|
|
9857
|
+
ensureVoiceNode,
|
|
9858
|
+
usedVoIds,
|
|
9859
|
+
nodes,
|
|
9860
|
+
out,
|
|
9861
|
+
env.ttsLanguageCode
|
|
9862
|
+
);
|
|
9728
9863
|
}
|
|
9729
9864
|
}
|
|
9730
|
-
function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out) {
|
|
9865
|
+
function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceNode, usedVoIds, nodes, out, languageCode) {
|
|
9731
9866
|
const bySpeaker = /* @__PURE__ */ new Map();
|
|
9732
9867
|
for (const l of onCam) {
|
|
9733
9868
|
const speaker = canonical(l.speaker ?? "voiceover");
|
|
@@ -9759,7 +9894,8 @@ function emitCompositeMultiSpeakerVoice(onCam, scene, i, canonical, ensureVoiceN
|
|
|
9759
9894
|
i,
|
|
9760
9895
|
usedVoIds,
|
|
9761
9896
|
nodes,
|
|
9762
|
-
out
|
|
9897
|
+
out,
|
|
9898
|
+
languageCode
|
|
9763
9899
|
);
|
|
9764
9900
|
}
|
|
9765
9901
|
}
|
|
@@ -9806,7 +9942,7 @@ function emitBrollScene(scene, i, isLast, env, nodes, out, prevEndFrame) {
|
|
|
9806
9942
|
{ first, last },
|
|
9807
9943
|
{ dur: lengths.dur, trimTarget: lengths.trimTarget, genDur: lengths.genDur },
|
|
9808
9944
|
lengths.out,
|
|
9809
|
-
{ ar: env.ar, videoModel: env.opts.videoModel },
|
|
9945
|
+
{ ar: env.ar, videoModel: env.opts.videoModel, resolution: env.opts.resolution, nativeLang: env.ttsLanguageCode },
|
|
9810
9946
|
nodes
|
|
9811
9947
|
);
|
|
9812
9948
|
if (ambientBroll) {
|
|
@@ -9842,7 +9978,8 @@ function buildTimeline(blueprint, slots, opts, nodes) {
|
|
|
9842
9978
|
reuse,
|
|
9843
9979
|
cameraOn: onCameraDialogue(blueprint),
|
|
9844
9980
|
casts: castIdSet(blueprint),
|
|
9845
|
-
ingestCache: /* @__PURE__ */ new Map()
|
|
9981
|
+
ingestCache: /* @__PURE__ */ new Map(),
|
|
9982
|
+
ttsLanguageCode: deriveTtsLanguageCode(blueprint)
|
|
9846
9983
|
};
|
|
9847
9984
|
const out = {
|
|
9848
9985
|
clips: [],
|
|
@@ -9853,7 +9990,8 @@ function buildTimeline(blueprint, slots, opts, nodes) {
|
|
|
9853
9990
|
sceneSlice: /* @__PURE__ */ new Map()
|
|
9854
9991
|
};
|
|
9855
9992
|
const presenterPresent = makePresenterPresent(slots, canonical);
|
|
9856
|
-
const
|
|
9993
|
+
const presentStrict = makePresenterPresent(slots, canonical, { strict: true });
|
|
9994
|
+
const phrases = buildPhrases(blueprint, canonical, compositeScenes, presenterPresent, presentStrict);
|
|
9857
9995
|
const usedVoIds = /* @__PURE__ */ new Set();
|
|
9858
9996
|
const claimed = /* @__PURE__ */ new Set();
|
|
9859
9997
|
phrases.forEach((phrase, k) => {
|
|
@@ -9863,7 +10001,7 @@ function buildTimeline(blueprint, slots, opts, nodes) {
|
|
|
9863
10001
|
for (const s of available) claimed.add(s);
|
|
9864
10002
|
emitPhraseClip({ ...phrase, shownScenes: available }, voiceNode, env, nodes, out);
|
|
9865
10003
|
} else {
|
|
9866
|
-
emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out);
|
|
10004
|
+
emitPhraseTts(phrase, voiceNode, k, usedVoIds, nodes, out, env.ttsLanguageCode);
|
|
9867
10005
|
}
|
|
9868
10006
|
});
|
|
9869
10007
|
const lastIndex = blueprint.scenes.length - 1;
|
|
@@ -10200,7 +10338,7 @@ function scaffoldVideoCanvas(input, elementsInput, opts) {
|
|
|
10200
10338
|
params: { source: "path", path: todoPath2(elements[i], slot.label), expect: "image" }
|
|
10201
10339
|
});
|
|
10202
10340
|
});
|
|
10203
|
-
|
|
10341
|
+
buildElementSheets(slots, nodes);
|
|
10204
10342
|
const { clips, voTracks, vo_segments, talking_scenes } = buildTimeline(blueprint, slots, opts, nodes);
|
|
10205
10343
|
let videoRef = buildSpine(clips, nodes);
|
|
10206
10344
|
let videoNode = "spine";
|
|
@@ -10309,9 +10447,27 @@ function buildVideoMeta(blueprint, meta) {
|
|
|
10309
10447
|
duration_s: blueprint.source?.duration_s ?? lastSceneEnd(blueprint),
|
|
10310
10448
|
vo_segments: [...meta.vo_segments].sort((a, b) => a.start_s - b.start_s),
|
|
10311
10449
|
talking_scenes: meta.talking_scenes,
|
|
10450
|
+
lip_sync_caution: buildLipSyncCaution(meta.vo_segments),
|
|
10312
10451
|
motion_board: buildMotionBoard(blueprint)
|
|
10313
10452
|
};
|
|
10314
10453
|
}
|
|
10454
|
+
function buildLipSyncCaution(segments) {
|
|
10455
|
+
const out = [];
|
|
10456
|
+
const byScene = /* @__PURE__ */ new Map();
|
|
10457
|
+
for (const s of segments) {
|
|
10458
|
+
const arr = byScene.get(s.scene) ?? [];
|
|
10459
|
+
arr.push(s);
|
|
10460
|
+
byScene.set(s.scene, arr);
|
|
10461
|
+
}
|
|
10462
|
+
for (const [scene, segs] of [...byScene.entries()].sort((a, b) => a[0] - b[0])) {
|
|
10463
|
+
const nativeSpeakers = new Set(segs.filter((s) => s.slot.endsWith("_conv")).map((s) => s.speaker));
|
|
10464
|
+
for (const speaker of nativeSpeakers) {
|
|
10465
|
+
const ttsOver = segs.filter((s) => !s.slot.endsWith("_conv") && s.speaker === speaker).map((s) => s.slot);
|
|
10466
|
+
if (ttsOver.length > 0) out.push({ scene, speaker, tts_over_native: ttsOver });
|
|
10467
|
+
}
|
|
10468
|
+
}
|
|
10469
|
+
return out;
|
|
10470
|
+
}
|
|
10315
10471
|
function sceneSpokenText(scene) {
|
|
10316
10472
|
return (scene.dialogue ?? []).map((d) => d.line?.trim()).filter((l) => Boolean(l)).join(" ") || null;
|
|
10317
10473
|
}
|
|
@@ -10743,7 +10899,11 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
10743
10899
|
"deconstruct-model": { type: "string", description: "Override the video_deconstruct model id" },
|
|
10744
10900
|
"select-model": { type: "string", description: "Override the text_generate model id for element selection" },
|
|
10745
10901
|
"image-model": { type: "string", description: "Override the image_generate model id for frames" },
|
|
10746
|
-
"video-model": { type: "string", description: "Override the video_generate model id for clips" }
|
|
10902
|
+
"video-model": { type: "string", description: "Override the video_generate model id for clips" },
|
|
10903
|
+
resolution: {
|
|
10904
|
+
type: "string",
|
|
10905
|
+
description: `Output resolution for generated clips (e.g. "1080p"). Default 1080p \u2014 the highest the video model supports \u2014 so clips keep the keyframe sharpness instead of the model's low default.`
|
|
10906
|
+
}
|
|
10747
10907
|
},
|
|
10748
10908
|
async run({ args }) {
|
|
10749
10909
|
const videoPath = path5.resolve(String(args.file));
|
|
@@ -10795,7 +10955,8 @@ var scaffoldVideoCommand = defineCommand76({
|
|
|
10795
10955
|
transcriptPath: captions.transcriptPath,
|
|
10796
10956
|
blueprintPath,
|
|
10797
10957
|
frames,
|
|
10798
|
-
ambient: Boolean(args.ambient)
|
|
10958
|
+
ambient: Boolean(args.ambient),
|
|
10959
|
+
...args.resolution ? { resolution: String(args.resolution) } : {}
|
|
10799
10960
|
};
|
|
10800
10961
|
let canvas;
|
|
10801
10962
|
let report;
|