npm - mulmocast - Versions diffs - 0.0.28 → 0.1.1 - Mend

mulmocast 0.0.28 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +25 -8
package/assets/templates/ghibli_shorts.json +2 -2
package/assets/templates/sensei_and_taro.json +1 -1
package/lib/actions/captions.js +16 -2
package/lib/actions/images.d.ts +5 -2
package/lib/actions/images.js +14 -34
package/lib/actions/movie.d.ts +1 -1
package/lib/actions/movie.js +110 -77
package/lib/agents/add_bgm_agent.js +15 -2
package/lib/agents/combine_audio_files_agent.js +53 -5
package/lib/agents/tts_openai_agent.js +2 -1
package/lib/cli/commands/tool/scripting/handler.js +1 -0
package/lib/methods/mulmo_presentation_style.d.ts +3 -2
package/lib/methods/mulmo_presentation_style.js +8 -4
package/lib/types/schema.d.ts +309 -115
package/lib/types/schema.js +48 -11
package/lib/types/type.d.ts +5 -2
package/lib/utils/context.d.ts +13 -4
package/lib/utils/file.js +8 -0
package/lib/utils/image_plugins/index.d.ts +2 -1
package/lib/utils/image_plugins/index.js +2 -1
package/lib/utils/image_plugins/voice_over.d.ts +5 -0
package/lib/utils/image_plugins/voice_over.js +9 -0
package/lib/utils/preprocess.d.ts +12 -3
package/lib/utils/utils.d.ts +1 -0
package/lib/utils/utils.js +14 -0
package/package.json +12 -12
package/scripts/templates/voice_over.json +60 -0

package/lib/agents/combine_audio_files_agent.js CHANGED Viewed

@@ -1,10 +1,12 @@
 import { assert, GraphAILogger } from "graphai";
 import { silent60secPath } from "../utils/file.js";
 import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
+import { userAssert } from "../utils/utils.js";
 const getMovieDulation = async (beat) => {
     if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
         const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
-        return await ffmpegGetMediaDuration(pathOrUrl);
+        const speed = beat.movieParams?.speed ?? 1.0;
+        return (await ffmpegGetMediaDuration(pathOrUrl)) / speed;
     }
     return 0;
 };
@@ -65,7 +67,45 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
     const mediaDurations = await getMediaDurations(context);
     const beatDurations = [];
     context.studio.script.beats.forEach((beat, index) => {
+        if (beatDurations.length > index) {
+            // The current beat has already been processed.
+            return;
+        }
+        assert(beatDurations.length === index, "beatDurations.length !== index");
         const { audioDuration, movieDuration } = mediaDurations[index];
+        // Check if we are processing a voice-over beat.
+        if (movieDuration > 0) {
+            const group = [index];
+            for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
+                group.push(i);
+            }
+            if (group.length > 1) {
+                group.reduce((remaining, idx, iGroup) => {
+                    const subBeatDurations = mediaDurations[idx];
+                    userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
+                    if (iGroup === group.length - 1) {
+                        beatDurations.push(remaining);
+                        subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
+                        return 0;
+                    }
+                    const nextBeat = context.studio.script.beats[idx + 1];
+                    assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
+                    const voiceStartAt = nextBeat.image?.startAt;
+                    if (voiceStartAt) {
+                        const remainingDuration = movieDuration - voiceStartAt;
+                        const duration = remaining - remainingDuration;
+                        userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
+                        beatDurations.push(duration);
+                        subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
+                        userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
+                        return remainingDuration;
+                    }
+                    beatDurations.push(subBeatDurations.audioDuration);
+                    return remaining - subBeatDurations.audioDuration;
+                }, movieDuration);
+                return;
+            }
+        }
         // Check if the current beat has media and the next beat does not have media.
         if (audioDuration > 0) {
             // Check if the current beat has spilled over audio.
@@ -111,17 +151,15 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
         }
         else if (movieDuration > 0) {
             // This beat has only a movie, not audio.
-            assert(beatDurations.length === index, "beatDurations.length !== index");
             beatDurations.push(movieDuration);
             mediaDurations[index].silenceDuration = movieDuration;
         }
-        else if (beatDurations.length === index) {
+        else {
             // The current beat has no audio, nor no spilled over audio
             const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
             beatDurations.push(beatDuration);
             mediaDurations[index].silenceDuration = beatDuration;
         }
-        // else { Skip this beat if the duration has been already added as a group }
     });
     assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
     // We cannot reuse longSilentId. We need to explicitly split it for each beat.
@@ -152,9 +190,19 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
     const result = {
         studio: {
             ...context.studio,
-            beats: context.studio.beats.map((studioBeat, index) => ({ ...studioBeat, duration: beatDurations[index] })),
+            beats: context.studio.beats.map((studioBeat, index) => ({
+                ...studioBeat,
+                duration: beatDurations[index],
+                audioDuration: mediaDurations[index].audioDuration,
+                movieDuration: mediaDurations[index].movieDuration,
+                silenceDuration: mediaDurations[index].silenceDuration,
+            })),
         },
     };
+    result.studio.beats.reduce((acc, beat) => {
+        beat.startAt = acc;
+        return acc + beat.duration;
+    }, 0);
     // context.studio = result.studio; // TODO: removing this breaks test/test_movie.ts
     return {
         ...context,

package/lib/agents/tts_openai_agent.js CHANGED Viewed

@@ -29,12 +29,13 @@ export const ttsOpenaiAgent = async ({ namedInputs, params, config }) => {
         if (e && typeof e === "object" && "error" in e) {
             GraphAILogger.info("tts_openai_agent: ");
             GraphAILogger.info(e.error);
+            throw new Error("TTS OpenAI Error: " + JSON.stringify(e.error, null, 2));
         }
         else if (e instanceof Error) {
             GraphAILogger.info("tts_openai_agent: ");
             GraphAILogger.info(e.message);
+            throw new Error("TTS OpenAI Error: " + e.message);
         }
-        throw new Error("TTS OpenAI Error");
     }
 };
 const ttsOpenaiAgentInfo = {

package/lib/cli/commands/tool/scripting/handler.js CHANGED Viewed

@@ -34,6 +34,7 @@ export const handler = async (argv) => {
     const context = { outDirPath, templateName: template, urls, filename: filename, cacheDirPath, llm_model, llm, verbose };
     if (interactive) {
         await createMulmoScriptInteractively(context);
+        return;
     }
     if (inputFile) {
         await createMulmoScriptFromFile(inputFile, context);

package/lib/methods/mulmo_presentation_style.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import "dotenv/config";
-import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData } from "../types/index.js";
+import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData, Text2ImageProvider } from "../types/index.js";
 export declare const MulmoPresentationStyleMethods: {
     getCanvasSize(presentationStyle: MulmoPresentationStyle): MulmoCanvasDimension;
     getSpeechProvider(presentationStyle: MulmoPresentationStyle): Text2SpeechProvider;
@@ -9,7 +9,8 @@ export declare const MulmoPresentationStyleMethods: {
     getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
     getProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
     getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
-    getImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2ImageAgentInfo;
+    getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
+    getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
     getHtmlImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2HtmlAgentInfo;
     getImageType(_: MulmoPresentationStyle, beat: MulmoBeat): BeatMediaType;
 };

package/lib/methods/mulmo_presentation_style.js CHANGED Viewed

@@ -57,17 +57,21 @@ export const MulmoPresentationStyleMethods = {
         const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
         return speaker.voiceId;
     },
-    getImageAgentInfo(presentationStyle) {
+    getText2ImageProvider(provider) {
+        return text2ImageProviderSchema.parse(provider);
+    },
+    getImageAgentInfo(presentationStyle, beat) {
         // Notice that we copy imageParams from presentationStyle and update
         // provider and model appropriately.
-        const provider = text2ImageProviderSchema.parse(presentationStyle.imageParams?.provider);
+        const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
+        const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
         const defaultImageParams = {
+            provider,
             model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
         };
         return {
-            provider,
             agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
-            imageParams: { ...defaultImageParams, ...presentationStyle.imageParams },
+            imageParams: { ...defaultImageParams, ...imageParams },
         };
     },
     getHtmlImageAgentInfo(presentationStyle) {