npm - mulmocast - Versions diffs - 0.0.28 → 0.1.1 - Mend

mulmocast 0.0.28 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +25 -8
package/assets/templates/ghibli_shorts.json +2 -2
package/assets/templates/sensei_and_taro.json +1 -1
package/lib/actions/captions.js +16 -2
package/lib/actions/images.d.ts +5 -2
package/lib/actions/images.js +14 -34
package/lib/actions/movie.d.ts +1 -1
package/lib/actions/movie.js +110 -77
package/lib/agents/add_bgm_agent.js +15 -2
package/lib/agents/combine_audio_files_agent.js +53 -5
package/lib/agents/tts_openai_agent.js +2 -1
package/lib/cli/commands/tool/scripting/handler.js +1 -0
package/lib/methods/mulmo_presentation_style.d.ts +3 -2
package/lib/methods/mulmo_presentation_style.js +8 -4
package/lib/types/schema.d.ts +309 -115
package/lib/types/schema.js +48 -11
package/lib/types/type.d.ts +5 -2
package/lib/utils/context.d.ts +13 -4
package/lib/utils/file.js +8 -0
package/lib/utils/image_plugins/index.d.ts +2 -1
package/lib/utils/image_plugins/index.js +2 -1
package/lib/utils/image_plugins/voice_over.d.ts +5 -0
package/lib/utils/image_plugins/voice_over.js +9 -0
package/lib/utils/preprocess.d.ts +12 -3
package/lib/utils/utils.d.ts +1 -0
package/lib/utils/utils.js +14 -0
package/package.json +12 -12
package/scripts/templates/voice_over.json +60 -0

package/README.md CHANGED Viewed

@@ -82,6 +82,16 @@ brew install ffmpeg
 # Visit https://ffmpeg.org/download.html
 ```
+You can also use [`Dockerfile`](./Dockerfile) which helps you install the pre-requisits.
+```
+docker build -t mulmo-cli .
+```
+You can use the Docker image like this:
+```
+docker run -e OPENAI_API_KEY=<your_openai_api_key> -it mulmo-cli mulmo tool scripting -i -t children_book -o ./ -s story
+```
 ## Configuration
 Create a `.env` file in your project directory with the following API keys:
@@ -200,11 +210,18 @@ writing: /Users/username/path/to/output/story-1747834931950__ja.mp4
 # Generate script from web content (requires Browserless API KEY)
 mulmo tool scripting -u https://example.com
+# Generate script from local file
+mulmo tool scripting --input-file story.txt
 # Generate script with interactive mode
 mulmo tool scripting -i
 ```
-When using the `⁠sensei_and_taro` template, a Nijivoice API key is required.
+Note:
+- When using the `⁠sensei_and_taro` template, a Nijivoice API key is required
+- When -i is specified, --input-file value will be ignored
+- When --input-file is specified, -u value will be ignored
 ## Generate content from MulmoScript
@@ -308,7 +325,6 @@ Options:
   -b, --basedir            base dir                                     [string]
   -l, --lang               target language        [string] [choices: "en", "ja"]
   -f, --force              Force regenerate           [boolean] [default: false]
-      --dryRun             Dry run                    [boolean] [default: false]
   -p, --presentationStyle  Presentation Style                           [string]
   -a, --audiodir           Audio output directory                       [string]
 ```
@@ -329,7 +345,6 @@ Options:
   -b, --basedir            base dir                                     [string]
   -l, --lang               target language        [string] [choices: "en", "ja"]
   -f, --force              Force regenerate           [boolean] [default: false]
-      --dryRun             Dry run                    [boolean] [default: false]
   -p, --presentationStyle  Presentation Style                           [string]
   -i, --imagedir           Image output directory                       [string]
 ```
@@ -350,7 +365,6 @@ Options:
   -b, --basedir            base dir                                     [string]
   -l, --lang               target language        [string] [choices: "en", "ja"]
   -f, --force              Force regenerate           [boolean] [default: false]
-      --dryRun             Dry run                    [boolean] [default: false]
   -p, --presentationStyle  Presentation Style                           [string]
   -a, --audiodir           Audio output directory                       [string]
   -i, --imagedir           Image output directory                       [string]
@@ -411,16 +425,19 @@ Options:
   -b, --basedir      base dir                                           [string]
   -u, --url          URLs to reference (required when not in interactive mode)
                                                            [array] [default: []]
+      --input-file   input file name                                    [string]
   -i, --interactive  Generate script in interactive mode with user prompts
                                                                        [boolean]
   -t, --template     Template name to use
-       [string] [choices: "business", "children_book", "coding", "comic_strips",
-                         "ghibli_strips", "podcast_standard", "sensei_and_taro"]
+        [string] [choices: "akira_comic", "business", "children_book", "coding",
+           "comic_strips", "drslump_comic", "ghibli_comic", "ghibli_image_only",
+           "ghibli_shorts", "ghost_comic", "onepiece_comic", "podcast_standard",
+               "portrait_movie", "realistic_movie", "sensei_and_taro", "shorts",
+                                       "text_and_image", "text_only", "trailer"]
   -c, --cache        cache dir                                          [string]
   -s, --script       script filename                [string] [default: "script"]
       --llm          llm
-              [string] [choices: "openAIAgent", "anthropicAgent", "geminiAgent",
-                                                                    "groqAgent"]
+                     [string] [choices: "openai", "anthropic", "gemini", "groq"]
       --llm_model    llm model                                          [string]
 ```

package/assets/templates/ghibli_shorts.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "title": "Ghibli comic style",
+  "title": "Ghibli style for YouTube Shorts",
   "description": "Template for Ghibli-style comic presentation.",
   "systemPrompt": "Generate a Japanese script for a Youtube shorts of the given topic. Another AI will generate comic strips for each beat based on the text description of that beat. Mention the reference in one of beats, if it exists. Use the JSON below as a template.",
   "presentationStyle": {
@@ -14,7 +14,7 @@
     "speechParams": {
       "provider": "nijivoice",
       "speakers": {
-        "Presenter": { "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62", "speechOptions": { "speed": 1.5 } }
+        "Presenter": { "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c", "speechOptions": { "speed": 1.5 } }
       }
     },
     "imageParams": {

package/assets/templates/sensei_and_taro.json CHANGED Viewed

@@ -17,7 +17,7 @@
     "speechParams": {
       "provider": "nijivoice",
       "speakers": {
-        "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62" },
+        "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
         "Student": { "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
         "Teacher": { "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
       }

package/lib/actions/captions.js CHANGED Viewed

@@ -1,14 +1,16 @@
 import { mulmoCaptionParamsSchema } from "../types/index.js";
 import { GraphAI, GraphAILogger } from "graphai";
 import * as agents from "@graphai/vanilla";
-import { getHTMLFile, getCaptionImagePath } from "../utils/file.js";
+import { getHTMLFile, getCaptionImagePath, getOutputStudioFilePath } from "../utils/file.js";
 import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
 import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
+import { fileWriteAgent } from "@graphai/vanilla_node_agents";
 const vanillaAgents = agents.default ?? agents;
 const graph_data = {
     version: 0.5,
     nodes: {
         context: {},
+        outputStudioFilePath: {},
         map: {
             agent: "mapAgent",
             inputs: { rows: ":context.studio.script.beats", context: ":context" },
@@ -60,14 +62,26 @@ const graph_data = {
                 },
             },
         },
+        fileWrite: {
+            agent: "fileWriteAgent",
+            inputs: {
+                onComplete: ":map.generateCaption",
+                file: ":outputStudioFilePath",
+                text: ":context.studio.toJSON()",
+            },
+        },
     },
 };
 export const captions = async (context, callbacks) => {
     if (MulmoStudioContextMethods.getCaption(context)) {
         try {
             MulmoStudioContextMethods.setSessionState(context, "caption", true);
-            const graph = new GraphAI(graph_data, { ...vanillaAgents });
+            const graph = new GraphAI(graph_data, { ...vanillaAgents, fileWriteAgent });
+            const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
+            const fileName = MulmoStudioContextMethods.getFileName(context);
+            const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
             graph.injectValue("context", context);
+            graph.injectValue("outputStudioFilePath", outputStudioFilePath);
             if (callbacks) {
                 callbacks.forEach((callback) => {
                     graph.registerCallback(callback);

package/lib/actions/images.d.ts CHANGED Viewed

@@ -1,13 +1,13 @@
 import type { CallbackFunction } from "graphai";
-import { MulmoStudioContext, MulmoBeat, Text2ImageAgentInfo } from "../types/index.js";
+import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
 export declare const imagePreprocessAgent: (namedInputs: {
     context: MulmoStudioContext;
     beat: MulmoBeat;
     index: number;
-    imageAgentInfo: Text2ImageAgentInfo;
     imageRefs: Record<string, string>;
 }) => Promise<{
     imageParams: {
+        provider: "openai" | "google";
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -42,6 +42,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
     images: string[];
     imageFromMovie: boolean;
     imageParams: {
+        provider: "openai" | "google";
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -68,6 +69,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
 } | {
     images: string[];
     imageParams: {
+        provider: "openai" | "google";
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -89,6 +91,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
         }> | undefined;
     };
     movieFile: string | undefined;
+    imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
     imagePath: string;
     referenceImage: string;
     prompt: string;

package/lib/actions/images.js CHANGED Viewed

@@ -10,7 +10,7 @@ import { fileCacheAgentFilter } from "../utils/filters.js";
 import { imageGoogleAgent, imageOpenaiAgent, movieGoogleAgent, movieReplicateAgent, mediaMockAgent } from "../agents/index.js";
 import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../methods/index.js";
 import { findImagePlugin } from "../utils/image_plugins/index.js";
-import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
+import { userAssert, settings2GraphAIConfig, getExtention } from "../utils/utils.js";
 import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
 import { defaultOpenAIImageModel } from "../utils/const.js";
 import { renderHTMLToImage } from "../utils/markdown.js";
@@ -25,11 +25,12 @@ const htmlStyle = (context, beat) => {
     };
 };
 export const imagePreprocessAgent = async (namedInputs) => {
-    const { context, beat, index, imageAgentInfo, imageRefs } = namedInputs;
-    const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
+    const { context, beat, index, imageRefs } = namedInputs;
+    const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle, beat);
+    // const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
     const imagePath = getBeatPngImagePath(context, index);
     const returnValue = {
-        imageParams,
+        imageParams: imageAgentInfo.imageParams,
         movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
     };
     if (beat.image) {
@@ -54,8 +55,8 @@ export const imagePreprocessAgent = async (namedInputs) => {
     if (beat.moviePrompt && !beat.imagePrompt) {
         return { ...returnValue, imagePath, images, imageFromMovie: true }; // no image prompt, only movie prompt
     }
-    const prompt = imagePrompt(beat, imageParams.style);
-    return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
+    const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
+    return { imageAgentInfo, imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
 };
 export const imagePluginAgent = async (namedInputs) => {
     const { context, beat, index } = namedInputs;
@@ -87,7 +88,6 @@ const beat_graph_data = {
     concurrency: 4,
     nodes: {
         context: {},
-        imageAgentInfo: {},
         htmlImageAgentInfo: {},
         movieAgentInfo: {},
         imageRefs: {},
@@ -99,7 +99,6 @@ const beat_graph_data = {
                 context: ":context",
                 beat: ":beat",
                 index: ":__mapIndex",
-                imageAgentInfo: ":imageAgentInfo",
                 imageRefs: ":imageRefs",
             },
         },
@@ -142,7 +141,7 @@ const beat_graph_data = {
         },
         imageGenerator: {
             if: ":preprocessor.prompt",
-            agent: ":imageAgentInfo.agent",
+            agent: ":preprocessor.imageAgentInfo.agent",
             retry: 2,
             inputs: {
                 prompt: ":preprocessor.prompt",
@@ -213,7 +212,6 @@ const graph_data = {
     concurrency: 4,
     nodes: {
         context: {},
-        imageAgentInfo: {},
         htmlImageAgentInfo: {},
         movieAgentInfo: {},
         outputStudioFilePath: {},
@@ -223,7 +221,6 @@ const graph_data = {
             inputs: {
                 rows: ":context.studio.script.beats",
                 context: ":context",
-                imageAgentInfo: ":imageAgentInfo",
                 htmlImageAgentInfo: ":htmlImageAgentInfo",
                 movieAgentInfo: ":movieAgentInfo",
                 imageRefs: ":imageRefs",
@@ -306,10 +303,10 @@ const graphOption = async (context, settings) => {
         agentFilters,
         taskManager,
     };
-    const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
+    const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
     const config = settings2GraphAIConfig(settings);
     // We need to get google's auth token only if the google is the text2image provider.
-    if (imageAgentInfo.provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
+    if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
         userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
         GraphAILogger.log("google was specified as text2image engine");
         const token = await googleAuth();
@@ -342,23 +339,7 @@ export const getImageRefs = async (context) => {
                 }
                 const buffer = Buffer.from(await response.arrayBuffer());
                 // Detect file extension from Content-Type header or URL
-                const extension = (() => {
-                    const contentType = response.headers.get("content-type");
-                    if (contentType?.includes("jpeg") || contentType?.includes("jpg")) {
-                        return "jpg";
-                    }
-                    else if (contentType?.includes("png")) {
-                        return "png";
-                    }
-                    else {
-                        // Fall back to URL extension
-                        const urlExtension = image.source.url.split(".").pop()?.toLowerCase();
-                        if (urlExtension && ["jpg", "jpeg", "png"].includes(urlExtension)) {
-                            return urlExtension === "jpeg" ? "jpg" : urlExtension;
-                        }
-                        return "png"; // default
-                    }
-                })();
+                const extension = getExtention(response.headers.get("content-type"), image.source.url);
                 const imagePath = getReferenceImagePath(context, key, extension);
                 await fs.promises.writeFile(imagePath, buffer);
                 imageRefs[key] = imagePath;
@@ -372,7 +353,7 @@ const prepareGenerateImages = async (context) => {
     const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
     const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
     mkdir(imageProjectDirPath);
-    const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
+    const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
     const htmlImageAgentInfo = MulmoPresentationStyleMethods.getHtmlImageAgentInfo(context.presentationStyle);
     const imageRefs = await getImageRefs(context);
     // Determine movie agent based on provider
@@ -386,10 +367,9 @@ const prepareGenerateImages = async (context) => {
                 return "movieGoogleAgent";
         }
     };
-    GraphAILogger.info(`text2image: provider=${imageAgentInfo.provider} model=${imageAgentInfo.imageParams.model}`);
+    GraphAILogger.info(`text2image: provider=${provider} model=${context.presentationStyle.imageParams?.model}`);
     const injections = {
         context,
-        imageAgentInfo,
         htmlImageAgentInfo,
         movieAgentInfo: {
             agent: getMovieAgent(),
@@ -404,7 +384,7 @@ const getConcurrency = (context) => {
         return 4;
     }
     const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
-    if (imageAgentInfo.provider === "openai") {
+    if (imageAgentInfo.imageParams.provider === "openai") {
         // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
         // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
         // gpt-image-1：3,000,000 TPM、150 images per minute

package/lib/actions/movie.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { MulmoStudioContext, MulmoCanvasDimension, BeatMediaType, MulmoFillOption } from "../types/index.js";
-export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption) => {
+export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption, speed: number) => {
     videoId: string;
     videoPart: string;
 };

package/lib/actions/movie.js CHANGED Viewed

@@ -2,24 +2,32 @@ import { GraphAILogger, assert } from "graphai";
 import { mulmoTransitionSchema, mulmoFillOptionSchema } from "../types/index.js";
 import { MulmoPresentationStyleMethods } from "../methods/index.js";
 import { getAudioArtifactFilePath, getOutputVideoFilePath, writingMessage } from "../utils/file.js";
-import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput } from "../utils/ffmpeg_utils.js";
+import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput, } from "../utils/ffmpeg_utils.js";
 import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
 // const isMac = process.platform === "darwin";
 const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
-export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption) => {
+export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption, speed) => {
     const videoId = `v${inputIndex}`;
     const videoFilters = [];
     // Handle different media types
+    const originalDuration = duration * speed;
     if (mediaType === "image") {
         videoFilters.push("loop=loop=-1:size=1:start=0");
     }
     else if (mediaType === "movie") {
         // For videos, extend with last frame if shorter than required duration
         // tpad will extend the video by cloning the last frame, then trim will ensure exact duration
-        videoFilters.push(`tpad=stop_mode=clone:stop_duration=${duration * 2}`); // Use 2x duration to ensure coverage
+        videoFilters.push(`tpad=stop_mode=clone:stop_duration=${originalDuration * 2}`); // Use 2x duration to ensure coverage
     }
     // Common filters for all media types
-    videoFilters.push(`trim=duration=${duration}`, "fps=30", "setpts=PTS-STARTPTS");
+    videoFilters.push(`trim=duration=${originalDuration}`, "fps=30");
+    // Apply speed if specified
+    if (speed !== 1.0) {
+        videoFilters.push(`setpts=${1 / speed}*PTS`);
+    }
+    else {
+        videoFilters.push("setpts=PTS-STARTPTS");
+    }
     // Apply scaling based on fill option
     if (fillOption.style === "aspectFill") {
         // For aspect fill: scale to fill the canvas completely, cropping if necessary
@@ -69,32 +77,94 @@ const getOutputOption = (audioId, videoId) => {
         "-b:a 128k", // Audio bitrate
     ];
 };
+const addCaptions = (ffmpegContext, concatVideoId, context, caption) => {
+    const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
+    if (caption && beatsWithCaptions.length > 0) {
+        const introPadding = context.presentationStyle.audioParams.introPadding;
+        return beatsWithCaptions.reduce((acc, beat, index) => {
+            const { startAt, duration, captionFile } = beat;
+            if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
+                const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
+                const compositeVideoId = `oc${index}`;
+                ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
+                return compositeVideoId;
+            }
+            return acc;
+        }, concatVideoId);
+    }
+    return concatVideoId;
+};
+const addTransitionEffects = (ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps) => {
+    if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
+        const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
+        return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
+            const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
+            const processedVideoId = `${transitionVideoId}_f`;
+            let transitionFilter;
+            if (transition.type === "fade") {
+                transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
+            }
+            else if (transition.type === "slideout_left") {
+                transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
+            }
+            else {
+                throw new Error(`Unknown transition type: ${transition.type}`);
+            }
+            ffmpegContext.filterComplex.push(transitionFilter);
+            const outputId = `${transitionVideoId}_o`;
+            if (transition.type === "fade") {
+                ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
+            }
+            else if (transition.type === "slideout_left") {
+                ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
+            }
+            return outputId;
+        }, captionedVideoId);
+    }
+    return captionedVideoId;
+};
+const mixAudiosFromMovieBeats = (ffmpegContext, artifactAudioId, audioIdsFromMovieBeats) => {
+    if (audioIdsFromMovieBeats.length > 0) {
+        const mainAudioId = "mainaudio";
+        const compositeAudioId = "composite";
+        const audioIds = audioIdsFromMovieBeats.map((id) => `[${id}]`).join("");
+        FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
+        ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${audioIdsFromMovieBeats.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
+        return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
+    }
+    return artifactAudioId;
+};
 const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
     const caption = MulmoStudioContextMethods.getCaption(context);
     const start = performance.now();
     const ffmpegContext = FfmpegContextInit();
-    const missingIndex = context.studio.beats.findIndex((beat) => !beat.imageFile && !beat.movieFile);
+    const missingIndex = context.studio.beats.findIndex((studioBeat, index) => {
+        const beat = context.studio.script.beats[index];
+        if (beat.image?.type === "voice_over") {
+            return false; // Voice-over does not have either imageFile or movieFile.
+        }
+        return !studioBeat.imageFile && !studioBeat.movieFile;
+    });
     if (missingIndex !== -1) {
         GraphAILogger.info(`ERROR: beat.imageFile or beat.movieFile is not set on beat ${missingIndex}.`);
         return false;
     }
     const canvasInfo = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
     // Add each image input
-    const filterComplexVideoIds = [];
-    const filterComplexAudioIds = [];
+    const videoIdsForBeats = [];
+    const audioIdsFromMovieBeats = [];
     const transitionVideoIds = [];
     const beatTimestamps = [];
     context.studio.beats.reduce((timestamp, studioBeat, index) => {
         const beat = context.studio.script.beats[index];
-        const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
-        if (!sourceFile) {
-            throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
-        }
-        if (!studioBeat.duration) {
-            throw new Error(`studioBeat.duration is not set: index=${index}`);
+        if (beat.image?.type === "voice_over") {
+            videoIdsForBeats.push(undefined);
+            beatTimestamps.push(timestamp);
+            return timestamp; // Skip voice-over beats.
         }
-        const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
-        const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
+        const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
+        assert(!!sourceFile, `studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
+        assert(!!studioBeat.duration, `studioBeat.duration is not set: index=${index}`);
         const extraPadding = (() => {
             // We need to consider only intro and outro padding because the other paddings were already added to the beat.duration
             if (index === 0) {
@@ -105,93 +175,56 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
             }
             return 0;
         })();
-        const duration = studioBeat.duration + extraPadding;
+        // The movie duration is bigger in case of voice-over.
+        const duration = Math.max(studioBeat.duration + extraPadding, studioBeat.movieDuration ?? 0);
         // Get fillOption from merged imageParams (global + beat-specific)
         const globalFillOption = context.presentationStyle.movieParams?.fillOption;
         const beatFillOption = beat.movieParams?.fillOption;
         const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
         const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
-        const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption);
+        const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
+        const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
+        const speed = beat.movieParams?.speed ?? 1.0;
+        const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
         ffmpegContext.filterComplex.push(videoPart);
-        if (caption && studioBeat.captionFile) {
-            const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
-            const compositeVideoId = `c${index}`;
-            ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
-            filterComplexVideoIds.push(compositeVideoId);
-        }
-        else {
-            filterComplexVideoIds.push(videoId);
-        }
         if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
-            const sourceId = filterComplexVideoIds.pop();
-            ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
-            filterComplexVideoIds.push(`${sourceId}_0`);
+            // NOTE: We split the video into two parts for transition.
+            ffmpegContext.filterComplex.push(`[${videoId}]split=2[${videoId}_0][${videoId}_1]`);
+            videoIdsForBeats.push(`${videoId}_0`);
             if (mediaType === "movie") {
                 // For movie beats, extract the last frame for transition
-                ffmpegContext.filterComplex.push(`[${sourceId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${sourceId}_2]`);
-                transitionVideoIds.push(`${sourceId}_2`);
+                ffmpegContext.filterComplex.push(`[${videoId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${videoId}_2]`);
+                transitionVideoIds.push(`${videoId}_2`);
             }
             else {
-                transitionVideoIds.push(`${sourceId}_1`);
+                transitionVideoIds.push(`${videoId}_1`);
             }
         }
-        if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0) {
+        else {
+            videoIdsForBeats.push(videoId);
+        }
+        // NOTE: We don't support audio if the speed is not 1.0.
+        if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
             const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
-            filterComplexAudioIds.push(audioId);
+            audioIdsFromMovieBeats.push(audioId);
             ffmpegContext.filterComplex.push(audioPart);
         }
         beatTimestamps.push(timestamp);
         return timestamp + duration;
     }, 0);
-    assert(filterComplexVideoIds.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
+    assert(videoIdsForBeats.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
     assert(beatTimestamps.length === context.studio.beats.length, "beatTimestamps.length !== studio.beats.length");
     // console.log("*** images", images.audioIds);
     // Concatenate the trimmed images
     const concatVideoId = "concat_video";
-    ffmpegContext.filterComplex.push(`${filterComplexVideoIds.map((id) => `[${id}]`).join("")}concat=n=${context.studio.beats.length}:v=1:a=0[${concatVideoId}]`);
-    // Add tranditions if needed
-    const mixedVideoId = (() => {
-        if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
-            const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
-            return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
-                const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
-                const processedVideoId = `${transitionVideoId}_f`;
-                let transitionFilter;
-                if (transition.type === "fade") {
-                    transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
-                }
-                else if (transition.type === "slideout_left") {
-                    transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
-                }
-                else {
-                    throw new Error(`Unknown transition type: ${transition.type}`);
-                }
-                ffmpegContext.filterComplex.push(transitionFilter);
-                const outputId = `${transitionVideoId}_o`;
-                if (transition.type === "fade") {
-                    ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
-                }
-                else if (transition.type === "slideout_left") {
-                    ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
-                }
-                return outputId;
-            }, concatVideoId);
-        }
-        return concatVideoId;
-    })();
+    const videoIds = videoIdsForBeats.filter((id) => id !== undefined); // filter out voice-over beats
+    ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
+    const captionedVideoId = addCaptions(ffmpegContext, concatVideoId, context, caption);
+    const mixedVideoId = addTransitionEffects(ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps);
+    GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
     const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
     const artifactAudioId = `${audioIndex}:a`;
-    const ffmpegContextAudioId = (() => {
-        if (filterComplexAudioIds.length > 0) {
-            const mainAudioId = "mainaudio";
-            const compositeAudioId = "composite";
-            const audioIds = filterComplexAudioIds.map((id) => `[${id}]`).join("");
-            FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
-            ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${filterComplexAudioIds.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
-            return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
-        }
-        return artifactAudioId;
-    })();
+    const ffmpegContextAudioId = mixAudiosFromMovieBeats(ffmpegContext, artifactAudioId, audioIdsFromMovieBeats);
     // GraphAILogger.debug("filterComplex", ffmpegContext.filterComplex);
     await FfmpegContextGenerateOutput(ffmpegContext, outputVideoPath, getOutputOption(ffmpegContextAudioId, mixedVideoId));
     const end = performance.now();

package/lib/agents/add_bgm_agent.js CHANGED Viewed

@@ -1,8 +1,15 @@
+import fs from "fs";
 import { GraphAILogger } from "graphai";
 import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextGenerateOutput, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
 const addBGMAgent = async ({ namedInputs, params, }) => {
     const { voiceFile, outputFile, context } = namedInputs;
     const { musicFile } = params;
+    if (!fs.existsSync(voiceFile)) {
+        throw new Error(`AddBGMAgent voiceFile not exist: ${voiceFile}`);
+    }
+    if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
+        throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
+    }
     const speechDuration = await ffmpegGetMediaDuration(voiceFile);
     const introPadding = context.presentationStyle.audioParams.introPadding;
     const outroPadding = context.presentationStyle.audioParams.outroPadding;
@@ -16,8 +23,14 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
     ffmpegContext.filterComplex.push(`[music][voice]amix=inputs=2:duration=longest[mixed]`);
     ffmpegContext.filterComplex.push(`[mixed]atrim=start=0:end=${totalDuration}[trimmed]`);
     ffmpegContext.filterComplex.push(`[trimmed]afade=t=out:st=${totalDuration - outroPadding}:d=${outroPadding}[faded]`);
-    await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
-    return outputFile;
+    try {
+        await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
+        return outputFile;
+    }
+    catch (e) {
+        GraphAILogger.log(e);
+        throw new Error(`AddBGMAgent ffmpeg run Error`);
+    }
 };
 const addBGMAgentInfo = {
     name: "addBGMAgent",