npm - mulmocast - Versions diffs - 0.1.0 → 0.1.1 - Mend

mulmocast 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +10 -0
package/assets/templates/ghibli_shorts.json +1 -1
package/lib/actions/images.js +2 -18
package/lib/actions/movie.js +80 -91
package/lib/agents/add_bgm_agent.js +15 -2
package/lib/agents/combine_audio_files_agent.js +3 -3
package/lib/types/schema.d.ts +94 -74
package/lib/types/schema.js +21 -7
package/lib/types/type.d.ts +3 -1
package/lib/utils/context.d.ts +2 -2
package/lib/utils/file.js +8 -0
package/lib/utils/preprocess.d.ts +1 -1
package/lib/utils/utils.d.ts +1 -0
package/lib/utils/utils.js +14 -0
package/package.json +12 -12
package/scripts/templates/voice_over.json +60 -0

package/README.md CHANGED Viewed

@@ -82,6 +82,16 @@ brew install ffmpeg
 # Visit https://ffmpeg.org/download.html
 ```
+You can also use [`Dockerfile`](./Dockerfile) which helps you install the pre-requisits.
+```
+docker build -t mulmo-cli .
+```
+You can use the Docker image like this:
+```
+docker run -e OPENAI_API_KEY=<your_openai_api_key> -it mulmo-cli mulmo tool scripting -i -t children_book -o ./ -s story
+```
 ## Configuration
 Create a `.env` file in your project directory with the following API keys:

package/assets/templates/ghibli_shorts.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "title": "Ghibli comic style",
+  "title": "Ghibli style for YouTube Shorts",
   "description": "Template for Ghibli-style comic presentation.",
   "systemPrompt": "Generate a Japanese script for a Youtube shorts of the given topic. Another AI will generate comic strips for each beat based on the text description of that beat. Mention the reference in one of beats, if it exists. Use the JSON below as a template.",
   "presentationStyle": {

package/lib/actions/images.js CHANGED Viewed

@@ -10,7 +10,7 @@ import { fileCacheAgentFilter } from "../utils/filters.js";
 import { imageGoogleAgent, imageOpenaiAgent, movieGoogleAgent, movieReplicateAgent, mediaMockAgent } from "../agents/index.js";
 import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../methods/index.js";
 import { findImagePlugin } from "../utils/image_plugins/index.js";
-import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
+import { userAssert, settings2GraphAIConfig, getExtention } from "../utils/utils.js";
 import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
 import { defaultOpenAIImageModel } from "../utils/const.js";
 import { renderHTMLToImage } from "../utils/markdown.js";
@@ -339,23 +339,7 @@ export const getImageRefs = async (context) => {
                 }
                 const buffer = Buffer.from(await response.arrayBuffer());
                 // Detect file extension from Content-Type header or URL
-                const extension = (() => {
-                    const contentType = response.headers.get("content-type");
-                    if (contentType?.includes("jpeg") || contentType?.includes("jpg")) {
-                        return "jpg";
-                    }
-                    else if (contentType?.includes("png")) {
-                        return "png";
-                    }
-                    else {
-                        // Fall back to URL extension
-                        const urlExtension = image.source.url.split(".").pop()?.toLowerCase();
-                        if (urlExtension && ["jpg", "jpeg", "png"].includes(urlExtension)) {
-                            return urlExtension === "jpeg" ? "jpg" : urlExtension;
-                        }
-                        return "png"; // default
-                    }
-                })();
+                const extension = getExtention(response.headers.get("content-type"), image.source.url);
                 const imagePath = getReferenceImagePath(context, key, extension);
                 await fs.promises.writeFile(imagePath, buffer);
                 imageRefs[key] = imagePath;

package/lib/actions/movie.js CHANGED Viewed

@@ -2,7 +2,7 @@ import { GraphAILogger, assert } from "graphai";
 import { mulmoTransitionSchema, mulmoFillOptionSchema } from "../types/index.js";
 import { MulmoPresentationStyleMethods } from "../methods/index.js";
 import { getAudioArtifactFilePath, getOutputVideoFilePath, writingMessage } from "../utils/file.js";
-import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput } from "../utils/ffmpeg_utils.js";
+import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput, } from "../utils/ffmpeg_utils.js";
 import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
 // const isMac = process.platform === "darwin";
 const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
@@ -77,6 +77,63 @@ const getOutputOption = (audioId, videoId) => {
         "-b:a 128k", // Audio bitrate
     ];
 };
+const addCaptions = (ffmpegContext, concatVideoId, context, caption) => {
+    const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
+    if (caption && beatsWithCaptions.length > 0) {
+        const introPadding = context.presentationStyle.audioParams.introPadding;
+        return beatsWithCaptions.reduce((acc, beat, index) => {
+            const { startAt, duration, captionFile } = beat;
+            if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
+                const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
+                const compositeVideoId = `oc${index}`;
+                ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
+                return compositeVideoId;
+            }
+            return acc;
+        }, concatVideoId);
+    }
+    return concatVideoId;
+};
+const addTransitionEffects = (ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps) => {
+    if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
+        const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
+        return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
+            const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
+            const processedVideoId = `${transitionVideoId}_f`;
+            let transitionFilter;
+            if (transition.type === "fade") {
+                transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
+            }
+            else if (transition.type === "slideout_left") {
+                transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
+            }
+            else {
+                throw new Error(`Unknown transition type: ${transition.type}`);
+            }
+            ffmpegContext.filterComplex.push(transitionFilter);
+            const outputId = `${transitionVideoId}_o`;
+            if (transition.type === "fade") {
+                ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
+            }
+            else if (transition.type === "slideout_left") {
+                ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
+            }
+            return outputId;
+        }, captionedVideoId);
+    }
+    return captionedVideoId;
+};
+const mixAudiosFromMovieBeats = (ffmpegContext, artifactAudioId, audioIdsFromMovieBeats) => {
+    if (audioIdsFromMovieBeats.length > 0) {
+        const mainAudioId = "mainaudio";
+        const compositeAudioId = "composite";
+        const audioIds = audioIdsFromMovieBeats.map((id) => `[${id}]`).join("");
+        FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
+        ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${audioIdsFromMovieBeats.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
+        return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
+    }
+    return artifactAudioId;
+};
 const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
     const caption = MulmoStudioContextMethods.getCaption(context);
     const start = performance.now();
@@ -94,26 +151,20 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
     }
     const canvasInfo = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
     // Add each image input
-    const filterComplexVideoIds = [];
-    const filterComplexAudioIds = [];
+    const videoIdsForBeats = [];
+    const audioIdsFromMovieBeats = [];
     const transitionVideoIds = [];
     const beatTimestamps = [];
     context.studio.beats.reduce((timestamp, studioBeat, index) => {
         const beat = context.studio.script.beats[index];
         if (beat.image?.type === "voice_over") {
-            filterComplexVideoIds.push(undefined);
+            videoIdsForBeats.push(undefined);
             beatTimestamps.push(timestamp);
             return timestamp; // Skip voice-over beats.
         }
         const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
-        if (!sourceFile) {
-            throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
-        }
-        if (!studioBeat.duration) {
-            throw new Error(`studioBeat.duration is not set: index=${index}`);
-        }
-        const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
-        const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
+        assert(!!sourceFile, `studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
+        assert(!!studioBeat.duration, `studioBeat.duration is not set: index=${index}`);
         const extraPadding = (() => {
             // We need to consider only intro and outro padding because the other paddings were already added to the beat.duration
             if (index === 0) {
@@ -131,111 +182,49 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
         const beatFillOption = beat.movieParams?.fillOption;
         const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
         const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
+        const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
+        const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
         const speed = beat.movieParams?.speed ?? 1.0;
         const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
         ffmpegContext.filterComplex.push(videoPart);
-        /*
-        if (caption && studioBeat.captionFile) {
-          // NOTE: This works for normal beats, but not for voice-over beats.
-          const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
-          const compositeVideoId = `c${index}`;
-          ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
-          filterComplexVideoIds.push(compositeVideoId);
-        } else {
-        }
-        */
-        filterComplexVideoIds.push(videoId);
         if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
-            const sourceId = filterComplexVideoIds.pop();
-            ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
-            filterComplexVideoIds.push(`${sourceId}_0`);
+            // NOTE: We split the video into two parts for transition.
+            ffmpegContext.filterComplex.push(`[${videoId}]split=2[${videoId}_0][${videoId}_1]`);
+            videoIdsForBeats.push(`${videoId}_0`);
             if (mediaType === "movie") {
                 // For movie beats, extract the last frame for transition
-                ffmpegContext.filterComplex.push(`[${sourceId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${sourceId}_2]`);
-                transitionVideoIds.push(`${sourceId}_2`);
+                ffmpegContext.filterComplex.push(`[${videoId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${videoId}_2]`);
+                transitionVideoIds.push(`${videoId}_2`);
             }
             else {
-                transitionVideoIds.push(`${sourceId}_1`);
+                transitionVideoIds.push(`${videoId}_1`);
             }
         }
+        else {
+            videoIdsForBeats.push(videoId);
+        }
         // NOTE: We don't support audio if the speed is not 1.0.
         if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
             const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
-            filterComplexAudioIds.push(audioId);
+            audioIdsFromMovieBeats.push(audioId);
             ffmpegContext.filterComplex.push(audioPart);
         }
         beatTimestamps.push(timestamp);
         return timestamp + duration;
     }, 0);
-    assert(filterComplexVideoIds.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
+    assert(videoIdsForBeats.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
     assert(beatTimestamps.length === context.studio.beats.length, "beatTimestamps.length !== studio.beats.length");
     // console.log("*** images", images.audioIds);
     // Concatenate the trimmed images
     const concatVideoId = "concat_video";
-    const videoIds = filterComplexVideoIds.filter((id) => id !== undefined); // filter out voice-over beats
+    const videoIds = videoIdsForBeats.filter((id) => id !== undefined); // filter out voice-over beats
     ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
-    // Overlay voice-over captions
-    const captionedVideoId = (() => {
-        const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
-        if (caption && beatsWithCaptions.length > 0) {
-            const introPadding = context.presentationStyle.audioParams.introPadding;
-            return beatsWithCaptions.reduce((acc, beat, index) => {
-                const { startAt, duration, captionFile } = beat;
-                if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
-                    const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
-                    const compositeVideoId = `oc${index}`;
-                    ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
-                    return compositeVideoId;
-                }
-                return acc;
-            }, concatVideoId);
-        }
-        return concatVideoId;
-    })();
-    // Add tranditions if needed
-    const mixedVideoId = (() => {
-        if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
-            const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
-            return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
-                const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
-                const processedVideoId = `${transitionVideoId}_f`;
-                let transitionFilter;
-                if (transition.type === "fade") {
-                    transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
-                }
-                else if (transition.type === "slideout_left") {
-                    transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
-                }
-                else {
-                    throw new Error(`Unknown transition type: ${transition.type}`);
-                }
-                ffmpegContext.filterComplex.push(transitionFilter);
-                const outputId = `${transitionVideoId}_o`;
-                if (transition.type === "fade") {
-                    ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
-                }
-                else if (transition.type === "slideout_left") {
-                    ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
-                }
-                return outputId;
-            }, captionedVideoId);
-        }
-        return captionedVideoId;
-    })();
+    const captionedVideoId = addCaptions(ffmpegContext, concatVideoId, context, caption);
+    const mixedVideoId = addTransitionEffects(ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps);
     GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
     const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
     const artifactAudioId = `${audioIndex}:a`;
-    const ffmpegContextAudioId = (() => {
-        if (filterComplexAudioIds.length > 0) {
-            const mainAudioId = "mainaudio";
-            const compositeAudioId = "composite";
-            const audioIds = filterComplexAudioIds.map((id) => `[${id}]`).join("");
-            FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
-            ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${filterComplexAudioIds.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
-            return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
-        }
-        return artifactAudioId;
-    })();
+    const ffmpegContextAudioId = mixAudiosFromMovieBeats(ffmpegContext, artifactAudioId, audioIdsFromMovieBeats);
     // GraphAILogger.debug("filterComplex", ffmpegContext.filterComplex);
     await FfmpegContextGenerateOutput(ffmpegContext, outputVideoPath, getOutputOption(ffmpegContextAudioId, mixedVideoId));
     const end = performance.now();

package/lib/agents/add_bgm_agent.js CHANGED Viewed

@@ -1,8 +1,15 @@
+import fs from "fs";
 import { GraphAILogger } from "graphai";
 import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextGenerateOutput, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
 const addBGMAgent = async ({ namedInputs, params, }) => {
     const { voiceFile, outputFile, context } = namedInputs;
     const { musicFile } = params;
+    if (!fs.existsSync(voiceFile)) {
+        throw new Error(`AddBGMAgent voiceFile not exist: ${voiceFile}`);
+    }
+    if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
+        throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
+    }
     const speechDuration = await ffmpegGetMediaDuration(voiceFile);
     const introPadding = context.presentationStyle.audioParams.introPadding;
     const outroPadding = context.presentationStyle.audioParams.outroPadding;
@@ -16,8 +23,14 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
     ffmpegContext.filterComplex.push(`[music][voice]amix=inputs=2:duration=longest[mixed]`);
     ffmpegContext.filterComplex.push(`[mixed]atrim=start=0:end=${totalDuration}[trimmed]`);
     ffmpegContext.filterComplex.push(`[trimmed]afade=t=out:st=${totalDuration - outroPadding}:d=${outroPadding}[faded]`);
-    await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
-    return outputFile;
+    try {
+        await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
+        return outputFile;
+    }
+    catch (e) {
+        GraphAILogger.log(e);
+        throw new Error(`AddBGMAgent ffmpeg run Error`);
+    }
 };
 const addBGMAgentInfo = {
     name: "addBGMAgent",

package/lib/agents/combine_audio_files_agent.js CHANGED Viewed

@@ -82,7 +82,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
             if (group.length > 1) {
                 group.reduce((remaining, idx, iGroup) => {
                     const subBeatDurations = mediaDurations[idx];
-                    userAssert(subBeatDurations.audioDuration <= remaining, `subBeatDurations.audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
+                    userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
                     if (iGroup === group.length - 1) {
                         beatDurations.push(remaining);
                         subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
@@ -94,10 +94,10 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
                     if (voiceStartAt) {
                         const remainingDuration = movieDuration - voiceStartAt;
                         const duration = remaining - remainingDuration;
-                        userAssert(duration >= 0, `duration(${duration}) < 0`);
+                        userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
                         beatDurations.push(duration);
                         subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
-                        userAssert(subBeatDurations.silenceDuration >= 0, `subBeatDurations.silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
+                        userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
                         return remainingDuration;
                     }
                     beatDurations.push(subBeatDurations.audioDuration);