npm - mulmocast - Versions diffs - 1.2.12 → 1.2.14 - Mend

mulmocast 1.2.12 → 1.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/lib/actions/audio.d.ts +1 -0
package/lib/actions/audio.js +14 -5
package/lib/actions/image_agents.d.ts +12 -0
package/lib/actions/image_agents.js +17 -4
package/lib/actions/images.js +25 -3
package/lib/actions/translate.d.ts +3 -1
package/lib/actions/translate.js +3 -1
package/lib/methods/mulmo_presentation_style.d.ts +1 -1
package/lib/methods/mulmo_presentation_style.js +4 -3
package/lib/utils/ffmpeg_utils.d.ts +1 -0
package/lib/utils/ffmpeg_utils.js +33 -0
package/package.json +1 -1
package/scripts/test/test_mv.json +82 -0

package/lib/actions/audio.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import "dotenv/config";
 import { MulmoStudioContext, MulmoBeat, PublicAPIArgs } from "../types/index.js";
 export declare const getBeatAudioPath: (text: string, context: MulmoStudioContext, beat: MulmoBeat, lang?: string) => string | undefined;
+export declare const listLocalizedAudioPaths: (context: MulmoStudioContext) => (string | undefined)[];
 export declare const generateBeatAudio: (index: number, context: MulmoStudioContext, args?: PublicAPIArgs & {
     langs: string[];
 }) => Promise<void>;

package/lib/actions/audio.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import "dotenv/config";
-import { GraphAI, TaskManager } from "graphai";
+import { GraphAI, TaskManager, GraphAILogger } from "graphai";
 import * as agents from "@graphai/vanilla";
 import { fileWriteAgent } from "@graphai/vanilla_node_agents";
 import { ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsElevenlabsAgent, addBGMAgent, combineAudioFilesAgent, mediaMockAgent } from "../agents/index.js";
@@ -26,25 +26,34 @@ const getAudioPath = (context, beat, audioFile) => {
     }
     return audioFile;
 };
-const getAudioParam = (context, beat) => {
-    const speaker = MulmoPresentationStyleMethods.getSpeaker(context, beat);
+const getAudioParam = (context, beat, lang) => {
+    const speaker = MulmoPresentationStyleMethods.getSpeaker(context, beat, lang);
     const speechOptions = { ...speaker.speechOptions, ...beat.speechOptions };
     const provider = text2SpeechProviderSchema.parse(speaker.provider);
     return { voiceId: speaker.voiceId, provider, speechOptions, model: speaker.model };
 };
 export const getBeatAudioPath = (text, context, beat, lang) => {
     const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
-    const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat);
+    const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat, lang);
     const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider, model ?? ""].join(":");
+    GraphAILogger.log(`getBeatAudioPath [${hash_string}]`);
     const audioFileName = `${context.studio.filename}_${text2hash(hash_string)}`;
     const audioFile = getAudioFilePath(audioDirPath, context.studio.filename, audioFileName, lang);
     return getAudioPath(context, beat, audioFile);
 };
+export const listLocalizedAudioPaths = (context) => {
+    const lang = context.lang ?? context.studio.script.lang;
+    return context.studio.script.beats.map((beat, index) => {
+        const multiLingual = context.multiLingual[index];
+        const text = localizedText(beat, multiLingual, lang);
+        return getBeatAudioPath(text, context, beat, lang);
+    });
+};
 const preprocessorAgent = (namedInputs) => {
     const { beat, studioBeat, multiLingual, context, lang } = namedInputs;
     // const { lang } = context;
     const text = localizedText(beat, multiLingual, lang);
-    const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat);
+    const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat, lang);
     const audioPath = getBeatAudioPath(text, context, beat, lang);
     studioBeat.audioFile = audioPath; // TODO: Passing by reference is difficult to maintain, so pass it using graphai inputs
     const needsTTS = !beat.audio && audioPath !== undefined;

package/lib/actions/image_agents.d.ts CHANGED Viewed

@@ -25,6 +25,10 @@ export declare const imagePreprocessAgent: (namedInputs: {
     lipSyncFile?: string;
     lipSyncModel?: string;
     lipSyncAgentName?: string;
+    lipSyncTrimAudio?: boolean;
+    bgmFile?: string | null;
+    startAt?: number;
+    duration?: number;
     audioFile?: string;
     beatDuration?: number;
     htmlPrompt?: undefined;
@@ -61,6 +65,10 @@ export declare const imagePreprocessAgent: (namedInputs: {
     lipSyncFile?: string;
     lipSyncModel?: string;
     lipSyncAgentName?: string;
+    lipSyncTrimAudio?: boolean;
+    bgmFile?: string | null;
+    startAt?: number;
+    duration?: number;
     audioFile?: string;
     beatDuration?: number;
     htmlPrompt?: undefined;
@@ -100,6 +108,10 @@ export declare const imagePreprocessAgent: (namedInputs: {
     lipSyncFile?: string;
     lipSyncModel?: string;
     lipSyncAgentName?: string;
+    lipSyncTrimAudio?: boolean;
+    bgmFile?: string | null;
+    startAt?: number;
+    duration?: number;
     audioFile?: string;
     beatDuration?: number;
     htmlPrompt?: undefined;

package/lib/actions/image_agents.js CHANGED Viewed

@@ -1,8 +1,9 @@
-import { MulmoPresentationStyleMethods, MulmoStudioContextMethods, MulmoBeatMethods } from "../methods/index.js";
-import { getBeatPngImagePath, getBeatMoviePaths } from "../utils/file.js";
+import { MulmoPresentationStyleMethods, MulmoStudioContextMethods, MulmoBeatMethods, MulmoMediaSourceMethods } from "../methods/index.js";
+import { getBeatPngImagePath, getBeatMoviePaths, getAudioFilePath } from "../utils/file.js";
 import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
 import { renderHTMLToImage } from "../utils/markdown.js";
 import { GraphAILogger } from "graphai";
+import { beatId } from "../utils/utils.js";
 const htmlStyle = (context, beat) => {
     return {
         canvasSize: MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle),
@@ -40,8 +41,20 @@ export const imagePreprocessAgent = async (namedInputs) => {
         returnValue.lipSyncAgentName = lipSyncAgentInfo.agentName;
         returnValue.lipSyncModel = beat.lipSyncParams?.model ?? context.presentationStyle.lipSyncParams?.model ?? lipSyncAgentInfo.defaultModel;
         returnValue.lipSyncFile = moviePaths.lipSyncFile;
-        // Audio file will be set from the beat's audio file when available
-        returnValue.audioFile = studioBeat?.audioFile;
+        if (context.studio.script.audioParams?.suppressSpeech) {
+            returnValue.startAt = studioBeat?.startAt ?? 0;
+            returnValue.duration = studioBeat?.duration ?? 0;
+            returnValue.lipSyncTrimAudio = true;
+            returnValue.bgmFile = MulmoMediaSourceMethods.resolve(context.studio.script.audioParams.bgm, context);
+            const folderName = MulmoStudioContextMethods.getFileName(context);
+            const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
+            const fileName = `${beatId(beat.id, index)}_trimmed.mp3`;
+            returnValue.audioFile = getAudioFilePath(audioDirPath, folderName, fileName);
+        }
+        else {
+            // Audio file will be set from the beat's audio file when available
+            returnValue.audioFile = studioBeat?.audioFile;
+        }
     }
     if (beat.image) {
         const plugin = MulmoBeatMethods.getPlugin(beat);

package/lib/actions/images.js CHANGED Viewed

@@ -10,7 +10,7 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../met
 import { getOutputStudioFilePath, mkdir } from "../utils/file.js";
 import { fileCacheAgentFilter } from "../utils/filters.js";
 import { settings2GraphAIConfig } from "../utils/utils.js";
-import { extractImageFromMovie, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
+import { extractImageFromMovie, ffmpegGetMediaDuration, trimMusic } from "../utils/ffmpeg_utils.js";
 import { getImageRefs } from "./image_references.js";
 import { imagePreprocessAgent, imagePluginAgent, htmlImageGeneratorAgent } from "./image_agents.js";
 const vanillaAgents = vanilla.default ?? vanilla;
@@ -224,11 +224,33 @@ const beat_graph_data = {
             },
             defaultValue: {},
         },
+        AudioTrimmer: {
+            if: ":preprocessor.lipSyncTrimAudio",
+            agent: async (namedInputs) => {
+                const buffer = await trimMusic(namedInputs.bgmFile, namedInputs.startAt, namedInputs.duration);
+                return { buffer };
+            },
+            inputs: {
+                audioFile: ":preprocessor.audioFile",
+                bgmFile: ":preprocessor.bgmFile",
+                startAt: ":preprocessor.startAt",
+                duration: ":preprocessor.duration",
+                cache: {
+                    force: [":context.force"],
+                    file: ":preprocessor.audioFile",
+                    index: ":__mapIndex",
+                    id: ":beat.id",
+                    sessionType: "audioTrimmer",
+                    mulmoContext: ":context",
+                },
+            },
+            defaultValue: {},
+        },
         lipSyncGenerator: {
             if: ":beat.enableLipSync",
             agent: ":preprocessor.lipSyncAgentName",
             inputs: {
-                onComplete: [":soundEffectGenerator"], // to wait for soundEffectGenerator to finish
+                onComplete: [":soundEffectGenerator", ":AudioTrimmer"], // to wait for soundEffectGenerator to finish
                 movieFile: ":preprocessor.movieFile",
                 imageFile: ":preprocessor.referenceImageForMovie",
                 audioFile: ":preprocessor.audioFile",
@@ -344,7 +366,7 @@ export const graphOption = async (context, settings) => {
             {
                 name: "fileCacheAgentFilter",
                 agent: fileCacheAgentFilter,
-                nodeIds: ["imageGenerator", "movieGenerator", "htmlImageAgent", "soundEffectGenerator", "lipSyncGenerator"],
+                nodeIds: ["imageGenerator", "movieGenerator", "htmlImageAgent", "soundEffectGenerator", "lipSyncGenerator", "AudioTrimmer"],
             },
         ],
         taskManager: new TaskManager(MulmoPresentationStyleMethods.getConcurrency(context.presentationStyle)),

package/lib/actions/translate.d.ts CHANGED Viewed

@@ -50,4 +50,6 @@ export declare const getOutputMultilingualFilePathAndMkdir: (context: MulmoStudi
     outDirPath: string;
 };
 export declare const translateBeat: (index: number, context: MulmoStudioContext, targetLangs: string[], args?: PublicAPIArgs) => Promise<void>;
-export declare const translate: (context: MulmoStudioContext, args?: PublicAPIArgs) => Promise<MulmoStudioContext>;
+export declare const translate: (context: MulmoStudioContext, args?: PublicAPIArgs & {
+    targetLangs?: string[];
+}) => Promise<MulmoStudioContext>;

package/lib/actions/translate.js CHANGED Viewed

@@ -270,7 +270,9 @@ export const translate = async (context, args) => {
     try {
         MulmoStudioContextMethods.setSessionState(context, "multiLingual", true);
         const { outputMultilingualFilePath, outDirPath } = getOutputMultilingualFilePathAndMkdir(context);
-        const targetLangs = [...new Set([context.lang, context.studio.script.captionParams?.lang].filter((x) => !isNull(x)))];
+        const targetLangs = args?.targetLangs
+            ? args?.targetLangs
+            : [...new Set([context.lang, context.studio.script.captionParams?.lang].filter((x) => !isNull(x)))];
         const config = settings2GraphAIConfig(settings, process.env);
         assert(!!config?.openAIAgent?.apiKey, "The OPENAI_API_KEY environment variable is missing or empty");
         const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters, config });

package/lib/methods/mulmo_presentation_style.d.ts CHANGED Viewed

@@ -9,7 +9,7 @@ export declare const MulmoPresentationStyleMethods: {
     getAllSpeechProviders(presentationStyle: MulmoPresentationStyle): Set<Text2SpeechProvider>;
     getTextSlideStyle(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
     getDefaultSpeaker(presentationStyle: MulmoPresentationStyle): string;
-    getSpeaker(context: MulmoStudioContext, beat: MulmoBeat): SpeakerData;
+    getSpeaker(context: MulmoStudioContext, beat: MulmoBeat, targetLang: string | undefined): SpeakerData;
     getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
     getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
     getMovieAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): {

package/lib/methods/mulmo_presentation_style.js CHANGED Viewed

@@ -50,13 +50,14 @@ export const MulmoPresentationStyleMethods = {
         }
         return keys[0];
     },
-    getSpeaker(context, beat) {
+    getSpeaker(context, beat, targetLang) {
         userAssert(!!context.presentationStyle?.speechParams?.speakers, "presentationStyle.speechParams.speakers is not set!!");
         const speakerId = beat?.speaker ?? MulmoPresentationStyleMethods.getDefaultSpeaker(context.presentationStyle);
         const speaker = context.presentationStyle.speechParams.speakers[speakerId];
         userAssert(!!speaker, `speaker is not set: speaker "${speakerId}"`);
-        // Check if the speaker has a language-specific version
-        const lang = context.lang ?? context.studio.script.lang;
+        // Check if the speaker has a language-specific version.
+        // Normally, lang is determined by the context, but lang may be specified when using the API.
+        const lang = targetLang ?? context.lang ?? context.studio.script.lang;
         if (speaker.lang && lang && speaker.lang[lang]) {
             return speaker.lang[lang];
         }

package/lib/utils/ffmpeg_utils.d.ts CHANGED Viewed

@@ -16,3 +16,4 @@ export declare const ffmpegGetMediaDuration: (filePath: string) => Promise<{
     hasAudio: boolean;
 }>;
 export declare const extractImageFromMovie: (movieFile: string, imagePath: string) => Promise<object>;
+export declare const trimMusic: (inputFile: string, startTime: number, duration: number) => Promise<Buffer>;

package/lib/utils/ffmpeg_utils.js CHANGED Viewed

@@ -89,3 +89,36 @@ export const extractImageFromMovie = (movieFile, imagePath) => {
             .run();
     });
 };
+export const trimMusic = (inputFile, startTime, duration) => {
+    return new Promise((resolve, reject) => {
+        if (!inputFile.startsWith("http://") && !inputFile.startsWith("https://") && !fs.existsSync(inputFile)) {
+            reject(new Error(`File not found: ${inputFile}`));
+            return;
+        }
+        if (duration <= 0) {
+            reject(new Error(`Invalid duration: duration (${duration}) must be greater than 0`));
+            return;
+        }
+        const chunks = [];
+        ffmpeg(inputFile)
+            .seekInput(startTime)
+            .duration(duration)
+            .format("mp3")
+            .on("start", () => {
+            GraphAILogger.log(`Trimming audio from ${startTime}s for ${duration}s...`);
+        })
+            .on("error", (err) => {
+            GraphAILogger.error("Error occurred while trimming audio:", err);
+            reject(err);
+        })
+            .on("end", () => {
+            const buffer = Buffer.concat(chunks);
+            GraphAILogger.log(`Audio trimmed successfully, buffer size: ${buffer.length} bytes`);
+            resolve(buffer);
+        })
+            .pipe()
+            .on("data", (chunk) => {
+            chunks.push(chunk);
+        });
+    });
+};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mulmocast",
-  "version": "1.2.12",
+  "version": "1.2.14",
   "description": "",
   "type": "module",
   "main": "lib/index.node.js",

package/scripts/test/test_mv.json ADDED Viewed

@@ -0,0 +1,82 @@
+{
+  "$mulmocast": {
+    "version": "1.1",
+    "credit": "closing"
+  },
+  "canvasSize": {
+    "width": 1536,
+    "height": 1024
+  },
+  "speechParams": {
+    "speakers": {
+      "Presenter": {
+        "displayName": {
+          "en": "Presenter"
+        },
+        "voiceId": "shimmer"
+      }
+    }
+  },
+  "imageParams": {
+    "provider": "openai",
+    "style": "<style>Vibrant 3D animation style inspired by K-pop aesthetics, with glossy, stylized characters. The overall visual style combines elements of modern animation, game cinematics, and fashion-forward character design, with sleek outlines, glowing effects, and a polished, cinematic finish.</style>",
+    "images": {
+      "min": {
+        "type": "image",
+        "source": {
+          "kind": "url",
+          "url": "https://raw.githubusercontent.com/receptron/mulmocast-media/refs/heads/main/characters/min_anime.png"
+        }
+      }
+    }
+  },
+  "movieParams": {
+    "provider": "replicate"
+  },
+  "soundEffectParams": {
+    "provider": "replicate"
+  },
+  "captionParams": {
+    "lang": "en",
+    "styles": ["font-size: 64px", "width: 90%", "padding-left: 5%", "padding-right: 5%"]
+  },
+  "audioParams": {
+    "padding": 0,
+    "introPadding": 0,
+    "closingPadding": 0,
+    "outroPadding": 0,
+    "bgm": {
+      "kind": "url",
+      "url": "https://raw.githubusercontent.com/receptron/mulmocast-media/refs/heads/main/music/finetuning_with_you.mp3"
+    },
+    "bgmVolume": 1,
+    "audioVolume": 0,
+    "suppressSpeech": true
+  },
+  "title": "Music Video",
+  "lang": "en",
+  "beats": [
+    {
+      "text": "Finetuning with you",
+      "duration": 7.0,
+      "image": {
+        "type": "textSlide",
+        "slide": {
+          "title": "Finetuning with you"
+        }
+      }
+    },
+    {
+      "text": "Whispers hide in silver rain. Every shadow calls your name.",
+      "duration": 9.32,
+      "imagePrompt": "Singer walking alone at night in neon-lit rainy street, holding a clear umbrella, raindrops sparkling, wearing a black mini dress with thigh-high boots, reflective puddles surrounding her.",
+      "enableLipSync": true
+    },
+    {
+      "text": "I dissolve into the night. Just to echo what you liked.",
+      "duration": 8.28,
+      "imagePrompt": "Singer standing against a glowing city skyline at night, hair blowing in wind, long white trench coat fluttering, reaching out with one hand as if fading into the background lights.",
+      "enableLipSync": true
+    }
+  ]
+}