npm - mulmocast - Versions diffs - 0.0.28 → 0.1.0 - Mend

mulmocast 0.0.28 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/README.md +15 -8
package/assets/templates/ghibli_shorts.json +1 -1
package/assets/templates/sensei_and_taro.json +1 -1
package/lib/actions/captions.js +16 -2
package/lib/actions/images.d.ts +5 -2
package/lib/actions/images.js +12 -16
package/lib/actions/movie.d.ts +1 -1
package/lib/actions/movie.js +61 -17
package/lib/agents/combine_audio_files_agent.js +53 -5
package/lib/agents/tts_openai_agent.js +2 -1
package/lib/cli/commands/tool/scripting/handler.js +1 -0
package/lib/methods/mulmo_presentation_style.d.ts +3 -2
package/lib/methods/mulmo_presentation_style.js +8 -4
package/lib/types/schema.d.ts +227 -53
package/lib/types/schema.js +30 -7
package/lib/types/type.d.ts +3 -2
package/lib/utils/context.d.ts +11 -2
package/lib/utils/image_plugins/index.d.ts +2 -1
package/lib/utils/image_plugins/index.js +2 -1
package/lib/utils/image_plugins/voice_over.d.ts +5 -0
package/lib/utils/image_plugins/voice_over.js +9 -0
package/lib/utils/preprocess.d.ts +11 -2
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -200,11 +200,18 @@ writing: /Users/username/path/to/output/story-1747834931950__ja.mp4
 # Generate script from web content (requires Browserless API KEY)
 mulmo tool scripting -u https://example.com
+# Generate script from local file
+mulmo tool scripting --input-file story.txt
 # Generate script with interactive mode
 mulmo tool scripting -i
 ```
-When using the `⁠sensei_and_taro` template, a Nijivoice API key is required.
+Note:
+- When using the `⁠sensei_and_taro` template, a Nijivoice API key is required
+- When -i is specified, --input-file value will be ignored
+- When --input-file is specified, -u value will be ignored
 ## Generate content from MulmoScript
@@ -308,7 +315,6 @@ Options:
   -b, --basedir            base dir                                     [string]
   -l, --lang               target language        [string] [choices: "en", "ja"]
   -f, --force              Force regenerate           [boolean] [default: false]
-      --dryRun             Dry run                    [boolean] [default: false]
   -p, --presentationStyle  Presentation Style                           [string]
   -a, --audiodir           Audio output directory                       [string]
 ```
@@ -329,7 +335,6 @@ Options:
   -b, --basedir            base dir                                     [string]
   -l, --lang               target language        [string] [choices: "en", "ja"]
   -f, --force              Force regenerate           [boolean] [default: false]
-      --dryRun             Dry run                    [boolean] [default: false]
   -p, --presentationStyle  Presentation Style                           [string]
   -i, --imagedir           Image output directory                       [string]
 ```
@@ -350,7 +355,6 @@ Options:
   -b, --basedir            base dir                                     [string]
   -l, --lang               target language        [string] [choices: "en", "ja"]
   -f, --force              Force regenerate           [boolean] [default: false]
-      --dryRun             Dry run                    [boolean] [default: false]
   -p, --presentationStyle  Presentation Style                           [string]
   -a, --audiodir           Audio output directory                       [string]
   -i, --imagedir           Image output directory                       [string]
@@ -411,16 +415,19 @@ Options:
   -b, --basedir      base dir                                           [string]
   -u, --url          URLs to reference (required when not in interactive mode)
                                                            [array] [default: []]
+      --input-file   input file name                                    [string]
   -i, --interactive  Generate script in interactive mode with user prompts
                                                                        [boolean]
   -t, --template     Template name to use
-       [string] [choices: "business", "children_book", "coding", "comic_strips",
-                         "ghibli_strips", "podcast_standard", "sensei_and_taro"]
+        [string] [choices: "akira_comic", "business", "children_book", "coding",
+           "comic_strips", "drslump_comic", "ghibli_comic", "ghibli_image_only",
+           "ghibli_shorts", "ghost_comic", "onepiece_comic", "podcast_standard",
+               "portrait_movie", "realistic_movie", "sensei_and_taro", "shorts",
+                                       "text_and_image", "text_only", "trailer"]
   -c, --cache        cache dir                                          [string]
   -s, --script       script filename                [string] [default: "script"]
       --llm          llm
-              [string] [choices: "openAIAgent", "anthropicAgent", "geminiAgent",
-                                                                    "groqAgent"]
+                     [string] [choices: "openai", "anthropic", "gemini", "groq"]
       --llm_model    llm model                                          [string]
 ```

package/assets/templates/ghibli_shorts.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "speechParams": {
       "provider": "nijivoice",
       "speakers": {
-        "Presenter": { "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62", "speechOptions": { "speed": 1.5 } }
+        "Presenter": { "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c", "speechOptions": { "speed": 1.5 } }
       }
     },
     "imageParams": {

package/assets/templates/sensei_and_taro.json CHANGED Viewed

@@ -17,7 +17,7 @@
     "speechParams": {
       "provider": "nijivoice",
       "speakers": {
-        "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62" },
+        "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
         "Student": { "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
         "Teacher": { "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
       }

package/lib/actions/captions.js CHANGED Viewed

@@ -1,14 +1,16 @@
 import { mulmoCaptionParamsSchema } from "../types/index.js";
 import { GraphAI, GraphAILogger } from "graphai";
 import * as agents from "@graphai/vanilla";
-import { getHTMLFile, getCaptionImagePath } from "../utils/file.js";
+import { getHTMLFile, getCaptionImagePath, getOutputStudioFilePath } from "../utils/file.js";
 import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
 import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
+import { fileWriteAgent } from "@graphai/vanilla_node_agents";
 const vanillaAgents = agents.default ?? agents;
 const graph_data = {
     version: 0.5,
     nodes: {
         context: {},
+        outputStudioFilePath: {},
         map: {
             agent: "mapAgent",
             inputs: { rows: ":context.studio.script.beats", context: ":context" },
@@ -60,14 +62,26 @@ const graph_data = {
                 },
             },
         },
+        fileWrite: {
+            agent: "fileWriteAgent",
+            inputs: {
+                onComplete: ":map.generateCaption",
+                file: ":outputStudioFilePath",
+                text: ":context.studio.toJSON()",
+            },
+        },
     },
 };
 export const captions = async (context, callbacks) => {
     if (MulmoStudioContextMethods.getCaption(context)) {
         try {
             MulmoStudioContextMethods.setSessionState(context, "caption", true);
-            const graph = new GraphAI(graph_data, { ...vanillaAgents });
+            const graph = new GraphAI(graph_data, { ...vanillaAgents, fileWriteAgent });
+            const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
+            const fileName = MulmoStudioContextMethods.getFileName(context);
+            const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
             graph.injectValue("context", context);
+            graph.injectValue("outputStudioFilePath", outputStudioFilePath);
             if (callbacks) {
                 callbacks.forEach((callback) => {
                     graph.registerCallback(callback);

package/lib/actions/images.d.ts CHANGED Viewed

@@ -1,13 +1,13 @@
 import type { CallbackFunction } from "graphai";
-import { MulmoStudioContext, MulmoBeat, Text2ImageAgentInfo } from "../types/index.js";
+import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
 export declare const imagePreprocessAgent: (namedInputs: {
     context: MulmoStudioContext;
     beat: MulmoBeat;
     index: number;
-    imageAgentInfo: Text2ImageAgentInfo;
     imageRefs: Record<string, string>;
 }) => Promise<{
     imageParams: {
+        provider: "openai" | "google";
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -42,6 +42,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
     images: string[];
     imageFromMovie: boolean;
     imageParams: {
+        provider: "openai" | "google";
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -68,6 +69,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
 } | {
     images: string[];
     imageParams: {
+        provider: "openai" | "google";
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -89,6 +91,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
         }> | undefined;
     };
     movieFile: string | undefined;
+    imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
     imagePath: string;
     referenceImage: string;
     prompt: string;

package/lib/actions/images.js CHANGED Viewed

@@ -25,11 +25,12 @@ const htmlStyle = (context, beat) => {
     };
 };
 export const imagePreprocessAgent = async (namedInputs) => {
-    const { context, beat, index, imageAgentInfo, imageRefs } = namedInputs;
-    const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
+    const { context, beat, index, imageRefs } = namedInputs;
+    const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle, beat);
+    // const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
     const imagePath = getBeatPngImagePath(context, index);
     const returnValue = {
-        imageParams,
+        imageParams: imageAgentInfo.imageParams,
         movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
     };
     if (beat.image) {
@@ -54,8 +55,8 @@ export const imagePreprocessAgent = async (namedInputs) => {
     if (beat.moviePrompt && !beat.imagePrompt) {
         return { ...returnValue, imagePath, images, imageFromMovie: true }; // no image prompt, only movie prompt
     }
-    const prompt = imagePrompt(beat, imageParams.style);
-    return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
+    const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
+    return { imageAgentInfo, imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
 };
 export const imagePluginAgent = async (namedInputs) => {
     const { context, beat, index } = namedInputs;
@@ -87,7 +88,6 @@ const beat_graph_data = {
     concurrency: 4,
     nodes: {
         context: {},
-        imageAgentInfo: {},
         htmlImageAgentInfo: {},
         movieAgentInfo: {},
         imageRefs: {},
@@ -99,7 +99,6 @@ const beat_graph_data = {
                 context: ":context",
                 beat: ":beat",
                 index: ":__mapIndex",
-                imageAgentInfo: ":imageAgentInfo",
                 imageRefs: ":imageRefs",
             },
         },
@@ -142,7 +141,7 @@ const beat_graph_data = {
         },
         imageGenerator: {
             if: ":preprocessor.prompt",
-            agent: ":imageAgentInfo.agent",
+            agent: ":preprocessor.imageAgentInfo.agent",
             retry: 2,
             inputs: {
                 prompt: ":preprocessor.prompt",
@@ -213,7 +212,6 @@ const graph_data = {
     concurrency: 4,
     nodes: {
         context: {},
-        imageAgentInfo: {},
         htmlImageAgentInfo: {},
         movieAgentInfo: {},
         outputStudioFilePath: {},
@@ -223,7 +221,6 @@ const graph_data = {
             inputs: {
                 rows: ":context.studio.script.beats",
                 context: ":context",
-                imageAgentInfo: ":imageAgentInfo",
                 htmlImageAgentInfo: ":htmlImageAgentInfo",
                 movieAgentInfo: ":movieAgentInfo",
                 imageRefs: ":imageRefs",
@@ -306,10 +303,10 @@ const graphOption = async (context, settings) => {
         agentFilters,
         taskManager,
     };
-    const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
+    const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
     const config = settings2GraphAIConfig(settings);
     // We need to get google's auth token only if the google is the text2image provider.
-    if (imageAgentInfo.provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
+    if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
         userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
         GraphAILogger.log("google was specified as text2image engine");
         const token = await googleAuth();
@@ -372,7 +369,7 @@ const prepareGenerateImages = async (context) => {
     const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
     const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
     mkdir(imageProjectDirPath);
-    const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
+    const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
     const htmlImageAgentInfo = MulmoPresentationStyleMethods.getHtmlImageAgentInfo(context.presentationStyle);
     const imageRefs = await getImageRefs(context);
     // Determine movie agent based on provider
@@ -386,10 +383,9 @@ const prepareGenerateImages = async (context) => {
                 return "movieGoogleAgent";
         }
     };
-    GraphAILogger.info(`text2image: provider=${imageAgentInfo.provider} model=${imageAgentInfo.imageParams.model}`);
+    GraphAILogger.info(`text2image: provider=${provider} model=${context.presentationStyle.imageParams?.model}`);
     const injections = {
         context,
-        imageAgentInfo,
         htmlImageAgentInfo,
         movieAgentInfo: {
             agent: getMovieAgent(),
@@ -404,7 +400,7 @@ const getConcurrency = (context) => {
         return 4;
     }
     const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
-    if (imageAgentInfo.provider === "openai") {
+    if (imageAgentInfo.imageParams.provider === "openai") {
         // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
         // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
         // gpt-image-1：3,000,000 TPM、150 images per minute

package/lib/actions/movie.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { MulmoStudioContext, MulmoCanvasDimension, BeatMediaType, MulmoFillOption } from "../types/index.js";
-export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption) => {
+export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption, speed: number) => {
     videoId: string;
     videoPart: string;
 };

package/lib/actions/movie.js CHANGED Viewed

@@ -6,20 +6,28 @@ import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAud
 import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
 // const isMac = process.platform === "darwin";
 const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
-export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption) => {
+export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption, speed) => {
     const videoId = `v${inputIndex}`;
     const videoFilters = [];
     // Handle different media types
+    const originalDuration = duration * speed;
     if (mediaType === "image") {
         videoFilters.push("loop=loop=-1:size=1:start=0");
     }
     else if (mediaType === "movie") {
         // For videos, extend with last frame if shorter than required duration
         // tpad will extend the video by cloning the last frame, then trim will ensure exact duration
-        videoFilters.push(`tpad=stop_mode=clone:stop_duration=${duration * 2}`); // Use 2x duration to ensure coverage
+        videoFilters.push(`tpad=stop_mode=clone:stop_duration=${originalDuration * 2}`); // Use 2x duration to ensure coverage
     }
     // Common filters for all media types
-    videoFilters.push(`trim=duration=${duration}`, "fps=30", "setpts=PTS-STARTPTS");
+    videoFilters.push(`trim=duration=${originalDuration}`, "fps=30");
+    // Apply speed if specified
+    if (speed !== 1.0) {
+        videoFilters.push(`setpts=${1 / speed}*PTS`);
+    }
+    else {
+        videoFilters.push("setpts=PTS-STARTPTS");
+    }
     // Apply scaling based on fill option
     if (fillOption.style === "aspectFill") {
         // For aspect fill: scale to fill the canvas completely, cropping if necessary
@@ -73,7 +81,13 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
     const caption = MulmoStudioContextMethods.getCaption(context);
     const start = performance.now();
     const ffmpegContext = FfmpegContextInit();
-    const missingIndex = context.studio.beats.findIndex((beat) => !beat.imageFile && !beat.movieFile);
+    const missingIndex = context.studio.beats.findIndex((studioBeat, index) => {
+        const beat = context.studio.script.beats[index];
+        if (beat.image?.type === "voice_over") {
+            return false; // Voice-over does not have either imageFile or movieFile.
+        }
+        return !studioBeat.imageFile && !studioBeat.movieFile;
+    });
     if (missingIndex !== -1) {
         GraphAILogger.info(`ERROR: beat.imageFile or beat.movieFile is not set on beat ${missingIndex}.`);
         return false;
@@ -86,6 +100,11 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
     const beatTimestamps = [];
     context.studio.beats.reduce((timestamp, studioBeat, index) => {
         const beat = context.studio.script.beats[index];
+        if (beat.image?.type === "voice_over") {
+            filterComplexVideoIds.push(undefined);
+            beatTimestamps.push(timestamp);
+            return timestamp; // Skip voice-over beats.
+        }
         const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
         if (!sourceFile) {
             throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
@@ -105,23 +124,27 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
             }
             return 0;
         })();
-        const duration = studioBeat.duration + extraPadding;
+        // The movie duration is bigger in case of voice-over.
+        const duration = Math.max(studioBeat.duration + extraPadding, studioBeat.movieDuration ?? 0);
         // Get fillOption from merged imageParams (global + beat-specific)
         const globalFillOption = context.presentationStyle.movieParams?.fillOption;
         const beatFillOption = beat.movieParams?.fillOption;
         const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
         const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
-        const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption);
+        const speed = beat.movieParams?.speed ?? 1.0;
+        const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
         ffmpegContext.filterComplex.push(videoPart);
+        /*
         if (caption && studioBeat.captionFile) {
-            const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
-            const compositeVideoId = `c${index}`;
-            ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
-            filterComplexVideoIds.push(compositeVideoId);
-        }
-        else {
-            filterComplexVideoIds.push(videoId);
+          // NOTE: This works for normal beats, but not for voice-over beats.
+          const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
+          const compositeVideoId = `c${index}`;
+          ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
+          filterComplexVideoIds.push(compositeVideoId);
+        } else {
         }
+        */
+        filterComplexVideoIds.push(videoId);
         if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
             const sourceId = filterComplexVideoIds.pop();
             ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
@@ -135,7 +158,8 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
                 transitionVideoIds.push(`${sourceId}_1`);
             }
         }
-        if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0) {
+        // NOTE: We don't support audio if the speed is not 1.0.
+        if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
             const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
             filterComplexAudioIds.push(audioId);
             ffmpegContext.filterComplex.push(audioPart);
@@ -148,7 +172,26 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
     // console.log("*** images", images.audioIds);
     // Concatenate the trimmed images
     const concatVideoId = "concat_video";
-    ffmpegContext.filterComplex.push(`${filterComplexVideoIds.map((id) => `[${id}]`).join("")}concat=n=${context.studio.beats.length}:v=1:a=0[${concatVideoId}]`);
+    const videoIds = filterComplexVideoIds.filter((id) => id !== undefined); // filter out voice-over beats
+    ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
+    // Overlay voice-over captions
+    const captionedVideoId = (() => {
+        const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
+        if (caption && beatsWithCaptions.length > 0) {
+            const introPadding = context.presentationStyle.audioParams.introPadding;
+            return beatsWithCaptions.reduce((acc, beat, index) => {
+                const { startAt, duration, captionFile } = beat;
+                if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
+                    const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
+                    const compositeVideoId = `oc${index}`;
+                    ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
+                    return compositeVideoId;
+                }
+                return acc;
+            }, concatVideoId);
+        }
+        return concatVideoId;
+    })();
     // Add tranditions if needed
     const mixedVideoId = (() => {
         if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
@@ -175,10 +218,11 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
                     ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
                 }
                 return outputId;
-            }, concatVideoId);
+            }, captionedVideoId);
         }
-        return concatVideoId;
+        return captionedVideoId;
     })();
+    GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
     const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
     const artifactAudioId = `${audioIndex}:a`;
     const ffmpegContextAudioId = (() => {

package/lib/agents/combine_audio_files_agent.js CHANGED Viewed

@@ -1,10 +1,12 @@
 import { assert, GraphAILogger } from "graphai";
 import { silent60secPath } from "../utils/file.js";
 import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
+import { userAssert } from "../utils/utils.js";
 const getMovieDulation = async (beat) => {
     if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
         const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
-        return await ffmpegGetMediaDuration(pathOrUrl);
+        const speed = beat.movieParams?.speed ?? 1.0;
+        return (await ffmpegGetMediaDuration(pathOrUrl)) / speed;
     }
     return 0;
 };
@@ -65,7 +67,45 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
     const mediaDurations = await getMediaDurations(context);
     const beatDurations = [];
     context.studio.script.beats.forEach((beat, index) => {
+        if (beatDurations.length > index) {
+            // The current beat has already been processed.
+            return;
+        }
+        assert(beatDurations.length === index, "beatDurations.length !== index");
         const { audioDuration, movieDuration } = mediaDurations[index];
+        // Check if we are processing a voice-over beat.
+        if (movieDuration > 0) {
+            const group = [index];
+            for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
+                group.push(i);
+            }
+            if (group.length > 1) {
+                group.reduce((remaining, idx, iGroup) => {
+                    const subBeatDurations = mediaDurations[idx];
+                    userAssert(subBeatDurations.audioDuration <= remaining, `subBeatDurations.audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
+                    if (iGroup === group.length - 1) {
+                        beatDurations.push(remaining);
+                        subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
+                        return 0;
+                    }
+                    const nextBeat = context.studio.script.beats[idx + 1];
+                    assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
+                    const voiceStartAt = nextBeat.image?.startAt;
+                    if (voiceStartAt) {
+                        const remainingDuration = movieDuration - voiceStartAt;
+                        const duration = remaining - remainingDuration;
+                        userAssert(duration >= 0, `duration(${duration}) < 0`);
+                        beatDurations.push(duration);
+                        subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
+                        userAssert(subBeatDurations.silenceDuration >= 0, `subBeatDurations.silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
+                        return remainingDuration;
+                    }
+                    beatDurations.push(subBeatDurations.audioDuration);
+                    return remaining - subBeatDurations.audioDuration;
+                }, movieDuration);
+                return;
+            }
+        }
         // Check if the current beat has media and the next beat does not have media.
         if (audioDuration > 0) {
             // Check if the current beat has spilled over audio.
@@ -111,17 +151,15 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
         }
         else if (movieDuration > 0) {
             // This beat has only a movie, not audio.
-            assert(beatDurations.length === index, "beatDurations.length !== index");
             beatDurations.push(movieDuration);
             mediaDurations[index].silenceDuration = movieDuration;
         }
-        else if (beatDurations.length === index) {
+        else {
             // The current beat has no audio, nor no spilled over audio
             const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
             beatDurations.push(beatDuration);
             mediaDurations[index].silenceDuration = beatDuration;
         }
-        // else { Skip this beat if the duration has been already added as a group }
     });
     assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
     // We cannot reuse longSilentId. We need to explicitly split it for each beat.
@@ -152,9 +190,19 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
     const result = {
         studio: {
             ...context.studio,
-            beats: context.studio.beats.map((studioBeat, index) => ({ ...studioBeat, duration: beatDurations[index] })),
+            beats: context.studio.beats.map((studioBeat, index) => ({
+                ...studioBeat,
+                duration: beatDurations[index],
+                audioDuration: mediaDurations[index].audioDuration,
+                movieDuration: mediaDurations[index].movieDuration,
+                silenceDuration: mediaDurations[index].silenceDuration,
+            })),
         },
     };
+    result.studio.beats.reduce((acc, beat) => {
+        beat.startAt = acc;
+        return acc + beat.duration;
+    }, 0);
     // context.studio = result.studio; // TODO: removing this breaks test/test_movie.ts
     return {
         ...context,

package/lib/agents/tts_openai_agent.js CHANGED Viewed

@@ -29,12 +29,13 @@ export const ttsOpenaiAgent = async ({ namedInputs, params, config }) => {
         if (e && typeof e === "object" && "error" in e) {
             GraphAILogger.info("tts_openai_agent: ");
             GraphAILogger.info(e.error);
+            throw new Error("TTS OpenAI Error: " + JSON.stringify(e.error, null, 2));
         }
         else if (e instanceof Error) {
             GraphAILogger.info("tts_openai_agent: ");
             GraphAILogger.info(e.message);
+            throw new Error("TTS OpenAI Error: " + e.message);
         }
-        throw new Error("TTS OpenAI Error");
     }
 };
 const ttsOpenaiAgentInfo = {

package/lib/cli/commands/tool/scripting/handler.js CHANGED Viewed

@@ -34,6 +34,7 @@ export const handler = async (argv) => {
     const context = { outDirPath, templateName: template, urls, filename: filename, cacheDirPath, llm_model, llm, verbose };
     if (interactive) {
         await createMulmoScriptInteractively(context);
+        return;
     }
     if (inputFile) {
         await createMulmoScriptFromFile(inputFile, context);

package/lib/methods/mulmo_presentation_style.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import "dotenv/config";
-import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData } from "../types/index.js";
+import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData, Text2ImageProvider } from "../types/index.js";
 export declare const MulmoPresentationStyleMethods: {
     getCanvasSize(presentationStyle: MulmoPresentationStyle): MulmoCanvasDimension;
     getSpeechProvider(presentationStyle: MulmoPresentationStyle): Text2SpeechProvider;
@@ -9,7 +9,8 @@ export declare const MulmoPresentationStyleMethods: {
     getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
     getProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
     getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
-    getImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2ImageAgentInfo;
+    getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
+    getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
     getHtmlImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2HtmlAgentInfo;
     getImageType(_: MulmoPresentationStyle, beat: MulmoBeat): BeatMediaType;
 };

package/lib/methods/mulmo_presentation_style.js CHANGED Viewed

@@ -57,17 +57,21 @@ export const MulmoPresentationStyleMethods = {
         const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
         return speaker.voiceId;
     },
-    getImageAgentInfo(presentationStyle) {
+    getText2ImageProvider(provider) {
+        return text2ImageProviderSchema.parse(provider);
+    },
+    getImageAgentInfo(presentationStyle, beat) {
         // Notice that we copy imageParams from presentationStyle and update
         // provider and model appropriately.
-        const provider = text2ImageProviderSchema.parse(presentationStyle.imageParams?.provider);
+        const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
+        const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
         const defaultImageParams = {
+            provider,
             model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
         };
         return {
-            provider,
             agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
-            imageParams: { ...defaultImageParams, ...presentationStyle.imageParams },
+            imageParams: { ...defaultImageParams, ...imageParams },
         };
     },
     getHtmlImageAgentInfo(presentationStyle) {