npm - mulmocast - Versions diffs - 0.1.4 → 0.1.6 - Mend

mulmocast 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/lib/actions/audio.js +8 -5
package/lib/actions/image_agents.d.ts +3 -3
package/lib/actions/image_references.js +2 -1
package/lib/actions/images.js +19 -11
package/lib/actions/movie.js +3 -2
package/lib/actions/translate.d.ts +4 -1
package/lib/actions/translate.js +6 -3
package/lib/agents/add_bgm_agent.js +1 -1
package/lib/agents/combine_audio_files_agent.js +116 -93
package/lib/agents/movie_replicate_agent.js +1 -1
package/lib/agents/tts_elevenlabs_agent.d.ts +2 -1
package/lib/agents/tts_elevenlabs_agent.js +5 -4
package/lib/agents/tts_google_agent.d.ts +2 -9
package/lib/agents/tts_nijivoice_agent.d.ts +2 -1
package/lib/agents/tts_nijivoice_agent.js +4 -5
package/lib/agents/tts_openai_agent.d.ts +2 -13
package/lib/agents/tts_openai_agent.js +4 -3
package/lib/index.browser.d.ts +1 -2
package/lib/index.browser.js +2 -3
package/lib/index.common.d.ts +2 -0
package/lib/index.common.js +3 -0
package/lib/index.js +1 -0
package/lib/index.node.d.ts +7 -0
package/lib/index.node.js +8 -0
package/lib/methods/mulmo_presentation_style.d.ts +1 -0
package/lib/methods/mulmo_presentation_style.js +4 -0
package/lib/types/agent.d.ts +29 -2
package/lib/types/agent.js +0 -1
package/lib/types/schema.d.ts +344 -254
package/lib/types/schema.js +5 -3
package/lib/utils/context.d.ts +24 -19
package/lib/utils/ffmpeg_utils.d.ts +4 -1
package/lib/utils/ffmpeg_utils.js +2 -1
package/lib/utils/image_plugins/image.d.ts +2 -2
package/lib/utils/image_plugins/movie.d.ts +2 -2
package/lib/utils/preprocess.d.ts +21 -18
package/lib/utils/provider2agent.d.ts +4 -0
package/lib/utils/provider2agent.js +6 -0
package/lib/utils/utils.js +6 -0
package/package.json +6 -6

package/lib/actions/audio.js CHANGED Viewed

@@ -34,12 +34,13 @@ const getAudioParam = (presentationStyle, beat) => {
     // Use speaker-specific provider if available, otherwise fall back to script-level provider
     const provider = MulmoPresentationStyleMethods.getTTSProvider(presentationStyle, beat);
     const speechOptions = MulmoPresentationStyleMethods.getSpeechOptions(presentationStyle, beat);
-    return { voiceId, provider, speechOptions };
+    const model = MulmoPresentationStyleMethods.getTTSModel(presentationStyle, beat);
+    return { voiceId, provider, speechOptions, model };
 };
 export const getBeatAudioPath = (text, context, beat, lang) => {
     const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
-    const { voiceId, provider, speechOptions } = getAudioParam(context.presentationStyle, beat);
-    const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider].join(":");
+    const { voiceId, provider, speechOptions, model } = getAudioParam(context.presentationStyle, beat);
+    const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider, model ?? ""].join(":");
     const audioFileName = `${context.studio.filename}_${text2hash(hash_string)}`;
     const audioFile = getAudioFilePath(audioDirPath, context.studio.filename, audioFileName, lang);
     return getAudioPath(context, beat, audioFile);
@@ -48,15 +49,16 @@ const preprocessor = (namedInputs) => {
     const { beat, studioBeat, multiLingual, context } = namedInputs;
     const { lang, presentationStyle } = context;
     const text = localizedText(beat, multiLingual, lang);
-    const { voiceId, provider, speechOptions } = getAudioParam(presentationStyle, beat);
+    const { voiceId, provider, speechOptions, model } = getAudioParam(presentationStyle, beat);
     const audioPath = getBeatAudioPath(text, context, beat, lang);
-    studioBeat.audioFile = audioPath; // TODO
+    studioBeat.audioFile = audioPath; // TODO: Passing by reference is difficult to maintain, so pass it using graphai inputs
     const needsTTS = !beat.audio && audioPath !== undefined;
     return {
         ttsAgent: provider2TTSAgent[provider].agentName,
         text,
         voiceId,
         speechOptions,
+        model,
         audioPath,
         studioBeat,
         needsTTS,
@@ -94,6 +96,7 @@ const graph_tts = {
                     voice: ":preprocessor.voiceId",
                     speed: ":preprocessor.speechOptions.speed",
                     instructions: ":preprocessor.speechOptions.instruction",
+                    model: ":preprocessor.model",
                 },
             },
         },

package/lib/actions/image_agents.d.ts CHANGED Viewed

@@ -14,8 +14,8 @@ export declare const imagePreprocessAgent: (namedInputs: {
     referenceImageForMovie: string | undefined;
     imageParams: {
         provider: string;
-        style?: string | undefined;
         model?: string | undefined;
+        style?: string | undefined;
         moderation?: string | undefined;
         images?: Record<string, {
             type: "image";
@@ -58,8 +58,8 @@ export declare const imagePreprocessAgent: (namedInputs: {
     };
     imageParams: {
         provider: string;
-        style?: string | undefined;
         model?: string | undefined;
+        style?: string | undefined;
         moderation?: string | undefined;
         images?: Record<string, {
             type: "image";
@@ -105,8 +105,8 @@ export declare const imagePreprocessAgent: (namedInputs: {
     };
     imageParams: {
         provider: string;
-        style?: string | undefined;
         model?: string | undefined;
+        style?: string | undefined;
         moderation?: string | undefined;
         images?: Record<string, {
             type: "image";

package/lib/actions/image_references.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import fs from "fs";
-import { GraphAI } from "graphai";
+import { GraphAI, GraphAILogger } from "graphai";
 import { getReferenceImagePath } from "../utils/file.js";
 import { getExtention } from "../utils/utils.js";
 import { graphOption } from "./images.js";
@@ -13,6 +13,7 @@ export const generateReferenceImage = async (inputs) => {
     // generate image
     const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
     const prompt = `${image.prompt}\n${imageAgentInfo.imageParams.style || ""}`;
+    GraphAILogger.info(`Generating reference image for ${key}: ${prompt}`);
     const image_graph_data = {
         version: 0.5,
         nodes: {

package/lib/actions/images.js CHANGED Viewed

@@ -11,7 +11,7 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../met
 import { getOutputStudioFilePath, mkdir } from "../utils/file.js";
 import { fileCacheAgentFilter } from "../utils/filters.js";
 import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
-import { extractImageFromMovie } from "../utils/ffmpeg_utils.js";
+import { extractImageFromMovie, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
 import { getImageRefs } from "./image_references.js";
 import { imagePreprocessAgent, imagePluginAgent, htmlImageGeneratorAgent } from "./image_agents.js";
 const vanillaAgents = vanilla.default ?? vanilla;
@@ -166,16 +166,30 @@ const beat_graph_data = {
             },
             defaultValue: {},
         },
+        audioChecker: {
+            if: ":preprocessor.movieFile",
+            agent: async (namedInputs) => {
+                const { hasAudio } = await ffmpegGetMediaDuration(namedInputs.movieFile);
+                return { hasMovieAudio: hasAudio };
+            },
+            inputs: {
+                onComplete: [":movieGenerator"], // to wait for movieGenerator to finish
+                movieFile: ":preprocessor.movieFile",
+            },
+            defaultValue: {},
+        },
         output: {
             agent: "copyAgent",
             inputs: {
-                onComplete: [":imageFromMovie", ":htmlImageGenerator"], // to wait for imageFromMovie to finish
+                onComplete: [":imageFromMovie", ":htmlImageGenerator", ":audioChecker"], // to wait for imageFromMovie to finish
                 imageFile: ":preprocessor.imagePath",
                 movieFile: ":preprocessor.movieFile",
+                hasMovieAudio: ":audioChecker.hasMovieAudio",
             },
             output: {
                 imageFile: ".imageFile",
                 movieFile: ".movieFile",
+                hasMovieAudio: ".hasMovieAudio",
             },
             isResult: true,
         },
@@ -279,17 +293,11 @@ export const graphOption = async (context, settings) => {
     const config = settings2GraphAIConfig(settings, process.env);
     // We need to get google's auth token only if the google is the text2image provider.
     if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
-        userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
+        userAssert(!!config.movieGoogleAgent || !!config.imageGoogleAgent, "GOOGLE_PROJECT_ID is not set");
         GraphAILogger.log("google was specified as text2image engine");
         const token = await googleAuth();
-        config["imageGoogleAgent"] = {
-            projectId: process.env.GOOGLE_PROJECT_ID,
-            token,
-        };
-        config["movieGoogleAgent"] = {
-            projectId: process.env.GOOGLE_PROJECT_ID,
-            token,
-        };
+        config["imageGoogleAgent"].token = token;
+        config["movieGoogleAgent"].token = token;
     }
     options.config = config;
     return options;

package/lib/actions/movie.js CHANGED Viewed

@@ -204,8 +204,9 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
             videoIdsForBeats.push(videoId);
         }
         // NOTE: We don't support audio if the speed is not 1.0.
-        if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
-            const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
+        const movieVolume = beat.audioParams?.movieVolume ?? 1.0;
+        if (studioBeat.hasMovieAudio && movieVolume > 0.0 && speed === 1.0) {
+            const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, movieVolume);
             audioIdsFromMovieBeats.push(audioId);
             ffmpegContext.filterComplex.push(audioPart);
         }

package/lib/actions/translate.d.ts CHANGED Viewed

@@ -1,4 +1,7 @@
 import "dotenv/config";
 import type { CallbackFunction } from "graphai";
 import { MulmoStudioContext } from "../types/index.js";
-export declare const translate: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
+export declare const translate: (context: MulmoStudioContext, args?: {
+    callbacks?: CallbackFunction[];
+    settings?: Record<string, string>;
+}) => Promise<void>;

package/lib/actions/translate.js CHANGED Viewed

@@ -4,6 +4,7 @@ import * as agents from "@graphai/vanilla";
 import { openAIAgent } from "@graphai/openai_agent";
 import { fileWriteAgent } from "@graphai/vanilla_node_agents";
 import { recursiveSplitJa, replacementsJa, replacePairsJa } from "../utils/string.js";
+import { settings2GraphAIConfig } from "../utils/utils.js";
 import { getOutputMultilingualFilePath, mkdir, writingMessage } from "../utils/file.js";
 import { translateSystemPrompt, translatePrompts } from "../utils/prompt.js";
 import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
@@ -208,15 +209,17 @@ const agentFilters = [
 ];
 const defaultLang = "en";
 const targetLangs = ["ja", "en"];
-export const translate = async (context, callbacks) => {
+export const translate = async (context, args) => {
+    const { settings, callbacks } = args ?? {};
     try {
         MulmoStudioContextMethods.setSessionState(context, "multiLingual", true);
         const fileName = MulmoStudioContextMethods.getFileName(context);
         const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
         const outputMultilingualFilePath = getOutputMultilingualFilePath(outDirPath, fileName);
         mkdir(outDirPath);
-        assert(!!process.env.OPENAI_API_KEY, "The OPENAI_API_KEY environment variable is missing or empty");
-        const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters });
+        const config = settings2GraphAIConfig(settings, process.env);
+        assert(!!config?.openAIAgent?.apiKey, "The OPENAI_API_KEY environment variable is missing or empty");
+        const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters, config });
         graph.injectValue("context", context);
         graph.injectValue("defaultLang", defaultLang);
         graph.injectValue("targetLangs", targetLangs);

package/lib/agents/add_bgm_agent.js CHANGED Viewed

@@ -10,7 +10,7 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
     if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
         throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
     }
-    const speechDuration = await ffmpegGetMediaDuration(voiceFile);
+    const { duration: speechDuration } = await ffmpegGetMediaDuration(voiceFile);
     const introPadding = context.presentationStyle.audioParams.introPadding;
     const outroPadding = context.presentationStyle.audioParams.outroPadding;
     const totalDuration = speechDuration + introPadding + outroPadding;

package/lib/agents/combine_audio_files_agent.js CHANGED Viewed

@@ -1,14 +1,15 @@
 import { assert, GraphAILogger } from "graphai";
 import { silent60secPath } from "../utils/file.js";
-import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
+import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration, } from "../utils/ffmpeg_utils.js";
 import { userAssert } from "../utils/utils.js";
-const getMovieDulation = async (beat) => {
+const getMovieDuration = async (beat) => {
     if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
         const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
         const speed = beat.movieParams?.speed ?? 1.0;
-        return (await ffmpegGetMediaDuration(pathOrUrl)) / speed;
+        const { duration, hasAudio } = await ffmpegGetMediaDuration(pathOrUrl);
+        return { duration: duration / speed, hasAudio };
     }
-    return 0;
+    return { duration: 0, hasAudio: false };
 };
 const getPadding = (context, beat, index) => {
     if (beat.audioParams?.padding !== undefined) {
@@ -29,16 +30,17 @@ const getTotalPadding = (padding, movieDuration, audioDuration, duration) => {
     }
     return padding;
 };
-const getMediaDurations = (context) => {
+const getMediaDurationsOfAllBeats = (context) => {
     return Promise.all(context.studio.beats.map(async (studioBeat, index) => {
         const beat = context.studio.script.beats[index];
-        const movieDuration = await getMovieDulation(beat);
-        const audioDuration = studioBeat.audioFile ? await ffmpegGetMediaDuration(studioBeat.audioFile) : 0;
+        const { duration: movieDuration, hasAudio: hasMovieAudio } = await getMovieDuration(beat);
+        const audioDuration = studioBeat.audioFile ? (await ffmpegGetMediaDuration(studioBeat.audioFile)).duration : 0;
         return {
             movieDuration,
             audioDuration,
             hasMedia: movieDuration + audioDuration > 0,
             silenceDuration: 0,
+            hasMovieAudio,
         };
     }));
 };
@@ -60,11 +62,98 @@ const getGroupBeatDurations = (context, group, audioDuration) => {
     });
     return durations;
 };
+const getInputIds = (context, mediaDurations, ffmpegContext, silentIds) => {
+    const inputIds = [];
+    context.studio.beats.forEach((studioBeat, index) => {
+        const { silenceDuration } = mediaDurations[index];
+        const paddingId = `[padding_${index}]`;
+        if (studioBeat.audioFile) {
+            const audioId = FfmpegContextInputFormattedAudio(ffmpegContext, studioBeat.audioFile);
+            inputIds.push(audioId);
+        }
+        if (silenceDuration > 0) {
+            const silentId = silentIds.pop();
+            ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${silenceDuration}${paddingId}`);
+            inputIds.push(paddingId);
+        }
+    });
+    return inputIds;
+};
+const voiceOverProcess = (context, mediaDurations, movieDuration, beatDurations, groupLength) => {
+    return (remaining, idx, iGroup) => {
+        const subBeatDurations = mediaDurations[idx];
+        userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
+        if (iGroup === groupLength - 1) {
+            beatDurations.push(remaining);
+            subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
+            return 0;
+        }
+        const nextBeat = context.studio.script.beats[idx + 1];
+        assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
+        const voiceStartAt = nextBeat.image?.startAt;
+        if (voiceStartAt) {
+            const remainingDuration = movieDuration - voiceStartAt;
+            const duration = remaining - remainingDuration;
+            userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
+            beatDurations.push(duration);
+            subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
+            userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
+            return remainingDuration;
+        }
+        beatDurations.push(subBeatDurations.audioDuration);
+        return remaining - subBeatDurations.audioDuration;
+    };
+};
+const getVoiceOverGroup = (context, index) => {
+    const group = [index];
+    for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
+        group.push(i);
+    }
+    return group;
+};
+const getSpillOverGroup = (context, mediaDurations, index) => {
+    const group = [index];
+    for (let i = index + 1; i < context.studio.beats.length && !mediaDurations[i].hasMedia; i++) {
+        group.push(i);
+    }
+    return group;
+};
+const spilledOverAudio = (context, group, audioDuration, beatDurations, mediaDurations) => {
+    const groupBeatsDurations = getGroupBeatDurations(context, group, audioDuration);
+    // Yes, the current beat has spilled over audio.
+    const beatsTotalDuration = groupBeatsDurations.reduce((a, b) => a + b, 0);
+    if (beatsTotalDuration > audioDuration + 0.01) {
+        // 0.01 is a tolerance to avoid floating point precision issues
+        group.reduce((remaining, idx, iGroup) => {
+            if (remaining >= groupBeatsDurations[iGroup]) {
+                return remaining - groupBeatsDurations[iGroup];
+            }
+            mediaDurations[idx].silenceDuration = groupBeatsDurations[iGroup] - remaining;
+            return 0;
+        }, audioDuration);
+    }
+    else if (audioDuration > beatsTotalDuration) {
+        // Last beat gets the rest of the audio.
+        groupBeatsDurations[groupBeatsDurations.length - 1] += audioDuration - beatsTotalDuration;
+    }
+    beatDurations.push(...groupBeatsDurations);
+};
+const noSpilledOverAudio = (context, beat, index, movieDuration, audioDuration, beatDurations, mediaDurations) => {
+    // padding is the amount of audio padding specified in the script.
+    const padding = getPadding(context, beat, index);
+    // totalPadding is the amount of audio padding to be added to the audio file.
+    const totalPadding = Math.round(getTotalPadding(padding, movieDuration, audioDuration, beat.duration) * 100) / 100;
+    const beatDuration = audioDuration + totalPadding;
+    beatDurations.push(beatDuration);
+    if (totalPadding > 0) {
+        mediaDurations[index].silenceDuration = totalPadding;
+    }
+};
 const combineAudioFilesAgent = async ({ namedInputs, }) => {
     const { context, combinedFileName } = namedInputs;
     const ffmpegContext = FfmpegContextInit();
     // First, get the audio durations of all beats, taking advantage of multi-threading capability of ffmpeg.
-    const mediaDurations = await getMediaDurations(context);
+    const mediaDurations = await getMediaDurationsOfAllBeats(context);
     const beatDurations = [];
     context.studio.script.beats.forEach((beat, index) => {
         if (beatDurations.length > index) {
@@ -75,91 +164,37 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
         const { audioDuration, movieDuration } = mediaDurations[index];
         // Check if we are processing a voice-over beat.
         if (movieDuration > 0) {
-            const group = [index];
-            for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
-                group.push(i);
-            }
+            const group = getVoiceOverGroup(context, index);
             if (group.length > 1) {
-                group.reduce((remaining, idx, iGroup) => {
-                    const subBeatDurations = mediaDurations[idx];
-                    userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
-                    if (iGroup === group.length - 1) {
-                        beatDurations.push(remaining);
-                        subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
-                        return 0;
-                    }
-                    const nextBeat = context.studio.script.beats[idx + 1];
-                    assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
-                    const voiceStartAt = nextBeat.image?.startAt;
-                    if (voiceStartAt) {
-                        const remainingDuration = movieDuration - voiceStartAt;
-                        const duration = remaining - remainingDuration;
-                        userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
-                        beatDurations.push(duration);
-                        subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
-                        userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
-                        return remainingDuration;
-                    }
-                    beatDurations.push(subBeatDurations.audioDuration);
-                    return remaining - subBeatDurations.audioDuration;
-                }, movieDuration);
+                GraphAILogger.log(`Voice over group: ${group.length}`);
+                group.reduce(voiceOverProcess(context, mediaDurations, movieDuration, beatDurations, group.length), movieDuration);
                 return;
             }
         }
         // Check if the current beat has media and the next beat does not have media.
         if (audioDuration > 0) {
             // Check if the current beat has spilled over audio.
-            const group = [index];
-            for (let i = index + 1; i < context.studio.beats.length && !mediaDurations[i].hasMedia; i++) {
-                group.push(i);
-            }
+            const group = getSpillOverGroup(context, mediaDurations, index);
             if (group.length > 1) {
-                const groupBeatsDurations = getGroupBeatDurations(context, group, audioDuration);
-                // Yes, the current beat has spilled over audio.
-                const beatsTotalDuration = groupBeatsDurations.reduce((a, b) => a + b, 0);
-                if (beatsTotalDuration > audioDuration + 0.01) {
-                    // 0.01 is a tolerance to avoid floating point precision issues
-                    group.reduce((remaining, idx, iGroup) => {
-                        if (remaining >= groupBeatsDurations[iGroup]) {
-                            return remaining - groupBeatsDurations[iGroup];
-                        }
-                        mediaDurations[idx].silenceDuration = groupBeatsDurations[iGroup] - remaining;
-                        return 0;
-                    }, audioDuration);
-                }
-                else {
-                    // Last beat gets the rest of the audio.
-                    if (audioDuration > beatsTotalDuration) {
-                        groupBeatsDurations[groupBeatsDurations.length - 1] += audioDuration - beatsTotalDuration;
-                    }
-                }
-                beatDurations.push(...groupBeatsDurations);
-            }
-            else {
-                // No spilled over audio.
-                assert(beatDurations.length === index, "beatDurations.length !== index");
-                // padding is the amount of audio padding specified in the script.
-                const padding = getPadding(context, beat, index);
-                // totalPadding is the amount of audio padding to be added to the audio file.
-                const totalPadding = Math.round(getTotalPadding(padding, movieDuration, audioDuration, beat.duration) * 100) / 100;
-                const beatDuration = audioDuration + totalPadding;
-                beatDurations.push(beatDuration);
-                if (totalPadding > 0) {
-                    mediaDurations[index].silenceDuration = totalPadding;
-                }
+                GraphAILogger.log(`Spill over group: ${group.length}`);
+                spilledOverAudio(context, group, audioDuration, beatDurations, mediaDurations);
+                return;
             }
+            // No spilled over audio.
+            assert(beatDurations.length === index, "beatDurations.length !== index");
+            noSpilledOverAudio(context, beat, index, movieDuration, audioDuration, beatDurations, mediaDurations);
+            return;
         }
-        else if (movieDuration > 0) {
+        if (movieDuration > 0) {
             // This beat has only a movie, not audio.
             beatDurations.push(movieDuration);
             mediaDurations[index].silenceDuration = movieDuration;
+            return;
         }
-        else {
-            // The current beat has no audio, nor no spilled over audio
-            const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
-            beatDurations.push(beatDuration);
-            mediaDurations[index].silenceDuration = beatDuration;
-        }
+        // The current beat has no audio, nor no spilled over audio
+        const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
+        beatDurations.push(beatDuration);
+        mediaDurations[index].silenceDuration = beatDuration;
     });
     assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
     // We cannot reuse longSilentId. We need to explicitly split it for each beat.
@@ -168,20 +203,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
         const longSilentId = FfmpegContextInputFormattedAudio(ffmpegContext, silent60secPath(), undefined, ["-stream_loop", "-1"]);
         ffmpegContext.filterComplex.push(`${longSilentId}asplit=${silentIds.length}${silentIds.join("")}`);
     }
-    const inputIds = [];
-    context.studio.beats.forEach((studioBeat, index) => {
-        const { silenceDuration } = mediaDurations[index];
-        const paddingId = `[padding_${index}]`;
-        if (studioBeat.audioFile) {
-            const audioId = FfmpegContextInputFormattedAudio(ffmpegContext, studioBeat.audioFile);
-            inputIds.push(audioId);
-        }
-        if (silenceDuration > 0) {
-            const silentId = silentIds.pop();
-            ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${silenceDuration}${paddingId}`);
-            inputIds.push(paddingId);
-        }
-    });
+    const inputIds = getInputIds(context, mediaDurations, ffmpegContext, silentIds);
     assert(silentIds.length === 0, "silentIds.length !== 0");
     GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
     // Finally, combine all audio files.
@@ -196,6 +218,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
                 audioDuration: mediaDurations[index].audioDuration,
                 movieDuration: mediaDurations[index].movieDuration,
                 silenceDuration: mediaDurations[index].silenceDuration,
+                hasMovieAudio: mediaDurations[index].hasMovieAudio,
             })),
         },
     };

package/lib/agents/movie_replicate_agent.js CHANGED Viewed

@@ -62,7 +62,7 @@ export const movieReplicateAgent = async ({ namedInputs, params, config, }) => {
     const { prompt, imagePath } = namedInputs;
     const aspectRatio = getAspectRatio(params.canvasSize);
     const duration = params.duration ?? 5;
-    const apiKey = config?.apiKey ?? process.env.REPLICATE_API_TOKEN;
+    const apiKey = config?.apiKey;
     if (!apiKey) {
         throw new Error("REPLICATE_API_TOKEN environment variable is required");
     }

package/lib/agents/tts_elevenlabs_agent.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { AgentFunction, AgentFunctionInfo } from "graphai";
-export declare const ttsElevenlabsAgent: AgentFunction;
+import type { ElevenlabsTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, AgentConfig } from "../types/agent.js";
+export declare const ttsElevenlabsAgent: AgentFunction<ElevenlabsTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, AgentConfig>;
 declare const ttsElevenlabsAgentInfo: AgentFunctionInfo;
 export default ttsElevenlabsAgentInfo;

package/lib/agents/tts_elevenlabs_agent.js CHANGED Viewed

@@ -1,18 +1,19 @@
 import { GraphAILogger } from "graphai";
-export const ttsElevenlabsAgent = async ({ namedInputs, params, config }) => {
+import { provider2TTSAgent } from "../utils/provider2agent.js";
+export const ttsElevenlabsAgent = async ({ namedInputs, params, config, }) => {
     const { text } = namedInputs;
     const { voice, model, stability, similarityBoost, suppressError } = params;
-    const apiKey = config?.apiKey ?? process.env.ELEVENLABS_API_KEY;
+    const apiKey = config?.apiKey;
     if (!apiKey) {
         throw new Error("ELEVENLABS_API_KEY environment variable is required");
     }
     if (!voice) {
-        throw new Error("Voice ID is required");
+        throw new Error("ELEVENLABS Voice ID is required");
     }
     try {
         const requestBody = {
             text,
-            model_id: model ?? "eleven_monolingual_v1",
+            model_id: model ?? provider2TTSAgent.elevenlabs.defaultModel,
             voice_settings: {
                 stability: stability ?? 0.5,
                 similarity_boost: similarityBoost ?? 0.75,

package/lib/agents/tts_google_agent.d.ts CHANGED Viewed

@@ -1,12 +1,5 @@
 import type { AgentFunction, AgentFunctionInfo } from "graphai";
-export declare const ttsGoogleAgent: AgentFunction<{
-    voice: string;
-    speed: number;
-    suppressError: boolean;
-}, {
-    buffer?: Buffer | null;
-}, {
-    text: string;
-}>;
+import type { GoogleTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult } from "../types/agent.js";
+export declare const ttsGoogleAgent: AgentFunction<GoogleTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs>;
 declare const ttsGoogleAgentInfo: AgentFunctionInfo;
 export default ttsGoogleAgentInfo;

package/lib/agents/tts_nijivoice_agent.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { AgentFunction, AgentFunctionInfo } from "graphai";
-export declare const ttsNijivoiceAgent: AgentFunction;
+import type { NijivoiceTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, AgentConfig } from "../types/agent.js";
+export declare const ttsNijivoiceAgent: AgentFunction<NijivoiceTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, AgentConfig>;
 declare const ttsNijivoiceAgentInfo: AgentFunctionInfo;
 export default ttsNijivoiceAgentInfo;

package/lib/agents/tts_nijivoice_agent.js CHANGED Viewed

@@ -1,21 +1,20 @@
 import { GraphAILogger, assert } from "graphai";
-const nijovoiceApiKey = process.env.NIJIVOICE_API_KEY ?? "";
 const errorMessage = [
     "TTS NijiVoice: No API key. ",
     "You have the following options:",
     "1. Obtain an API key from Niji Voice (https://platform.nijivoice.com/) and set it as the NIJIVOICE_API_KEY environment variable.",
     '2. Use OpenAI\'s TTS instead of Niji Voice by changing speechParams.provider from "nijivoice" to "openai".',
 ].join("\n");
-export const ttsNijivoiceAgent = async ({ params, namedInputs, config }) => {
+export const ttsNijivoiceAgent = async ({ params, namedInputs, config, }) => {
     const { suppressError, voice, speed, speed_global } = params;
     const { apiKey } = config ?? {};
     const { text } = namedInputs;
-    assert(apiKey ?? nijovoiceApiKey, errorMessage);
+    assert(!!apiKey, errorMessage);
     const url = `https://api.nijivoice.com/api/platform/v1/voice-actors/${voice}/generate-voice`;
     const options = {
         method: "POST",
         headers: {
-            "x-api-key": apiKey ?? nijovoiceApiKey,
+            "x-api-key": apiKey,
             accept: "application/json",
             "content-type": "application/json",
         },
@@ -31,7 +30,7 @@ export const ttsNijivoiceAgent = async ({ params, namedInputs, config }) => {
         if (voiceJson && voiceJson.generatedVoice && voiceJson.generatedVoice.audioFileDownloadUrl) {
             const audioRes = await fetch(voiceJson.generatedVoice.audioFileDownloadUrl);
             const buffer = Buffer.from(await audioRes.arrayBuffer());
-            return { buffer, generatedVoice: voiceJson.generatedVoice };
+            return { buffer };
         }
         if (suppressError) {
             return {

package/lib/agents/tts_openai_agent.d.ts CHANGED Viewed

@@ -1,16 +1,5 @@
 import type { AgentFunction, AgentFunctionInfo } from "graphai";
-export declare const ttsOpenaiAgent: AgentFunction<{
-    model: string;
-    voice: string;
-    instructions: string;
-    suppressError: boolean;
-}, {
-    buffer?: Buffer;
-}, {
-    text: string;
-}, {
-    baseURL?: string;
-    apiKey?: string;
-}>;
+import type { OpenAITTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, OpenAIImageAgentConfig } from "../types/agent.js";
+export declare const ttsOpenaiAgent: AgentFunction<OpenAITTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, OpenAIImageAgentConfig>;
 declare const ttsOpenaiAgentInfo: AgentFunctionInfo;
 export default ttsOpenaiAgentInfo;

package/lib/agents/tts_openai_agent.js CHANGED Viewed

@@ -1,14 +1,15 @@
 import { GraphAILogger } from "graphai";
 import OpenAI from "openai";
-export const ttsOpenaiAgent = async ({ namedInputs, params, config }) => {
+import { provider2TTSAgent } from "../utils/provider2agent.js";
+export const ttsOpenaiAgent = async ({ namedInputs, params, config, }) => {
     const { text } = namedInputs;
     const { model, voice, suppressError, instructions } = params;
     const { apiKey, baseURL } = config ?? {};
     const openai = new OpenAI({ apiKey, baseURL });
     try {
         const tts_options = {
-            model: model ?? "gpt-4o-mini-tts", // "tts-1",
-            voice: voice ?? "shimmer",
+            model: model ?? provider2TTSAgent.openai.defaultModel,
+            voice: voice ?? provider2TTSAgent.openai.defaultVoice,
             input: text,
         };
         if (instructions) {

package/lib/index.browser.d.ts CHANGED Viewed

@@ -1,3 +1,2 @@
-export * from "./types/type.js";
-export * from "./types/schema.js";
+export * from "./index.common.js";
 export * from "./agents/validate_schema_agent.js";