npm - mulmocast - Versions diffs - 0.1.3 → 0.1.4 - Mend

mulmocast 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/lib/actions/audio.js +5 -13
package/lib/actions/image_agents.d.ts +27 -3
package/lib/actions/image_agents.js +5 -2
package/lib/actions/images.d.ts +9 -1
package/lib/actions/images.js +22 -11
package/lib/agents/image_google_agent.js +2 -2
package/lib/agents/image_openai_agent.js +2 -2
package/lib/agents/movie_replicate_agent.js +1 -1
package/lib/index.d.ts +1 -0
package/lib/index.js +1 -0
package/lib/methods/mulmo_presentation_style.d.ts +1 -1
package/lib/methods/mulmo_presentation_style.js +17 -17
package/lib/types/schema.d.ts +243 -222
package/lib/types/schema.js +10 -8
package/lib/utils/const.d.ts +0 -1
package/lib/utils/const.js +0 -1
package/lib/utils/context.d.ts +12 -11
package/lib/utils/preprocess.d.ts +7 -6
package/lib/utils/provider2agent.d.ts +72 -0
package/lib/utils/provider2agent.js +81 -0
package/lib/utils/utils.d.ts +6 -11
package/lib/utils/utils.js +5 -26
package/package.json +1 -1

package/lib/actions/audio.js CHANGED Viewed

@@ -12,18 +12,10 @@ import { MulmoPresentationStyleMethods } from "../methods/index.js";
 import { fileCacheAgentFilter } from "../utils/filters.js";
 import { getAudioArtifactFilePath, getAudioFilePath, getOutputStudioFilePath, resolveDirPath, defaultBGMPath, mkdir, writingMessage } from "../utils/file.js";
 import { text2hash, localizedText, settings2GraphAIConfig } from "../utils/utils.js";
+import { provider2TTSAgent } from "../utils/provider2agent.js";
 import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
 import { MulmoMediaSourceMethods } from "../methods/mulmo_media_source.js";
 const vanillaAgents = agents.default ?? agents;
-// const rion_takanashi_voice = "b9277ce3-ba1c-4f6f-9a65-c05ca102ded0"; // たかなし りおん
-// const ben_carter_voice = "bc06c63f-fef6-43b6-92f7-67f919bd5dae"; // ベン・カーター
-const provider_to_agent = {
-    nijivoice: "ttsNijivoiceAgent",
-    openai: "ttsOpenaiAgent",
-    google: "ttsGoogleAgent",
-    elevenlabs: "ttsElevenlabsAgent",
-    mock: "mediaMockAgent",
-};
 const getAudioPath = (context, beat, audioFile) => {
     if (beat.audio?.type === "audio") {
         const path = MulmoMediaSourceMethods.resolve(beat.audio.source, context);
@@ -40,7 +32,7 @@ const getAudioPath = (context, beat, audioFile) => {
 const getAudioParam = (presentationStyle, beat) => {
     const voiceId = MulmoPresentationStyleMethods.getVoiceId(presentationStyle, beat);
     // Use speaker-specific provider if available, otherwise fall back to script-level provider
-    const provider = MulmoPresentationStyleMethods.getProvider(presentationStyle, beat);
+    const provider = MulmoPresentationStyleMethods.getTTSProvider(presentationStyle, beat);
     const speechOptions = MulmoPresentationStyleMethods.getSpeechOptions(presentationStyle, beat);
     return { voiceId, provider, speechOptions };
 };
@@ -61,7 +53,7 @@ const preprocessor = (namedInputs) => {
     studioBeat.audioFile = audioPath; // TODO
     const needsTTS = !beat.audio && audioPath !== undefined;
     return {
-        ttsAgent: provider_to_agent[provider],
+        ttsAgent: provider2TTSAgent[provider].agentName,
         text,
         voiceId,
         speechOptions,
@@ -186,8 +178,8 @@ export const audioFilePath = (context) => {
 const getConcurrency = (context) => {
     // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
     const hasLimitedConcurrencyProvider = Object.values(context.presentationStyle.speechParams.speakers).some((speaker) => {
-        const provider = speaker.provider ?? context.presentationStyle.speechParams.provider;
-        return provider === "nijivoice" || provider === "elevenlabs";
+        const provider = (speaker.provider ?? context.presentationStyle.speechParams.provider);
+        return provider2TTSAgent[provider].hasLimitedConcurrency;
     });
     return hasLimitedConcurrencyProvider ? 1 : 8;
 };

package/lib/actions/image_agents.d.ts CHANGED Viewed

@@ -13,7 +13,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
     imagePath: string | undefined;
     referenceImageForMovie: string | undefined;
     imageParams: {
-        provider: "openai" | "google";
+        provider: string;
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -44,8 +44,20 @@ export declare const imagePreprocessAgent: (namedInputs: {
 } | {
     imagePath: string;
     imageFromMovie: boolean;
+    movieParams: {
+        speed?: number | undefined;
+        model?: string | undefined;
+        fillOption?: {
+            style: "aspectFit" | "aspectFill";
+        } | undefined;
+        provider?: string | undefined;
+        transition?: {
+            type: "fade" | "slideout_left";
+            duration: number;
+        } | undefined;
+    };
     imageParams: {
-        provider: "openai" | "google";
+        provider: string;
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;
@@ -79,8 +91,20 @@ export declare const imagePreprocessAgent: (namedInputs: {
     imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
     prompt: string;
     referenceImages: string[];
+    movieParams: {
+        speed?: number | undefined;
+        model?: string | undefined;
+        fillOption?: {
+            style: "aspectFit" | "aspectFill";
+        } | undefined;
+        provider?: string | undefined;
+        transition?: {
+            type: "fade" | "slideout_left";
+            duration: number;
+        } | undefined;
+    };
     imageParams: {
-        provider: "openai" | "google";
+        provider: string;
         style?: string | undefined;
         model?: string | undefined;
         moderation?: string | undefined;

package/lib/actions/image_agents.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods, MulmoBeatMeth
 import { getBeatPngImagePath, getBeatMoviePath } from "../utils/file.js";
 import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
 import { renderHTMLToImage } from "../utils/markdown.js";
+import { GraphAILogger } from "graphai";
 const htmlStyle = (context, beat) => {
     return {
         canvasSize: MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle),
@@ -27,13 +28,15 @@ export const imagePreprocessAgent = async (namedInputs) => {
         // undefined prompt indicates that image generation is not needed
         return { ...returnValue, imagePath: pluginPath, referenceImageForMovie: pluginPath };
     }
+    const movieParams = { ...context.presentationStyle.movieParams, ...beat.movieParams };
+    GraphAILogger.log(`movieParams: ${index}`, movieParams, beat.moviePrompt);
     if (beat.moviePrompt && !beat.imagePrompt) {
-        return { ...returnValue, imagePath, imageFromMovie: true }; // no image prompt, only movie prompt
+        return { ...returnValue, imagePath, imageFromMovie: true, movieParams }; // no image prompt, only movie prompt
     }
     // referenceImages for "edit_image", openai agent.
     const referenceImages = MulmoBeatMethods.getImageReferenceForImageGenerator(beat, imageRefs);
     const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
-    return { ...returnValue, imagePath, referenceImageForMovie: imagePath, imageAgentInfo, prompt, referenceImages };
+    return { ...returnValue, imagePath, referenceImageForMovie: imagePath, imageAgentInfo, prompt, referenceImages, movieParams };
 };
 export const imagePluginAgent = async (namedInputs) => {
     const { context, beat, index } = namedInputs;

package/lib/actions/images.d.ts CHANGED Viewed

@@ -1,7 +1,14 @@
 import type { GraphOptions, CallbackFunction } from "graphai";
 import { MulmoStudioContext } from "../types/index.js";
 export declare const graphOption: (context: MulmoStudioContext, settings?: Record<string, string>) => Promise<GraphOptions>;
-export declare const images: (context: MulmoStudioContext, settings?: Record<string, string>, callbacks?: CallbackFunction[]) => Promise<MulmoStudioContext>;
+type ImageOptions = {
+    imageAgents: Record<string, unknown>;
+};
+export declare const images: (context: MulmoStudioContext, args?: {
+    settings?: Record<string, string>;
+    callbacks?: CallbackFunction[];
+    options?: ImageOptions;
+}) => Promise<MulmoStudioContext>;
 export declare const generateBeatImage: (inputs: {
     index: number;
     context: MulmoStudioContext;
@@ -10,3 +17,4 @@ export declare const generateBeatImage: (inputs: {
     forceMovie?: boolean;
     forceImage?: boolean;
 }) => Promise<void>;
+export {};

package/lib/actions/images.js CHANGED Viewed

@@ -2,7 +2,7 @@ import dotenv from "dotenv";
 import fs from "fs";
 import { GraphAI, GraphAILogger, TaskManager } from "graphai";
 import { GoogleAuth } from "google-auth-library";
-import * as agents from "@graphai/vanilla";
+import * as vanilla from "@graphai/vanilla";
 import { openAIAgent } from "@graphai/openai_agent";
 import { anthropicAgent } from "@graphai/anthropic_agent";
 import { fileWriteAgent } from "@graphai/vanilla_node_agents";
@@ -14,13 +14,19 @@ import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
 import { extractImageFromMovie } from "../utils/ffmpeg_utils.js";
 import { getImageRefs } from "./image_references.js";
 import { imagePreprocessAgent, imagePluginAgent, htmlImageGeneratorAgent } from "./image_agents.js";
-const vanillaAgents = agents.default ?? agents;
+const vanillaAgents = vanilla.default ?? vanilla;
 const imageAgents = {
-    ...vanillaAgents,
     imageGoogleAgent,
+    imageOpenaiAgent,
+};
+const movieAgents = {
     movieGoogleAgent,
     movieReplicateAgent,
-    imageOpenaiAgent,
+};
+const defaultAgents = {
+    ...vanillaAgents,
+    ...imageAgents,
+    ...movieAgents,
     mediaMockAgent,
     fileWriteAgent,
     openAIAgent,
@@ -141,7 +147,7 @@ const beat_graph_data = {
                     mulmoContext: ":context",
                 },
                 params: {
-                    model: ":context.presentationStyle.movieParams.model",
+                    model: ":preprocessor.movieParams.model",
                     duration: ":beat.duration",
                     canvasSize: ":context.presentationStyle.canvasSize",
                 },
@@ -308,10 +314,14 @@ const prepareGenerateImages = async (context) => {
     };
     return injections;
 };
-const generateImages = async (context, settings, callbacks) => {
-    const options = await graphOption(context, settings);
+const generateImages = async (context, settings, callbacks, options) => {
+    const optionImageAgents = options?.imageAgents ?? {};
     const injections = await prepareGenerateImages(context);
-    const graph = new GraphAI(graph_data, imageAgents, options);
+    const graphaiAgent = {
+        ...defaultAgents,
+        ...optionImageAgents,
+    };
+    const graph = new GraphAI(graph_data, graphaiAgent, await graphOption(context, settings));
     Object.keys(injections).forEach((key) => {
         graph.injectValue(key, injections[key]);
     });
@@ -324,10 +334,11 @@ const generateImages = async (context, settings, callbacks) => {
     return res.mergeResult;
 };
 // public api
-export const images = async (context, settings, callbacks) => {
+export const images = async (context, args) => {
+    const { settings, callbacks, options } = args ?? {};
     try {
         MulmoStudioContextMethods.setSessionState(context, "image", true);
-        const newContext = await generateImages(context, settings, callbacks);
+        const newContext = await generateImages(context, settings, callbacks, options);
         MulmoStudioContextMethods.setSessionState(context, "image", false);
         return newContext;
     }
@@ -341,7 +352,7 @@ export const generateBeatImage = async (inputs) => {
     const { index, context, settings, callbacks, forceMovie, forceImage } = inputs;
     const options = await graphOption(context, settings);
     const injections = await prepareGenerateImages(context);
-    const graph = new GraphAI(beat_graph_data, imageAgents, options);
+    const graph = new GraphAI(beat_graph_data, defaultAgents, options);
     Object.keys(injections).forEach((key) => {
         if ("outputStudioFilePath" !== key) {
             graph.injectValue(key, injections[key]);

package/lib/agents/image_google_agent.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { GraphAILogger } from "graphai";
 import { getAspectRatio } from "./movie_google_agent.js";
+import { provider2ImageAgent } from "../utils/provider2agent.js";
 async function generateImage(projectId, model, token, prompt, aspectRatio) {
     const GOOGLE_IMAGEN_ENDPOINT = `https://us-central1-aiplatform.googleapis.com/v1/projects/${projectId}/locations/us-central1/publishers/google/models/${model}:predict`;
     try {
@@ -54,8 +55,7 @@ async function generateImage(projectId, model, token, prompt, aspectRatio) {
 export const imageGoogleAgent = async ({ namedInputs, params, config, }) => {
     const { prompt } = namedInputs;
     const aspectRatio = getAspectRatio(params.canvasSize);
-    const model = params.model ?? "imagen-3.0-fast-generate-001";
-    //const projectId = process.env.GOOGLE_PROJECT_ID; // Your Google Cloud Project ID
+    const model = params.model ?? provider2ImageAgent["google"].defaultModel;
     const projectId = config?.projectId;
     const token = config?.token;
     try {

package/lib/agents/image_openai_agent.js CHANGED Viewed

@@ -2,13 +2,13 @@ import fs from "fs";
 import path from "path";
 import { GraphAILogger } from "graphai";
 import OpenAI, { toFile } from "openai";
-import { defaultOpenAIImageModel } from "../utils/const.js";
+import { provider2ImageAgent } from "../utils/provider2agent.js";
 // https://platform.openai.com/docs/guides/image-generation
 export const imageOpenaiAgent = async ({ namedInputs, params, config, }) => {
     const { prompt, referenceImages } = namedInputs;
     const { moderation, canvasSize } = params;
     const { apiKey, baseURL } = { ...config };
-    const model = params.model ?? defaultOpenAIImageModel;
+    const model = params.model ?? provider2ImageAgent["openai"].defaultModel;
     const openai = new OpenAI({ apiKey, baseURL });
     const size = (() => {
         if (model === "gpt-image-1") {

package/lib/agents/movie_replicate_agent.js CHANGED Viewed

@@ -21,7 +21,7 @@ async function generateMovie(model, apiKey, prompt, imagePath, aspectRatio, dura
     if (imagePath) {
         const buffer = readFileSync(imagePath);
         const base64Image = `data:image/png;base64,${buffer.toString("base64")}`;
-        if (model === "kwaivgi/kling-v2.1") {
+        if (model === "kwaivgi/kling-v2.1" || model === "kwaivgi/kling-v1.6-pro") {
             input.start_image = base64Image;
         }
         else {

package/lib/index.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@ export * from "./actions/index.js";
 export * from "./cli/helpers.js";
 export * from "./utils/file.js";
 export * from "./utils/ffmpeg_utils.js";
+export * from "./utils/provider2agent.js";
 export * from "./methods/index.js";
 export * from "./agents/index.js";
 export * from "./types/index.js";

package/lib/index.js CHANGED Viewed

@@ -2,6 +2,7 @@ export * from "./actions/index.js";
 export * from "./cli/helpers.js";
 export * from "./utils/file.js";
 export * from "./utils/ffmpeg_utils.js";
+export * from "./utils/provider2agent.js";
 export * from "./methods/index.js";
 export * from "./agents/index.js";
 export * from "./types/index.js";

package/lib/methods/mulmo_presentation_style.d.ts CHANGED Viewed

@@ -7,7 +7,7 @@ export declare const MulmoPresentationStyleMethods: {
     getTextSlideStyle(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
     getSpeechOptions(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeechOptions | undefined;
     getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
-    getProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
+    getTTSProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
     getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
     getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
     getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;

package/lib/methods/mulmo_presentation_style.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import "dotenv/config";
-import { userAssert, llmConfig } from "../utils/utils.js";
+import { userAssert } from "../utils/utils.js";
 import { text2ImageProviderSchema, text2HtmlImageProviderSchema, text2SpeechProviderSchema, mulmoCanvasDimensionSchema } from "../types/schema.js";
-import { defaultOpenAIImageModel } from "../utils/const.js";
+import { defaultProviders, provider2ImageAgent, provider2MovieAgent, provider2LLMAgent } from "../utils/provider2agent.js";
 const defaultTextSlideStyles = [
     '*,*::before,*::after{box-sizing:border-box}body,h1,h2,h3,h4,p,figure,blockquote,dl,dd{margin:0}ul[role="list"],ol[role="list"]{list-style:none}html:focus-within{scroll-behavior:smooth}body{min-height:100vh;text-rendering:optimizeSpeed;line-height:1.5}a:not([class]){text-decoration-skip-ink:auto}img,picture{max-width:100%;display:block}input,button,textarea,select{font:inherit}@media(prefers-reduced-motion:reduce){html:focus-within{scroll-behavior:auto}*,*::before,*::after{animation-duration:.01ms !important;animation-iteration-count:1 !important;transition-duration:.01ms !important;scroll-behavior:auto !important}}',
     "body { margin: 60px; margin-top: 40px; color:#333; font-size: 30px; font-family: Arial, sans-serif; box-sizing: border-box; height: 100vh }",
@@ -49,7 +49,7 @@ export const MulmoPresentationStyleMethods = {
         userAssert(!!speaker, `speaker is not set: speaker "${beat.speaker}"`);
         return speaker;
     },
-    getProvider(presentationStyle, beat) {
+    getTTSProvider(presentationStyle, beat) {
         const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
         return speaker.provider ?? presentationStyle.speechParams.provider;
     },
@@ -65,46 +65,46 @@ export const MulmoPresentationStyleMethods = {
         // provider and model appropriately.
         const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
         const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
+        const agentInfo = provider2ImageAgent[provider];
+        // The default text2image model is gpt-image-1 from OpenAI, and to use it you must have an OpenAI account and have verified your identity. If this is not possible, please specify dall-e-3 as the model.
         const defaultImageParams = {
             provider,
-            model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
+            model: agentInfo.defaultModel,
         };
         return {
-            agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
+            agent: agentInfo.agentName,
             imageParams: { ...defaultImageParams, ...imageParams },
         };
     },
     // Determine movie agent based on provider
     getMovieAgent(presentationStyle) {
-        const movieProvider = presentationStyle.movieParams?.provider ?? "google";
-        switch (movieProvider) {
-            case "replicate":
-                return "movieReplicateAgent";
-            case "google":
-            default:
-                return "movieGoogleAgent";
-        }
+        const movieProvider = (presentationStyle.movieParams?.provider ?? defaultProviders.text2movie);
+        return provider2MovieAgent[movieProvider].agentName;
     },
     getConcurrency(presentationStyle) {
+        /*
         if (presentationStyle.movieParams?.provider === "replicate") {
-            return 4;
+          return 4;
         }
+        */
         const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(presentationStyle);
         if (imageAgentInfo.imageParams.provider === "openai") {
             // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
             // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
             // gpt-image-1：3,000,000 TPM、150 images per minute
-            return imageAgentInfo.imageParams.model === defaultOpenAIImageModel ? 4 : 16;
+            if (imageAgentInfo.imageParams.model === provider2ImageAgent.openai.defaultModel) {
+                return 16;
+            }
         }
         return 4;
     },
     getHtmlImageAgentInfo(presentationStyle) {
         const provider = text2HtmlImageProviderSchema.parse(presentationStyle.htmlImageParams?.provider);
-        const defaultConfig = llmConfig[provider];
+        const defaultConfig = provider2LLMAgent[provider];
         const model = presentationStyle.htmlImageParams?.model ? presentationStyle.htmlImageParams?.model : defaultConfig.defaultModel;
         return {
             provider,
-            agent: defaultConfig.agent,
+            agent: defaultConfig.agentName,
             model,
             max_tokens: defaultConfig.max_tokens,
         };