mulmocast 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,18 +12,10 @@ import { MulmoPresentationStyleMethods } from "../methods/index.js";
12
12
  import { fileCacheAgentFilter } from "../utils/filters.js";
13
13
  import { getAudioArtifactFilePath, getAudioFilePath, getOutputStudioFilePath, resolveDirPath, defaultBGMPath, mkdir, writingMessage } from "../utils/file.js";
14
14
  import { text2hash, localizedText, settings2GraphAIConfig } from "../utils/utils.js";
15
+ import { provider2TTSAgent } from "../utils/provider2agent.js";
15
16
  import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
16
17
  import { MulmoMediaSourceMethods } from "../methods/mulmo_media_source.js";
17
18
  const vanillaAgents = agents.default ?? agents;
18
- // const rion_takanashi_voice = "b9277ce3-ba1c-4f6f-9a65-c05ca102ded0"; // たかなし りおん
19
- // const ben_carter_voice = "bc06c63f-fef6-43b6-92f7-67f919bd5dae"; // ベン・カーター
20
- const provider_to_agent = {
21
- nijivoice: "ttsNijivoiceAgent",
22
- openai: "ttsOpenaiAgent",
23
- google: "ttsGoogleAgent",
24
- elevenlabs: "ttsElevenlabsAgent",
25
- mock: "mediaMockAgent",
26
- };
27
19
  const getAudioPath = (context, beat, audioFile) => {
28
20
  if (beat.audio?.type === "audio") {
29
21
  const path = MulmoMediaSourceMethods.resolve(beat.audio.source, context);
@@ -40,7 +32,7 @@ const getAudioPath = (context, beat, audioFile) => {
40
32
  const getAudioParam = (presentationStyle, beat) => {
41
33
  const voiceId = MulmoPresentationStyleMethods.getVoiceId(presentationStyle, beat);
42
34
  // Use speaker-specific provider if available, otherwise fall back to script-level provider
43
- const provider = MulmoPresentationStyleMethods.getProvider(presentationStyle, beat);
35
+ const provider = MulmoPresentationStyleMethods.getTTSProvider(presentationStyle, beat);
44
36
  const speechOptions = MulmoPresentationStyleMethods.getSpeechOptions(presentationStyle, beat);
45
37
  return { voiceId, provider, speechOptions };
46
38
  };
@@ -61,7 +53,7 @@ const preprocessor = (namedInputs) => {
61
53
  studioBeat.audioFile = audioPath; // TODO
62
54
  const needsTTS = !beat.audio && audioPath !== undefined;
63
55
  return {
64
- ttsAgent: provider_to_agent[provider],
56
+ ttsAgent: provider2TTSAgent[provider].agentName,
65
57
  text,
66
58
  voiceId,
67
59
  speechOptions,
@@ -186,8 +178,8 @@ export const audioFilePath = (context) => {
186
178
  const getConcurrency = (context) => {
187
179
  // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
188
180
  const hasLimitedConcurrencyProvider = Object.values(context.presentationStyle.speechParams.speakers).some((speaker) => {
189
- const provider = speaker.provider ?? context.presentationStyle.speechParams.provider;
190
- return provider === "nijivoice" || provider === "elevenlabs";
181
+ const provider = (speaker.provider ?? context.presentationStyle.speechParams.provider);
182
+ return provider2TTSAgent[provider].hasLimitedConcurrency;
191
183
  });
192
184
  return hasLimitedConcurrencyProvider ? 1 : 8;
193
185
  };
@@ -13,7 +13,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
13
13
  imagePath: string | undefined;
14
14
  referenceImageForMovie: string | undefined;
15
15
  imageParams: {
16
- provider: "openai" | "google";
16
+ provider: string;
17
17
  style?: string | undefined;
18
18
  model?: string | undefined;
19
19
  moderation?: string | undefined;
@@ -44,8 +44,20 @@ export declare const imagePreprocessAgent: (namedInputs: {
44
44
  } | {
45
45
  imagePath: string;
46
46
  imageFromMovie: boolean;
47
+ movieParams: {
48
+ speed?: number | undefined;
49
+ model?: string | undefined;
50
+ fillOption?: {
51
+ style: "aspectFit" | "aspectFill";
52
+ } | undefined;
53
+ provider?: string | undefined;
54
+ transition?: {
55
+ type: "fade" | "slideout_left";
56
+ duration: number;
57
+ } | undefined;
58
+ };
47
59
  imageParams: {
48
- provider: "openai" | "google";
60
+ provider: string;
49
61
  style?: string | undefined;
50
62
  model?: string | undefined;
51
63
  moderation?: string | undefined;
@@ -79,8 +91,20 @@ export declare const imagePreprocessAgent: (namedInputs: {
79
91
  imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
80
92
  prompt: string;
81
93
  referenceImages: string[];
94
+ movieParams: {
95
+ speed?: number | undefined;
96
+ model?: string | undefined;
97
+ fillOption?: {
98
+ style: "aspectFit" | "aspectFill";
99
+ } | undefined;
100
+ provider?: string | undefined;
101
+ transition?: {
102
+ type: "fade" | "slideout_left";
103
+ duration: number;
104
+ } | undefined;
105
+ };
82
106
  imageParams: {
83
- provider: "openai" | "google";
107
+ provider: string;
84
108
  style?: string | undefined;
85
109
  model?: string | undefined;
86
110
  moderation?: string | undefined;
@@ -2,6 +2,7 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods, MulmoBeatMeth
2
2
  import { getBeatPngImagePath, getBeatMoviePath } from "../utils/file.js";
3
3
  import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
4
4
  import { renderHTMLToImage } from "../utils/markdown.js";
5
+ import { GraphAILogger } from "graphai";
5
6
  const htmlStyle = (context, beat) => {
6
7
  return {
7
8
  canvasSize: MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle),
@@ -27,13 +28,15 @@ export const imagePreprocessAgent = async (namedInputs) => {
27
28
  // undefined prompt indicates that image generation is not needed
28
29
  return { ...returnValue, imagePath: pluginPath, referenceImageForMovie: pluginPath };
29
30
  }
31
+ const movieParams = { ...context.presentationStyle.movieParams, ...beat.movieParams };
32
+ GraphAILogger.log(`movieParams: ${index}`, movieParams, beat.moviePrompt);
30
33
  if (beat.moviePrompt && !beat.imagePrompt) {
31
- return { ...returnValue, imagePath, imageFromMovie: true }; // no image prompt, only movie prompt
34
+ return { ...returnValue, imagePath, imageFromMovie: true, movieParams }; // no image prompt, only movie prompt
32
35
  }
33
36
  // referenceImages for "edit_image", openai agent.
34
37
  const referenceImages = MulmoBeatMethods.getImageReferenceForImageGenerator(beat, imageRefs);
35
38
  const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
36
- return { ...returnValue, imagePath, referenceImageForMovie: imagePath, imageAgentInfo, prompt, referenceImages };
39
+ return { ...returnValue, imagePath, referenceImageForMovie: imagePath, imageAgentInfo, prompt, referenceImages, movieParams };
37
40
  };
38
41
  export const imagePluginAgent = async (namedInputs) => {
39
42
  const { context, beat, index } = namedInputs;
@@ -1,7 +1,14 @@
1
1
  import type { GraphOptions, CallbackFunction } from "graphai";
2
2
  import { MulmoStudioContext } from "../types/index.js";
3
3
  export declare const graphOption: (context: MulmoStudioContext, settings?: Record<string, string>) => Promise<GraphOptions>;
4
- export declare const images: (context: MulmoStudioContext, settings?: Record<string, string>, callbacks?: CallbackFunction[]) => Promise<MulmoStudioContext>;
4
+ type ImageOptions = {
5
+ imageAgents: Record<string, unknown>;
6
+ };
7
+ export declare const images: (context: MulmoStudioContext, args?: {
8
+ settings?: Record<string, string>;
9
+ callbacks?: CallbackFunction[];
10
+ options?: ImageOptions;
11
+ }) => Promise<MulmoStudioContext>;
5
12
  export declare const generateBeatImage: (inputs: {
6
13
  index: number;
7
14
  context: MulmoStudioContext;
@@ -10,3 +17,4 @@ export declare const generateBeatImage: (inputs: {
10
17
  forceMovie?: boolean;
11
18
  forceImage?: boolean;
12
19
  }) => Promise<void>;
20
+ export {};
@@ -2,7 +2,7 @@ import dotenv from "dotenv";
2
2
  import fs from "fs";
3
3
  import { GraphAI, GraphAILogger, TaskManager } from "graphai";
4
4
  import { GoogleAuth } from "google-auth-library";
5
- import * as agents from "@graphai/vanilla";
5
+ import * as vanilla from "@graphai/vanilla";
6
6
  import { openAIAgent } from "@graphai/openai_agent";
7
7
  import { anthropicAgent } from "@graphai/anthropic_agent";
8
8
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
@@ -14,13 +14,19 @@ import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
14
14
  import { extractImageFromMovie } from "../utils/ffmpeg_utils.js";
15
15
  import { getImageRefs } from "./image_references.js";
16
16
  import { imagePreprocessAgent, imagePluginAgent, htmlImageGeneratorAgent } from "./image_agents.js";
17
- const vanillaAgents = agents.default ?? agents;
17
+ const vanillaAgents = vanilla.default ?? vanilla;
18
18
  const imageAgents = {
19
- ...vanillaAgents,
20
19
  imageGoogleAgent,
20
+ imageOpenaiAgent,
21
+ };
22
+ const movieAgents = {
21
23
  movieGoogleAgent,
22
24
  movieReplicateAgent,
23
- imageOpenaiAgent,
25
+ };
26
+ const defaultAgents = {
27
+ ...vanillaAgents,
28
+ ...imageAgents,
29
+ ...movieAgents,
24
30
  mediaMockAgent,
25
31
  fileWriteAgent,
26
32
  openAIAgent,
@@ -141,7 +147,7 @@ const beat_graph_data = {
141
147
  mulmoContext: ":context",
142
148
  },
143
149
  params: {
144
- model: ":context.presentationStyle.movieParams.model",
150
+ model: ":preprocessor.movieParams.model",
145
151
  duration: ":beat.duration",
146
152
  canvasSize: ":context.presentationStyle.canvasSize",
147
153
  },
@@ -308,10 +314,14 @@ const prepareGenerateImages = async (context) => {
308
314
  };
309
315
  return injections;
310
316
  };
311
- const generateImages = async (context, settings, callbacks) => {
312
- const options = await graphOption(context, settings);
317
+ const generateImages = async (context, settings, callbacks, options) => {
318
+ const optionImageAgents = options?.imageAgents ?? {};
313
319
  const injections = await prepareGenerateImages(context);
314
- const graph = new GraphAI(graph_data, imageAgents, options);
320
+ const graphaiAgent = {
321
+ ...defaultAgents,
322
+ ...optionImageAgents,
323
+ };
324
+ const graph = new GraphAI(graph_data, graphaiAgent, await graphOption(context, settings));
315
325
  Object.keys(injections).forEach((key) => {
316
326
  graph.injectValue(key, injections[key]);
317
327
  });
@@ -324,10 +334,11 @@ const generateImages = async (context, settings, callbacks) => {
324
334
  return res.mergeResult;
325
335
  };
326
336
  // public api
327
- export const images = async (context, settings, callbacks) => {
337
+ export const images = async (context, args) => {
338
+ const { settings, callbacks, options } = args ?? {};
328
339
  try {
329
340
  MulmoStudioContextMethods.setSessionState(context, "image", true);
330
- const newContext = await generateImages(context, settings, callbacks);
341
+ const newContext = await generateImages(context, settings, callbacks, options);
331
342
  MulmoStudioContextMethods.setSessionState(context, "image", false);
332
343
  return newContext;
333
344
  }
@@ -341,7 +352,7 @@ export const generateBeatImage = async (inputs) => {
341
352
  const { index, context, settings, callbacks, forceMovie, forceImage } = inputs;
342
353
  const options = await graphOption(context, settings);
343
354
  const injections = await prepareGenerateImages(context);
344
- const graph = new GraphAI(beat_graph_data, imageAgents, options);
355
+ const graph = new GraphAI(beat_graph_data, defaultAgents, options);
345
356
  Object.keys(injections).forEach((key) => {
346
357
  if ("outputStudioFilePath" !== key) {
347
358
  graph.injectValue(key, injections[key]);
@@ -1,5 +1,6 @@
1
1
  import { GraphAILogger } from "graphai";
2
2
  import { getAspectRatio } from "./movie_google_agent.js";
3
+ import { provider2ImageAgent } from "../utils/provider2agent.js";
3
4
  async function generateImage(projectId, model, token, prompt, aspectRatio) {
4
5
  const GOOGLE_IMAGEN_ENDPOINT = `https://us-central1-aiplatform.googleapis.com/v1/projects/${projectId}/locations/us-central1/publishers/google/models/${model}:predict`;
5
6
  try {
@@ -54,8 +55,7 @@ async function generateImage(projectId, model, token, prompt, aspectRatio) {
54
55
  export const imageGoogleAgent = async ({ namedInputs, params, config, }) => {
55
56
  const { prompt } = namedInputs;
56
57
  const aspectRatio = getAspectRatio(params.canvasSize);
57
- const model = params.model ?? "imagen-3.0-fast-generate-001";
58
- //const projectId = process.env.GOOGLE_PROJECT_ID; // Your Google Cloud Project ID
58
+ const model = params.model ?? provider2ImageAgent["google"].defaultModel;
59
59
  const projectId = config?.projectId;
60
60
  const token = config?.token;
61
61
  try {
@@ -2,13 +2,13 @@ import fs from "fs";
2
2
  import path from "path";
3
3
  import { GraphAILogger } from "graphai";
4
4
  import OpenAI, { toFile } from "openai";
5
- import { defaultOpenAIImageModel } from "../utils/const.js";
5
+ import { provider2ImageAgent } from "../utils/provider2agent.js";
6
6
  // https://platform.openai.com/docs/guides/image-generation
7
7
  export const imageOpenaiAgent = async ({ namedInputs, params, config, }) => {
8
8
  const { prompt, referenceImages } = namedInputs;
9
9
  const { moderation, canvasSize } = params;
10
10
  const { apiKey, baseURL } = { ...config };
11
- const model = params.model ?? defaultOpenAIImageModel;
11
+ const model = params.model ?? provider2ImageAgent["openai"].defaultModel;
12
12
  const openai = new OpenAI({ apiKey, baseURL });
13
13
  const size = (() => {
14
14
  if (model === "gpt-image-1") {
@@ -21,7 +21,7 @@ async function generateMovie(model, apiKey, prompt, imagePath, aspectRatio, dura
21
21
  if (imagePath) {
22
22
  const buffer = readFileSync(imagePath);
23
23
  const base64Image = `data:image/png;base64,${buffer.toString("base64")}`;
24
- if (model === "kwaivgi/kling-v2.1") {
24
+ if (model === "kwaivgi/kling-v2.1" || model === "kwaivgi/kling-v1.6-pro") {
25
25
  input.start_image = base64Image;
26
26
  }
27
27
  else {
package/lib/index.d.ts CHANGED
@@ -2,6 +2,7 @@ export * from "./actions/index.js";
2
2
  export * from "./cli/helpers.js";
3
3
  export * from "./utils/file.js";
4
4
  export * from "./utils/ffmpeg_utils.js";
5
+ export * from "./utils/provider2agent.js";
5
6
  export * from "./methods/index.js";
6
7
  export * from "./agents/index.js";
7
8
  export * from "./types/index.js";
package/lib/index.js CHANGED
@@ -2,6 +2,7 @@ export * from "./actions/index.js";
2
2
  export * from "./cli/helpers.js";
3
3
  export * from "./utils/file.js";
4
4
  export * from "./utils/ffmpeg_utils.js";
5
+ export * from "./utils/provider2agent.js";
5
6
  export * from "./methods/index.js";
6
7
  export * from "./agents/index.js";
7
8
  export * from "./types/index.js";
@@ -7,7 +7,7 @@ export declare const MulmoPresentationStyleMethods: {
7
7
  getTextSlideStyle(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
8
8
  getSpeechOptions(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeechOptions | undefined;
9
9
  getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
10
- getProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
10
+ getTTSProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
11
11
  getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
12
12
  getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
13
13
  getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
@@ -1,7 +1,7 @@
1
1
  import "dotenv/config";
2
- import { userAssert, llmConfig } from "../utils/utils.js";
2
+ import { userAssert } from "../utils/utils.js";
3
3
  import { text2ImageProviderSchema, text2HtmlImageProviderSchema, text2SpeechProviderSchema, mulmoCanvasDimensionSchema } from "../types/schema.js";
4
- import { defaultOpenAIImageModel } from "../utils/const.js";
4
+ import { defaultProviders, provider2ImageAgent, provider2MovieAgent, provider2LLMAgent } from "../utils/provider2agent.js";
5
5
  const defaultTextSlideStyles = [
6
6
  '*,*::before,*::after{box-sizing:border-box}body,h1,h2,h3,h4,p,figure,blockquote,dl,dd{margin:0}ul[role="list"],ol[role="list"]{list-style:none}html:focus-within{scroll-behavior:smooth}body{min-height:100vh;text-rendering:optimizeSpeed;line-height:1.5}a:not([class]){text-decoration-skip-ink:auto}img,picture{max-width:100%;display:block}input,button,textarea,select{font:inherit}@media(prefers-reduced-motion:reduce){html:focus-within{scroll-behavior:auto}*,*::before,*::after{animation-duration:.01ms !important;animation-iteration-count:1 !important;transition-duration:.01ms !important;scroll-behavior:auto !important}}',
7
7
  "body { margin: 60px; margin-top: 40px; color:#333; font-size: 30px; font-family: Arial, sans-serif; box-sizing: border-box; height: 100vh }",
@@ -49,7 +49,7 @@ export const MulmoPresentationStyleMethods = {
49
49
  userAssert(!!speaker, `speaker is not set: speaker "${beat.speaker}"`);
50
50
  return speaker;
51
51
  },
52
- getProvider(presentationStyle, beat) {
52
+ getTTSProvider(presentationStyle, beat) {
53
53
  const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
54
54
  return speaker.provider ?? presentationStyle.speechParams.provider;
55
55
  },
@@ -65,46 +65,46 @@ export const MulmoPresentationStyleMethods = {
65
65
  // provider and model appropriately.
66
66
  const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
67
67
  const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
68
+ const agentInfo = provider2ImageAgent[provider];
69
+ // The default text2image model is gpt-image-1 from OpenAI, and to use it you must have an OpenAI account and have verified your identity. If this is not possible, please specify dall-e-3 as the model.
68
70
  const defaultImageParams = {
69
71
  provider,
70
- model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
72
+ model: agentInfo.defaultModel,
71
73
  };
72
74
  return {
73
- agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
75
+ agent: agentInfo.agentName,
74
76
  imageParams: { ...defaultImageParams, ...imageParams },
75
77
  };
76
78
  },
77
79
  // Determine movie agent based on provider
78
80
  getMovieAgent(presentationStyle) {
79
- const movieProvider = presentationStyle.movieParams?.provider ?? "google";
80
- switch (movieProvider) {
81
- case "replicate":
82
- return "movieReplicateAgent";
83
- case "google":
84
- default:
85
- return "movieGoogleAgent";
86
- }
81
+ const movieProvider = (presentationStyle.movieParams?.provider ?? defaultProviders.text2movie);
82
+ return provider2MovieAgent[movieProvider].agentName;
87
83
  },
88
84
  getConcurrency(presentationStyle) {
85
+ /*
89
86
  if (presentationStyle.movieParams?.provider === "replicate") {
90
- return 4;
87
+ return 4;
91
88
  }
89
+ */
92
90
  const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(presentationStyle);
93
91
  if (imageAgentInfo.imageParams.provider === "openai") {
94
92
  // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
95
93
  // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
96
94
  // gpt-image-1:3,000,000 TPM、150 images per minute
97
- return imageAgentInfo.imageParams.model === defaultOpenAIImageModel ? 4 : 16;
95
+ if (imageAgentInfo.imageParams.model === provider2ImageAgent.openai.defaultModel) {
96
+ return 16;
97
+ }
98
98
  }
99
99
  return 4;
100
100
  },
101
101
  getHtmlImageAgentInfo(presentationStyle) {
102
102
  const provider = text2HtmlImageProviderSchema.parse(presentationStyle.htmlImageParams?.provider);
103
- const defaultConfig = llmConfig[provider];
103
+ const defaultConfig = provider2LLMAgent[provider];
104
104
  const model = presentationStyle.htmlImageParams?.model ? presentationStyle.htmlImageParams?.model : defaultConfig.defaultModel;
105
105
  return {
106
106
  provider,
107
- agent: defaultConfig.agent,
107
+ agent: defaultConfig.agentName,
108
108
  model,
109
109
  max_tokens: defaultConfig.max_tokens,
110
110
  };