mulmocast 0.0.15 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/assets/templates/text_and_image.json +6 -0
  2. package/assets/templates/text_only.json +6 -0
  3. package/lib/actions/audio.d.ts +4 -2
  4. package/lib/actions/audio.js +89 -48
  5. package/lib/actions/captions.d.ts +1 -1
  6. package/lib/actions/captions.js +17 -14
  7. package/lib/actions/images.d.ts +6 -3
  8. package/lib/actions/images.js +64 -39
  9. package/lib/actions/movie.js +19 -19
  10. package/lib/actions/pdf.js +3 -4
  11. package/lib/actions/translate.js +11 -11
  12. package/lib/agents/add_bgm_agent.js +3 -3
  13. package/lib/agents/combine_audio_files_agent.js +88 -42
  14. package/lib/agents/index.d.ts +2 -1
  15. package/lib/agents/index.js +2 -1
  16. package/lib/agents/tavily_agent.d.ts +15 -0
  17. package/lib/agents/tavily_agent.js +130 -0
  18. package/lib/cli/commands/audio/builder.d.ts +2 -0
  19. package/lib/cli/commands/image/builder.d.ts +2 -0
  20. package/lib/cli/commands/movie/builder.d.ts +2 -0
  21. package/lib/cli/commands/movie/handler.js +1 -6
  22. package/lib/cli/commands/pdf/builder.d.ts +2 -0
  23. package/lib/cli/commands/translate/builder.d.ts +2 -0
  24. package/lib/cli/common.d.ts +2 -0
  25. package/lib/cli/common.js +6 -0
  26. package/lib/cli/helpers.d.ts +7 -1
  27. package/lib/cli/helpers.js +30 -3
  28. package/lib/methods/index.d.ts +1 -1
  29. package/lib/methods/index.js +1 -1
  30. package/lib/methods/mulmo_presentation_style.d.ts +14 -0
  31. package/lib/methods/mulmo_presentation_style.js +70 -0
  32. package/lib/methods/mulmo_studio_context.d.ts +17 -0
  33. package/lib/methods/mulmo_studio_context.js +30 -2
  34. package/lib/tools/deep_research.d.ts +2 -0
  35. package/lib/tools/deep_research.js +265 -0
  36. package/lib/types/index.d.ts +0 -1
  37. package/lib/types/index.js +0 -1
  38. package/lib/types/schema.d.ts +101 -55
  39. package/lib/types/schema.js +3 -3
  40. package/lib/types/type.d.ts +5 -1
  41. package/lib/utils/ffmpeg_utils.d.ts +1 -0
  42. package/lib/utils/ffmpeg_utils.js +10 -0
  43. package/lib/utils/file.d.ts +7 -4
  44. package/lib/utils/file.js +24 -12
  45. package/lib/utils/preprocess.d.ts +0 -9
  46. package/lib/utils/preprocess.js +4 -10
  47. package/lib/utils/prompt.d.ts +3 -0
  48. package/lib/utils/prompt.js +52 -0
  49. package/package.json +11 -10
  50. package/assets/music/StarsBeyondEx.mp3 +0 -0
@@ -0,0 +1,6 @@
1
+ {
2
+ "title": "Text and Image",
3
+ "description": "Template for Text and Image Script.",
4
+ "systemPrompt": "Generate a script for a presentation of the given topic. Another AI will generate comic strips for each beat based on the imagePrompt of that beat. Mention the reference in one of beats, if it exists. Use the JSON below as a template.",
5
+ "scriptName": "image_prompts_template.json"
6
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "title": "Text Only",
3
+ "description": "Template for Text Only Script.",
4
+ "systemPrompt": "Generate a script for a presentation of the given topic. Another AI will generate comic strips for each beat based on the text description of that beat. Mention the reference in one of beats, if it exists. Use the JSON below as a template.",
5
+ "scriptName": "text_only_template.json"
6
+ }
@@ -1,5 +1,7 @@
1
1
  import "dotenv/config";
2
2
  import type { CallbackFunction } from "graphai";
3
- import { MulmoStudioContext } from "../types/index.js";
3
+ import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
4
+ export declare const getBeatAudioPath: (text: string, context: MulmoStudioContext, beat: MulmoBeat, lang?: string) => string | undefined;
4
5
  export declare const audioFilePath: (context: MulmoStudioContext) => string;
5
- export declare const audio: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
6
+ export declare const generateBeatAudio: (index: number, context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
7
+ export declare const audio: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<MulmoStudioContext>;
@@ -1,5 +1,6 @@
1
1
  import "dotenv/config";
2
2
  import { GraphAI } from "graphai";
3
+ import { TaskManager } from "graphai/lib/task_manager.js";
3
4
  import * as agents from "@graphai/vanilla";
4
5
  import ttsNijivoiceAgent from "../agents/tts_nijivoice_agent.js";
5
6
  import addBGMAgent from "../agents/add_bgm_agent.js";
@@ -8,9 +9,9 @@ import ttsOpenaiAgent from "../agents/tts_openai_agent.js";
8
9
  import ttsGoogleAgent from "../agents/tts_google_agent.js";
9
10
  import ttsElevenlabsAgent from "../agents/tts_elevenlabs_agent.js";
10
11
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
11
- import { MulmoScriptMethods } from "../methods/index.js";
12
+ import { MulmoPresentationStyleMethods } from "../methods/index.js";
12
13
  import { fileCacheAgentFilter } from "../utils/filters.js";
13
- import { getAudioArtifactFilePath, getAudioSegmentDirPath, getAudioCombinedFilePath, getOutputStudioFilePath, defaultBGMPath, mkdir, writingMessage, getAudioSegmentFilePath, } from "../utils/file.js";
14
+ import { getAudioArtifactFilePath, getAudioFilePath, getOutputStudioFilePath, resolveDirPath, defaultBGMPath, mkdir, writingMessage } from "../utils/file.js";
14
15
  import { text2hash, localizedText } from "../utils/utils.js";
15
16
  import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
16
17
  import { MulmoMediaSourceMethods } from "../methods/mulmo_media_source.js";
@@ -24,7 +25,7 @@ const provider_to_agent = {
24
25
  elevenlabs: "ttsElevenlabsAgent",
25
26
  mock: "mediaMockAgent",
26
27
  };
27
- const getAudioPath = (context, beat, audioFile, audioDirPath) => {
28
+ const getAudioPath = (context, beat, audioFile) => {
28
29
  if (beat.audio?.type === "audio") {
29
30
  const path = MulmoMediaSourceMethods.resolve(beat.audio.source, context);
30
31
  if (path) {
@@ -35,34 +36,48 @@ const getAudioPath = (context, beat, audioFile, audioDirPath) => {
35
36
  if (beat.text === undefined || beat.text === "") {
36
37
  return undefined; // It indicates that the audio is not needed.
37
38
  }
38
- return getAudioSegmentFilePath(audioDirPath, context.studio.filename, audioFile);
39
+ return audioFile;
40
+ };
41
+ const getAudioParam = (presentationStyle, beat) => {
42
+ const voiceId = MulmoPresentationStyleMethods.getVoiceId(presentationStyle, beat);
43
+ // Use speaker-specific provider if available, otherwise fall back to script-level provider
44
+ const provider = MulmoPresentationStyleMethods.getProvider(presentationStyle, beat);
45
+ const speechOptions = MulmoPresentationStyleMethods.getSpeechOptions(presentationStyle, beat);
46
+ return { voiceId, provider, speechOptions };
47
+ };
48
+ export const getBeatAudioPath = (text, context, beat, lang) => {
49
+ const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
50
+ const { voiceId, provider, speechOptions } = getAudioParam(context.presentationStyle, beat);
51
+ const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider].join(":");
52
+ const audioFileName = `${context.studio.filename}_${text2hash(hash_string)}`;
53
+ const audioFile = getAudioFilePath(audioDirPath, context.studio.filename, audioFileName, lang);
54
+ return getAudioPath(context, beat, audioFile);
39
55
  };
40
56
  const preprocessor = (namedInputs) => {
41
- const { beat, studioBeat, multiLingual, context, audioDirPath } = namedInputs;
42
- const { lang } = context;
43
- const speaker = context.studio.script.speechParams.speakers[beat.speaker];
44
- const voiceId = speaker.voiceId;
45
- const speechOptions = MulmoScriptMethods.getSpeechOptions(context.studio.script, beat);
57
+ const { beat, studioBeat, multiLingual, context } = namedInputs;
58
+ const { lang, presentationStyle } = context;
46
59
  const text = localizedText(beat, multiLingual, lang);
47
- // Use speaker-specific provider if available, otherwise fall back to script-level provider
48
- const provider = speaker.provider ?? context.studio.script.speechParams.provider;
49
- const hash_string = `${text}${voiceId}${speechOptions?.instruction ?? ""}${speechOptions?.speed ?? 1.0}${provider}`;
50
- const audioFile = `${context.studio.filename}_${text2hash(hash_string)}` + (lang ? `_${lang}` : "");
51
- const audioPath = getAudioPath(context, beat, audioFile, audioDirPath);
52
- studioBeat.audioFile = audioPath;
60
+ const { voiceId, provider, speechOptions } = getAudioParam(presentationStyle, beat);
61
+ const audioPath = getBeatAudioPath(text, context, beat, lang);
62
+ studioBeat.audioFile = audioPath; // TODO
53
63
  const needsTTS = !beat.audio && audioPath !== undefined;
54
64
  return {
55
65
  ttsAgent: provider_to_agent[provider],
56
- studioBeat,
66
+ text,
57
67
  voiceId,
58
68
  speechOptions,
59
69
  audioPath,
60
- text,
70
+ studioBeat,
61
71
  needsTTS,
62
72
  };
63
73
  };
64
74
  const graph_tts = {
65
75
  nodes: {
76
+ beat: {},
77
+ studioBeat: {},
78
+ multiLingual: {},
79
+ context: {},
80
+ __mapIndex: {},
66
81
  preprocessor: {
67
82
  agent: preprocessor,
68
83
  inputs: {
@@ -70,7 +85,6 @@ const graph_tts = {
70
85
  studioBeat: ":studioBeat",
71
86
  multiLingual: ":multiLingual",
72
87
  context: ":context",
73
- audioDirPath: ":audioDirPath",
74
88
  },
75
89
  },
76
90
  tts: {
@@ -100,17 +114,13 @@ const graph_data = {
100
114
  audioArtifactFilePath: {},
101
115
  audioCombinedFilePath: {},
102
116
  outputStudioFilePath: {},
103
- audioDirPath: {},
104
- audioSegmentDirPath: {},
105
117
  musicFile: {},
106
118
  map: {
107
119
  agent: "mapAgent",
108
120
  inputs: {
109
121
  rows: ":context.studio.script.beats",
110
122
  studioBeat: ":context.studio.beats",
111
- multiLingual: ":context.studio.multiLingual",
112
- audioDirPath: ":audioDirPath",
113
- audioSegmentDirPath: ":audioSegmentDirPath",
123
+ multiLingual: ":context.multiLingual",
114
124
  context: ":context",
115
125
  },
116
126
  params: {
@@ -122,7 +132,7 @@ const graph_data = {
122
132
  combineFiles: {
123
133
  agent: "combineAudioFilesAgent",
124
134
  inputs: {
125
- map: ":map",
135
+ onComplete: ":map",
126
136
  context: ":context",
127
137
  combinedFileName: ":audioCombinedFilePath",
128
138
  },
@@ -141,7 +151,7 @@ const graph_data = {
141
151
  wait: ":combineFiles",
142
152
  voiceFile: ":audioCombinedFilePath",
143
153
  outputFile: ":audioArtifactFilePath",
144
- script: ":context.studio.script",
154
+ context: ":context",
145
155
  params: {
146
156
  musicFile: ":musicFile",
147
157
  },
@@ -172,49 +182,80 @@ export const audioFilePath = (context) => {
172
182
  const { outDirPath } = fileDirs;
173
183
  return getAudioArtifactFilePath(outDirPath, studio.filename);
174
184
  };
185
+ const getConcurrency = (context) => {
186
+ // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
187
+ const hasLimitedConcurrencyProvider = Object.values(context.presentationStyle.speechParams.speakers).some((speaker) => {
188
+ const provider = speaker.provider ?? context.presentationStyle.speechParams.provider;
189
+ return provider === "nijivoice" || provider === "elevenlabs";
190
+ });
191
+ return hasLimitedConcurrencyProvider ? 1 : 8;
192
+ };
193
+ const audioAgents = {
194
+ ...vanillaAgents,
195
+ fileWriteAgent,
196
+ ttsOpenaiAgent,
197
+ ttsNijivoiceAgent,
198
+ ttsGoogleAgent,
199
+ ttsElevenlabsAgent,
200
+ addBGMAgent,
201
+ combineAudioFilesAgent,
202
+ };
203
+ export const generateBeatAudio = async (index, context, callbacks) => {
204
+ try {
205
+ MulmoStudioContextMethods.setSessionState(context, "audio", true);
206
+ const { studio, fileDirs } = context;
207
+ const { outDirPath, audioDirPath } = fileDirs;
208
+ const audioSegmentDirPath = resolveDirPath(audioDirPath, studio.filename);
209
+ mkdir(outDirPath);
210
+ mkdir(audioSegmentDirPath);
211
+ const taskManager = new TaskManager(getConcurrency(context));
212
+ const graph = new GraphAI(graph_tts, audioAgents, { agentFilters, taskManager });
213
+ graph.injectValue("__mapIndex", index);
214
+ graph.injectValue("beat", context.studio.script.beats[index]);
215
+ graph.injectValue("studioBeat", context.studio.beats[index]);
216
+ graph.injectValue("multiLingual", context.multiLingual);
217
+ graph.injectValue("context", context);
218
+ if (callbacks) {
219
+ callbacks.forEach((callback) => {
220
+ graph.registerCallback(callback);
221
+ });
222
+ }
223
+ await graph.run();
224
+ }
225
+ finally {
226
+ MulmoStudioContextMethods.setSessionState(context, "audio", false);
227
+ }
228
+ };
175
229
  export const audio = async (context, callbacks) => {
176
230
  try {
177
231
  MulmoStudioContextMethods.setSessionState(context, "audio", true);
178
232
  const { studio, fileDirs, lang } = context;
179
233
  const { outDirPath, audioDirPath } = fileDirs;
180
234
  const audioArtifactFilePath = audioFilePath(context);
181
- const audioSegmentDirPath = getAudioSegmentDirPath(audioDirPath, studio.filename);
182
- const audioCombinedFilePath = getAudioCombinedFilePath(audioDirPath, studio.filename, lang);
235
+ const audioSegmentDirPath = resolveDirPath(audioDirPath, studio.filename);
236
+ const audioCombinedFilePath = getAudioFilePath(audioDirPath, studio.filename, studio.filename, lang);
183
237
  const outputStudioFilePath = getOutputStudioFilePath(outDirPath, studio.filename);
184
238
  mkdir(outDirPath);
185
239
  mkdir(audioSegmentDirPath);
186
- // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
187
- const hasLimitedConcurrencyProvider = Object.values(studio.script.speechParams.speakers).some((speaker) => {
188
- const provider = speaker.provider ?? studio.script.speechParams.provider;
189
- return provider === "nijivoice" || provider === "elevenlabs";
190
- });
191
- graph_data.concurrency = hasLimitedConcurrencyProvider ? 1 : 8;
192
- const graph = new GraphAI(graph_data, {
193
- ...vanillaAgents,
194
- fileWriteAgent,
195
- ttsOpenaiAgent,
196
- ttsNijivoiceAgent,
197
- ttsGoogleAgent,
198
- ttsElevenlabsAgent,
199
- addBGMAgent,
200
- combineAudioFilesAgent,
201
- }, { agentFilters });
240
+ const taskManager = new TaskManager(getConcurrency(context));
241
+ const graph = new GraphAI(graph_data, audioAgents, { agentFilters, taskManager });
202
242
  graph.injectValue("context", context);
203
243
  graph.injectValue("audioArtifactFilePath", audioArtifactFilePath);
204
244
  graph.injectValue("audioCombinedFilePath", audioCombinedFilePath);
205
245
  graph.injectValue("outputStudioFilePath", outputStudioFilePath);
206
- graph.injectValue("audioSegmentDirPath", audioSegmentDirPath);
207
- graph.injectValue("audioDirPath", audioDirPath);
208
- graph.injectValue("musicFile", MulmoMediaSourceMethods.resolve(studio.script.audioParams.bgm, context) ?? process.env.PATH_BGM ?? defaultBGMPath());
246
+ graph.injectValue("musicFile", MulmoMediaSourceMethods.resolve(context.presentationStyle.audioParams.bgm, context) ?? process.env.PATH_BGM ?? defaultBGMPath());
209
247
  if (callbacks) {
210
248
  callbacks.forEach((callback) => {
211
249
  graph.registerCallback(callback);
212
250
  });
213
251
  }
214
- await graph.run();
252
+ const result = await graph.run();
215
253
  writingMessage(audioCombinedFilePath);
254
+ MulmoStudioContextMethods.setSessionState(context, "audio", false);
255
+ return result.combineFiles;
216
256
  }
217
- finally {
257
+ catch (__error) {
218
258
  MulmoStudioContextMethods.setSessionState(context, "audio", false);
259
+ throw __error;
219
260
  }
220
261
  };
@@ -1,3 +1,3 @@
1
1
  import { MulmoStudioContext } from "../types/index.js";
2
2
  import type { CallbackFunction } from "graphai";
3
- export declare const captions: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
3
+ export declare const captions: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<MulmoStudioContext>;
@@ -26,11 +26,11 @@ const graph_data = {
26
26
  const { fileDirs } = namedInputs.context;
27
27
  const { caption } = context;
28
28
  const { imageDirPath } = fileDirs;
29
- const { canvasSize } = context.studio.script;
29
+ const { canvasSize } = context.presentationStyle;
30
30
  const imagePath = `${imageDirPath}/${context.studio.filename}/${index}_caption.png`;
31
31
  const template = getHTMLFile("caption");
32
32
  const text = (() => {
33
- const multiLingual = context.studio.multiLingual;
33
+ const multiLingual = context.multiLingual;
34
34
  if (caption && multiLingual) {
35
35
  return multiLingual[index].multiLingualTexts[caption].text;
36
36
  }
@@ -63,18 +63,21 @@ const graph_data = {
63
63
  },
64
64
  };
65
65
  export const captions = async (context, callbacks) => {
66
- try {
67
- MulmoStudioContextMethods.setSessionState(context, "caption", true);
68
- const graph = new GraphAI(graph_data, { ...vanillaAgents });
69
- graph.injectValue("context", context);
70
- if (callbacks) {
71
- callbacks.forEach((callback) => {
72
- graph.registerCallback(callback);
73
- });
66
+ if (context.caption) {
67
+ try {
68
+ MulmoStudioContextMethods.setSessionState(context, "caption", true);
69
+ const graph = new GraphAI(graph_data, { ...vanillaAgents });
70
+ graph.injectValue("context", context);
71
+ if (callbacks) {
72
+ callbacks.forEach((callback) => {
73
+ graph.registerCallback(callback);
74
+ });
75
+ }
76
+ await graph.run();
77
+ }
78
+ finally {
79
+ MulmoStudioContextMethods.setSessionState(context, "caption", false);
74
80
  }
75
- await graph.run();
76
- }
77
- finally {
78
- MulmoStudioContextMethods.setSessionState(context, "caption", false);
79
81
  }
82
+ return context;
80
83
  };
@@ -4,8 +4,6 @@ export declare const imagePreprocessAgent: (namedInputs: {
4
4
  context: MulmoStudioContext;
5
5
  beat: MulmoBeat;
6
6
  index: number;
7
- suffix: string;
8
- imageDirPath: string;
9
7
  imageAgentInfo: Text2ImageAgentInfo;
10
8
  imageRefs: Record<string, string>;
11
9
  }) => Promise<{
@@ -32,8 +30,11 @@ export declare const imagePreprocessAgent: (namedInputs: {
32
30
  };
33
31
  movieFile: string | undefined;
34
32
  imagePath: string | undefined;
33
+ referenceImage: string | undefined;
35
34
  } | {
35
+ imagePath: string;
36
36
  images: string[];
37
+ imageFromMovie: boolean;
37
38
  imageParams: {
38
39
  model?: string | undefined;
39
40
  style?: string | undefined;
@@ -81,7 +82,9 @@ export declare const imagePreprocessAgent: (namedInputs: {
81
82
  };
82
83
  movieFile: string | undefined;
83
84
  imagePath: string;
85
+ referenceImage: string;
84
86
  prompt: string;
85
87
  }>;
86
- export declare const images: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
88
+ export declare const getImageRefs: (context: MulmoStudioContext) => Promise<Record<string, string>>;
89
+ export declare const images: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<MulmoStudioContext>;
87
90
  export declare const generateBeatImage: (index: number, context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
@@ -1,41 +1,43 @@
1
1
  import dotenv from "dotenv";
2
2
  import fs from "fs";
3
3
  import { GraphAI, GraphAILogger } from "graphai";
4
+ import { TaskManager } from "graphai/lib/task_manager.js";
4
5
  import * as agents from "@graphai/vanilla";
5
6
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
6
- import { getOutputStudioFilePath, mkdir } from "../utils/file.js";
7
+ import { getOutputStudioFilePath, getBeatPngImagePath, getBeatMoviePath, getReferenceImagePath, mkdir } from "../utils/file.js";
7
8
  import { fileCacheAgentFilter } from "../utils/filters.js";
8
9
  import { imageGoogleAgent, imageOpenaiAgent, movieGoogleAgent, mediaMockAgent } from "../agents/index.js";
9
- import { MulmoScriptMethods, MulmoStudioContextMethods } from "../methods/index.js";
10
+ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../methods/index.js";
10
11
  import { imagePlugins } from "../utils/image_plugins/index.js";
11
12
  import { imagePrompt } from "../utils/prompt.js";
12
13
  const vanillaAgents = agents.default ?? agents;
13
14
  dotenv.config();
14
15
  // const openai = new OpenAI();
15
16
  import { GoogleAuth } from "google-auth-library";
16
- const htmlStyle = (script, beat) => {
17
+ import { extractImageFromMovie } from "../utils/ffmpeg_utils.js";
18
+ const htmlStyle = (context, beat) => {
17
19
  return {
18
- canvasSize: MulmoScriptMethods.getCanvasSize(script),
19
- textSlideStyle: MulmoScriptMethods.getTextSlideStyle(script, beat),
20
+ canvasSize: MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle),
21
+ textSlideStyle: MulmoPresentationStyleMethods.getTextSlideStyle(context.presentationStyle, beat),
20
22
  };
21
23
  };
22
24
  export const imagePreprocessAgent = async (namedInputs) => {
23
- const { context, beat, index, suffix, imageDirPath, imageAgentInfo, imageRefs } = namedInputs;
25
+ const { context, beat, index, imageAgentInfo, imageRefs } = namedInputs;
24
26
  const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
25
- const imagePath = `${imageDirPath}/${context.studio.filename}/${index}${suffix}.png`;
27
+ const imagePath = getBeatPngImagePath(context, index);
26
28
  const returnValue = {
27
29
  imageParams,
28
- movieFile: beat.moviePrompt ? `${imageDirPath}/${context.studio.filename}/${index}.mov` : undefined,
30
+ movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
29
31
  };
30
32
  if (beat.image) {
31
33
  const plugin = imagePlugins.find((plugin) => plugin.imageType === beat?.image?.type);
32
34
  if (plugin) {
33
35
  try {
34
36
  MulmoStudioContextMethods.setBeatSessionState(context, "image", index, true);
35
- const processorParams = { beat, context, imagePath, ...htmlStyle(context.studio.script, beat) };
37
+ const processorParams = { beat, context, imagePath, ...htmlStyle(context, beat) };
36
38
  const path = await plugin.process(processorParams);
37
39
  // undefined prompt indicates that image generation is not needed
38
- return { imagePath: path, ...returnValue };
40
+ return { imagePath: path, referenceImage: path, ...returnValue };
39
41
  }
40
42
  finally {
41
43
  MulmoStudioContextMethods.setBeatSessionState(context, "image", index, false);
@@ -49,17 +51,16 @@ export const imagePreprocessAgent = async (namedInputs) => {
49
51
  return sources.filter((source) => source !== undefined);
50
52
  })();
51
53
  if (beat.moviePrompt && !beat.imagePrompt) {
52
- return { ...returnValue, images }; // no image prompt, only movie prompt
54
+ return { ...returnValue, imagePath, images, imageFromMovie: true }; // no image prompt, only movie prompt
53
55
  }
54
56
  const prompt = imagePrompt(beat, imageParams.style);
55
- return { imagePath, prompt, ...returnValue, images };
57
+ return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
56
58
  };
57
59
  const beat_graph_data = {
58
60
  version: 0.5,
59
61
  concurrency: 4,
60
62
  nodes: {
61
63
  context: {},
62
- imageDirPath: {},
63
64
  imageAgentInfo: {},
64
65
  movieAgentInfo: {},
65
66
  imageRefs: {},
@@ -71,8 +72,6 @@ const beat_graph_data = {
71
72
  context: ":context",
72
73
  beat: ":beat",
73
74
  index: ":__mapIndex",
74
- suffix: "p",
75
- imageDirPath: ":imageDirPath",
76
75
  imageAgentInfo: ":imageAgentInfo",
77
76
  imageRefs: ":imageRefs",
78
77
  },
@@ -93,7 +92,7 @@ const beat_graph_data = {
93
92
  params: {
94
93
  model: ":preprocessor.imageParams.model",
95
94
  moderation: ":preprocessor.imageParams.moderation",
96
- canvasSize: ":context.studio.script.canvasSize",
95
+ canvasSize: ":context.presentationStyle.canvasSize",
97
96
  },
98
97
  },
99
98
  defaultValue: {},
@@ -104,24 +103,37 @@ const beat_graph_data = {
104
103
  inputs: {
105
104
  onComplete: ":imageGenerator", // to wait for imageGenerator to finish
106
105
  prompt: ":beat.moviePrompt",
107
- imagePath: ":preprocessor.imagePath",
106
+ imagePath: ":preprocessor.referenceImage",
108
107
  file: ":preprocessor.movieFile",
109
108
  studio: ":context.studio", // for cache
110
109
  mulmoContext: ":context", // for fileCacheAgentFilter
111
110
  index: ":__mapIndex", // for cache
112
111
  sessionType: "movie", // for cache
113
112
  params: {
114
- model: ":context.studio.script.movieParams.model",
113
+ model: ":context.presentationStyle.movieParams.model",
115
114
  duration: ":beat.duration",
116
- canvasSize: ":context.studio.script.canvasSize",
115
+ canvasSize: ":context.presentationStyle.canvasSize",
117
116
  },
118
117
  },
119
118
  defaultValue: {},
120
119
  },
120
+ imageFromMovie: {
121
+ if: ":preprocessor.imageFromMovie",
122
+ agent: async (namedInputs) => {
123
+ await extractImageFromMovie(namedInputs.movieFile, namedInputs.imageFile);
124
+ return { generatedImage: true };
125
+ },
126
+ inputs: {
127
+ onComplete: ":movieGenerator", // to wait for movieGenerator to finish
128
+ imageFile: ":preprocessor.imagePath",
129
+ movieFile: ":preprocessor.movieFile",
130
+ },
131
+ defaultValue: { generatedImage: false },
132
+ },
121
133
  output: {
122
134
  agent: "copyAgent",
123
135
  inputs: {
124
- onComplete: ":movieGenerator", // to wait for movieGenerator to finish
136
+ onComplete: ":imageFromMovie", // to wait for imageFromMovie to finish
125
137
  imageFile: ":preprocessor.imagePath",
126
138
  movieFile: ":preprocessor.movieFile",
127
139
  },
@@ -138,7 +150,6 @@ const graph_data = {
138
150
  concurrency: 4,
139
151
  nodes: {
140
152
  context: {},
141
- imageDirPath: {},
142
153
  imageAgentInfo: {},
143
154
  movieAgentInfo: {},
144
155
  outputStudioFilePath: {},
@@ -150,7 +161,6 @@ const graph_data = {
150
161
  context: ":context",
151
162
  imageAgentInfo: ":imageAgentInfo",
152
163
  movieAgentInfo: ":movieAgentInfo",
153
- imageDirPath: ":imageDirPath",
154
164
  imageRefs: ":imageRefs",
155
165
  },
156
166
  isResult: true,
@@ -185,7 +195,10 @@ const graph_data = {
185
195
  }
186
196
  }
187
197
  });
188
- return { studio };
198
+ return {
199
+ ...context,
200
+ studio,
201
+ };
189
202
  },
190
203
  inputs: {
191
204
  array: ":map.output",
@@ -217,7 +230,6 @@ const googleAuth = async () => {
217
230
  }
218
231
  };
219
232
  const graphOption = async (context) => {
220
- const { studio } = context;
221
233
  const agentFilters = [
222
234
  {
223
235
  name: "fileCacheAgentFilter",
@@ -225,12 +237,14 @@ const graphOption = async (context) => {
225
237
  nodeIds: ["imageGenerator", "movieGenerator"],
226
238
  },
227
239
  ];
240
+ const taskManager = new TaskManager(getConcurrency(context));
228
241
  const options = {
229
242
  agentFilters,
243
+ taskManager,
230
244
  };
231
- const imageAgentInfo = MulmoScriptMethods.getImageAgentInfo(studio.script);
245
+ const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
232
246
  // We need to get google's auth token only if the google is the text2image provider.
233
- if (imageAgentInfo.provider === "google" || studio.script.movieParams?.provider === "google") {
247
+ if (imageAgentInfo.provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
234
248
  GraphAILogger.log("google was specified as text2image engine");
235
249
  const token = await googleAuth();
236
250
  options.config = {
@@ -246,13 +260,10 @@ const graphOption = async (context) => {
246
260
  }
247
261
  return options;
248
262
  };
249
- const prepareGenerateImages = async (context) => {
250
- const { studio, fileDirs } = context;
251
- const { outDirPath, imageDirPath } = fileDirs;
252
- mkdir(`${imageDirPath}/${studio.filename}`);
253
- const imageAgentInfo = MulmoScriptMethods.getImageAgentInfo(studio.script, context.dryRun);
263
+ // TODO: unit test
264
+ export const getImageRefs = async (context) => {
254
265
  const imageRefs = {};
255
- const images = studio.script.imageParams?.images;
266
+ const images = context.presentationStyle.imageParams?.images;
256
267
  if (images) {
257
268
  await Promise.all(Object.keys(images).map(async (key) => {
258
269
  const image = images[key];
@@ -283,12 +294,21 @@ const prepareGenerateImages = async (context) => {
283
294
  return "png"; // default
284
295
  }
285
296
  })();
286
- const imagePath = `${imageDirPath}/${context.studio.filename}/${key}.${extension}`;
297
+ const imagePath = getReferenceImagePath(context, key, extension);
287
298
  await fs.promises.writeFile(imagePath, buffer);
288
299
  imageRefs[key] = imagePath;
289
300
  }
290
301
  }));
291
302
  }
303
+ return imageRefs;
304
+ };
305
+ const prepareGenerateImages = async (context) => {
306
+ const { studio } = context;
307
+ const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
308
+ const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
309
+ mkdir(imageProjectDirPath);
310
+ const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle, context.dryRun);
311
+ const imageRefs = await getImageRefs(context);
292
312
  GraphAILogger.info(`text2image: provider=${imageAgentInfo.provider} model=${imageAgentInfo.imageParams.model}`);
293
313
  const injections = {
294
314
  context,
@@ -297,19 +317,21 @@ const prepareGenerateImages = async (context) => {
297
317
  agent: context.dryRun ? "mediaMockAgent" : "movieGoogleAgent",
298
318
  },
299
319
  outputStudioFilePath: getOutputStudioFilePath(outDirPath, studio.filename),
300
- imageDirPath,
301
320
  imageRefs,
302
321
  };
303
322
  return injections;
304
323
  };
305
- const generateImages = async (context, callbacks) => {
306
- const imageAgentInfo = MulmoScriptMethods.getImageAgentInfo(context.studio.script);
324
+ const getConcurrency = (context) => {
325
+ const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
307
326
  if (imageAgentInfo.provider === "openai") {
308
327
  // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
309
328
  // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
310
329
  // gpt-image-1:3,000,000 TPM、150 images per minute
311
- graph_data.concurrency = imageAgentInfo.imageParams.model === "dall-e-3" ? 4 : 16;
330
+ return imageAgentInfo.imageParams.model === "dall-e-3" ? 4 : 16;
312
331
  }
332
+ return 4;
333
+ };
334
+ const generateImages = async (context, callbacks) => {
313
335
  const options = await graphOption(context);
314
336
  const injections = await prepareGenerateImages(context);
315
337
  const graph = new GraphAI(graph_data, { ...vanillaAgents, imageGoogleAgent, movieGoogleAgent, imageOpenaiAgent, mediaMockAgent, fileWriteAgent }, options);
@@ -327,10 +349,13 @@ const generateImages = async (context, callbacks) => {
327
349
  export const images = async (context, callbacks) => {
328
350
  try {
329
351
  MulmoStudioContextMethods.setSessionState(context, "image", true);
330
- await generateImages(context, callbacks);
352
+ const newContext = await generateImages(context, callbacks);
353
+ MulmoStudioContextMethods.setSessionState(context, "image", false);
354
+ return newContext;
331
355
  }
332
- finally {
356
+ catch (error) {
333
357
  MulmoStudioContextMethods.setSessionState(context, "image", false);
358
+ throw error;
334
359
  }
335
360
  };
336
361
  export const generateBeatImage = async (index, context, callbacks) => {