mulmocast 0.0.18 → 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +44 -36
  2. package/assets/templates/ghibli_image_only.json +28 -0
  3. package/lib/actions/audio.js +13 -11
  4. package/lib/actions/captions.js +2 -3
  5. package/lib/actions/images.d.ts +5 -0
  6. package/lib/actions/images.js +41 -17
  7. package/lib/actions/movie.js +17 -3
  8. package/lib/actions/translate.js +3 -3
  9. package/lib/agents/add_bgm_agent.js +2 -2
  10. package/lib/agents/combine_audio_files_agent.js +96 -53
  11. package/lib/agents/image_openai_agent.js +2 -1
  12. package/lib/agents/validate_schema_agent.d.ts +1 -1
  13. package/lib/agents/validate_schema_agent.js +3 -3
  14. package/lib/cli/helpers.js +6 -1
  15. package/lib/index.browser.d.ts +2 -0
  16. package/lib/index.browser.js +3 -0
  17. package/lib/index.d.ts +1 -0
  18. package/lib/index.js +1 -0
  19. package/lib/methods/mulmo_presentation_style.js +2 -1
  20. package/lib/types/schema.d.ts +197 -129
  21. package/lib/types/schema.js +9 -5
  22. package/lib/utils/const.d.ts +1 -0
  23. package/lib/utils/const.js +1 -0
  24. package/lib/utils/file.d.ts +1 -0
  25. package/lib/utils/file.js +4 -0
  26. package/lib/utils/image_plugins/beat.d.ts +1 -0
  27. package/lib/utils/image_plugins/beat.js +3 -0
  28. package/lib/utils/image_plugins/chart.d.ts +1 -0
  29. package/lib/utils/image_plugins/chart.js +2 -0
  30. package/lib/utils/image_plugins/html_tailwind.d.ts +1 -0
  31. package/lib/utils/image_plugins/html_tailwind.js +2 -0
  32. package/lib/utils/image_plugins/image.d.ts +1 -0
  33. package/lib/utils/image_plugins/image.js +1 -0
  34. package/lib/utils/image_plugins/index.d.ts +3 -3
  35. package/lib/utils/image_plugins/index.js +6 -3
  36. package/lib/utils/image_plugins/markdown.d.ts +1 -0
  37. package/lib/utils/image_plugins/markdown.js +2 -0
  38. package/lib/utils/image_plugins/mermaid.d.ts +1 -0
  39. package/lib/utils/image_plugins/mermaid.js +3 -1
  40. package/lib/utils/image_plugins/movie.d.ts +1 -0
  41. package/lib/utils/image_plugins/movie.js +1 -0
  42. package/lib/utils/image_plugins/source.js +1 -1
  43. package/lib/utils/image_plugins/text_slide.d.ts +1 -0
  44. package/lib/utils/image_plugins/text_slide.js +2 -0
  45. package/lib/utils/image_plugins/utils.d.ts +2 -0
  46. package/lib/utils/image_plugins/utils.js +3 -0
  47. package/lib/utils/preprocess.d.ts +3 -1
  48. package/package.json +13 -3
  49. package/scripts/templates/image_prompt_only_template.json +27 -0
  50. package/lib/agents/image_mock_agent.d.ts +0 -4
  51. package/lib/agents/image_mock_agent.js +0 -18
  52. package/lib/agents/mulmo_prompts_agent.d.ts +0 -7
  53. package/lib/agents/mulmo_prompts_agent.js +0 -37
  54. package/lib/agents/prompts_data.d.ts +0 -15
  55. package/lib/agents/prompts_data.js +0 -16
  56. package/lib/agents/validate_mulmo_script_agent.d.ts +0 -17
  57. package/lib/agents/validate_mulmo_script_agent.js +0 -34
  58. package/lib/cli/args.d.ts +0 -15
  59. package/lib/cli/args.js +0 -62
  60. package/lib/cli/cli.d.ts +0 -17
  61. package/lib/cli/cli.js +0 -117
  62. package/lib/cli/run.d.ts +0 -2
  63. package/lib/cli/run.js +0 -3
  64. package/lib/cli/tool-args.d.ts +0 -16
  65. package/lib/cli/tool-args.js +0 -64
  66. package/lib/cli/tool-cli.d.ts +0 -2
  67. package/lib/cli/tool-cli.js +0 -69
  68. package/lib/methods/mulmo_script.d.ts +0 -11
  69. package/lib/methods/mulmo_script.js +0 -59
  70. package/lib/methods/mulmo_studio.d.ts +0 -8
  71. package/lib/methods/mulmo_studio.js +0 -24
  72. package/lib/tools/prompt.d.ts +0 -1
  73. package/lib/tools/prompt.js +0 -18
  74. package/lib/utils/image_plugins/tailwind.d.ts +0 -3
  75. package/lib/utils/image_plugins/tailwind.js +0 -18
  76. package/lib/utils/pdf.d.ts +0 -9
  77. package/lib/utils/pdf.js +0 -77
package/README.md CHANGED
@@ -288,14 +288,16 @@ Positionals:
288
288
  file Mulmo Script File [string] [required]
289
289
 
290
290
  Options:
291
- --version Show version number [boolean]
292
- -v, --verbose verbose log [boolean] [required] [default: false]
293
- -h, --help Show help [boolean]
294
- -o, --outdir output dir [string]
295
- -b, --basedir base dir [string]
296
- -l, --lang target language [string] [choices: "en", "ja"]
297
- -f, --force Force regenerate [boolean] [default: false]
298
- -a, --audiodir Audio output directory [string]
291
+ --version Show version number [boolean]
292
+ -v, --verbose verbose log [boolean] [required] [default: false]
293
+ -h, --help Show help [boolean]
294
+ -o, --outdir output dir [string]
295
+ -b, --basedir base dir [string]
296
+ -l, --lang target language [string] [choices: "en", "ja"]
297
+ -f, --force Force regenerate [boolean] [default: false]
298
+ --dryRun Dry run [boolean] [default: false]
299
+ -p, --presentationStyle Presentation Style [string]
300
+ -a, --audiodir Audio output directory [string]
299
301
  ```
300
302
 
301
303
  ```
@@ -307,14 +309,16 @@ Positionals:
307
309
  file Mulmo Script File [string] [required]
308
310
 
309
311
  Options:
310
- --version Show version number [boolean]
311
- -v, --verbose verbose log [boolean] [required] [default: false]
312
- -h, --help Show help [boolean]
313
- -o, --outdir output dir [string]
314
- -b, --basedir base dir [string]
315
- -l, --lang target language [string] [choices: "en", "ja"]
316
- -f, --force Force regenerate [boolean] [default: false]
317
- -i, --imagedir Image output directory [string]
312
+ --version Show version number [boolean]
313
+ -v, --verbose verbose log [boolean] [required] [default: false]
314
+ -h, --help Show help [boolean]
315
+ -o, --outdir output dir [string]
316
+ -b, --basedir base dir [string]
317
+ -l, --lang target language [string] [choices: "en", "ja"]
318
+ -f, --force Force regenerate [boolean] [default: false]
319
+ --dryRun Dry run [boolean] [default: false]
320
+ -p, --presentationStyle Presentation Style [string]
321
+ -i, --imagedir Image output directory [string]
318
322
  ```
319
323
 
320
324
  ```
@@ -326,16 +330,18 @@ Positionals:
326
330
  file Mulmo Script File [string] [required]
327
331
 
328
332
  Options:
329
- --version Show version number [boolean]
330
- -v, --verbose verbose log [boolean] [required] [default: false]
331
- -h, --help Show help [boolean]
332
- -o, --outdir output dir [string]
333
- -b, --basedir base dir [string]
334
- -l, --lang target language [string] [choices: "en", "ja"]
335
- -f, --force Force regenerate [boolean] [default: false]
336
- -a, --audiodir Audio output directory [string]
337
- -i, --imagedir Image output directory [string]
338
- -c, --caption Video captions [string] [choices: "en", "ja"]
333
+ --version Show version number [boolean]
334
+ -v, --verbose verbose log [boolean] [required] [default: false]
335
+ -h, --help Show help [boolean]
336
+ -o, --outdir output dir [string]
337
+ -b, --basedir base dir [string]
338
+ -l, --lang target language [string] [choices: "en", "ja"]
339
+ -f, --force Force regenerate [boolean] [default: false]
340
+ --dryRun Dry run [boolean] [default: false]
341
+ -p, --presentationStyle Presentation Style [string]
342
+ -a, --audiodir Audio output directory [string]
343
+ -i, --imagedir Image output directory [string]
344
+ -c, --caption Video captions [string] [choices: "en", "ja"]
339
345
  ```
340
346
 
341
347
  ```
@@ -347,17 +353,19 @@ Positionals:
347
353
  file Mulmo Script File [string] [required]
348
354
 
349
355
  Options:
350
- --version Show version number [boolean]
351
- -v, --verbose verbose log [boolean] [required] [default: false]
352
- -h, --help Show help [boolean]
353
- -o, --outdir output dir [string]
354
- -b, --basedir base dir [string]
355
- -l, --lang target language [string] [choices: "en", "ja"]
356
- -f, --force Force regenerate [boolean] [default: false]
357
- -i, --imagedir Image output directory [string]
358
- --pdf_mode PDF mode
356
+ --version Show version number [boolean]
357
+ -v, --verbose verbose log [boolean] [required] [default: false]
358
+ -h, --help Show help [boolean]
359
+ -o, --outdir output dir [string]
360
+ -b, --basedir base dir [string]
361
+ -l, --lang target language [string] [choices: "en", "ja"]
362
+ -f, --force Force regenerate [boolean] [default: false]
363
+ --dryRun Dry run [boolean] [default: false]
364
+ -p, --presentationStyle Presentation Style [string]
365
+ -i, --imagedir Image output directory [string]
366
+ --pdf_mode PDF mode
359
367
  [string] [choices: "slide", "talk", "handout"] [default: "slide"]
360
- --pdf_size PDF paper size (default: letter)
368
+ --pdf_size PDF paper size (default: letter)
361
369
  [choices: "letter", "a4"] [default: "letter"]
362
370
  ```
363
371
 
@@ -0,0 +1,28 @@
1
+ {
2
+ "title": "Ghibli comic image-only",
3
+ "description": "Template for Ghibli-style image-only comic presentation.",
4
+ "systemPrompt": "Generate a script for a presentation of the given topic. Another AI will generate an image for each beat based on the text description of that beat. Use the JSON below as a template.",
5
+ "presentationStyle": {
6
+ "$mulmocast": {
7
+ "version": "1.0",
8
+ "credit": "closing"
9
+ },
10
+ "canvasSize": {
11
+ "width": 1536,
12
+ "height": 1024
13
+ },
14
+ "imageParams": {
15
+ "style": "<style>Ghibli style</style>",
16
+ "images": {
17
+ "presenter": {
18
+ "type": "image",
19
+ "source": {
20
+ "kind": "url",
21
+ "url": "https://raw.githubusercontent.com/receptron/mulmocast-media/refs/heads/main/characters/ghibli_presenter.png"
22
+ }
23
+ }
24
+ }
25
+ }
26
+ },
27
+ "scriptName": "image_prompt_only_template.json"
28
+ }
@@ -178,9 +178,9 @@ const agentFilters = [
178
178
  },
179
179
  ];
180
180
  export const audioFilePath = (context) => {
181
- const { studio, fileDirs } = context;
182
- const { outDirPath } = fileDirs;
183
- return getAudioArtifactFilePath(outDirPath, studio.filename);
181
+ const fileName = MulmoStudioContextMethods.getFileName(context);
182
+ const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
183
+ return getAudioArtifactFilePath(outDirPath, fileName);
184
184
  };
185
185
  const getConcurrency = (context) => {
186
186
  // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
@@ -203,9 +203,10 @@ const audioAgents = {
203
203
  export const generateBeatAudio = async (index, context, callbacks) => {
204
204
  try {
205
205
  MulmoStudioContextMethods.setSessionState(context, "audio", true);
206
- const { studio, fileDirs } = context;
207
- const { outDirPath, audioDirPath } = fileDirs;
208
- const audioSegmentDirPath = resolveDirPath(audioDirPath, studio.filename);
206
+ const fileName = MulmoStudioContextMethods.getFileName(context);
207
+ const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
208
+ const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
209
+ const audioSegmentDirPath = resolveDirPath(audioDirPath, fileName);
209
210
  mkdir(outDirPath);
210
211
  mkdir(audioSegmentDirPath);
211
212
  const taskManager = new TaskManager(getConcurrency(context));
@@ -229,12 +230,13 @@ export const generateBeatAudio = async (index, context, callbacks) => {
229
230
  export const audio = async (context, callbacks) => {
230
231
  try {
231
232
  MulmoStudioContextMethods.setSessionState(context, "audio", true);
232
- const { studio, fileDirs, lang } = context;
233
- const { outDirPath, audioDirPath } = fileDirs;
233
+ const fileName = MulmoStudioContextMethods.getFileName(context);
234
+ const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
235
+ const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
234
236
  const audioArtifactFilePath = audioFilePath(context);
235
- const audioSegmentDirPath = resolveDirPath(audioDirPath, studio.filename);
236
- const audioCombinedFilePath = getAudioFilePath(audioDirPath, studio.filename, studio.filename, lang);
237
- const outputStudioFilePath = getOutputStudioFilePath(outDirPath, studio.filename);
237
+ const audioSegmentDirPath = resolveDirPath(audioDirPath, fileName);
238
+ const audioCombinedFilePath = getAudioFilePath(audioDirPath, fileName, fileName, context.lang);
239
+ const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
238
240
  mkdir(outDirPath);
239
241
  mkdir(audioSegmentDirPath);
240
242
  const taskManager = new TaskManager(getConcurrency(context));
@@ -1,6 +1,6 @@
1
1
  import { GraphAI, GraphAILogger } from "graphai";
2
2
  import * as agents from "@graphai/vanilla";
3
- import { getHTMLFile } from "../utils/file.js";
3
+ import { getHTMLFile, getCaptionImagePath } from "../utils/file.js";
4
4
  import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
5
5
  import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
6
6
  const vanillaAgents = agents.default ?? agents;
@@ -23,10 +23,9 @@ const graph_data = {
23
23
  const { beat, context, index } = namedInputs;
24
24
  try {
25
25
  MulmoStudioContextMethods.setBeatSessionState(context, "caption", index, true);
26
- const imageDirPath = MulmoStudioContextMethods.getImageDirPath(context);
27
26
  const caption = MulmoStudioContextMethods.getCaption(context);
28
27
  const canvasSize = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
29
- const imagePath = `${imageDirPath}/${context.studio.filename}/${index}_caption.png`;
28
+ const imagePath = getCaptionImagePath(context, index);
30
29
  const template = getHTMLFile("caption");
31
30
  const text = (() => {
32
31
  const multiLingual = context.multiLingual;
@@ -85,6 +85,11 @@ export declare const imagePreprocessAgent: (namedInputs: {
85
85
  referenceImage: string;
86
86
  prompt: string;
87
87
  }>;
88
+ export declare const imagePluginAgent: (namedInputs: {
89
+ context: MulmoStudioContext;
90
+ beat: MulmoBeat;
91
+ index: number;
92
+ }) => Promise<void>;
88
93
  export declare const getImageRefs: (context: MulmoStudioContext) => Promise<Record<string, string>>;
89
94
  export declare const images: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<MulmoStudioContext>;
90
95
  export declare const generateBeatImage: (index: number, context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
@@ -8,8 +8,9 @@ import { getOutputStudioFilePath, getBeatPngImagePath, getBeatMoviePath, getRefe
8
8
  import { fileCacheAgentFilter } from "../utils/filters.js";
9
9
  import { imageGoogleAgent, imageOpenaiAgent, movieGoogleAgent, mediaMockAgent } from "../agents/index.js";
10
10
  import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../methods/index.js";
11
- import { imagePlugins } from "../utils/image_plugins/index.js";
11
+ import { findImagePlugin } from "../utils/image_plugins/index.js";
12
12
  import { imagePrompt } from "../utils/prompt.js";
13
+ import { defaultOpenAIImageModel } from "../utils/const.js";
13
14
  const vanillaAgents = agents.default ?? agents;
14
15
  dotenv.config();
15
16
  // const openai = new OpenAI();
@@ -30,19 +31,13 @@ export const imagePreprocessAgent = async (namedInputs) => {
30
31
  movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
31
32
  };
32
33
  if (beat.image) {
33
- const plugin = imagePlugins.find((plugin) => plugin.imageType === beat?.image?.type);
34
- if (plugin) {
35
- try {
36
- MulmoStudioContextMethods.setBeatSessionState(context, "image", index, true);
37
- const processorParams = { beat, context, imagePath, ...htmlStyle(context, beat) };
38
- const path = await plugin.process(processorParams);
39
- // undefined prompt indicates that image generation is not needed
40
- return { imagePath: path, referenceImage: path, ...returnValue };
41
- }
42
- finally {
43
- MulmoStudioContextMethods.setBeatSessionState(context, "image", index, false);
44
- }
34
+ const plugin = findImagePlugin(beat?.image?.type);
35
+ if (!plugin) {
36
+ throw new Error(`invalid beat image type: ${beat.image}`);
45
37
  }
38
+ const path = plugin.path({ beat, context, imagePath, ...htmlStyle(context, beat) });
39
+ // undefined prompt indicates that image generation is not needed
40
+ return { imagePath: path, referenceImage: path, ...returnValue };
46
41
  }
47
42
  // images for "edit_image"
48
43
  const images = (() => {
@@ -56,6 +51,24 @@ export const imagePreprocessAgent = async (namedInputs) => {
56
51
  const prompt = imagePrompt(beat, imageParams.style);
57
52
  return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
58
53
  };
54
+ export const imagePluginAgent = async (namedInputs) => {
55
+ const { context, beat, index } = namedInputs;
56
+ const imagePath = getBeatPngImagePath(context, index);
57
+ const plugin = findImagePlugin(beat?.image?.type);
58
+ if (!plugin) {
59
+ throw new Error(`invalid beat image type: ${beat.image}`);
60
+ }
61
+ try {
62
+ MulmoStudioContextMethods.setBeatSessionState(context, "image", index, true);
63
+ const processorParams = { beat, context, imagePath, ...htmlStyle(context, beat) };
64
+ await plugin.process(processorParams);
65
+ MulmoStudioContextMethods.setBeatSessionState(context, "image", index, false);
66
+ }
67
+ catch (error) {
68
+ MulmoStudioContextMethods.setBeatSessionState(context, "image", index, false);
69
+ throw error;
70
+ }
71
+ };
59
72
  const beat_graph_data = {
60
73
  version: 0.5,
61
74
  concurrency: 4,
@@ -76,6 +89,17 @@ const beat_graph_data = {
76
89
  imageRefs: ":imageRefs",
77
90
  },
78
91
  },
92
+ imagePlugin: {
93
+ if: ":beat.image",
94
+ defaultValue: {},
95
+ agent: imagePluginAgent,
96
+ inputs: {
97
+ context: ":context",
98
+ beat: ":beat",
99
+ index: ":__mapIndex",
100
+ onComplete: ":preprocessor",
101
+ },
102
+ },
79
103
  imageGenerator: {
80
104
  if: ":preprocessor.prompt",
81
105
  agent: ":imageAgentInfo.agent",
@@ -101,7 +125,7 @@ const beat_graph_data = {
101
125
  if: ":preprocessor.movieFile",
102
126
  agent: ":movieAgentInfo.agent",
103
127
  inputs: {
104
- onComplete: ":imageGenerator", // to wait for imageGenerator to finish
128
+ onComplete: [":imageGenerator", ":imagePlugin"], // to wait for imageGenerator to finish
105
129
  prompt: ":beat.moviePrompt",
106
130
  imagePath: ":preprocessor.referenceImage",
107
131
  file: ":preprocessor.movieFile",
@@ -303,7 +327,7 @@ export const getImageRefs = async (context) => {
303
327
  return imageRefs;
304
328
  };
305
329
  const prepareGenerateImages = async (context) => {
306
- const { studio } = context;
330
+ const fileName = MulmoStudioContextMethods.getFileName(context);
307
331
  const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
308
332
  const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
309
333
  mkdir(imageProjectDirPath);
@@ -316,7 +340,7 @@ const prepareGenerateImages = async (context) => {
316
340
  movieAgentInfo: {
317
341
  agent: context.dryRun ? "mediaMockAgent" : "movieGoogleAgent",
318
342
  },
319
- outputStudioFilePath: getOutputStudioFilePath(outDirPath, studio.filename),
343
+ outputStudioFilePath: getOutputStudioFilePath(outDirPath, fileName),
320
344
  imageRefs,
321
345
  };
322
346
  return injections;
@@ -327,7 +351,7 @@ const getConcurrency = (context) => {
327
351
  // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
328
352
  // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
329
353
  // gpt-image-1:3,000,000 TPM、150 images per minute
330
- return imageAgentInfo.imageParams.model === "dall-e-3" ? 4 : 16;
354
+ return imageAgentInfo.imageParams.model === defaultOpenAIImageModel ? 4 : 16;
331
355
  }
332
356
  return 4;
333
357
  };
@@ -140,10 +140,24 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context, capt
140
140
  return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
141
141
  const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
142
142
  const processedVideoId = `${transitionVideoId}_f`;
143
- // If we can to add other transition types than fade, we need to add them here.
144
- ffmpegContext.filterComplex.push(`[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`);
143
+ let transitionFilter;
144
+ if (transition.type === "fade") {
145
+ transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
146
+ }
147
+ else if (transition.type === "slideout_left") {
148
+ transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
149
+ }
150
+ else {
151
+ throw new Error(`Unknown transition type: ${transition.type}`);
152
+ }
153
+ ffmpegContext.filterComplex.push(transitionFilter);
145
154
  const outputId = `${transitionVideoId}_o`;
146
- ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
155
+ if (transition.type === "fade") {
156
+ ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
157
+ }
158
+ else if (transition.type === "slideout_left") {
159
+ ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
160
+ }
147
161
  return outputId;
148
162
  }, concatVideoId);
149
163
  }
@@ -211,9 +211,9 @@ const targetLangs = ["ja", "en"];
211
211
  export const translate = async (context, callbacks) => {
212
212
  try {
213
213
  MulmoStudioContextMethods.setSessionState(context, "multiLingual", true);
214
- const { studio, fileDirs } = context;
215
- const { outDirPath } = fileDirs;
216
- const outputMultilingualFilePath = getOutputMultilingualFilePath(outDirPath, studio.filename);
214
+ const fileName = MulmoStudioContextMethods.getFileName(context);
215
+ const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
216
+ const outputMultilingualFilePath = getOutputMultilingualFilePath(outDirPath, fileName);
217
217
  mkdir(outDirPath);
218
218
  assert(!!process.env.OPENAI_API_KEY, "The OPENAI_API_KEY environment variable is missing or empty");
219
219
  const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters });
@@ -11,8 +11,8 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
11
11
  const ffmpegContext = FfmpegContextInit();
12
12
  const musicInputIndex = FfmpegContextAddInput(ffmpegContext, musicFile);
13
13
  const voiceInputIndex = FfmpegContextAddInput(ffmpegContext, voiceFile);
14
- ffmpegContext.filterComplex.push(`[${musicInputIndex}:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo, volume=0.2[music]`);
15
- ffmpegContext.filterComplex.push(`[${voiceInputIndex}:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo, volume=2, adelay=${introPadding * 1000}|${introPadding * 1000}[voice]`);
14
+ ffmpegContext.filterComplex.push(`[${musicInputIndex}:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo, volume=${context.presentationStyle.audioParams.bgmVolume}[music]`);
15
+ ffmpegContext.filterComplex.push(`[${voiceInputIndex}:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo, volume=${context.presentationStyle.audioParams.audioVolume}, adelay=${introPadding * 1000}|${introPadding * 1000}[voice]`);
16
16
  ffmpegContext.filterComplex.push(`[music][voice]amix=inputs=2:duration=longest[mixed]`);
17
17
  ffmpegContext.filterComplex.push(`[mixed]atrim=start=0:end=${totalDuration}[trimmed]`);
18
18
  ffmpegContext.filterComplex.push(`[trimmed]afade=t=out:st=${totalDuration - outroPadding}:d=${outroPadding}[faded]`);
@@ -18,87 +18,130 @@ const getPadding = (context, beat, index) => {
18
18
  const isClosingGap = index === context.studio.beats.length - 2;
19
19
  return isClosingGap ? context.presentationStyle.audioParams.closingPadding : context.presentationStyle.audioParams.padding;
20
20
  };
21
- const getTotalPadding = (padding, movieDuration, audioDuration, duration, canSpillover = false) => {
21
+ const getTotalPadding = (padding, movieDuration, audioDuration, duration) => {
22
22
  if (movieDuration > 0) {
23
23
  return padding + (movieDuration - audioDuration);
24
24
  }
25
25
  else if (duration && duration > audioDuration) {
26
26
  return padding + (duration - audioDuration);
27
27
  }
28
- else if (canSpillover && duration && audioDuration > duration) {
29
- return duration - audioDuration; // negative value to indicate that there is a spill over.
30
- }
31
28
  return padding;
32
29
  };
33
- const combineAudioFilesAgent = async ({ namedInputs, }) => {
34
- const { context, combinedFileName } = namedInputs;
35
- const ffmpegContext = FfmpegContextInit();
36
- const longSilentId = FfmpegContextInputFormattedAudio(ffmpegContext, silent60secPath());
37
- // We cannot reuse longSilentId. We need to explicitly split it for each beat.
38
- const silentIds = context.studio.beats.map((_, index) => `[ls_${index}]`);
39
- ffmpegContext.filterComplex.push(`${longSilentId}asplit=${silentIds.length}${silentIds.join("")}`);
40
- // First, get the audio durations of all beats, taking advantage of multi-threading capability of ffmpeg.
41
- const mediaDurations = await Promise.all(context.studio.beats.map(async (studioBeat, index) => {
30
+ const getMediaDurations = (context) => {
31
+ return Promise.all(context.studio.beats.map(async (studioBeat, index) => {
42
32
  const beat = context.studio.script.beats[index];
43
33
  const movieDuration = await getMovieDulation(beat);
44
34
  const audioDuration = studioBeat.audioFile ? await ffmpegGetMediaDuration(studioBeat.audioFile) : 0;
45
35
  return {
46
36
  movieDuration,
47
37
  audioDuration,
38
+ hasMadia: movieDuration + audioDuration > 0,
39
+ silenceDuration: 0,
48
40
  };
49
41
  }));
50
- const inputIds = [];
42
+ };
43
+ const getGroupBeatDurations = (context, group, audioDuration) => {
44
+ const specifiedSum = group
45
+ .map((idx) => context.studio.script.beats[idx].duration)
46
+ .filter((d) => d !== undefined)
47
+ .reduce((a, b) => a + b, 0);
48
+ const unspecified = group.filter((idx) => context.studio.script.beats[idx].duration === undefined);
49
+ const minTotal = 1.0 * unspecified.length;
50
+ const rest = Math.max(audioDuration - specifiedSum, minTotal);
51
+ const durationForUnspecified = rest / (unspecified.length || 1);
52
+ const durations = group.map((idx) => {
53
+ const duration = context.studio.script.beats[idx].duration;
54
+ if (duration === undefined) {
55
+ return durationForUnspecified;
56
+ }
57
+ return duration;
58
+ });
59
+ return durations;
60
+ };
61
+ const combineAudioFilesAgent = async ({ namedInputs, }) => {
62
+ const { context, combinedFileName } = namedInputs;
63
+ const ffmpegContext = FfmpegContextInit();
64
+ // First, get the audio durations of all beats, taking advantage of multi-threading capability of ffmpeg.
65
+ const mediaDurations = await getMediaDurations(context);
51
66
  const beatDurations = [];
52
- context.studio.beats.reduce((spillover, studioBeat, index) => {
53
- const beat = context.studio.script.beats[index];
67
+ context.studio.script.beats.forEach((beat, index) => {
54
68
  const { audioDuration, movieDuration } = mediaDurations[index];
55
- const paddingId = `[padding_${index}]`;
56
- const canSpillover = index < context.studio.beats.length - 1 && mediaDurations[index + 1].movieDuration + mediaDurations[index + 1].audioDuration === 0;
57
- if (studioBeat.audioFile) {
58
- const audioId = FfmpegContextInputFormattedAudio(ffmpegContext, studioBeat.audioFile);
59
- // padding is the amount of audio padding specified in the script.
60
- const padding = getPadding(context, beat, index);
61
- // totalPadding is the amount of audio padding to be added to the audio file.
62
- const totalPadding = getTotalPadding(padding, movieDuration, audioDuration, beat.duration, canSpillover);
63
- beatDurations.push(audioDuration + totalPadding);
64
- if (totalPadding > 0) {
65
- const silentId = silentIds.pop();
66
- ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${totalPadding}${paddingId}`);
67
- inputIds.push(audioId, paddingId);
69
+ // Check if the current beat has media and the next beat does not have media.
70
+ if (audioDuration > 0) {
71
+ // Check if the current beat has spilled over audio.
72
+ const group = [index];
73
+ for (let i = index + 1; i < context.studio.beats.length && !mediaDurations[i].hasMadia; i++) {
74
+ group.push(i);
75
+ }
76
+ if (group.length > 1) {
77
+ const groupBeatsDurations = getGroupBeatDurations(context, group, audioDuration);
78
+ // Yes, the current beat has spilled over audio.
79
+ const beatsTotalDuration = groupBeatsDurations.reduce((a, b) => a + b, 0);
80
+ if (beatsTotalDuration > audioDuration) {
81
+ group.reduce((remaining, idx, iGroup) => {
82
+ if (remaining >= groupBeatsDurations[iGroup]) {
83
+ return remaining - groupBeatsDurations[iGroup];
84
+ }
85
+ mediaDurations[idx].silenceDuration = groupBeatsDurations[iGroup] - remaining;
86
+ return 0;
87
+ }, audioDuration);
88
+ }
89
+ else {
90
+ // Last beat gets the rest of the audio.
91
+ groupBeatsDurations[groupBeatsDurations.length - 1] += audioDuration - beatsTotalDuration;
92
+ }
93
+ beatDurations.push(...groupBeatsDurations);
68
94
  }
69
95
  else {
70
- inputIds.push(audioId);
71
- if (totalPadding < 0) {
72
- return -totalPadding;
96
+ // No spilled over audio.
97
+ assert(beatDurations.length === index, "beatDurations.length !== index");
98
+ // padding is the amount of audio padding specified in the script.
99
+ const padding = getPadding(context, beat, index);
100
+ // totalPadding is the amount of audio padding to be added to the audio file.
101
+ const totalPadding = getTotalPadding(padding, movieDuration, audioDuration, beat.duration);
102
+ const beatDuration = audioDuration + totalPadding;
103
+ beatDurations.push(beatDuration);
104
+ if (totalPadding > 0) {
105
+ mediaDurations[index].silenceDuration = totalPadding;
73
106
  }
74
107
  }
75
108
  }
76
- else {
77
- // NOTE: We come here when the text is empty and no audio property is specified.
78
- const beatDuration = (() => {
79
- const duration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
80
- if (!canSpillover && duration < spillover) {
81
- return spillover; // We need to consume the spillover here.
82
- }
83
- return duration;
84
- })();
109
+ else if (movieDuration > 0) {
110
+ // This beat has only a movie, not audio.
111
+ assert(beatDurations.length === index, "beatDurations.length !== index");
112
+ beatDurations.push(movieDuration);
113
+ mediaDurations[index].silenceDuration = movieDuration;
114
+ }
115
+ else if (beatDurations.length === index) {
116
+ // The current beat has no audio, nor no spilled over audio
117
+ const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
85
118
  beatDurations.push(beatDuration);
86
- if (beatDuration <= spillover) {
87
- return spillover - beatDuration;
88
- }
119
+ mediaDurations[index].silenceDuration = beatDuration;
120
+ }
121
+ // else { Skip this beat if the duration has been already added as a group }
122
+ });
123
+ assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
124
+ // We cannot reuse longSilentId. We need to explicitly split it for each beat.
125
+ const silentIds = mediaDurations.filter((md) => md.silenceDuration > 0).map((_, index) => `[ls_${index}]`);
126
+ if (silentIds.length > 0) {
127
+ const longSilentId = FfmpegContextInputFormattedAudio(ffmpegContext, silent60secPath());
128
+ ffmpegContext.filterComplex.push(`${longSilentId}asplit=${silentIds.length}${silentIds.join("")}`);
129
+ }
130
+ const inputIds = [];
131
+ context.studio.beats.forEach((studioBeat, index) => {
132
+ const { silenceDuration } = mediaDurations[index];
133
+ const paddingId = `[padding_${index}]`;
134
+ if (studioBeat.audioFile) {
135
+ const audioId = FfmpegContextInputFormattedAudio(ffmpegContext, studioBeat.audioFile);
136
+ inputIds.push(audioId);
137
+ }
138
+ if (silenceDuration > 0) {
89
139
  const silentId = silentIds.pop();
90
- ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${beatDuration - spillover}${paddingId}`);
140
+ ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${silenceDuration}${paddingId}`);
91
141
  inputIds.push(paddingId);
92
142
  }
93
- return 0;
94
- }, 0);
95
- assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
96
- // We need to "consume" extra silentIds.
97
- silentIds.forEach((silentId, index) => {
98
- const extraId = `[silent_extra_${index}]`;
99
- ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${0.01}${extraId}`);
100
- inputIds.push(extraId);
101
143
  });
144
+ assert(silentIds.length === 0, "silentIds.length !== 0");
102
145
  // Finally, combine all audio files.
103
146
  ffmpegContext.filterComplex.push(`${inputIds.join("")}concat=n=${inputIds.length}:v=0:a=1[aout]`);
104
147
  await FfmpegContextGenerateOutput(ffmpegContext, combinedFileName, ["-map", "[aout]"]);
@@ -1,11 +1,12 @@
1
1
  import fs from "fs";
2
2
  import path from "path";
3
3
  import OpenAI, { toFile } from "openai";
4
+ import { defaultOpenAIImageModel } from "../utils/const.js";
4
5
  // https://platform.openai.com/docs/guides/image-generation
5
6
  export const imageOpenaiAgent = async ({ namedInputs, params }) => {
6
7
  const { prompt, images } = namedInputs;
7
8
  const { apiKey, moderation, canvasSize } = params;
8
- const model = params.model ?? "dall-e-3";
9
+ const model = params.model ?? defaultOpenAIImageModel;
9
10
  const openai = new OpenAI({ apiKey });
10
11
  const size = (() => {
11
12
  if (model === "gpt-image-1") {
@@ -1,4 +1,4 @@
1
- import type { AgentFunction, AgentFunctionInfo, DefaultConfigData } from "graphai";
1
+ import { type AgentFunction, type AgentFunctionInfo, type DefaultConfigData } from "graphai";
2
2
  import { MulmoScript } from "../types/index.js";
3
3
  import { ZodSchema } from "zod";
4
4
  interface ValidateMulmoScriptInputs {
@@ -1,4 +1,4 @@
1
- import assert from "node:assert";
1
+ import { assert } from "graphai";
2
2
  /**
3
3
  * Zod schema validation agent
4
4
  * Validates if a JSON string conforms to the Zod schema
@@ -6,8 +6,8 @@ import assert from "node:assert";
6
6
  export const validateSchemaAgent = async ({ namedInputs, }) => {
7
7
  const { text, schema } = namedInputs;
8
8
  try {
9
- assert(schema, "schema is required");
10
- assert(text, "text is required");
9
+ assert(!!schema, "schema is required");
10
+ assert(!!text, "text is required");
11
11
  const jsonData = JSON.parse(text);
12
12
  const parsed = schema.parse(jsonData);
13
13
  return {
@@ -83,7 +83,12 @@ export const fetchScript = async (isHttpPath, mulmoFilePath, fileOrUrl) => {
83
83
  export const getMultiLingual = (multilingualFilePath, beatsLength) => {
84
84
  if (fs.existsSync(multilingualFilePath)) {
85
85
  const jsonData = readMulmoScriptFile(multilingualFilePath, "ERROR: File does not exist " + multilingualFilePath)?.mulmoData ?? null;
86
- return mulmoStudioMultiLingualSchema.parse(jsonData);
86
+ const dataSet = mulmoStudioMultiLingualSchema.parse(jsonData);
87
+ while (dataSet.length < beatsLength) {
88
+ dataSet.push({ multiLingualTexts: {} });
89
+ }
90
+ dataSet.length = beatsLength;
91
+ return dataSet;
87
92
  }
88
93
  return [...Array(beatsLength)].map(() => ({ multiLingualTexts: {} }));
89
94
  };