mulmocast 0.0.28 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -82,6 +82,16 @@ brew install ffmpeg
82
82
  # Visit https://ffmpeg.org/download.html
83
83
  ```
84
84
 
85
+ You can also use [`Dockerfile`](./Dockerfile) which helps you install the pre-requisits.
86
+ ```
87
+ docker build -t mulmo-cli .
88
+ ```
89
+
90
+ You can use the Docker image like this:
91
+ ```
92
+ docker run -e OPENAI_API_KEY=<your_openai_api_key> -it mulmo-cli mulmo tool scripting -i -t children_book -o ./ -s story
93
+ ```
94
+
85
95
  ## Configuration
86
96
 
87
97
  Create a `.env` file in your project directory with the following API keys:
@@ -200,11 +210,18 @@ writing: /Users/username/path/to/output/story-1747834931950__ja.mp4
200
210
  # Generate script from web content (requires Browserless API KEY)
201
211
  mulmo tool scripting -u https://example.com
202
212
 
213
+ # Generate script from local file
214
+ mulmo tool scripting --input-file story.txt
215
+
203
216
  # Generate script with interactive mode
204
217
  mulmo tool scripting -i
205
218
  ```
206
219
 
207
- When using the `⁠sensei_and_taro` template, a Nijivoice API key is required.
220
+ Note:
221
+ - When using the `⁠sensei_and_taro` template, a Nijivoice API key is required
222
+ - When -i is specified, --input-file value will be ignored
223
+ - When --input-file is specified, -u value will be ignored
224
+
208
225
 
209
226
  ## Generate content from MulmoScript
210
227
 
@@ -308,7 +325,6 @@ Options:
308
325
  -b, --basedir base dir [string]
309
326
  -l, --lang target language [string] [choices: "en", "ja"]
310
327
  -f, --force Force regenerate [boolean] [default: false]
311
- --dryRun Dry run [boolean] [default: false]
312
328
  -p, --presentationStyle Presentation Style [string]
313
329
  -a, --audiodir Audio output directory [string]
314
330
  ```
@@ -329,7 +345,6 @@ Options:
329
345
  -b, --basedir base dir [string]
330
346
  -l, --lang target language [string] [choices: "en", "ja"]
331
347
  -f, --force Force regenerate [boolean] [default: false]
332
- --dryRun Dry run [boolean] [default: false]
333
348
  -p, --presentationStyle Presentation Style [string]
334
349
  -i, --imagedir Image output directory [string]
335
350
  ```
@@ -350,7 +365,6 @@ Options:
350
365
  -b, --basedir base dir [string]
351
366
  -l, --lang target language [string] [choices: "en", "ja"]
352
367
  -f, --force Force regenerate [boolean] [default: false]
353
- --dryRun Dry run [boolean] [default: false]
354
368
  -p, --presentationStyle Presentation Style [string]
355
369
  -a, --audiodir Audio output directory [string]
356
370
  -i, --imagedir Image output directory [string]
@@ -411,16 +425,19 @@ Options:
411
425
  -b, --basedir base dir [string]
412
426
  -u, --url URLs to reference (required when not in interactive mode)
413
427
  [array] [default: []]
428
+ --input-file input file name [string]
414
429
  -i, --interactive Generate script in interactive mode with user prompts
415
430
  [boolean]
416
431
  -t, --template Template name to use
417
- [string] [choices: "business", "children_book", "coding", "comic_strips",
418
- "ghibli_strips", "podcast_standard", "sensei_and_taro"]
432
+ [string] [choices: "akira_comic", "business", "children_book", "coding",
433
+ "comic_strips", "drslump_comic", "ghibli_comic", "ghibli_image_only",
434
+ "ghibli_shorts", "ghost_comic", "onepiece_comic", "podcast_standard",
435
+ "portrait_movie", "realistic_movie", "sensei_and_taro", "shorts",
436
+ "text_and_image", "text_only", "trailer"]
419
437
  -c, --cache cache dir [string]
420
438
  -s, --script script filename [string] [default: "script"]
421
439
  --llm llm
422
- [string] [choices: "openAIAgent", "anthropicAgent", "geminiAgent",
423
- "groqAgent"]
440
+ [string] [choices: "openai", "anthropic", "gemini", "groq"]
424
441
  --llm_model llm model [string]
425
442
  ```
426
443
 
@@ -1,5 +1,5 @@
1
1
  {
2
- "title": "Ghibli comic style",
2
+ "title": "Ghibli style for YouTube Shorts",
3
3
  "description": "Template for Ghibli-style comic presentation.",
4
4
  "systemPrompt": "Generate a Japanese script for a Youtube shorts of the given topic. Another AI will generate comic strips for each beat based on the text description of that beat. Mention the reference in one of beats, if it exists. Use the JSON below as a template.",
5
5
  "presentationStyle": {
@@ -14,7 +14,7 @@
14
14
  "speechParams": {
15
15
  "provider": "nijivoice",
16
16
  "speakers": {
17
- "Presenter": { "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62", "speechOptions": { "speed": 1.5 } }
17
+ "Presenter": { "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c", "speechOptions": { "speed": 1.5 } }
18
18
  }
19
19
  },
20
20
  "imageParams": {
@@ -17,7 +17,7 @@
17
17
  "speechParams": {
18
18
  "provider": "nijivoice",
19
19
  "speakers": {
20
- "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62" },
20
+ "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
21
21
  "Student": { "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
22
22
  "Teacher": { "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
23
23
  }
@@ -1,14 +1,16 @@
1
1
  import { mulmoCaptionParamsSchema } from "../types/index.js";
2
2
  import { GraphAI, GraphAILogger } from "graphai";
3
3
  import * as agents from "@graphai/vanilla";
4
- import { getHTMLFile, getCaptionImagePath } from "../utils/file.js";
4
+ import { getHTMLFile, getCaptionImagePath, getOutputStudioFilePath } from "../utils/file.js";
5
5
  import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
6
6
  import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
7
+ import { fileWriteAgent } from "@graphai/vanilla_node_agents";
7
8
  const vanillaAgents = agents.default ?? agents;
8
9
  const graph_data = {
9
10
  version: 0.5,
10
11
  nodes: {
11
12
  context: {},
13
+ outputStudioFilePath: {},
12
14
  map: {
13
15
  agent: "mapAgent",
14
16
  inputs: { rows: ":context.studio.script.beats", context: ":context" },
@@ -60,14 +62,26 @@ const graph_data = {
60
62
  },
61
63
  },
62
64
  },
65
+ fileWrite: {
66
+ agent: "fileWriteAgent",
67
+ inputs: {
68
+ onComplete: ":map.generateCaption",
69
+ file: ":outputStudioFilePath",
70
+ text: ":context.studio.toJSON()",
71
+ },
72
+ },
63
73
  },
64
74
  };
65
75
  export const captions = async (context, callbacks) => {
66
76
  if (MulmoStudioContextMethods.getCaption(context)) {
67
77
  try {
68
78
  MulmoStudioContextMethods.setSessionState(context, "caption", true);
69
- const graph = new GraphAI(graph_data, { ...vanillaAgents });
79
+ const graph = new GraphAI(graph_data, { ...vanillaAgents, fileWriteAgent });
80
+ const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
81
+ const fileName = MulmoStudioContextMethods.getFileName(context);
82
+ const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
70
83
  graph.injectValue("context", context);
84
+ graph.injectValue("outputStudioFilePath", outputStudioFilePath);
71
85
  if (callbacks) {
72
86
  callbacks.forEach((callback) => {
73
87
  graph.registerCallback(callback);
@@ -1,13 +1,13 @@
1
1
  import type { CallbackFunction } from "graphai";
2
- import { MulmoStudioContext, MulmoBeat, Text2ImageAgentInfo } from "../types/index.js";
2
+ import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
3
3
  export declare const imagePreprocessAgent: (namedInputs: {
4
4
  context: MulmoStudioContext;
5
5
  beat: MulmoBeat;
6
6
  index: number;
7
- imageAgentInfo: Text2ImageAgentInfo;
8
7
  imageRefs: Record<string, string>;
9
8
  }) => Promise<{
10
9
  imageParams: {
10
+ provider: "openai" | "google";
11
11
  style?: string | undefined;
12
12
  model?: string | undefined;
13
13
  moderation?: string | undefined;
@@ -42,6 +42,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
42
42
  images: string[];
43
43
  imageFromMovie: boolean;
44
44
  imageParams: {
45
+ provider: "openai" | "google";
45
46
  style?: string | undefined;
46
47
  model?: string | undefined;
47
48
  moderation?: string | undefined;
@@ -68,6 +69,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
68
69
  } | {
69
70
  images: string[];
70
71
  imageParams: {
72
+ provider: "openai" | "google";
71
73
  style?: string | undefined;
72
74
  model?: string | undefined;
73
75
  moderation?: string | undefined;
@@ -89,6 +91,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
89
91
  }> | undefined;
90
92
  };
91
93
  movieFile: string | undefined;
94
+ imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
92
95
  imagePath: string;
93
96
  referenceImage: string;
94
97
  prompt: string;
@@ -10,7 +10,7 @@ import { fileCacheAgentFilter } from "../utils/filters.js";
10
10
  import { imageGoogleAgent, imageOpenaiAgent, movieGoogleAgent, movieReplicateAgent, mediaMockAgent } from "../agents/index.js";
11
11
  import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../methods/index.js";
12
12
  import { findImagePlugin } from "../utils/image_plugins/index.js";
13
- import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
13
+ import { userAssert, settings2GraphAIConfig, getExtention } from "../utils/utils.js";
14
14
  import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
15
15
  import { defaultOpenAIImageModel } from "../utils/const.js";
16
16
  import { renderHTMLToImage } from "../utils/markdown.js";
@@ -25,11 +25,12 @@ const htmlStyle = (context, beat) => {
25
25
  };
26
26
  };
27
27
  export const imagePreprocessAgent = async (namedInputs) => {
28
- const { context, beat, index, imageAgentInfo, imageRefs } = namedInputs;
29
- const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
28
+ const { context, beat, index, imageRefs } = namedInputs;
29
+ const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle, beat);
30
+ // const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
30
31
  const imagePath = getBeatPngImagePath(context, index);
31
32
  const returnValue = {
32
- imageParams,
33
+ imageParams: imageAgentInfo.imageParams,
33
34
  movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
34
35
  };
35
36
  if (beat.image) {
@@ -54,8 +55,8 @@ export const imagePreprocessAgent = async (namedInputs) => {
54
55
  if (beat.moviePrompt && !beat.imagePrompt) {
55
56
  return { ...returnValue, imagePath, images, imageFromMovie: true }; // no image prompt, only movie prompt
56
57
  }
57
- const prompt = imagePrompt(beat, imageParams.style);
58
- return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
58
+ const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
59
+ return { imageAgentInfo, imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
59
60
  };
60
61
  export const imagePluginAgent = async (namedInputs) => {
61
62
  const { context, beat, index } = namedInputs;
@@ -87,7 +88,6 @@ const beat_graph_data = {
87
88
  concurrency: 4,
88
89
  nodes: {
89
90
  context: {},
90
- imageAgentInfo: {},
91
91
  htmlImageAgentInfo: {},
92
92
  movieAgentInfo: {},
93
93
  imageRefs: {},
@@ -99,7 +99,6 @@ const beat_graph_data = {
99
99
  context: ":context",
100
100
  beat: ":beat",
101
101
  index: ":__mapIndex",
102
- imageAgentInfo: ":imageAgentInfo",
103
102
  imageRefs: ":imageRefs",
104
103
  },
105
104
  },
@@ -142,7 +141,7 @@ const beat_graph_data = {
142
141
  },
143
142
  imageGenerator: {
144
143
  if: ":preprocessor.prompt",
145
- agent: ":imageAgentInfo.agent",
144
+ agent: ":preprocessor.imageAgentInfo.agent",
146
145
  retry: 2,
147
146
  inputs: {
148
147
  prompt: ":preprocessor.prompt",
@@ -213,7 +212,6 @@ const graph_data = {
213
212
  concurrency: 4,
214
213
  nodes: {
215
214
  context: {},
216
- imageAgentInfo: {},
217
215
  htmlImageAgentInfo: {},
218
216
  movieAgentInfo: {},
219
217
  outputStudioFilePath: {},
@@ -223,7 +221,6 @@ const graph_data = {
223
221
  inputs: {
224
222
  rows: ":context.studio.script.beats",
225
223
  context: ":context",
226
- imageAgentInfo: ":imageAgentInfo",
227
224
  htmlImageAgentInfo: ":htmlImageAgentInfo",
228
225
  movieAgentInfo: ":movieAgentInfo",
229
226
  imageRefs: ":imageRefs",
@@ -306,10 +303,10 @@ const graphOption = async (context, settings) => {
306
303
  agentFilters,
307
304
  taskManager,
308
305
  };
309
- const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
306
+ const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
310
307
  const config = settings2GraphAIConfig(settings);
311
308
  // We need to get google's auth token only if the google is the text2image provider.
312
- if (imageAgentInfo.provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
309
+ if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
313
310
  userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
314
311
  GraphAILogger.log("google was specified as text2image engine");
315
312
  const token = await googleAuth();
@@ -342,23 +339,7 @@ export const getImageRefs = async (context) => {
342
339
  }
343
340
  const buffer = Buffer.from(await response.arrayBuffer());
344
341
  // Detect file extension from Content-Type header or URL
345
- const extension = (() => {
346
- const contentType = response.headers.get("content-type");
347
- if (contentType?.includes("jpeg") || contentType?.includes("jpg")) {
348
- return "jpg";
349
- }
350
- else if (contentType?.includes("png")) {
351
- return "png";
352
- }
353
- else {
354
- // Fall back to URL extension
355
- const urlExtension = image.source.url.split(".").pop()?.toLowerCase();
356
- if (urlExtension && ["jpg", "jpeg", "png"].includes(urlExtension)) {
357
- return urlExtension === "jpeg" ? "jpg" : urlExtension;
358
- }
359
- return "png"; // default
360
- }
361
- })();
342
+ const extension = getExtention(response.headers.get("content-type"), image.source.url);
362
343
  const imagePath = getReferenceImagePath(context, key, extension);
363
344
  await fs.promises.writeFile(imagePath, buffer);
364
345
  imageRefs[key] = imagePath;
@@ -372,7 +353,7 @@ const prepareGenerateImages = async (context) => {
372
353
  const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
373
354
  const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
374
355
  mkdir(imageProjectDirPath);
375
- const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
356
+ const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
376
357
  const htmlImageAgentInfo = MulmoPresentationStyleMethods.getHtmlImageAgentInfo(context.presentationStyle);
377
358
  const imageRefs = await getImageRefs(context);
378
359
  // Determine movie agent based on provider
@@ -386,10 +367,9 @@ const prepareGenerateImages = async (context) => {
386
367
  return "movieGoogleAgent";
387
368
  }
388
369
  };
389
- GraphAILogger.info(`text2image: provider=${imageAgentInfo.provider} model=${imageAgentInfo.imageParams.model}`);
370
+ GraphAILogger.info(`text2image: provider=${provider} model=${context.presentationStyle.imageParams?.model}`);
390
371
  const injections = {
391
372
  context,
392
- imageAgentInfo,
393
373
  htmlImageAgentInfo,
394
374
  movieAgentInfo: {
395
375
  agent: getMovieAgent(),
@@ -404,7 +384,7 @@ const getConcurrency = (context) => {
404
384
  return 4;
405
385
  }
406
386
  const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
407
- if (imageAgentInfo.provider === "openai") {
387
+ if (imageAgentInfo.imageParams.provider === "openai") {
408
388
  // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
409
389
  // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
410
390
  // gpt-image-1:3,000,000 TPM、150 images per minute
@@ -1,5 +1,5 @@
1
1
  import { MulmoStudioContext, MulmoCanvasDimension, BeatMediaType, MulmoFillOption } from "../types/index.js";
2
- export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption) => {
2
+ export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption, speed: number) => {
3
3
  videoId: string;
4
4
  videoPart: string;
5
5
  };
@@ -2,24 +2,32 @@ import { GraphAILogger, assert } from "graphai";
2
2
  import { mulmoTransitionSchema, mulmoFillOptionSchema } from "../types/index.js";
3
3
  import { MulmoPresentationStyleMethods } from "../methods/index.js";
4
4
  import { getAudioArtifactFilePath, getOutputVideoFilePath, writingMessage } from "../utils/file.js";
5
- import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput } from "../utils/ffmpeg_utils.js";
5
+ import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput, } from "../utils/ffmpeg_utils.js";
6
6
  import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
7
7
  // const isMac = process.platform === "darwin";
8
8
  const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
9
- export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption) => {
9
+ export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption, speed) => {
10
10
  const videoId = `v${inputIndex}`;
11
11
  const videoFilters = [];
12
12
  // Handle different media types
13
+ const originalDuration = duration * speed;
13
14
  if (mediaType === "image") {
14
15
  videoFilters.push("loop=loop=-1:size=1:start=0");
15
16
  }
16
17
  else if (mediaType === "movie") {
17
18
  // For videos, extend with last frame if shorter than required duration
18
19
  // tpad will extend the video by cloning the last frame, then trim will ensure exact duration
19
- videoFilters.push(`tpad=stop_mode=clone:stop_duration=${duration * 2}`); // Use 2x duration to ensure coverage
20
+ videoFilters.push(`tpad=stop_mode=clone:stop_duration=${originalDuration * 2}`); // Use 2x duration to ensure coverage
20
21
  }
21
22
  // Common filters for all media types
22
- videoFilters.push(`trim=duration=${duration}`, "fps=30", "setpts=PTS-STARTPTS");
23
+ videoFilters.push(`trim=duration=${originalDuration}`, "fps=30");
24
+ // Apply speed if specified
25
+ if (speed !== 1.0) {
26
+ videoFilters.push(`setpts=${1 / speed}*PTS`);
27
+ }
28
+ else {
29
+ videoFilters.push("setpts=PTS-STARTPTS");
30
+ }
23
31
  // Apply scaling based on fill option
24
32
  if (fillOption.style === "aspectFill") {
25
33
  // For aspect fill: scale to fill the canvas completely, cropping if necessary
@@ -69,32 +77,94 @@ const getOutputOption = (audioId, videoId) => {
69
77
  "-b:a 128k", // Audio bitrate
70
78
  ];
71
79
  };
80
+ const addCaptions = (ffmpegContext, concatVideoId, context, caption) => {
81
+ const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
82
+ if (caption && beatsWithCaptions.length > 0) {
83
+ const introPadding = context.presentationStyle.audioParams.introPadding;
84
+ return beatsWithCaptions.reduce((acc, beat, index) => {
85
+ const { startAt, duration, captionFile } = beat;
86
+ if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
87
+ const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
88
+ const compositeVideoId = `oc${index}`;
89
+ ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
90
+ return compositeVideoId;
91
+ }
92
+ return acc;
93
+ }, concatVideoId);
94
+ }
95
+ return concatVideoId;
96
+ };
97
+ const addTransitionEffects = (ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps) => {
98
+ if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
99
+ const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
100
+ return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
101
+ const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
102
+ const processedVideoId = `${transitionVideoId}_f`;
103
+ let transitionFilter;
104
+ if (transition.type === "fade") {
105
+ transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
106
+ }
107
+ else if (transition.type === "slideout_left") {
108
+ transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
109
+ }
110
+ else {
111
+ throw new Error(`Unknown transition type: ${transition.type}`);
112
+ }
113
+ ffmpegContext.filterComplex.push(transitionFilter);
114
+ const outputId = `${transitionVideoId}_o`;
115
+ if (transition.type === "fade") {
116
+ ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
117
+ }
118
+ else if (transition.type === "slideout_left") {
119
+ ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
120
+ }
121
+ return outputId;
122
+ }, captionedVideoId);
123
+ }
124
+ return captionedVideoId;
125
+ };
126
+ const mixAudiosFromMovieBeats = (ffmpegContext, artifactAudioId, audioIdsFromMovieBeats) => {
127
+ if (audioIdsFromMovieBeats.length > 0) {
128
+ const mainAudioId = "mainaudio";
129
+ const compositeAudioId = "composite";
130
+ const audioIds = audioIdsFromMovieBeats.map((id) => `[${id}]`).join("");
131
+ FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
132
+ ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${audioIdsFromMovieBeats.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
133
+ return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
134
+ }
135
+ return artifactAudioId;
136
+ };
72
137
  const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
73
138
  const caption = MulmoStudioContextMethods.getCaption(context);
74
139
  const start = performance.now();
75
140
  const ffmpegContext = FfmpegContextInit();
76
- const missingIndex = context.studio.beats.findIndex((beat) => !beat.imageFile && !beat.movieFile);
141
+ const missingIndex = context.studio.beats.findIndex((studioBeat, index) => {
142
+ const beat = context.studio.script.beats[index];
143
+ if (beat.image?.type === "voice_over") {
144
+ return false; // Voice-over does not have either imageFile or movieFile.
145
+ }
146
+ return !studioBeat.imageFile && !studioBeat.movieFile;
147
+ });
77
148
  if (missingIndex !== -1) {
78
149
  GraphAILogger.info(`ERROR: beat.imageFile or beat.movieFile is not set on beat ${missingIndex}.`);
79
150
  return false;
80
151
  }
81
152
  const canvasInfo = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
82
153
  // Add each image input
83
- const filterComplexVideoIds = [];
84
- const filterComplexAudioIds = [];
154
+ const videoIdsForBeats = [];
155
+ const audioIdsFromMovieBeats = [];
85
156
  const transitionVideoIds = [];
86
157
  const beatTimestamps = [];
87
158
  context.studio.beats.reduce((timestamp, studioBeat, index) => {
88
159
  const beat = context.studio.script.beats[index];
89
- const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
90
- if (!sourceFile) {
91
- throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
92
- }
93
- if (!studioBeat.duration) {
94
- throw new Error(`studioBeat.duration is not set: index=${index}`);
160
+ if (beat.image?.type === "voice_over") {
161
+ videoIdsForBeats.push(undefined);
162
+ beatTimestamps.push(timestamp);
163
+ return timestamp; // Skip voice-over beats.
95
164
  }
96
- const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
97
- const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
165
+ const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
166
+ assert(!!sourceFile, `studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
167
+ assert(!!studioBeat.duration, `studioBeat.duration is not set: index=${index}`);
98
168
  const extraPadding = (() => {
99
169
  // We need to consider only intro and outro padding because the other paddings were already added to the beat.duration
100
170
  if (index === 0) {
@@ -105,93 +175,56 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
105
175
  }
106
176
  return 0;
107
177
  })();
108
- const duration = studioBeat.duration + extraPadding;
178
+ // The movie duration is bigger in case of voice-over.
179
+ const duration = Math.max(studioBeat.duration + extraPadding, studioBeat.movieDuration ?? 0);
109
180
  // Get fillOption from merged imageParams (global + beat-specific)
110
181
  const globalFillOption = context.presentationStyle.movieParams?.fillOption;
111
182
  const beatFillOption = beat.movieParams?.fillOption;
112
183
  const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
113
184
  const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
114
- const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption);
185
+ const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
186
+ const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
187
+ const speed = beat.movieParams?.speed ?? 1.0;
188
+ const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
115
189
  ffmpegContext.filterComplex.push(videoPart);
116
- if (caption && studioBeat.captionFile) {
117
- const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
118
- const compositeVideoId = `c${index}`;
119
- ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
120
- filterComplexVideoIds.push(compositeVideoId);
121
- }
122
- else {
123
- filterComplexVideoIds.push(videoId);
124
- }
125
190
  if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
126
- const sourceId = filterComplexVideoIds.pop();
127
- ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
128
- filterComplexVideoIds.push(`${sourceId}_0`);
191
+ // NOTE: We split the video into two parts for transition.
192
+ ffmpegContext.filterComplex.push(`[${videoId}]split=2[${videoId}_0][${videoId}_1]`);
193
+ videoIdsForBeats.push(`${videoId}_0`);
129
194
  if (mediaType === "movie") {
130
195
  // For movie beats, extract the last frame for transition
131
- ffmpegContext.filterComplex.push(`[${sourceId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${sourceId}_2]`);
132
- transitionVideoIds.push(`${sourceId}_2`);
196
+ ffmpegContext.filterComplex.push(`[${videoId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${videoId}_2]`);
197
+ transitionVideoIds.push(`${videoId}_2`);
133
198
  }
134
199
  else {
135
- transitionVideoIds.push(`${sourceId}_1`);
200
+ transitionVideoIds.push(`${videoId}_1`);
136
201
  }
137
202
  }
138
- if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0) {
203
+ else {
204
+ videoIdsForBeats.push(videoId);
205
+ }
206
+ // NOTE: We don't support audio if the speed is not 1.0.
207
+ if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
139
208
  const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
140
- filterComplexAudioIds.push(audioId);
209
+ audioIdsFromMovieBeats.push(audioId);
141
210
  ffmpegContext.filterComplex.push(audioPart);
142
211
  }
143
212
  beatTimestamps.push(timestamp);
144
213
  return timestamp + duration;
145
214
  }, 0);
146
- assert(filterComplexVideoIds.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
215
+ assert(videoIdsForBeats.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
147
216
  assert(beatTimestamps.length === context.studio.beats.length, "beatTimestamps.length !== studio.beats.length");
148
217
  // console.log("*** images", images.audioIds);
149
218
  // Concatenate the trimmed images
150
219
  const concatVideoId = "concat_video";
151
- ffmpegContext.filterComplex.push(`${filterComplexVideoIds.map((id) => `[${id}]`).join("")}concat=n=${context.studio.beats.length}:v=1:a=0[${concatVideoId}]`);
152
- // Add tranditions if needed
153
- const mixedVideoId = (() => {
154
- if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
155
- const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
156
- return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
157
- const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
158
- const processedVideoId = `${transitionVideoId}_f`;
159
- let transitionFilter;
160
- if (transition.type === "fade") {
161
- transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
162
- }
163
- else if (transition.type === "slideout_left") {
164
- transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
165
- }
166
- else {
167
- throw new Error(`Unknown transition type: ${transition.type}`);
168
- }
169
- ffmpegContext.filterComplex.push(transitionFilter);
170
- const outputId = `${transitionVideoId}_o`;
171
- if (transition.type === "fade") {
172
- ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
173
- }
174
- else if (transition.type === "slideout_left") {
175
- ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
176
- }
177
- return outputId;
178
- }, concatVideoId);
179
- }
180
- return concatVideoId;
181
- })();
220
+ const videoIds = videoIdsForBeats.filter((id) => id !== undefined); // filter out voice-over beats
221
+ ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
222
+ const captionedVideoId = addCaptions(ffmpegContext, concatVideoId, context, caption);
223
+ const mixedVideoId = addTransitionEffects(ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps);
224
+ GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
182
225
  const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
183
226
  const artifactAudioId = `${audioIndex}:a`;
184
- const ffmpegContextAudioId = (() => {
185
- if (filterComplexAudioIds.length > 0) {
186
- const mainAudioId = "mainaudio";
187
- const compositeAudioId = "composite";
188
- const audioIds = filterComplexAudioIds.map((id) => `[${id}]`).join("");
189
- FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
190
- ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${filterComplexAudioIds.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
191
- return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
192
- }
193
- return artifactAudioId;
194
- })();
227
+ const ffmpegContextAudioId = mixAudiosFromMovieBeats(ffmpegContext, artifactAudioId, audioIdsFromMovieBeats);
195
228
  // GraphAILogger.debug("filterComplex", ffmpegContext.filterComplex);
196
229
  await FfmpegContextGenerateOutput(ffmpegContext, outputVideoPath, getOutputOption(ffmpegContextAudioId, mixedVideoId));
197
230
  const end = performance.now();
@@ -1,8 +1,15 @@
1
+ import fs from "fs";
1
2
  import { GraphAILogger } from "graphai";
2
3
  import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextGenerateOutput, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
3
4
  const addBGMAgent = async ({ namedInputs, params, }) => {
4
5
  const { voiceFile, outputFile, context } = namedInputs;
5
6
  const { musicFile } = params;
7
+ if (!fs.existsSync(voiceFile)) {
8
+ throw new Error(`AddBGMAgent voiceFile not exist: ${voiceFile}`);
9
+ }
10
+ if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
11
+ throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
12
+ }
6
13
  const speechDuration = await ffmpegGetMediaDuration(voiceFile);
7
14
  const introPadding = context.presentationStyle.audioParams.introPadding;
8
15
  const outroPadding = context.presentationStyle.audioParams.outroPadding;
@@ -16,8 +23,14 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
16
23
  ffmpegContext.filterComplex.push(`[music][voice]amix=inputs=2:duration=longest[mixed]`);
17
24
  ffmpegContext.filterComplex.push(`[mixed]atrim=start=0:end=${totalDuration}[trimmed]`);
18
25
  ffmpegContext.filterComplex.push(`[trimmed]afade=t=out:st=${totalDuration - outroPadding}:d=${outroPadding}[faded]`);
19
- await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
20
- return outputFile;
26
+ try {
27
+ await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
28
+ return outputFile;
29
+ }
30
+ catch (e) {
31
+ GraphAILogger.log(e);
32
+ throw new Error(`AddBGMAgent ffmpeg run Error`);
33
+ }
21
34
  };
22
35
  const addBGMAgentInfo = {
23
36
  name: "addBGMAgent",