mulmocast 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -82,6 +82,16 @@ brew install ffmpeg
82
82
  # Visit https://ffmpeg.org/download.html
83
83
  ```
84
84
 
85
+ You can also use [`Dockerfile`](./Dockerfile) which helps you install the pre-requisits.
86
+ ```
87
+ docker build -t mulmo-cli .
88
+ ```
89
+
90
+ You can use the Docker image like this:
91
+ ```
92
+ docker run -e OPENAI_API_KEY=<your_openai_api_key> -it mulmo-cli mulmo tool scripting -i -t children_book -o ./ -s story
93
+ ```
94
+
85
95
  ## Configuration
86
96
 
87
97
  Create a `.env` file in your project directory with the following API keys:
@@ -1,5 +1,5 @@
1
1
  {
2
- "title": "Ghibli comic style",
2
+ "title": "Ghibli style for YouTube Shorts",
3
3
  "description": "Template for Ghibli-style comic presentation.",
4
4
  "systemPrompt": "Generate a Japanese script for a Youtube shorts of the given topic. Another AI will generate comic strips for each beat based on the text description of that beat. Mention the reference in one of beats, if it exists. Use the JSON below as a template.",
5
5
  "presentationStyle": {
@@ -10,7 +10,7 @@ import { fileCacheAgentFilter } from "../utils/filters.js";
10
10
  import { imageGoogleAgent, imageOpenaiAgent, movieGoogleAgent, movieReplicateAgent, mediaMockAgent } from "../agents/index.js";
11
11
  import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../methods/index.js";
12
12
  import { findImagePlugin } from "../utils/image_plugins/index.js";
13
- import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
13
+ import { userAssert, settings2GraphAIConfig, getExtention } from "../utils/utils.js";
14
14
  import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
15
15
  import { defaultOpenAIImageModel } from "../utils/const.js";
16
16
  import { renderHTMLToImage } from "../utils/markdown.js";
@@ -339,23 +339,7 @@ export const getImageRefs = async (context) => {
339
339
  }
340
340
  const buffer = Buffer.from(await response.arrayBuffer());
341
341
  // Detect file extension from Content-Type header or URL
342
- const extension = (() => {
343
- const contentType = response.headers.get("content-type");
344
- if (contentType?.includes("jpeg") || contentType?.includes("jpg")) {
345
- return "jpg";
346
- }
347
- else if (contentType?.includes("png")) {
348
- return "png";
349
- }
350
- else {
351
- // Fall back to URL extension
352
- const urlExtension = image.source.url.split(".").pop()?.toLowerCase();
353
- if (urlExtension && ["jpg", "jpeg", "png"].includes(urlExtension)) {
354
- return urlExtension === "jpeg" ? "jpg" : urlExtension;
355
- }
356
- return "png"; // default
357
- }
358
- })();
342
+ const extension = getExtention(response.headers.get("content-type"), image.source.url);
359
343
  const imagePath = getReferenceImagePath(context, key, extension);
360
344
  await fs.promises.writeFile(imagePath, buffer);
361
345
  imageRefs[key] = imagePath;
@@ -2,7 +2,7 @@ import { GraphAILogger, assert } from "graphai";
2
2
  import { mulmoTransitionSchema, mulmoFillOptionSchema } from "../types/index.js";
3
3
  import { MulmoPresentationStyleMethods } from "../methods/index.js";
4
4
  import { getAudioArtifactFilePath, getOutputVideoFilePath, writingMessage } from "../utils/file.js";
5
- import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput } from "../utils/ffmpeg_utils.js";
5
+ import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput, } from "../utils/ffmpeg_utils.js";
6
6
  import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
7
7
  // const isMac = process.platform === "darwin";
8
8
  const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
@@ -77,6 +77,63 @@ const getOutputOption = (audioId, videoId) => {
77
77
  "-b:a 128k", // Audio bitrate
78
78
  ];
79
79
  };
80
+ const addCaptions = (ffmpegContext, concatVideoId, context, caption) => {
81
+ const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
82
+ if (caption && beatsWithCaptions.length > 0) {
83
+ const introPadding = context.presentationStyle.audioParams.introPadding;
84
+ return beatsWithCaptions.reduce((acc, beat, index) => {
85
+ const { startAt, duration, captionFile } = beat;
86
+ if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
87
+ const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
88
+ const compositeVideoId = `oc${index}`;
89
+ ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
90
+ return compositeVideoId;
91
+ }
92
+ return acc;
93
+ }, concatVideoId);
94
+ }
95
+ return concatVideoId;
96
+ };
97
+ const addTransitionEffects = (ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps) => {
98
+ if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
99
+ const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
100
+ return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
101
+ const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
102
+ const processedVideoId = `${transitionVideoId}_f`;
103
+ let transitionFilter;
104
+ if (transition.type === "fade") {
105
+ transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
106
+ }
107
+ else if (transition.type === "slideout_left") {
108
+ transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
109
+ }
110
+ else {
111
+ throw new Error(`Unknown transition type: ${transition.type}`);
112
+ }
113
+ ffmpegContext.filterComplex.push(transitionFilter);
114
+ const outputId = `${transitionVideoId}_o`;
115
+ if (transition.type === "fade") {
116
+ ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
117
+ }
118
+ else if (transition.type === "slideout_left") {
119
+ ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
120
+ }
121
+ return outputId;
122
+ }, captionedVideoId);
123
+ }
124
+ return captionedVideoId;
125
+ };
126
+ const mixAudiosFromMovieBeats = (ffmpegContext, artifactAudioId, audioIdsFromMovieBeats) => {
127
+ if (audioIdsFromMovieBeats.length > 0) {
128
+ const mainAudioId = "mainaudio";
129
+ const compositeAudioId = "composite";
130
+ const audioIds = audioIdsFromMovieBeats.map((id) => `[${id}]`).join("");
131
+ FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
132
+ ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${audioIdsFromMovieBeats.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
133
+ return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
134
+ }
135
+ return artifactAudioId;
136
+ };
80
137
  const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
81
138
  const caption = MulmoStudioContextMethods.getCaption(context);
82
139
  const start = performance.now();
@@ -94,26 +151,20 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
94
151
  }
95
152
  const canvasInfo = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
96
153
  // Add each image input
97
- const filterComplexVideoIds = [];
98
- const filterComplexAudioIds = [];
154
+ const videoIdsForBeats = [];
155
+ const audioIdsFromMovieBeats = [];
99
156
  const transitionVideoIds = [];
100
157
  const beatTimestamps = [];
101
158
  context.studio.beats.reduce((timestamp, studioBeat, index) => {
102
159
  const beat = context.studio.script.beats[index];
103
160
  if (beat.image?.type === "voice_over") {
104
- filterComplexVideoIds.push(undefined);
161
+ videoIdsForBeats.push(undefined);
105
162
  beatTimestamps.push(timestamp);
106
163
  return timestamp; // Skip voice-over beats.
107
164
  }
108
165
  const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
109
- if (!sourceFile) {
110
- throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
111
- }
112
- if (!studioBeat.duration) {
113
- throw new Error(`studioBeat.duration is not set: index=${index}`);
114
- }
115
- const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
116
- const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
166
+ assert(!!sourceFile, `studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
167
+ assert(!!studioBeat.duration, `studioBeat.duration is not set: index=${index}`);
117
168
  const extraPadding = (() => {
118
169
  // We need to consider only intro and outro padding because the other paddings were already added to the beat.duration
119
170
  if (index === 0) {
@@ -131,111 +182,49 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
131
182
  const beatFillOption = beat.movieParams?.fillOption;
132
183
  const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
133
184
  const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
185
+ const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
186
+ const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
134
187
  const speed = beat.movieParams?.speed ?? 1.0;
135
188
  const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
136
189
  ffmpegContext.filterComplex.push(videoPart);
137
- /*
138
- if (caption && studioBeat.captionFile) {
139
- // NOTE: This works for normal beats, but not for voice-over beats.
140
- const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
141
- const compositeVideoId = `c${index}`;
142
- ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
143
- filterComplexVideoIds.push(compositeVideoId);
144
- } else {
145
- }
146
- */
147
- filterComplexVideoIds.push(videoId);
148
190
  if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
149
- const sourceId = filterComplexVideoIds.pop();
150
- ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
151
- filterComplexVideoIds.push(`${sourceId}_0`);
191
+ // NOTE: We split the video into two parts for transition.
192
+ ffmpegContext.filterComplex.push(`[${videoId}]split=2[${videoId}_0][${videoId}_1]`);
193
+ videoIdsForBeats.push(`${videoId}_0`);
152
194
  if (mediaType === "movie") {
153
195
  // For movie beats, extract the last frame for transition
154
- ffmpegContext.filterComplex.push(`[${sourceId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${sourceId}_2]`);
155
- transitionVideoIds.push(`${sourceId}_2`);
196
+ ffmpegContext.filterComplex.push(`[${videoId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${videoId}_2]`);
197
+ transitionVideoIds.push(`${videoId}_2`);
156
198
  }
157
199
  else {
158
- transitionVideoIds.push(`${sourceId}_1`);
200
+ transitionVideoIds.push(`${videoId}_1`);
159
201
  }
160
202
  }
203
+ else {
204
+ videoIdsForBeats.push(videoId);
205
+ }
161
206
  // NOTE: We don't support audio if the speed is not 1.0.
162
207
  if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
163
208
  const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
164
- filterComplexAudioIds.push(audioId);
209
+ audioIdsFromMovieBeats.push(audioId);
165
210
  ffmpegContext.filterComplex.push(audioPart);
166
211
  }
167
212
  beatTimestamps.push(timestamp);
168
213
  return timestamp + duration;
169
214
  }, 0);
170
- assert(filterComplexVideoIds.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
215
+ assert(videoIdsForBeats.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
171
216
  assert(beatTimestamps.length === context.studio.beats.length, "beatTimestamps.length !== studio.beats.length");
172
217
  // console.log("*** images", images.audioIds);
173
218
  // Concatenate the trimmed images
174
219
  const concatVideoId = "concat_video";
175
- const videoIds = filterComplexVideoIds.filter((id) => id !== undefined); // filter out voice-over beats
220
+ const videoIds = videoIdsForBeats.filter((id) => id !== undefined); // filter out voice-over beats
176
221
  ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
177
- // Overlay voice-over captions
178
- const captionedVideoId = (() => {
179
- const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
180
- if (caption && beatsWithCaptions.length > 0) {
181
- const introPadding = context.presentationStyle.audioParams.introPadding;
182
- return beatsWithCaptions.reduce((acc, beat, index) => {
183
- const { startAt, duration, captionFile } = beat;
184
- if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
185
- const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
186
- const compositeVideoId = `oc${index}`;
187
- ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
188
- return compositeVideoId;
189
- }
190
- return acc;
191
- }, concatVideoId);
192
- }
193
- return concatVideoId;
194
- })();
195
- // Add tranditions if needed
196
- const mixedVideoId = (() => {
197
- if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
198
- const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
199
- return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
200
- const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
201
- const processedVideoId = `${transitionVideoId}_f`;
202
- let transitionFilter;
203
- if (transition.type === "fade") {
204
- transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
205
- }
206
- else if (transition.type === "slideout_left") {
207
- transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
208
- }
209
- else {
210
- throw new Error(`Unknown transition type: ${transition.type}`);
211
- }
212
- ffmpegContext.filterComplex.push(transitionFilter);
213
- const outputId = `${transitionVideoId}_o`;
214
- if (transition.type === "fade") {
215
- ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
216
- }
217
- else if (transition.type === "slideout_left") {
218
- ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
219
- }
220
- return outputId;
221
- }, captionedVideoId);
222
- }
223
- return captionedVideoId;
224
- })();
222
+ const captionedVideoId = addCaptions(ffmpegContext, concatVideoId, context, caption);
223
+ const mixedVideoId = addTransitionEffects(ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps);
225
224
  GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
226
225
  const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
227
226
  const artifactAudioId = `${audioIndex}:a`;
228
- const ffmpegContextAudioId = (() => {
229
- if (filterComplexAudioIds.length > 0) {
230
- const mainAudioId = "mainaudio";
231
- const compositeAudioId = "composite";
232
- const audioIds = filterComplexAudioIds.map((id) => `[${id}]`).join("");
233
- FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
234
- ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${filterComplexAudioIds.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
235
- return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
236
- }
237
- return artifactAudioId;
238
- })();
227
+ const ffmpegContextAudioId = mixAudiosFromMovieBeats(ffmpegContext, artifactAudioId, audioIdsFromMovieBeats);
239
228
  // GraphAILogger.debug("filterComplex", ffmpegContext.filterComplex);
240
229
  await FfmpegContextGenerateOutput(ffmpegContext, outputVideoPath, getOutputOption(ffmpegContextAudioId, mixedVideoId));
241
230
  const end = performance.now();
@@ -1,8 +1,15 @@
1
+ import fs from "fs";
1
2
  import { GraphAILogger } from "graphai";
2
3
  import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextGenerateOutput, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
3
4
  const addBGMAgent = async ({ namedInputs, params, }) => {
4
5
  const { voiceFile, outputFile, context } = namedInputs;
5
6
  const { musicFile } = params;
7
+ if (!fs.existsSync(voiceFile)) {
8
+ throw new Error(`AddBGMAgent voiceFile not exist: ${voiceFile}`);
9
+ }
10
+ if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
11
+ throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
12
+ }
6
13
  const speechDuration = await ffmpegGetMediaDuration(voiceFile);
7
14
  const introPadding = context.presentationStyle.audioParams.introPadding;
8
15
  const outroPadding = context.presentationStyle.audioParams.outroPadding;
@@ -16,8 +23,14 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
16
23
  ffmpegContext.filterComplex.push(`[music][voice]amix=inputs=2:duration=longest[mixed]`);
17
24
  ffmpegContext.filterComplex.push(`[mixed]atrim=start=0:end=${totalDuration}[trimmed]`);
18
25
  ffmpegContext.filterComplex.push(`[trimmed]afade=t=out:st=${totalDuration - outroPadding}:d=${outroPadding}[faded]`);
19
- await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
20
- return outputFile;
26
+ try {
27
+ await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
28
+ return outputFile;
29
+ }
30
+ catch (e) {
31
+ GraphAILogger.log(e);
32
+ throw new Error(`AddBGMAgent ffmpeg run Error`);
33
+ }
21
34
  };
22
35
  const addBGMAgentInfo = {
23
36
  name: "addBGMAgent",
@@ -82,7 +82,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
82
82
  if (group.length > 1) {
83
83
  group.reduce((remaining, idx, iGroup) => {
84
84
  const subBeatDurations = mediaDurations[idx];
85
- userAssert(subBeatDurations.audioDuration <= remaining, `subBeatDurations.audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
85
+ userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
86
86
  if (iGroup === group.length - 1) {
87
87
  beatDurations.push(remaining);
88
88
  subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
@@ -94,10 +94,10 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
94
94
  if (voiceStartAt) {
95
95
  const remainingDuration = movieDuration - voiceStartAt;
96
96
  const duration = remaining - remainingDuration;
97
- userAssert(duration >= 0, `duration(${duration}) < 0`);
97
+ userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
98
98
  beatDurations.push(duration);
99
99
  subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
100
- userAssert(subBeatDurations.silenceDuration >= 0, `subBeatDurations.silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
100
+ userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
101
101
  return remainingDuration;
102
102
  }
103
103
  beatDurations.push(subBeatDurations.audioDuration);