mulmocast 0.0.28 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -8
- package/assets/templates/ghibli_shorts.json +2 -2
- package/assets/templates/sensei_and_taro.json +1 -1
- package/lib/actions/captions.js +16 -2
- package/lib/actions/images.d.ts +5 -2
- package/lib/actions/images.js +14 -34
- package/lib/actions/movie.d.ts +1 -1
- package/lib/actions/movie.js +110 -77
- package/lib/agents/add_bgm_agent.js +15 -2
- package/lib/agents/combine_audio_files_agent.js +53 -5
- package/lib/agents/tts_openai_agent.js +2 -1
- package/lib/cli/commands/tool/scripting/handler.js +1 -0
- package/lib/methods/mulmo_presentation_style.d.ts +3 -2
- package/lib/methods/mulmo_presentation_style.js +8 -4
- package/lib/types/schema.d.ts +309 -115
- package/lib/types/schema.js +48 -11
- package/lib/types/type.d.ts +5 -2
- package/lib/utils/context.d.ts +13 -4
- package/lib/utils/file.js +8 -0
- package/lib/utils/image_plugins/index.d.ts +2 -1
- package/lib/utils/image_plugins/index.js +2 -1
- package/lib/utils/image_plugins/voice_over.d.ts +5 -0
- package/lib/utils/image_plugins/voice_over.js +9 -0
- package/lib/utils/preprocess.d.ts +12 -3
- package/lib/utils/utils.d.ts +1 -0
- package/lib/utils/utils.js +14 -0
- package/package.json +12 -12
- package/scripts/templates/voice_over.json +60 -0
package/README.md
CHANGED
|
@@ -82,6 +82,16 @@ brew install ffmpeg
|
|
|
82
82
|
# Visit https://ffmpeg.org/download.html
|
|
83
83
|
```
|
|
84
84
|
|
|
85
|
+
You can also use [`Dockerfile`](./Dockerfile) which helps you install the pre-requisits.
|
|
86
|
+
```
|
|
87
|
+
docker build -t mulmo-cli .
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
You can use the Docker image like this:
|
|
91
|
+
```
|
|
92
|
+
docker run -e OPENAI_API_KEY=<your_openai_api_key> -it mulmo-cli mulmo tool scripting -i -t children_book -o ./ -s story
|
|
93
|
+
```
|
|
94
|
+
|
|
85
95
|
## Configuration
|
|
86
96
|
|
|
87
97
|
Create a `.env` file in your project directory with the following API keys:
|
|
@@ -200,11 +210,18 @@ writing: /Users/username/path/to/output/story-1747834931950__ja.mp4
|
|
|
200
210
|
# Generate script from web content (requires Browserless API KEY)
|
|
201
211
|
mulmo tool scripting -u https://example.com
|
|
202
212
|
|
|
213
|
+
# Generate script from local file
|
|
214
|
+
mulmo tool scripting --input-file story.txt
|
|
215
|
+
|
|
203
216
|
# Generate script with interactive mode
|
|
204
217
|
mulmo tool scripting -i
|
|
205
218
|
```
|
|
206
219
|
|
|
207
|
-
|
|
220
|
+
Note:
|
|
221
|
+
- When using the `sensei_and_taro` template, a Nijivoice API key is required
|
|
222
|
+
- When -i is specified, --input-file value will be ignored
|
|
223
|
+
- When --input-file is specified, -u value will be ignored
|
|
224
|
+
|
|
208
225
|
|
|
209
226
|
## Generate content from MulmoScript
|
|
210
227
|
|
|
@@ -308,7 +325,6 @@ Options:
|
|
|
308
325
|
-b, --basedir base dir [string]
|
|
309
326
|
-l, --lang target language [string] [choices: "en", "ja"]
|
|
310
327
|
-f, --force Force regenerate [boolean] [default: false]
|
|
311
|
-
--dryRun Dry run [boolean] [default: false]
|
|
312
328
|
-p, --presentationStyle Presentation Style [string]
|
|
313
329
|
-a, --audiodir Audio output directory [string]
|
|
314
330
|
```
|
|
@@ -329,7 +345,6 @@ Options:
|
|
|
329
345
|
-b, --basedir base dir [string]
|
|
330
346
|
-l, --lang target language [string] [choices: "en", "ja"]
|
|
331
347
|
-f, --force Force regenerate [boolean] [default: false]
|
|
332
|
-
--dryRun Dry run [boolean] [default: false]
|
|
333
348
|
-p, --presentationStyle Presentation Style [string]
|
|
334
349
|
-i, --imagedir Image output directory [string]
|
|
335
350
|
```
|
|
@@ -350,7 +365,6 @@ Options:
|
|
|
350
365
|
-b, --basedir base dir [string]
|
|
351
366
|
-l, --lang target language [string] [choices: "en", "ja"]
|
|
352
367
|
-f, --force Force regenerate [boolean] [default: false]
|
|
353
|
-
--dryRun Dry run [boolean] [default: false]
|
|
354
368
|
-p, --presentationStyle Presentation Style [string]
|
|
355
369
|
-a, --audiodir Audio output directory [string]
|
|
356
370
|
-i, --imagedir Image output directory [string]
|
|
@@ -411,16 +425,19 @@ Options:
|
|
|
411
425
|
-b, --basedir base dir [string]
|
|
412
426
|
-u, --url URLs to reference (required when not in interactive mode)
|
|
413
427
|
[array] [default: []]
|
|
428
|
+
--input-file input file name [string]
|
|
414
429
|
-i, --interactive Generate script in interactive mode with user prompts
|
|
415
430
|
[boolean]
|
|
416
431
|
-t, --template Template name to use
|
|
417
|
-
|
|
418
|
-
|
|
432
|
+
[string] [choices: "akira_comic", "business", "children_book", "coding",
|
|
433
|
+
"comic_strips", "drslump_comic", "ghibli_comic", "ghibli_image_only",
|
|
434
|
+
"ghibli_shorts", "ghost_comic", "onepiece_comic", "podcast_standard",
|
|
435
|
+
"portrait_movie", "realistic_movie", "sensei_and_taro", "shorts",
|
|
436
|
+
"text_and_image", "text_only", "trailer"]
|
|
419
437
|
-c, --cache cache dir [string]
|
|
420
438
|
-s, --script script filename [string] [default: "script"]
|
|
421
439
|
--llm llm
|
|
422
|
-
|
|
423
|
-
"groqAgent"]
|
|
440
|
+
[string] [choices: "openai", "anthropic", "gemini", "groq"]
|
|
424
441
|
--llm_model llm model [string]
|
|
425
442
|
```
|
|
426
443
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"title": "Ghibli
|
|
2
|
+
"title": "Ghibli style for YouTube Shorts",
|
|
3
3
|
"description": "Template for Ghibli-style comic presentation.",
|
|
4
4
|
"systemPrompt": "Generate a Japanese script for a Youtube shorts of the given topic. Another AI will generate comic strips for each beat based on the text description of that beat. Mention the reference in one of beats, if it exists. Use the JSON below as a template.",
|
|
5
5
|
"presentationStyle": {
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"speechParams": {
|
|
15
15
|
"provider": "nijivoice",
|
|
16
16
|
"speakers": {
|
|
17
|
-
"Presenter": { "voiceId": "
|
|
17
|
+
"Presenter": { "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c", "speechOptions": { "speed": 1.5 } }
|
|
18
18
|
}
|
|
19
19
|
},
|
|
20
20
|
"imageParams": {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"speechParams": {
|
|
18
18
|
"provider": "nijivoice",
|
|
19
19
|
"speakers": {
|
|
20
|
-
"Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "
|
|
20
|
+
"Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
|
|
21
21
|
"Student": { "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
|
|
22
22
|
"Teacher": { "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
|
|
23
23
|
}
|
package/lib/actions/captions.js
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import { mulmoCaptionParamsSchema } from "../types/index.js";
|
|
2
2
|
import { GraphAI, GraphAILogger } from "graphai";
|
|
3
3
|
import * as agents from "@graphai/vanilla";
|
|
4
|
-
import { getHTMLFile, getCaptionImagePath } from "../utils/file.js";
|
|
4
|
+
import { getHTMLFile, getCaptionImagePath, getOutputStudioFilePath } from "../utils/file.js";
|
|
5
5
|
import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
|
|
6
6
|
import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
|
|
7
|
+
import { fileWriteAgent } from "@graphai/vanilla_node_agents";
|
|
7
8
|
const vanillaAgents = agents.default ?? agents;
|
|
8
9
|
const graph_data = {
|
|
9
10
|
version: 0.5,
|
|
10
11
|
nodes: {
|
|
11
12
|
context: {},
|
|
13
|
+
outputStudioFilePath: {},
|
|
12
14
|
map: {
|
|
13
15
|
agent: "mapAgent",
|
|
14
16
|
inputs: { rows: ":context.studio.script.beats", context: ":context" },
|
|
@@ -60,14 +62,26 @@ const graph_data = {
|
|
|
60
62
|
},
|
|
61
63
|
},
|
|
62
64
|
},
|
|
65
|
+
fileWrite: {
|
|
66
|
+
agent: "fileWriteAgent",
|
|
67
|
+
inputs: {
|
|
68
|
+
onComplete: ":map.generateCaption",
|
|
69
|
+
file: ":outputStudioFilePath",
|
|
70
|
+
text: ":context.studio.toJSON()",
|
|
71
|
+
},
|
|
72
|
+
},
|
|
63
73
|
},
|
|
64
74
|
};
|
|
65
75
|
export const captions = async (context, callbacks) => {
|
|
66
76
|
if (MulmoStudioContextMethods.getCaption(context)) {
|
|
67
77
|
try {
|
|
68
78
|
MulmoStudioContextMethods.setSessionState(context, "caption", true);
|
|
69
|
-
const graph = new GraphAI(graph_data, { ...vanillaAgents });
|
|
79
|
+
const graph = new GraphAI(graph_data, { ...vanillaAgents, fileWriteAgent });
|
|
80
|
+
const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
|
|
81
|
+
const fileName = MulmoStudioContextMethods.getFileName(context);
|
|
82
|
+
const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
|
|
70
83
|
graph.injectValue("context", context);
|
|
84
|
+
graph.injectValue("outputStudioFilePath", outputStudioFilePath);
|
|
71
85
|
if (callbacks) {
|
|
72
86
|
callbacks.forEach((callback) => {
|
|
73
87
|
graph.registerCallback(callback);
|
package/lib/actions/images.d.ts
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import type { CallbackFunction } from "graphai";
|
|
2
|
-
import { MulmoStudioContext, MulmoBeat
|
|
2
|
+
import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
|
|
3
3
|
export declare const imagePreprocessAgent: (namedInputs: {
|
|
4
4
|
context: MulmoStudioContext;
|
|
5
5
|
beat: MulmoBeat;
|
|
6
6
|
index: number;
|
|
7
|
-
imageAgentInfo: Text2ImageAgentInfo;
|
|
8
7
|
imageRefs: Record<string, string>;
|
|
9
8
|
}) => Promise<{
|
|
10
9
|
imageParams: {
|
|
10
|
+
provider: "openai" | "google";
|
|
11
11
|
style?: string | undefined;
|
|
12
12
|
model?: string | undefined;
|
|
13
13
|
moderation?: string | undefined;
|
|
@@ -42,6 +42,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
42
42
|
images: string[];
|
|
43
43
|
imageFromMovie: boolean;
|
|
44
44
|
imageParams: {
|
|
45
|
+
provider: "openai" | "google";
|
|
45
46
|
style?: string | undefined;
|
|
46
47
|
model?: string | undefined;
|
|
47
48
|
moderation?: string | undefined;
|
|
@@ -68,6 +69,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
68
69
|
} | {
|
|
69
70
|
images: string[];
|
|
70
71
|
imageParams: {
|
|
72
|
+
provider: "openai" | "google";
|
|
71
73
|
style?: string | undefined;
|
|
72
74
|
model?: string | undefined;
|
|
73
75
|
moderation?: string | undefined;
|
|
@@ -89,6 +91,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
89
91
|
}> | undefined;
|
|
90
92
|
};
|
|
91
93
|
movieFile: string | undefined;
|
|
94
|
+
imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
|
|
92
95
|
imagePath: string;
|
|
93
96
|
referenceImage: string;
|
|
94
97
|
prompt: string;
|
package/lib/actions/images.js
CHANGED
|
@@ -10,7 +10,7 @@ import { fileCacheAgentFilter } from "../utils/filters.js";
|
|
|
10
10
|
import { imageGoogleAgent, imageOpenaiAgent, movieGoogleAgent, movieReplicateAgent, mediaMockAgent } from "../agents/index.js";
|
|
11
11
|
import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../methods/index.js";
|
|
12
12
|
import { findImagePlugin } from "../utils/image_plugins/index.js";
|
|
13
|
-
import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
|
|
13
|
+
import { userAssert, settings2GraphAIConfig, getExtention } from "../utils/utils.js";
|
|
14
14
|
import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
|
|
15
15
|
import { defaultOpenAIImageModel } from "../utils/const.js";
|
|
16
16
|
import { renderHTMLToImage } from "../utils/markdown.js";
|
|
@@ -25,11 +25,12 @@ const htmlStyle = (context, beat) => {
|
|
|
25
25
|
};
|
|
26
26
|
};
|
|
27
27
|
export const imagePreprocessAgent = async (namedInputs) => {
|
|
28
|
-
const { context, beat, index,
|
|
29
|
-
const
|
|
28
|
+
const { context, beat, index, imageRefs } = namedInputs;
|
|
29
|
+
const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle, beat);
|
|
30
|
+
// const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
|
|
30
31
|
const imagePath = getBeatPngImagePath(context, index);
|
|
31
32
|
const returnValue = {
|
|
32
|
-
imageParams,
|
|
33
|
+
imageParams: imageAgentInfo.imageParams,
|
|
33
34
|
movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
|
|
34
35
|
};
|
|
35
36
|
if (beat.image) {
|
|
@@ -54,8 +55,8 @@ export const imagePreprocessAgent = async (namedInputs) => {
|
|
|
54
55
|
if (beat.moviePrompt && !beat.imagePrompt) {
|
|
55
56
|
return { ...returnValue, imagePath, images, imageFromMovie: true }; // no image prompt, only movie prompt
|
|
56
57
|
}
|
|
57
|
-
const prompt = imagePrompt(beat, imageParams.style);
|
|
58
|
-
return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
|
|
58
|
+
const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
|
|
59
|
+
return { imageAgentInfo, imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
|
|
59
60
|
};
|
|
60
61
|
export const imagePluginAgent = async (namedInputs) => {
|
|
61
62
|
const { context, beat, index } = namedInputs;
|
|
@@ -87,7 +88,6 @@ const beat_graph_data = {
|
|
|
87
88
|
concurrency: 4,
|
|
88
89
|
nodes: {
|
|
89
90
|
context: {},
|
|
90
|
-
imageAgentInfo: {},
|
|
91
91
|
htmlImageAgentInfo: {},
|
|
92
92
|
movieAgentInfo: {},
|
|
93
93
|
imageRefs: {},
|
|
@@ -99,7 +99,6 @@ const beat_graph_data = {
|
|
|
99
99
|
context: ":context",
|
|
100
100
|
beat: ":beat",
|
|
101
101
|
index: ":__mapIndex",
|
|
102
|
-
imageAgentInfo: ":imageAgentInfo",
|
|
103
102
|
imageRefs: ":imageRefs",
|
|
104
103
|
},
|
|
105
104
|
},
|
|
@@ -142,7 +141,7 @@ const beat_graph_data = {
|
|
|
142
141
|
},
|
|
143
142
|
imageGenerator: {
|
|
144
143
|
if: ":preprocessor.prompt",
|
|
145
|
-
agent: ":imageAgentInfo.agent",
|
|
144
|
+
agent: ":preprocessor.imageAgentInfo.agent",
|
|
146
145
|
retry: 2,
|
|
147
146
|
inputs: {
|
|
148
147
|
prompt: ":preprocessor.prompt",
|
|
@@ -213,7 +212,6 @@ const graph_data = {
|
|
|
213
212
|
concurrency: 4,
|
|
214
213
|
nodes: {
|
|
215
214
|
context: {},
|
|
216
|
-
imageAgentInfo: {},
|
|
217
215
|
htmlImageAgentInfo: {},
|
|
218
216
|
movieAgentInfo: {},
|
|
219
217
|
outputStudioFilePath: {},
|
|
@@ -223,7 +221,6 @@ const graph_data = {
|
|
|
223
221
|
inputs: {
|
|
224
222
|
rows: ":context.studio.script.beats",
|
|
225
223
|
context: ":context",
|
|
226
|
-
imageAgentInfo: ":imageAgentInfo",
|
|
227
224
|
htmlImageAgentInfo: ":htmlImageAgentInfo",
|
|
228
225
|
movieAgentInfo: ":movieAgentInfo",
|
|
229
226
|
imageRefs: ":imageRefs",
|
|
@@ -306,10 +303,10 @@ const graphOption = async (context, settings) => {
|
|
|
306
303
|
agentFilters,
|
|
307
304
|
taskManager,
|
|
308
305
|
};
|
|
309
|
-
const
|
|
306
|
+
const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
|
|
310
307
|
const config = settings2GraphAIConfig(settings);
|
|
311
308
|
// We need to get google's auth token only if the google is the text2image provider.
|
|
312
|
-
if (
|
|
309
|
+
if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
|
|
313
310
|
userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
|
|
314
311
|
GraphAILogger.log("google was specified as text2image engine");
|
|
315
312
|
const token = await googleAuth();
|
|
@@ -342,23 +339,7 @@ export const getImageRefs = async (context) => {
|
|
|
342
339
|
}
|
|
343
340
|
const buffer = Buffer.from(await response.arrayBuffer());
|
|
344
341
|
// Detect file extension from Content-Type header or URL
|
|
345
|
-
const extension = (()
|
|
346
|
-
const contentType = response.headers.get("content-type");
|
|
347
|
-
if (contentType?.includes("jpeg") || contentType?.includes("jpg")) {
|
|
348
|
-
return "jpg";
|
|
349
|
-
}
|
|
350
|
-
else if (contentType?.includes("png")) {
|
|
351
|
-
return "png";
|
|
352
|
-
}
|
|
353
|
-
else {
|
|
354
|
-
// Fall back to URL extension
|
|
355
|
-
const urlExtension = image.source.url.split(".").pop()?.toLowerCase();
|
|
356
|
-
if (urlExtension && ["jpg", "jpeg", "png"].includes(urlExtension)) {
|
|
357
|
-
return urlExtension === "jpeg" ? "jpg" : urlExtension;
|
|
358
|
-
}
|
|
359
|
-
return "png"; // default
|
|
360
|
-
}
|
|
361
|
-
})();
|
|
342
|
+
const extension = getExtention(response.headers.get("content-type"), image.source.url);
|
|
362
343
|
const imagePath = getReferenceImagePath(context, key, extension);
|
|
363
344
|
await fs.promises.writeFile(imagePath, buffer);
|
|
364
345
|
imageRefs[key] = imagePath;
|
|
@@ -372,7 +353,7 @@ const prepareGenerateImages = async (context) => {
|
|
|
372
353
|
const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
|
|
373
354
|
const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
|
|
374
355
|
mkdir(imageProjectDirPath);
|
|
375
|
-
const
|
|
356
|
+
const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
|
|
376
357
|
const htmlImageAgentInfo = MulmoPresentationStyleMethods.getHtmlImageAgentInfo(context.presentationStyle);
|
|
377
358
|
const imageRefs = await getImageRefs(context);
|
|
378
359
|
// Determine movie agent based on provider
|
|
@@ -386,10 +367,9 @@ const prepareGenerateImages = async (context) => {
|
|
|
386
367
|
return "movieGoogleAgent";
|
|
387
368
|
}
|
|
388
369
|
};
|
|
389
|
-
GraphAILogger.info(`text2image: provider=${
|
|
370
|
+
GraphAILogger.info(`text2image: provider=${provider} model=${context.presentationStyle.imageParams?.model}`);
|
|
390
371
|
const injections = {
|
|
391
372
|
context,
|
|
392
|
-
imageAgentInfo,
|
|
393
373
|
htmlImageAgentInfo,
|
|
394
374
|
movieAgentInfo: {
|
|
395
375
|
agent: getMovieAgent(),
|
|
@@ -404,7 +384,7 @@ const getConcurrency = (context) => {
|
|
|
404
384
|
return 4;
|
|
405
385
|
}
|
|
406
386
|
const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
|
|
407
|
-
if (imageAgentInfo.provider === "openai") {
|
|
387
|
+
if (imageAgentInfo.imageParams.provider === "openai") {
|
|
408
388
|
// NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
|
|
409
389
|
// dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
|
|
410
390
|
// gpt-image-1:3,000,000 TPM、150 images per minute
|
package/lib/actions/movie.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { MulmoStudioContext, MulmoCanvasDimension, BeatMediaType, MulmoFillOption } from "../types/index.js";
|
|
2
|
-
export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption) => {
|
|
2
|
+
export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption, speed: number) => {
|
|
3
3
|
videoId: string;
|
|
4
4
|
videoPart: string;
|
|
5
5
|
};
|
package/lib/actions/movie.js
CHANGED
|
@@ -2,24 +2,32 @@ import { GraphAILogger, assert } from "graphai";
|
|
|
2
2
|
import { mulmoTransitionSchema, mulmoFillOptionSchema } from "../types/index.js";
|
|
3
3
|
import { MulmoPresentationStyleMethods } from "../methods/index.js";
|
|
4
4
|
import { getAudioArtifactFilePath, getOutputVideoFilePath, writingMessage } from "../utils/file.js";
|
|
5
|
-
import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput } from "../utils/ffmpeg_utils.js";
|
|
5
|
+
import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput, } from "../utils/ffmpeg_utils.js";
|
|
6
6
|
import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
|
|
7
7
|
// const isMac = process.platform === "darwin";
|
|
8
8
|
const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
|
|
9
|
-
export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption) => {
|
|
9
|
+
export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption, speed) => {
|
|
10
10
|
const videoId = `v${inputIndex}`;
|
|
11
11
|
const videoFilters = [];
|
|
12
12
|
// Handle different media types
|
|
13
|
+
const originalDuration = duration * speed;
|
|
13
14
|
if (mediaType === "image") {
|
|
14
15
|
videoFilters.push("loop=loop=-1:size=1:start=0");
|
|
15
16
|
}
|
|
16
17
|
else if (mediaType === "movie") {
|
|
17
18
|
// For videos, extend with last frame if shorter than required duration
|
|
18
19
|
// tpad will extend the video by cloning the last frame, then trim will ensure exact duration
|
|
19
|
-
videoFilters.push(`tpad=stop_mode=clone:stop_duration=${
|
|
20
|
+
videoFilters.push(`tpad=stop_mode=clone:stop_duration=${originalDuration * 2}`); // Use 2x duration to ensure coverage
|
|
20
21
|
}
|
|
21
22
|
// Common filters for all media types
|
|
22
|
-
videoFilters.push(`trim=duration=${
|
|
23
|
+
videoFilters.push(`trim=duration=${originalDuration}`, "fps=30");
|
|
24
|
+
// Apply speed if specified
|
|
25
|
+
if (speed !== 1.0) {
|
|
26
|
+
videoFilters.push(`setpts=${1 / speed}*PTS`);
|
|
27
|
+
}
|
|
28
|
+
else {
|
|
29
|
+
videoFilters.push("setpts=PTS-STARTPTS");
|
|
30
|
+
}
|
|
23
31
|
// Apply scaling based on fill option
|
|
24
32
|
if (fillOption.style === "aspectFill") {
|
|
25
33
|
// For aspect fill: scale to fill the canvas completely, cropping if necessary
|
|
@@ -69,32 +77,94 @@ const getOutputOption = (audioId, videoId) => {
|
|
|
69
77
|
"-b:a 128k", // Audio bitrate
|
|
70
78
|
];
|
|
71
79
|
};
|
|
80
|
+
const addCaptions = (ffmpegContext, concatVideoId, context, caption) => {
|
|
81
|
+
const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
|
|
82
|
+
if (caption && beatsWithCaptions.length > 0) {
|
|
83
|
+
const introPadding = context.presentationStyle.audioParams.introPadding;
|
|
84
|
+
return beatsWithCaptions.reduce((acc, beat, index) => {
|
|
85
|
+
const { startAt, duration, captionFile } = beat;
|
|
86
|
+
if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
|
|
87
|
+
const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
|
|
88
|
+
const compositeVideoId = `oc${index}`;
|
|
89
|
+
ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
|
|
90
|
+
return compositeVideoId;
|
|
91
|
+
}
|
|
92
|
+
return acc;
|
|
93
|
+
}, concatVideoId);
|
|
94
|
+
}
|
|
95
|
+
return concatVideoId;
|
|
96
|
+
};
|
|
97
|
+
const addTransitionEffects = (ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps) => {
|
|
98
|
+
if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
|
|
99
|
+
const transition = mulmoTransitionSchema.parse(context.presentationStyle.movieParams.transition);
|
|
100
|
+
return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
|
|
101
|
+
const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
|
|
102
|
+
const processedVideoId = `${transitionVideoId}_f`;
|
|
103
|
+
let transitionFilter;
|
|
104
|
+
if (transition.type === "fade") {
|
|
105
|
+
transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
|
|
106
|
+
}
|
|
107
|
+
else if (transition.type === "slideout_left") {
|
|
108
|
+
transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
throw new Error(`Unknown transition type: ${transition.type}`);
|
|
112
|
+
}
|
|
113
|
+
ffmpegContext.filterComplex.push(transitionFilter);
|
|
114
|
+
const outputId = `${transitionVideoId}_o`;
|
|
115
|
+
if (transition.type === "fade") {
|
|
116
|
+
ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
|
|
117
|
+
}
|
|
118
|
+
else if (transition.type === "slideout_left") {
|
|
119
|
+
ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
|
|
120
|
+
}
|
|
121
|
+
return outputId;
|
|
122
|
+
}, captionedVideoId);
|
|
123
|
+
}
|
|
124
|
+
return captionedVideoId;
|
|
125
|
+
};
|
|
126
|
+
const mixAudiosFromMovieBeats = (ffmpegContext, artifactAudioId, audioIdsFromMovieBeats) => {
|
|
127
|
+
if (audioIdsFromMovieBeats.length > 0) {
|
|
128
|
+
const mainAudioId = "mainaudio";
|
|
129
|
+
const compositeAudioId = "composite";
|
|
130
|
+
const audioIds = audioIdsFromMovieBeats.map((id) => `[${id}]`).join("");
|
|
131
|
+
FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
|
|
132
|
+
ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${audioIdsFromMovieBeats.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
|
|
133
|
+
return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
|
|
134
|
+
}
|
|
135
|
+
return artifactAudioId;
|
|
136
|
+
};
|
|
72
137
|
const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
73
138
|
const caption = MulmoStudioContextMethods.getCaption(context);
|
|
74
139
|
const start = performance.now();
|
|
75
140
|
const ffmpegContext = FfmpegContextInit();
|
|
76
|
-
const missingIndex = context.studio.beats.findIndex((
|
|
141
|
+
const missingIndex = context.studio.beats.findIndex((studioBeat, index) => {
|
|
142
|
+
const beat = context.studio.script.beats[index];
|
|
143
|
+
if (beat.image?.type === "voice_over") {
|
|
144
|
+
return false; // Voice-over does not have either imageFile or movieFile.
|
|
145
|
+
}
|
|
146
|
+
return !studioBeat.imageFile && !studioBeat.movieFile;
|
|
147
|
+
});
|
|
77
148
|
if (missingIndex !== -1) {
|
|
78
149
|
GraphAILogger.info(`ERROR: beat.imageFile or beat.movieFile is not set on beat ${missingIndex}.`);
|
|
79
150
|
return false;
|
|
80
151
|
}
|
|
81
152
|
const canvasInfo = MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle);
|
|
82
153
|
// Add each image input
|
|
83
|
-
const
|
|
84
|
-
const
|
|
154
|
+
const videoIdsForBeats = [];
|
|
155
|
+
const audioIdsFromMovieBeats = [];
|
|
85
156
|
const transitionVideoIds = [];
|
|
86
157
|
const beatTimestamps = [];
|
|
87
158
|
context.studio.beats.reduce((timestamp, studioBeat, index) => {
|
|
88
159
|
const beat = context.studio.script.beats[index];
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if (!studioBeat.duration) {
|
|
94
|
-
throw new Error(`studioBeat.duration is not set: index=${index}`);
|
|
160
|
+
if (beat.image?.type === "voice_over") {
|
|
161
|
+
videoIdsForBeats.push(undefined);
|
|
162
|
+
beatTimestamps.push(timestamp);
|
|
163
|
+
return timestamp; // Skip voice-over beats.
|
|
95
164
|
}
|
|
96
|
-
const
|
|
97
|
-
|
|
165
|
+
const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
|
|
166
|
+
assert(!!sourceFile, `studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
|
|
167
|
+
assert(!!studioBeat.duration, `studioBeat.duration is not set: index=${index}`);
|
|
98
168
|
const extraPadding = (() => {
|
|
99
169
|
// We need to consider only intro and outro padding because the other paddings were already added to the beat.duration
|
|
100
170
|
if (index === 0) {
|
|
@@ -105,93 +175,56 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
105
175
|
}
|
|
106
176
|
return 0;
|
|
107
177
|
})();
|
|
108
|
-
|
|
178
|
+
// The movie duration is bigger in case of voice-over.
|
|
179
|
+
const duration = Math.max(studioBeat.duration + extraPadding, studioBeat.movieDuration ?? 0);
|
|
109
180
|
// Get fillOption from merged imageParams (global + beat-specific)
|
|
110
181
|
const globalFillOption = context.presentationStyle.movieParams?.fillOption;
|
|
111
182
|
const beatFillOption = beat.movieParams?.fillOption;
|
|
112
183
|
const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
|
|
113
184
|
const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
|
|
114
|
-
const
|
|
185
|
+
const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
|
|
186
|
+
const mediaType = studioBeat.movieFile ? "movie" : MulmoPresentationStyleMethods.getImageType(context.presentationStyle, beat);
|
|
187
|
+
const speed = beat.movieParams?.speed ?? 1.0;
|
|
188
|
+
const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
|
|
115
189
|
ffmpegContext.filterComplex.push(videoPart);
|
|
116
|
-
if (caption && studioBeat.captionFile) {
|
|
117
|
-
const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
|
|
118
|
-
const compositeVideoId = `c${index}`;
|
|
119
|
-
ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
|
|
120
|
-
filterComplexVideoIds.push(compositeVideoId);
|
|
121
|
-
}
|
|
122
|
-
else {
|
|
123
|
-
filterComplexVideoIds.push(videoId);
|
|
124
|
-
}
|
|
125
190
|
if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
|
|
126
|
-
|
|
127
|
-
ffmpegContext.filterComplex.push(`[${
|
|
128
|
-
|
|
191
|
+
// NOTE: We split the video into two parts for transition.
|
|
192
|
+
ffmpegContext.filterComplex.push(`[${videoId}]split=2[${videoId}_0][${videoId}_1]`);
|
|
193
|
+
videoIdsForBeats.push(`${videoId}_0`);
|
|
129
194
|
if (mediaType === "movie") {
|
|
130
195
|
// For movie beats, extract the last frame for transition
|
|
131
|
-
ffmpegContext.filterComplex.push(`[${
|
|
132
|
-
transitionVideoIds.push(`${
|
|
196
|
+
ffmpegContext.filterComplex.push(`[${videoId}_1]reverse,select='eq(n,0)',reverse,tpad=stop_mode=clone:stop_duration=${duration},fps=30,setpts=PTS-STARTPTS[${videoId}_2]`);
|
|
197
|
+
transitionVideoIds.push(`${videoId}_2`);
|
|
133
198
|
}
|
|
134
199
|
else {
|
|
135
|
-
transitionVideoIds.push(`${
|
|
200
|
+
transitionVideoIds.push(`${videoId}_1`);
|
|
136
201
|
}
|
|
137
202
|
}
|
|
138
|
-
|
|
203
|
+
else {
|
|
204
|
+
videoIdsForBeats.push(videoId);
|
|
205
|
+
}
|
|
206
|
+
// NOTE: We don't support audio if the speed is not 1.0.
|
|
207
|
+
if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
|
|
139
208
|
const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
|
|
140
|
-
|
|
209
|
+
audioIdsFromMovieBeats.push(audioId);
|
|
141
210
|
ffmpegContext.filterComplex.push(audioPart);
|
|
142
211
|
}
|
|
143
212
|
beatTimestamps.push(timestamp);
|
|
144
213
|
return timestamp + duration;
|
|
145
214
|
}, 0);
|
|
146
|
-
assert(
|
|
215
|
+
assert(videoIdsForBeats.length === context.studio.beats.length, "videoIds.length !== studio.beats.length");
|
|
147
216
|
assert(beatTimestamps.length === context.studio.beats.length, "beatTimestamps.length !== studio.beats.length");
|
|
148
217
|
// console.log("*** images", images.audioIds);
|
|
149
218
|
// Concatenate the trimmed images
|
|
150
219
|
const concatVideoId = "concat_video";
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
const
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
|
|
157
|
-
const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
|
|
158
|
-
const processedVideoId = `${transitionVideoId}_f`;
|
|
159
|
-
let transitionFilter;
|
|
160
|
-
if (transition.type === "fade") {
|
|
161
|
-
transitionFilter = `[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
|
|
162
|
-
}
|
|
163
|
-
else if (transition.type === "slideout_left") {
|
|
164
|
-
transitionFilter = `[${transitionVideoId}]format=yuva420p,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`;
|
|
165
|
-
}
|
|
166
|
-
else {
|
|
167
|
-
throw new Error(`Unknown transition type: ${transition.type}`);
|
|
168
|
-
}
|
|
169
|
-
ffmpegContext.filterComplex.push(transitionFilter);
|
|
170
|
-
const outputId = `${transitionVideoId}_o`;
|
|
171
|
-
if (transition.type === "fade") {
|
|
172
|
-
ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
|
|
173
|
-
}
|
|
174
|
-
else if (transition.type === "slideout_left") {
|
|
175
|
-
ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
|
|
176
|
-
}
|
|
177
|
-
return outputId;
|
|
178
|
-
}, concatVideoId);
|
|
179
|
-
}
|
|
180
|
-
return concatVideoId;
|
|
181
|
-
})();
|
|
220
|
+
const videoIds = videoIdsForBeats.filter((id) => id !== undefined); // filter out voice-over beats
|
|
221
|
+
ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
|
|
222
|
+
const captionedVideoId = addCaptions(ffmpegContext, concatVideoId, context, caption);
|
|
223
|
+
const mixedVideoId = addTransitionEffects(ffmpegContext, captionedVideoId, context, transitionVideoIds, beatTimestamps);
|
|
224
|
+
GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
|
|
182
225
|
const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
|
|
183
226
|
const artifactAudioId = `${audioIndex}:a`;
|
|
184
|
-
const ffmpegContextAudioId = (
|
|
185
|
-
if (filterComplexAudioIds.length > 0) {
|
|
186
|
-
const mainAudioId = "mainaudio";
|
|
187
|
-
const compositeAudioId = "composite";
|
|
188
|
-
const audioIds = filterComplexAudioIds.map((id) => `[${id}]`).join("");
|
|
189
|
-
FfmpegContextPushFormattedAudio(ffmpegContext, `[${artifactAudioId}]`, `[${mainAudioId}]`);
|
|
190
|
-
ffmpegContext.filterComplex.push(`[${mainAudioId}]${audioIds}amix=inputs=${filterComplexAudioIds.length + 1}:duration=first:dropout_transition=2[${compositeAudioId}]`);
|
|
191
|
-
return `[${compositeAudioId}]`; // notice that we need to use [mainaudio] instead of mainaudio
|
|
192
|
-
}
|
|
193
|
-
return artifactAudioId;
|
|
194
|
-
})();
|
|
227
|
+
const ffmpegContextAudioId = mixAudiosFromMovieBeats(ffmpegContext, artifactAudioId, audioIdsFromMovieBeats);
|
|
195
228
|
// GraphAILogger.debug("filterComplex", ffmpegContext.filterComplex);
|
|
196
229
|
await FfmpegContextGenerateOutput(ffmpegContext, outputVideoPath, getOutputOption(ffmpegContextAudioId, mixedVideoId));
|
|
197
230
|
const end = performance.now();
|
|
@@ -1,8 +1,15 @@
|
|
|
1
|
+
import fs from "fs";
|
|
1
2
|
import { GraphAILogger } from "graphai";
|
|
2
3
|
import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextGenerateOutput, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
|
|
3
4
|
const addBGMAgent = async ({ namedInputs, params, }) => {
|
|
4
5
|
const { voiceFile, outputFile, context } = namedInputs;
|
|
5
6
|
const { musicFile } = params;
|
|
7
|
+
if (!fs.existsSync(voiceFile)) {
|
|
8
|
+
throw new Error(`AddBGMAgent voiceFile not exist: ${voiceFile}`);
|
|
9
|
+
}
|
|
10
|
+
if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
|
|
11
|
+
throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
|
|
12
|
+
}
|
|
6
13
|
const speechDuration = await ffmpegGetMediaDuration(voiceFile);
|
|
7
14
|
const introPadding = context.presentationStyle.audioParams.introPadding;
|
|
8
15
|
const outroPadding = context.presentationStyle.audioParams.outroPadding;
|
|
@@ -16,8 +23,14 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
|
|
|
16
23
|
ffmpegContext.filterComplex.push(`[music][voice]amix=inputs=2:duration=longest[mixed]`);
|
|
17
24
|
ffmpegContext.filterComplex.push(`[mixed]atrim=start=0:end=${totalDuration}[trimmed]`);
|
|
18
25
|
ffmpegContext.filterComplex.push(`[trimmed]afade=t=out:st=${totalDuration - outroPadding}:d=${outroPadding}[faded]`);
|
|
19
|
-
|
|
20
|
-
|
|
26
|
+
try {
|
|
27
|
+
await FfmpegContextGenerateOutput(ffmpegContext, outputFile, ["-map", "[faded]"]);
|
|
28
|
+
return outputFile;
|
|
29
|
+
}
|
|
30
|
+
catch (e) {
|
|
31
|
+
GraphAILogger.log(e);
|
|
32
|
+
throw new Error(`AddBGMAgent ffmpeg run Error`);
|
|
33
|
+
}
|
|
21
34
|
};
|
|
22
35
|
const addBGMAgentInfo = {
|
|
23
36
|
name: "addBGMAgent",
|