mulmocast 0.0.28 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -8
- package/assets/templates/ghibli_shorts.json +1 -1
- package/assets/templates/sensei_and_taro.json +1 -1
- package/lib/actions/captions.js +16 -2
- package/lib/actions/images.d.ts +5 -2
- package/lib/actions/images.js +12 -16
- package/lib/actions/movie.d.ts +1 -1
- package/lib/actions/movie.js +61 -17
- package/lib/agents/combine_audio_files_agent.js +53 -5
- package/lib/agents/tts_openai_agent.js +2 -1
- package/lib/cli/commands/tool/scripting/handler.js +1 -0
- package/lib/methods/mulmo_presentation_style.d.ts +3 -2
- package/lib/methods/mulmo_presentation_style.js +8 -4
- package/lib/types/schema.d.ts +227 -53
- package/lib/types/schema.js +30 -7
- package/lib/types/type.d.ts +3 -2
- package/lib/utils/context.d.ts +11 -2
- package/lib/utils/image_plugins/index.d.ts +2 -1
- package/lib/utils/image_plugins/index.js +2 -1
- package/lib/utils/image_plugins/voice_over.d.ts +5 -0
- package/lib/utils/image_plugins/voice_over.js +9 -0
- package/lib/utils/preprocess.d.ts +11 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -200,11 +200,18 @@ writing: /Users/username/path/to/output/story-1747834931950__ja.mp4
|
|
|
200
200
|
# Generate script from web content (requires Browserless API KEY)
|
|
201
201
|
mulmo tool scripting -u https://example.com
|
|
202
202
|
|
|
203
|
+
# Generate script from local file
|
|
204
|
+
mulmo tool scripting --input-file story.txt
|
|
205
|
+
|
|
203
206
|
# Generate script with interactive mode
|
|
204
207
|
mulmo tool scripting -i
|
|
205
208
|
```
|
|
206
209
|
|
|
207
|
-
|
|
210
|
+
Note:
|
|
211
|
+
- When using the `sensei_and_taro` template, a Nijivoice API key is required
|
|
212
|
+
- When -i is specified, --input-file value will be ignored
|
|
213
|
+
- When --input-file is specified, -u value will be ignored
|
|
214
|
+
|
|
208
215
|
|
|
209
216
|
## Generate content from MulmoScript
|
|
210
217
|
|
|
@@ -308,7 +315,6 @@ Options:
|
|
|
308
315
|
-b, --basedir base dir [string]
|
|
309
316
|
-l, --lang target language [string] [choices: "en", "ja"]
|
|
310
317
|
-f, --force Force regenerate [boolean] [default: false]
|
|
311
|
-
--dryRun Dry run [boolean] [default: false]
|
|
312
318
|
-p, --presentationStyle Presentation Style [string]
|
|
313
319
|
-a, --audiodir Audio output directory [string]
|
|
314
320
|
```
|
|
@@ -329,7 +335,6 @@ Options:
|
|
|
329
335
|
-b, --basedir base dir [string]
|
|
330
336
|
-l, --lang target language [string] [choices: "en", "ja"]
|
|
331
337
|
-f, --force Force regenerate [boolean] [default: false]
|
|
332
|
-
--dryRun Dry run [boolean] [default: false]
|
|
333
338
|
-p, --presentationStyle Presentation Style [string]
|
|
334
339
|
-i, --imagedir Image output directory [string]
|
|
335
340
|
```
|
|
@@ -350,7 +355,6 @@ Options:
|
|
|
350
355
|
-b, --basedir base dir [string]
|
|
351
356
|
-l, --lang target language [string] [choices: "en", "ja"]
|
|
352
357
|
-f, --force Force regenerate [boolean] [default: false]
|
|
353
|
-
--dryRun Dry run [boolean] [default: false]
|
|
354
358
|
-p, --presentationStyle Presentation Style [string]
|
|
355
359
|
-a, --audiodir Audio output directory [string]
|
|
356
360
|
-i, --imagedir Image output directory [string]
|
|
@@ -411,16 +415,19 @@ Options:
|
|
|
411
415
|
-b, --basedir base dir [string]
|
|
412
416
|
-u, --url URLs to reference (required when not in interactive mode)
|
|
413
417
|
[array] [default: []]
|
|
418
|
+
--input-file input file name [string]
|
|
414
419
|
-i, --interactive Generate script in interactive mode with user prompts
|
|
415
420
|
[boolean]
|
|
416
421
|
-t, --template Template name to use
|
|
417
|
-
|
|
418
|
-
|
|
422
|
+
[string] [choices: "akira_comic", "business", "children_book", "coding",
|
|
423
|
+
"comic_strips", "drslump_comic", "ghibli_comic", "ghibli_image_only",
|
|
424
|
+
"ghibli_shorts", "ghost_comic", "onepiece_comic", "podcast_standard",
|
|
425
|
+
"portrait_movie", "realistic_movie", "sensei_and_taro", "shorts",
|
|
426
|
+
"text_and_image", "text_only", "trailer"]
|
|
419
427
|
-c, --cache cache dir [string]
|
|
420
428
|
-s, --script script filename [string] [default: "script"]
|
|
421
429
|
--llm llm
|
|
422
|
-
|
|
423
|
-
"groqAgent"]
|
|
430
|
+
[string] [choices: "openai", "anthropic", "gemini", "groq"]
|
|
424
431
|
--llm_model llm model [string]
|
|
425
432
|
```
|
|
426
433
|
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"speechParams": {
|
|
15
15
|
"provider": "nijivoice",
|
|
16
16
|
"speakers": {
|
|
17
|
-
"Presenter": { "voiceId": "
|
|
17
|
+
"Presenter": { "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c", "speechOptions": { "speed": 1.5 } }
|
|
18
18
|
}
|
|
19
19
|
},
|
|
20
20
|
"imageParams": {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"speechParams": {
|
|
18
18
|
"provider": "nijivoice",
|
|
19
19
|
"speakers": {
|
|
20
|
-
"Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "
|
|
20
|
+
"Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
|
|
21
21
|
"Student": { "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
|
|
22
22
|
"Teacher": { "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
|
|
23
23
|
}
|
package/lib/actions/captions.js
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import { mulmoCaptionParamsSchema } from "../types/index.js";
|
|
2
2
|
import { GraphAI, GraphAILogger } from "graphai";
|
|
3
3
|
import * as agents from "@graphai/vanilla";
|
|
4
|
-
import { getHTMLFile, getCaptionImagePath } from "../utils/file.js";
|
|
4
|
+
import { getHTMLFile, getCaptionImagePath, getOutputStudioFilePath } from "../utils/file.js";
|
|
5
5
|
import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
|
|
6
6
|
import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
|
|
7
|
+
import { fileWriteAgent } from "@graphai/vanilla_node_agents";
|
|
7
8
|
const vanillaAgents = agents.default ?? agents;
|
|
8
9
|
const graph_data = {
|
|
9
10
|
version: 0.5,
|
|
10
11
|
nodes: {
|
|
11
12
|
context: {},
|
|
13
|
+
outputStudioFilePath: {},
|
|
12
14
|
map: {
|
|
13
15
|
agent: "mapAgent",
|
|
14
16
|
inputs: { rows: ":context.studio.script.beats", context: ":context" },
|
|
@@ -60,14 +62,26 @@ const graph_data = {
|
|
|
60
62
|
},
|
|
61
63
|
},
|
|
62
64
|
},
|
|
65
|
+
fileWrite: {
|
|
66
|
+
agent: "fileWriteAgent",
|
|
67
|
+
inputs: {
|
|
68
|
+
onComplete: ":map.generateCaption",
|
|
69
|
+
file: ":outputStudioFilePath",
|
|
70
|
+
text: ":context.studio.toJSON()",
|
|
71
|
+
},
|
|
72
|
+
},
|
|
63
73
|
},
|
|
64
74
|
};
|
|
65
75
|
export const captions = async (context, callbacks) => {
|
|
66
76
|
if (MulmoStudioContextMethods.getCaption(context)) {
|
|
67
77
|
try {
|
|
68
78
|
MulmoStudioContextMethods.setSessionState(context, "caption", true);
|
|
69
|
-
const graph = new GraphAI(graph_data, { ...vanillaAgents });
|
|
79
|
+
const graph = new GraphAI(graph_data, { ...vanillaAgents, fileWriteAgent });
|
|
80
|
+
const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
|
|
81
|
+
const fileName = MulmoStudioContextMethods.getFileName(context);
|
|
82
|
+
const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
|
|
70
83
|
graph.injectValue("context", context);
|
|
84
|
+
graph.injectValue("outputStudioFilePath", outputStudioFilePath);
|
|
71
85
|
if (callbacks) {
|
|
72
86
|
callbacks.forEach((callback) => {
|
|
73
87
|
graph.registerCallback(callback);
|
package/lib/actions/images.d.ts
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import type { CallbackFunction } from "graphai";
|
|
2
|
-
import { MulmoStudioContext, MulmoBeat
|
|
2
|
+
import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
|
|
3
3
|
export declare const imagePreprocessAgent: (namedInputs: {
|
|
4
4
|
context: MulmoStudioContext;
|
|
5
5
|
beat: MulmoBeat;
|
|
6
6
|
index: number;
|
|
7
|
-
imageAgentInfo: Text2ImageAgentInfo;
|
|
8
7
|
imageRefs: Record<string, string>;
|
|
9
8
|
}) => Promise<{
|
|
10
9
|
imageParams: {
|
|
10
|
+
provider: "openai" | "google";
|
|
11
11
|
style?: string | undefined;
|
|
12
12
|
model?: string | undefined;
|
|
13
13
|
moderation?: string | undefined;
|
|
@@ -42,6 +42,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
42
42
|
images: string[];
|
|
43
43
|
imageFromMovie: boolean;
|
|
44
44
|
imageParams: {
|
|
45
|
+
provider: "openai" | "google";
|
|
45
46
|
style?: string | undefined;
|
|
46
47
|
model?: string | undefined;
|
|
47
48
|
moderation?: string | undefined;
|
|
@@ -68,6 +69,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
68
69
|
} | {
|
|
69
70
|
images: string[];
|
|
70
71
|
imageParams: {
|
|
72
|
+
provider: "openai" | "google";
|
|
71
73
|
style?: string | undefined;
|
|
72
74
|
model?: string | undefined;
|
|
73
75
|
moderation?: string | undefined;
|
|
@@ -89,6 +91,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
89
91
|
}> | undefined;
|
|
90
92
|
};
|
|
91
93
|
movieFile: string | undefined;
|
|
94
|
+
imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
|
|
92
95
|
imagePath: string;
|
|
93
96
|
referenceImage: string;
|
|
94
97
|
prompt: string;
|
package/lib/actions/images.js
CHANGED
|
@@ -25,11 +25,12 @@ const htmlStyle = (context, beat) => {
|
|
|
25
25
|
};
|
|
26
26
|
};
|
|
27
27
|
export const imagePreprocessAgent = async (namedInputs) => {
|
|
28
|
-
const { context, beat, index,
|
|
29
|
-
const
|
|
28
|
+
const { context, beat, index, imageRefs } = namedInputs;
|
|
29
|
+
const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle, beat);
|
|
30
|
+
// const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
|
|
30
31
|
const imagePath = getBeatPngImagePath(context, index);
|
|
31
32
|
const returnValue = {
|
|
32
|
-
imageParams,
|
|
33
|
+
imageParams: imageAgentInfo.imageParams,
|
|
33
34
|
movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
|
|
34
35
|
};
|
|
35
36
|
if (beat.image) {
|
|
@@ -54,8 +55,8 @@ export const imagePreprocessAgent = async (namedInputs) => {
|
|
|
54
55
|
if (beat.moviePrompt && !beat.imagePrompt) {
|
|
55
56
|
return { ...returnValue, imagePath, images, imageFromMovie: true }; // no image prompt, only movie prompt
|
|
56
57
|
}
|
|
57
|
-
const prompt = imagePrompt(beat, imageParams.style);
|
|
58
|
-
return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
|
|
58
|
+
const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
|
|
59
|
+
return { imageAgentInfo, imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
|
|
59
60
|
};
|
|
60
61
|
export const imagePluginAgent = async (namedInputs) => {
|
|
61
62
|
const { context, beat, index } = namedInputs;
|
|
@@ -87,7 +88,6 @@ const beat_graph_data = {
|
|
|
87
88
|
concurrency: 4,
|
|
88
89
|
nodes: {
|
|
89
90
|
context: {},
|
|
90
|
-
imageAgentInfo: {},
|
|
91
91
|
htmlImageAgentInfo: {},
|
|
92
92
|
movieAgentInfo: {},
|
|
93
93
|
imageRefs: {},
|
|
@@ -99,7 +99,6 @@ const beat_graph_data = {
|
|
|
99
99
|
context: ":context",
|
|
100
100
|
beat: ":beat",
|
|
101
101
|
index: ":__mapIndex",
|
|
102
|
-
imageAgentInfo: ":imageAgentInfo",
|
|
103
102
|
imageRefs: ":imageRefs",
|
|
104
103
|
},
|
|
105
104
|
},
|
|
@@ -142,7 +141,7 @@ const beat_graph_data = {
|
|
|
142
141
|
},
|
|
143
142
|
imageGenerator: {
|
|
144
143
|
if: ":preprocessor.prompt",
|
|
145
|
-
agent: ":imageAgentInfo.agent",
|
|
144
|
+
agent: ":preprocessor.imageAgentInfo.agent",
|
|
146
145
|
retry: 2,
|
|
147
146
|
inputs: {
|
|
148
147
|
prompt: ":preprocessor.prompt",
|
|
@@ -213,7 +212,6 @@ const graph_data = {
|
|
|
213
212
|
concurrency: 4,
|
|
214
213
|
nodes: {
|
|
215
214
|
context: {},
|
|
216
|
-
imageAgentInfo: {},
|
|
217
215
|
htmlImageAgentInfo: {},
|
|
218
216
|
movieAgentInfo: {},
|
|
219
217
|
outputStudioFilePath: {},
|
|
@@ -223,7 +221,6 @@ const graph_data = {
|
|
|
223
221
|
inputs: {
|
|
224
222
|
rows: ":context.studio.script.beats",
|
|
225
223
|
context: ":context",
|
|
226
|
-
imageAgentInfo: ":imageAgentInfo",
|
|
227
224
|
htmlImageAgentInfo: ":htmlImageAgentInfo",
|
|
228
225
|
movieAgentInfo: ":movieAgentInfo",
|
|
229
226
|
imageRefs: ":imageRefs",
|
|
@@ -306,10 +303,10 @@ const graphOption = async (context, settings) => {
|
|
|
306
303
|
agentFilters,
|
|
307
304
|
taskManager,
|
|
308
305
|
};
|
|
309
|
-
const
|
|
306
|
+
const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
|
|
310
307
|
const config = settings2GraphAIConfig(settings);
|
|
311
308
|
// We need to get google's auth token only if the google is the text2image provider.
|
|
312
|
-
if (
|
|
309
|
+
if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
|
|
313
310
|
userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
|
|
314
311
|
GraphAILogger.log("google was specified as text2image engine");
|
|
315
312
|
const token = await googleAuth();
|
|
@@ -372,7 +369,7 @@ const prepareGenerateImages = async (context) => {
|
|
|
372
369
|
const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
|
|
373
370
|
const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
|
|
374
371
|
mkdir(imageProjectDirPath);
|
|
375
|
-
const
|
|
372
|
+
const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
|
|
376
373
|
const htmlImageAgentInfo = MulmoPresentationStyleMethods.getHtmlImageAgentInfo(context.presentationStyle);
|
|
377
374
|
const imageRefs = await getImageRefs(context);
|
|
378
375
|
// Determine movie agent based on provider
|
|
@@ -386,10 +383,9 @@ const prepareGenerateImages = async (context) => {
|
|
|
386
383
|
return "movieGoogleAgent";
|
|
387
384
|
}
|
|
388
385
|
};
|
|
389
|
-
GraphAILogger.info(`text2image: provider=${
|
|
386
|
+
GraphAILogger.info(`text2image: provider=${provider} model=${context.presentationStyle.imageParams?.model}`);
|
|
390
387
|
const injections = {
|
|
391
388
|
context,
|
|
392
|
-
imageAgentInfo,
|
|
393
389
|
htmlImageAgentInfo,
|
|
394
390
|
movieAgentInfo: {
|
|
395
391
|
agent: getMovieAgent(),
|
|
@@ -404,7 +400,7 @@ const getConcurrency = (context) => {
|
|
|
404
400
|
return 4;
|
|
405
401
|
}
|
|
406
402
|
const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
|
|
407
|
-
if (imageAgentInfo.provider === "openai") {
|
|
403
|
+
if (imageAgentInfo.imageParams.provider === "openai") {
|
|
408
404
|
// NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
|
|
409
405
|
// dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
|
|
410
406
|
// gpt-image-1:3,000,000 TPM、150 images per minute
|
package/lib/actions/movie.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { MulmoStudioContext, MulmoCanvasDimension, BeatMediaType, MulmoFillOption } from "../types/index.js";
|
|
2
|
-
export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption) => {
|
|
2
|
+
export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption, speed: number) => {
|
|
3
3
|
videoId: string;
|
|
4
4
|
videoPart: string;
|
|
5
5
|
};
|
package/lib/actions/movie.js
CHANGED
|
@@ -6,20 +6,28 @@ import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAud
|
|
|
6
6
|
import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
|
|
7
7
|
// const isMac = process.platform === "darwin";
|
|
8
8
|
const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
|
|
9
|
-
export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption) => {
|
|
9
|
+
export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption, speed) => {
|
|
10
10
|
const videoId = `v${inputIndex}`;
|
|
11
11
|
const videoFilters = [];
|
|
12
12
|
// Handle different media types
|
|
13
|
+
const originalDuration = duration * speed;
|
|
13
14
|
if (mediaType === "image") {
|
|
14
15
|
videoFilters.push("loop=loop=-1:size=1:start=0");
|
|
15
16
|
}
|
|
16
17
|
else if (mediaType === "movie") {
|
|
17
18
|
// For videos, extend with last frame if shorter than required duration
|
|
18
19
|
// tpad will extend the video by cloning the last frame, then trim will ensure exact duration
|
|
19
|
-
videoFilters.push(`tpad=stop_mode=clone:stop_duration=${
|
|
20
|
+
videoFilters.push(`tpad=stop_mode=clone:stop_duration=${originalDuration * 2}`); // Use 2x duration to ensure coverage
|
|
20
21
|
}
|
|
21
22
|
// Common filters for all media types
|
|
22
|
-
videoFilters.push(`trim=duration=${
|
|
23
|
+
videoFilters.push(`trim=duration=${originalDuration}`, "fps=30");
|
|
24
|
+
// Apply speed if specified
|
|
25
|
+
if (speed !== 1.0) {
|
|
26
|
+
videoFilters.push(`setpts=${1 / speed}*PTS`);
|
|
27
|
+
}
|
|
28
|
+
else {
|
|
29
|
+
videoFilters.push("setpts=PTS-STARTPTS");
|
|
30
|
+
}
|
|
23
31
|
// Apply scaling based on fill option
|
|
24
32
|
if (fillOption.style === "aspectFill") {
|
|
25
33
|
// For aspect fill: scale to fill the canvas completely, cropping if necessary
|
|
@@ -73,7 +81,13 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
73
81
|
const caption = MulmoStudioContextMethods.getCaption(context);
|
|
74
82
|
const start = performance.now();
|
|
75
83
|
const ffmpegContext = FfmpegContextInit();
|
|
76
|
-
const missingIndex = context.studio.beats.findIndex((
|
|
84
|
+
const missingIndex = context.studio.beats.findIndex((studioBeat, index) => {
|
|
85
|
+
const beat = context.studio.script.beats[index];
|
|
86
|
+
if (beat.image?.type === "voice_over") {
|
|
87
|
+
return false; // Voice-over does not have either imageFile or movieFile.
|
|
88
|
+
}
|
|
89
|
+
return !studioBeat.imageFile && !studioBeat.movieFile;
|
|
90
|
+
});
|
|
77
91
|
if (missingIndex !== -1) {
|
|
78
92
|
GraphAILogger.info(`ERROR: beat.imageFile or beat.movieFile is not set on beat ${missingIndex}.`);
|
|
79
93
|
return false;
|
|
@@ -86,6 +100,11 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
86
100
|
const beatTimestamps = [];
|
|
87
101
|
context.studio.beats.reduce((timestamp, studioBeat, index) => {
|
|
88
102
|
const beat = context.studio.script.beats[index];
|
|
103
|
+
if (beat.image?.type === "voice_over") {
|
|
104
|
+
filterComplexVideoIds.push(undefined);
|
|
105
|
+
beatTimestamps.push(timestamp);
|
|
106
|
+
return timestamp; // Skip voice-over beats.
|
|
107
|
+
}
|
|
89
108
|
const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
|
|
90
109
|
if (!sourceFile) {
|
|
91
110
|
throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
|
|
@@ -105,23 +124,27 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
105
124
|
}
|
|
106
125
|
return 0;
|
|
107
126
|
})();
|
|
108
|
-
|
|
127
|
+
// The movie duration is bigger in case of voice-over.
|
|
128
|
+
const duration = Math.max(studioBeat.duration + extraPadding, studioBeat.movieDuration ?? 0);
|
|
109
129
|
// Get fillOption from merged imageParams (global + beat-specific)
|
|
110
130
|
const globalFillOption = context.presentationStyle.movieParams?.fillOption;
|
|
111
131
|
const beatFillOption = beat.movieParams?.fillOption;
|
|
112
132
|
const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
|
|
113
133
|
const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
|
|
114
|
-
const
|
|
134
|
+
const speed = beat.movieParams?.speed ?? 1.0;
|
|
135
|
+
const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
|
|
115
136
|
ffmpegContext.filterComplex.push(videoPart);
|
|
137
|
+
/*
|
|
116
138
|
if (caption && studioBeat.captionFile) {
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
else {
|
|
123
|
-
filterComplexVideoIds.push(videoId);
|
|
139
|
+
// NOTE: This works for normal beats, but not for voice-over beats.
|
|
140
|
+
const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
|
|
141
|
+
const compositeVideoId = `c${index}`;
|
|
142
|
+
ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
|
|
143
|
+
filterComplexVideoIds.push(compositeVideoId);
|
|
144
|
+
} else {
|
|
124
145
|
}
|
|
146
|
+
*/
|
|
147
|
+
filterComplexVideoIds.push(videoId);
|
|
125
148
|
if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
|
|
126
149
|
const sourceId = filterComplexVideoIds.pop();
|
|
127
150
|
ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
|
|
@@ -135,7 +158,8 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
135
158
|
transitionVideoIds.push(`${sourceId}_1`);
|
|
136
159
|
}
|
|
137
160
|
}
|
|
138
|
-
if
|
|
161
|
+
// NOTE: We don't support audio if the speed is not 1.0.
|
|
162
|
+
if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
|
|
139
163
|
const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
|
|
140
164
|
filterComplexAudioIds.push(audioId);
|
|
141
165
|
ffmpegContext.filterComplex.push(audioPart);
|
|
@@ -148,7 +172,26 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
148
172
|
// console.log("*** images", images.audioIds);
|
|
149
173
|
// Concatenate the trimmed images
|
|
150
174
|
const concatVideoId = "concat_video";
|
|
151
|
-
|
|
175
|
+
const videoIds = filterComplexVideoIds.filter((id) => id !== undefined); // filter out voice-over beats
|
|
176
|
+
ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
|
|
177
|
+
// Overlay voice-over captions
|
|
178
|
+
const captionedVideoId = (() => {
|
|
179
|
+
const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
|
|
180
|
+
if (caption && beatsWithCaptions.length > 0) {
|
|
181
|
+
const introPadding = context.presentationStyle.audioParams.introPadding;
|
|
182
|
+
return beatsWithCaptions.reduce((acc, beat, index) => {
|
|
183
|
+
const { startAt, duration, captionFile } = beat;
|
|
184
|
+
if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
|
|
185
|
+
const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
|
|
186
|
+
const compositeVideoId = `oc${index}`;
|
|
187
|
+
ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
|
|
188
|
+
return compositeVideoId;
|
|
189
|
+
}
|
|
190
|
+
return acc;
|
|
191
|
+
}, concatVideoId);
|
|
192
|
+
}
|
|
193
|
+
return concatVideoId;
|
|
194
|
+
})();
|
|
152
195
|
// Add tranditions if needed
|
|
153
196
|
const mixedVideoId = (() => {
|
|
154
197
|
if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
|
|
@@ -175,10 +218,11 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
175
218
|
ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
|
|
176
219
|
}
|
|
177
220
|
return outputId;
|
|
178
|
-
},
|
|
221
|
+
}, captionedVideoId);
|
|
179
222
|
}
|
|
180
|
-
return
|
|
223
|
+
return captionedVideoId;
|
|
181
224
|
})();
|
|
225
|
+
GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
|
|
182
226
|
const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
|
|
183
227
|
const artifactAudioId = `${audioIndex}:a`;
|
|
184
228
|
const ffmpegContextAudioId = (() => {
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { assert, GraphAILogger } from "graphai";
|
|
2
2
|
import { silent60secPath } from "../utils/file.js";
|
|
3
3
|
import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
|
|
4
|
+
import { userAssert } from "../utils/utils.js";
|
|
4
5
|
const getMovieDulation = async (beat) => {
|
|
5
6
|
if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
|
|
6
7
|
const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
|
|
7
|
-
|
|
8
|
+
const speed = beat.movieParams?.speed ?? 1.0;
|
|
9
|
+
return (await ffmpegGetMediaDuration(pathOrUrl)) / speed;
|
|
8
10
|
}
|
|
9
11
|
return 0;
|
|
10
12
|
};
|
|
@@ -65,7 +67,45 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
65
67
|
const mediaDurations = await getMediaDurations(context);
|
|
66
68
|
const beatDurations = [];
|
|
67
69
|
context.studio.script.beats.forEach((beat, index) => {
|
|
70
|
+
if (beatDurations.length > index) {
|
|
71
|
+
// The current beat has already been processed.
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
assert(beatDurations.length === index, "beatDurations.length !== index");
|
|
68
75
|
const { audioDuration, movieDuration } = mediaDurations[index];
|
|
76
|
+
// Check if we are processing a voice-over beat.
|
|
77
|
+
if (movieDuration > 0) {
|
|
78
|
+
const group = [index];
|
|
79
|
+
for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
|
|
80
|
+
group.push(i);
|
|
81
|
+
}
|
|
82
|
+
if (group.length > 1) {
|
|
83
|
+
group.reduce((remaining, idx, iGroup) => {
|
|
84
|
+
const subBeatDurations = mediaDurations[idx];
|
|
85
|
+
userAssert(subBeatDurations.audioDuration <= remaining, `subBeatDurations.audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
|
|
86
|
+
if (iGroup === group.length - 1) {
|
|
87
|
+
beatDurations.push(remaining);
|
|
88
|
+
subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
|
|
89
|
+
return 0;
|
|
90
|
+
}
|
|
91
|
+
const nextBeat = context.studio.script.beats[idx + 1];
|
|
92
|
+
assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
|
|
93
|
+
const voiceStartAt = nextBeat.image?.startAt;
|
|
94
|
+
if (voiceStartAt) {
|
|
95
|
+
const remainingDuration = movieDuration - voiceStartAt;
|
|
96
|
+
const duration = remaining - remainingDuration;
|
|
97
|
+
userAssert(duration >= 0, `duration(${duration}) < 0`);
|
|
98
|
+
beatDurations.push(duration);
|
|
99
|
+
subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
|
|
100
|
+
userAssert(subBeatDurations.silenceDuration >= 0, `subBeatDurations.silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
|
|
101
|
+
return remainingDuration;
|
|
102
|
+
}
|
|
103
|
+
beatDurations.push(subBeatDurations.audioDuration);
|
|
104
|
+
return remaining - subBeatDurations.audioDuration;
|
|
105
|
+
}, movieDuration);
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
69
109
|
// Check if the current beat has media and the next beat does not have media.
|
|
70
110
|
if (audioDuration > 0) {
|
|
71
111
|
// Check if the current beat has spilled over audio.
|
|
@@ -111,17 +151,15 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
111
151
|
}
|
|
112
152
|
else if (movieDuration > 0) {
|
|
113
153
|
// This beat has only a movie, not audio.
|
|
114
|
-
assert(beatDurations.length === index, "beatDurations.length !== index");
|
|
115
154
|
beatDurations.push(movieDuration);
|
|
116
155
|
mediaDurations[index].silenceDuration = movieDuration;
|
|
117
156
|
}
|
|
118
|
-
else
|
|
157
|
+
else {
|
|
119
158
|
// The current beat has no audio, nor no spilled over audio
|
|
120
159
|
const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
|
|
121
160
|
beatDurations.push(beatDuration);
|
|
122
161
|
mediaDurations[index].silenceDuration = beatDuration;
|
|
123
162
|
}
|
|
124
|
-
// else { Skip this beat if the duration has been already added as a group }
|
|
125
163
|
});
|
|
126
164
|
assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
|
|
127
165
|
// We cannot reuse longSilentId. We need to explicitly split it for each beat.
|
|
@@ -152,9 +190,19 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
152
190
|
const result = {
|
|
153
191
|
studio: {
|
|
154
192
|
...context.studio,
|
|
155
|
-
beats: context.studio.beats.map((studioBeat, index) => ({
|
|
193
|
+
beats: context.studio.beats.map((studioBeat, index) => ({
|
|
194
|
+
...studioBeat,
|
|
195
|
+
duration: beatDurations[index],
|
|
196
|
+
audioDuration: mediaDurations[index].audioDuration,
|
|
197
|
+
movieDuration: mediaDurations[index].movieDuration,
|
|
198
|
+
silenceDuration: mediaDurations[index].silenceDuration,
|
|
199
|
+
})),
|
|
156
200
|
},
|
|
157
201
|
};
|
|
202
|
+
result.studio.beats.reduce((acc, beat) => {
|
|
203
|
+
beat.startAt = acc;
|
|
204
|
+
return acc + beat.duration;
|
|
205
|
+
}, 0);
|
|
158
206
|
// context.studio = result.studio; // TODO: removing this breaks test/test_movie.ts
|
|
159
207
|
return {
|
|
160
208
|
...context,
|
|
@@ -29,12 +29,13 @@ export const ttsOpenaiAgent = async ({ namedInputs, params, config }) => {
|
|
|
29
29
|
if (e && typeof e === "object" && "error" in e) {
|
|
30
30
|
GraphAILogger.info("tts_openai_agent: ");
|
|
31
31
|
GraphAILogger.info(e.error);
|
|
32
|
+
throw new Error("TTS OpenAI Error: " + JSON.stringify(e.error, null, 2));
|
|
32
33
|
}
|
|
33
34
|
else if (e instanceof Error) {
|
|
34
35
|
GraphAILogger.info("tts_openai_agent: ");
|
|
35
36
|
GraphAILogger.info(e.message);
|
|
37
|
+
throw new Error("TTS OpenAI Error: " + e.message);
|
|
36
38
|
}
|
|
37
|
-
throw new Error("TTS OpenAI Error");
|
|
38
39
|
}
|
|
39
40
|
};
|
|
40
41
|
const ttsOpenaiAgentInfo = {
|
|
@@ -34,6 +34,7 @@ export const handler = async (argv) => {
|
|
|
34
34
|
const context = { outDirPath, templateName: template, urls, filename: filename, cacheDirPath, llm_model, llm, verbose };
|
|
35
35
|
if (interactive) {
|
|
36
36
|
await createMulmoScriptInteractively(context);
|
|
37
|
+
return;
|
|
37
38
|
}
|
|
38
39
|
if (inputFile) {
|
|
39
40
|
await createMulmoScriptFromFile(inputFile, context);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import "dotenv/config";
|
|
2
|
-
import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData } from "../types/index.js";
|
|
2
|
+
import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData, Text2ImageProvider } from "../types/index.js";
|
|
3
3
|
export declare const MulmoPresentationStyleMethods: {
|
|
4
4
|
getCanvasSize(presentationStyle: MulmoPresentationStyle): MulmoCanvasDimension;
|
|
5
5
|
getSpeechProvider(presentationStyle: MulmoPresentationStyle): Text2SpeechProvider;
|
|
@@ -9,7 +9,8 @@ export declare const MulmoPresentationStyleMethods: {
|
|
|
9
9
|
getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
|
|
10
10
|
getProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
|
|
11
11
|
getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
|
|
12
|
-
|
|
12
|
+
getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
|
|
13
|
+
getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
|
|
13
14
|
getHtmlImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2HtmlAgentInfo;
|
|
14
15
|
getImageType(_: MulmoPresentationStyle, beat: MulmoBeat): BeatMediaType;
|
|
15
16
|
};
|
|
@@ -57,17 +57,21 @@ export const MulmoPresentationStyleMethods = {
|
|
|
57
57
|
const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
|
|
58
58
|
return speaker.voiceId;
|
|
59
59
|
},
|
|
60
|
-
|
|
60
|
+
getText2ImageProvider(provider) {
|
|
61
|
+
return text2ImageProviderSchema.parse(provider);
|
|
62
|
+
},
|
|
63
|
+
getImageAgentInfo(presentationStyle, beat) {
|
|
61
64
|
// Notice that we copy imageParams from presentationStyle and update
|
|
62
65
|
// provider and model appropriately.
|
|
63
|
-
const
|
|
66
|
+
const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
|
|
67
|
+
const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
|
|
64
68
|
const defaultImageParams = {
|
|
69
|
+
provider,
|
|
65
70
|
model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
|
|
66
71
|
};
|
|
67
72
|
return {
|
|
68
|
-
provider,
|
|
69
73
|
agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
|
|
70
|
-
imageParams: { ...defaultImageParams, ...
|
|
74
|
+
imageParams: { ...defaultImageParams, ...imageParams },
|
|
71
75
|
};
|
|
72
76
|
},
|
|
73
77
|
getHtmlImageAgentInfo(presentationStyle) {
|