mulmocast 0.0.28 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -8
- package/assets/templates/ghibli_shorts.json +2 -2
- package/assets/templates/sensei_and_taro.json +1 -1
- package/lib/actions/captions.js +16 -2
- package/lib/actions/images.d.ts +5 -2
- package/lib/actions/images.js +14 -34
- package/lib/actions/movie.d.ts +1 -1
- package/lib/actions/movie.js +110 -77
- package/lib/agents/add_bgm_agent.js +15 -2
- package/lib/agents/combine_audio_files_agent.js +53 -5
- package/lib/agents/tts_openai_agent.js +2 -1
- package/lib/cli/commands/tool/scripting/handler.js +1 -0
- package/lib/methods/mulmo_presentation_style.d.ts +3 -2
- package/lib/methods/mulmo_presentation_style.js +8 -4
- package/lib/types/schema.d.ts +309 -115
- package/lib/types/schema.js +48 -11
- package/lib/types/type.d.ts +5 -2
- package/lib/utils/context.d.ts +13 -4
- package/lib/utils/file.js +8 -0
- package/lib/utils/image_plugins/index.d.ts +2 -1
- package/lib/utils/image_plugins/index.js +2 -1
- package/lib/utils/image_plugins/voice_over.d.ts +5 -0
- package/lib/utils/image_plugins/voice_over.js +9 -0
- package/lib/utils/preprocess.d.ts +12 -3
- package/lib/utils/utils.d.ts +1 -0
- package/lib/utils/utils.js +14 -0
- package/package.json +12 -12
- package/scripts/templates/voice_over.json +60 -0
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { assert, GraphAILogger } from "graphai";
|
|
2
2
|
import { silent60secPath } from "../utils/file.js";
|
|
3
3
|
import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
|
|
4
|
+
import { userAssert } from "../utils/utils.js";
|
|
4
5
|
const getMovieDulation = async (beat) => {
|
|
5
6
|
if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
|
|
6
7
|
const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
|
|
7
|
-
|
|
8
|
+
const speed = beat.movieParams?.speed ?? 1.0;
|
|
9
|
+
return (await ffmpegGetMediaDuration(pathOrUrl)) / speed;
|
|
8
10
|
}
|
|
9
11
|
return 0;
|
|
10
12
|
};
|
|
@@ -65,7 +67,45 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
65
67
|
const mediaDurations = await getMediaDurations(context);
|
|
66
68
|
const beatDurations = [];
|
|
67
69
|
context.studio.script.beats.forEach((beat, index) => {
|
|
70
|
+
if (beatDurations.length > index) {
|
|
71
|
+
// The current beat has already been processed.
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
assert(beatDurations.length === index, "beatDurations.length !== index");
|
|
68
75
|
const { audioDuration, movieDuration } = mediaDurations[index];
|
|
76
|
+
// Check if we are processing a voice-over beat.
|
|
77
|
+
if (movieDuration > 0) {
|
|
78
|
+
const group = [index];
|
|
79
|
+
for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
|
|
80
|
+
group.push(i);
|
|
81
|
+
}
|
|
82
|
+
if (group.length > 1) {
|
|
83
|
+
group.reduce((remaining, idx, iGroup) => {
|
|
84
|
+
const subBeatDurations = mediaDurations[idx];
|
|
85
|
+
userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
|
|
86
|
+
if (iGroup === group.length - 1) {
|
|
87
|
+
beatDurations.push(remaining);
|
|
88
|
+
subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
|
|
89
|
+
return 0;
|
|
90
|
+
}
|
|
91
|
+
const nextBeat = context.studio.script.beats[idx + 1];
|
|
92
|
+
assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
|
|
93
|
+
const voiceStartAt = nextBeat.image?.startAt;
|
|
94
|
+
if (voiceStartAt) {
|
|
95
|
+
const remainingDuration = movieDuration - voiceStartAt;
|
|
96
|
+
const duration = remaining - remainingDuration;
|
|
97
|
+
userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
|
|
98
|
+
beatDurations.push(duration);
|
|
99
|
+
subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
|
|
100
|
+
userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
|
|
101
|
+
return remainingDuration;
|
|
102
|
+
}
|
|
103
|
+
beatDurations.push(subBeatDurations.audioDuration);
|
|
104
|
+
return remaining - subBeatDurations.audioDuration;
|
|
105
|
+
}, movieDuration);
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
69
109
|
// Check if the current beat has media and the next beat does not have media.
|
|
70
110
|
if (audioDuration > 0) {
|
|
71
111
|
// Check if the current beat has spilled over audio.
|
|
@@ -111,17 +151,15 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
111
151
|
}
|
|
112
152
|
else if (movieDuration > 0) {
|
|
113
153
|
// This beat has only a movie, not audio.
|
|
114
|
-
assert(beatDurations.length === index, "beatDurations.length !== index");
|
|
115
154
|
beatDurations.push(movieDuration);
|
|
116
155
|
mediaDurations[index].silenceDuration = movieDuration;
|
|
117
156
|
}
|
|
118
|
-
else
|
|
157
|
+
else {
|
|
119
158
|
// The current beat has no audio, nor no spilled over audio
|
|
120
159
|
const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
|
|
121
160
|
beatDurations.push(beatDuration);
|
|
122
161
|
mediaDurations[index].silenceDuration = beatDuration;
|
|
123
162
|
}
|
|
124
|
-
// else { Skip this beat if the duration has been already added as a group }
|
|
125
163
|
});
|
|
126
164
|
assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
|
|
127
165
|
// We cannot reuse longSilentId. We need to explicitly split it for each beat.
|
|
@@ -152,9 +190,19 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
152
190
|
const result = {
|
|
153
191
|
studio: {
|
|
154
192
|
...context.studio,
|
|
155
|
-
beats: context.studio.beats.map((studioBeat, index) => ({
|
|
193
|
+
beats: context.studio.beats.map((studioBeat, index) => ({
|
|
194
|
+
...studioBeat,
|
|
195
|
+
duration: beatDurations[index],
|
|
196
|
+
audioDuration: mediaDurations[index].audioDuration,
|
|
197
|
+
movieDuration: mediaDurations[index].movieDuration,
|
|
198
|
+
silenceDuration: mediaDurations[index].silenceDuration,
|
|
199
|
+
})),
|
|
156
200
|
},
|
|
157
201
|
};
|
|
202
|
+
result.studio.beats.reduce((acc, beat) => {
|
|
203
|
+
beat.startAt = acc;
|
|
204
|
+
return acc + beat.duration;
|
|
205
|
+
}, 0);
|
|
158
206
|
// context.studio = result.studio; // TODO: removing this breaks test/test_movie.ts
|
|
159
207
|
return {
|
|
160
208
|
...context,
|
|
@@ -29,12 +29,13 @@ export const ttsOpenaiAgent = async ({ namedInputs, params, config }) => {
|
|
|
29
29
|
if (e && typeof e === "object" && "error" in e) {
|
|
30
30
|
GraphAILogger.info("tts_openai_agent: ");
|
|
31
31
|
GraphAILogger.info(e.error);
|
|
32
|
+
throw new Error("TTS OpenAI Error: " + JSON.stringify(e.error, null, 2));
|
|
32
33
|
}
|
|
33
34
|
else if (e instanceof Error) {
|
|
34
35
|
GraphAILogger.info("tts_openai_agent: ");
|
|
35
36
|
GraphAILogger.info(e.message);
|
|
37
|
+
throw new Error("TTS OpenAI Error: " + e.message);
|
|
36
38
|
}
|
|
37
|
-
throw new Error("TTS OpenAI Error");
|
|
38
39
|
}
|
|
39
40
|
};
|
|
40
41
|
const ttsOpenaiAgentInfo = {
|
|
@@ -34,6 +34,7 @@ export const handler = async (argv) => {
|
|
|
34
34
|
const context = { outDirPath, templateName: template, urls, filename: filename, cacheDirPath, llm_model, llm, verbose };
|
|
35
35
|
if (interactive) {
|
|
36
36
|
await createMulmoScriptInteractively(context);
|
|
37
|
+
return;
|
|
37
38
|
}
|
|
38
39
|
if (inputFile) {
|
|
39
40
|
await createMulmoScriptFromFile(inputFile, context);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import "dotenv/config";
|
|
2
|
-
import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData } from "../types/index.js";
|
|
2
|
+
import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData, Text2ImageProvider } from "../types/index.js";
|
|
3
3
|
export declare const MulmoPresentationStyleMethods: {
|
|
4
4
|
getCanvasSize(presentationStyle: MulmoPresentationStyle): MulmoCanvasDimension;
|
|
5
5
|
getSpeechProvider(presentationStyle: MulmoPresentationStyle): Text2SpeechProvider;
|
|
@@ -9,7 +9,8 @@ export declare const MulmoPresentationStyleMethods: {
|
|
|
9
9
|
getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
|
|
10
10
|
getProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
|
|
11
11
|
getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
|
|
12
|
-
|
|
12
|
+
getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
|
|
13
|
+
getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
|
|
13
14
|
getHtmlImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2HtmlAgentInfo;
|
|
14
15
|
getImageType(_: MulmoPresentationStyle, beat: MulmoBeat): BeatMediaType;
|
|
15
16
|
};
|
|
@@ -57,17 +57,21 @@ export const MulmoPresentationStyleMethods = {
|
|
|
57
57
|
const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
|
|
58
58
|
return speaker.voiceId;
|
|
59
59
|
},
|
|
60
|
-
|
|
60
|
+
getText2ImageProvider(provider) {
|
|
61
|
+
return text2ImageProviderSchema.parse(provider);
|
|
62
|
+
},
|
|
63
|
+
getImageAgentInfo(presentationStyle, beat) {
|
|
61
64
|
// Notice that we copy imageParams from presentationStyle and update
|
|
62
65
|
// provider and model appropriately.
|
|
63
|
-
const
|
|
66
|
+
const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
|
|
67
|
+
const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
|
|
64
68
|
const defaultImageParams = {
|
|
69
|
+
provider,
|
|
65
70
|
model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
|
|
66
71
|
};
|
|
67
72
|
return {
|
|
68
|
-
provider,
|
|
69
73
|
agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
|
|
70
|
-
imageParams: { ...defaultImageParams, ...
|
|
74
|
+
imageParams: { ...defaultImageParams, ...imageParams },
|
|
71
75
|
};
|
|
72
76
|
},
|
|
73
77
|
getHtmlImageAgentInfo(presentationStyle) {
|