mulmocast 0.0.28 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -200,11 +200,18 @@ writing: /Users/username/path/to/output/story-1747834931950__ja.mp4
200
200
  # Generate script from web content (requires Browserless API KEY)
201
201
  mulmo tool scripting -u https://example.com
202
202
 
203
+ # Generate script from local file
204
+ mulmo tool scripting --input-file story.txt
205
+
203
206
  # Generate script with interactive mode
204
207
  mulmo tool scripting -i
205
208
  ```
206
209
 
207
- When using the `⁠sensei_and_taro` template, a Nijivoice API key is required.
210
+ Note:
211
+ - When using the `⁠sensei_and_taro` template, a Nijivoice API key is required
212
+ - When -i is specified, --input-file value will be ignored
213
+ - When --input-file is specified, -u value will be ignored
214
+
208
215
 
209
216
  ## Generate content from MulmoScript
210
217
 
@@ -308,7 +315,6 @@ Options:
308
315
  -b, --basedir base dir [string]
309
316
  -l, --lang target language [string] [choices: "en", "ja"]
310
317
  -f, --force Force regenerate [boolean] [default: false]
311
- --dryRun Dry run [boolean] [default: false]
312
318
  -p, --presentationStyle Presentation Style [string]
313
319
  -a, --audiodir Audio output directory [string]
314
320
  ```
@@ -329,7 +335,6 @@ Options:
329
335
  -b, --basedir base dir [string]
330
336
  -l, --lang target language [string] [choices: "en", "ja"]
331
337
  -f, --force Force regenerate [boolean] [default: false]
332
- --dryRun Dry run [boolean] [default: false]
333
338
  -p, --presentationStyle Presentation Style [string]
334
339
  -i, --imagedir Image output directory [string]
335
340
  ```
@@ -350,7 +355,6 @@ Options:
350
355
  -b, --basedir base dir [string]
351
356
  -l, --lang target language [string] [choices: "en", "ja"]
352
357
  -f, --force Force regenerate [boolean] [default: false]
353
- --dryRun Dry run [boolean] [default: false]
354
358
  -p, --presentationStyle Presentation Style [string]
355
359
  -a, --audiodir Audio output directory [string]
356
360
  -i, --imagedir Image output directory [string]
@@ -411,16 +415,19 @@ Options:
411
415
  -b, --basedir base dir [string]
412
416
  -u, --url URLs to reference (required when not in interactive mode)
413
417
  [array] [default: []]
418
+ --input-file input file name [string]
414
419
  -i, --interactive Generate script in interactive mode with user prompts
415
420
  [boolean]
416
421
  -t, --template Template name to use
417
- [string] [choices: "business", "children_book", "coding", "comic_strips",
418
- "ghibli_strips", "podcast_standard", "sensei_and_taro"]
422
+ [string] [choices: "akira_comic", "business", "children_book", "coding",
423
+ "comic_strips", "drslump_comic", "ghibli_comic", "ghibli_image_only",
424
+ "ghibli_shorts", "ghost_comic", "onepiece_comic", "podcast_standard",
425
+ "portrait_movie", "realistic_movie", "sensei_and_taro", "shorts",
426
+ "text_and_image", "text_only", "trailer"]
419
427
  -c, --cache cache dir [string]
420
428
  -s, --script script filename [string] [default: "script"]
421
429
  --llm llm
422
- [string] [choices: "openAIAgent", "anthropicAgent", "geminiAgent",
423
- "groqAgent"]
430
+ [string] [choices: "openai", "anthropic", "gemini", "groq"]
424
431
  --llm_model llm model [string]
425
432
  ```
426
433
 
@@ -14,7 +14,7 @@
14
14
  "speechParams": {
15
15
  "provider": "nijivoice",
16
16
  "speakers": {
17
- "Presenter": { "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62", "speechOptions": { "speed": 1.5 } }
17
+ "Presenter": { "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c", "speechOptions": { "speed": 1.5 } }
18
18
  }
19
19
  },
20
20
  "imageParams": {
@@ -17,7 +17,7 @@
17
17
  "speechParams": {
18
18
  "provider": "nijivoice",
19
19
  "speakers": {
20
- "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "afd7df65-0fdc-4d31-ae8b-a29f0f5eed62" },
20
+ "Announcer": { "displayName": { "ja": "アナウンサー" }, "voiceId": "3708ad43-cace-486c-a4ca-8fe41186e20c" },
21
21
  "Student": { "displayName": { "ja": "太郎" }, "voiceId": "a7619e48-bf6a-4f9f-843f-40485651257f" },
22
22
  "Teacher": { "displayName": { "ja": "先生" }, "voiceId": "bc06c63f-fef6-43b6-92f7-67f919bd5dae" }
23
23
  }
@@ -1,14 +1,16 @@
1
1
  import { mulmoCaptionParamsSchema } from "../types/index.js";
2
2
  import { GraphAI, GraphAILogger } from "graphai";
3
3
  import * as agents from "@graphai/vanilla";
4
- import { getHTMLFile, getCaptionImagePath } from "../utils/file.js";
4
+ import { getHTMLFile, getCaptionImagePath, getOutputStudioFilePath } from "../utils/file.js";
5
5
  import { renderHTMLToImage, interpolate } from "../utils/markdown.js";
6
6
  import { MulmoStudioContextMethods, MulmoPresentationStyleMethods } from "../methods/index.js";
7
+ import { fileWriteAgent } from "@graphai/vanilla_node_agents";
7
8
  const vanillaAgents = agents.default ?? agents;
8
9
  const graph_data = {
9
10
  version: 0.5,
10
11
  nodes: {
11
12
  context: {},
13
+ outputStudioFilePath: {},
12
14
  map: {
13
15
  agent: "mapAgent",
14
16
  inputs: { rows: ":context.studio.script.beats", context: ":context" },
@@ -60,14 +62,26 @@ const graph_data = {
60
62
  },
61
63
  },
62
64
  },
65
+ fileWrite: {
66
+ agent: "fileWriteAgent",
67
+ inputs: {
68
+ onComplete: ":map.generateCaption",
69
+ file: ":outputStudioFilePath",
70
+ text: ":context.studio.toJSON()",
71
+ },
72
+ },
63
73
  },
64
74
  };
65
75
  export const captions = async (context, callbacks) => {
66
76
  if (MulmoStudioContextMethods.getCaption(context)) {
67
77
  try {
68
78
  MulmoStudioContextMethods.setSessionState(context, "caption", true);
69
- const graph = new GraphAI(graph_data, { ...vanillaAgents });
79
+ const graph = new GraphAI(graph_data, { ...vanillaAgents, fileWriteAgent });
80
+ const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
81
+ const fileName = MulmoStudioContextMethods.getFileName(context);
82
+ const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
70
83
  graph.injectValue("context", context);
84
+ graph.injectValue("outputStudioFilePath", outputStudioFilePath);
71
85
  if (callbacks) {
72
86
  callbacks.forEach((callback) => {
73
87
  graph.registerCallback(callback);
@@ -1,13 +1,13 @@
1
1
  import type { CallbackFunction } from "graphai";
2
- import { MulmoStudioContext, MulmoBeat, Text2ImageAgentInfo } from "../types/index.js";
2
+ import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
3
3
  export declare const imagePreprocessAgent: (namedInputs: {
4
4
  context: MulmoStudioContext;
5
5
  beat: MulmoBeat;
6
6
  index: number;
7
- imageAgentInfo: Text2ImageAgentInfo;
8
7
  imageRefs: Record<string, string>;
9
8
  }) => Promise<{
10
9
  imageParams: {
10
+ provider: "openai" | "google";
11
11
  style?: string | undefined;
12
12
  model?: string | undefined;
13
13
  moderation?: string | undefined;
@@ -42,6 +42,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
42
42
  images: string[];
43
43
  imageFromMovie: boolean;
44
44
  imageParams: {
45
+ provider: "openai" | "google";
45
46
  style?: string | undefined;
46
47
  model?: string | undefined;
47
48
  moderation?: string | undefined;
@@ -68,6 +69,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
68
69
  } | {
69
70
  images: string[];
70
71
  imageParams: {
72
+ provider: "openai" | "google";
71
73
  style?: string | undefined;
72
74
  model?: string | undefined;
73
75
  moderation?: string | undefined;
@@ -89,6 +91,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
89
91
  }> | undefined;
90
92
  };
91
93
  movieFile: string | undefined;
94
+ imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
92
95
  imagePath: string;
93
96
  referenceImage: string;
94
97
  prompt: string;
@@ -25,11 +25,12 @@ const htmlStyle = (context, beat) => {
25
25
  };
26
26
  };
27
27
  export const imagePreprocessAgent = async (namedInputs) => {
28
- const { context, beat, index, imageAgentInfo, imageRefs } = namedInputs;
29
- const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
28
+ const { context, beat, index, imageRefs } = namedInputs;
29
+ const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle, beat);
30
+ // const imageParams = { ...imageAgentInfo.imageParams, ...beat.imageParams };
30
31
  const imagePath = getBeatPngImagePath(context, index);
31
32
  const returnValue = {
32
- imageParams,
33
+ imageParams: imageAgentInfo.imageParams,
33
34
  movieFile: beat.moviePrompt ? getBeatMoviePath(context, index) : undefined,
34
35
  };
35
36
  if (beat.image) {
@@ -54,8 +55,8 @@ export const imagePreprocessAgent = async (namedInputs) => {
54
55
  if (beat.moviePrompt && !beat.imagePrompt) {
55
56
  return { ...returnValue, imagePath, images, imageFromMovie: true }; // no image prompt, only movie prompt
56
57
  }
57
- const prompt = imagePrompt(beat, imageParams.style);
58
- return { imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
58
+ const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
59
+ return { imageAgentInfo, imagePath, referenceImage: imagePath, prompt, ...returnValue, images };
59
60
  };
60
61
  export const imagePluginAgent = async (namedInputs) => {
61
62
  const { context, beat, index } = namedInputs;
@@ -87,7 +88,6 @@ const beat_graph_data = {
87
88
  concurrency: 4,
88
89
  nodes: {
89
90
  context: {},
90
- imageAgentInfo: {},
91
91
  htmlImageAgentInfo: {},
92
92
  movieAgentInfo: {},
93
93
  imageRefs: {},
@@ -99,7 +99,6 @@ const beat_graph_data = {
99
99
  context: ":context",
100
100
  beat: ":beat",
101
101
  index: ":__mapIndex",
102
- imageAgentInfo: ":imageAgentInfo",
103
102
  imageRefs: ":imageRefs",
104
103
  },
105
104
  },
@@ -142,7 +141,7 @@ const beat_graph_data = {
142
141
  },
143
142
  imageGenerator: {
144
143
  if: ":preprocessor.prompt",
145
- agent: ":imageAgentInfo.agent",
144
+ agent: ":preprocessor.imageAgentInfo.agent",
146
145
  retry: 2,
147
146
  inputs: {
148
147
  prompt: ":preprocessor.prompt",
@@ -213,7 +212,6 @@ const graph_data = {
213
212
  concurrency: 4,
214
213
  nodes: {
215
214
  context: {},
216
- imageAgentInfo: {},
217
215
  htmlImageAgentInfo: {},
218
216
  movieAgentInfo: {},
219
217
  outputStudioFilePath: {},
@@ -223,7 +221,6 @@ const graph_data = {
223
221
  inputs: {
224
222
  rows: ":context.studio.script.beats",
225
223
  context: ":context",
226
- imageAgentInfo: ":imageAgentInfo",
227
224
  htmlImageAgentInfo: ":htmlImageAgentInfo",
228
225
  movieAgentInfo: ":movieAgentInfo",
229
226
  imageRefs: ":imageRefs",
@@ -306,10 +303,10 @@ const graphOption = async (context, settings) => {
306
303
  agentFilters,
307
304
  taskManager,
308
305
  };
309
- const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
306
+ const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
310
307
  const config = settings2GraphAIConfig(settings);
311
308
  // We need to get google's auth token only if the google is the text2image provider.
312
- if (imageAgentInfo.provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
309
+ if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
313
310
  userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
314
311
  GraphAILogger.log("google was specified as text2image engine");
315
312
  const token = await googleAuth();
@@ -372,7 +369,7 @@ const prepareGenerateImages = async (context) => {
372
369
  const imageProjectDirPath = MulmoStudioContextMethods.getImageProjectDirPath(context);
373
370
  const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
374
371
  mkdir(imageProjectDirPath);
375
- const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
372
+ const provider = MulmoPresentationStyleMethods.getText2ImageProvider(context.presentationStyle.imageParams?.provider);
376
373
  const htmlImageAgentInfo = MulmoPresentationStyleMethods.getHtmlImageAgentInfo(context.presentationStyle);
377
374
  const imageRefs = await getImageRefs(context);
378
375
  // Determine movie agent based on provider
@@ -386,10 +383,9 @@ const prepareGenerateImages = async (context) => {
386
383
  return "movieGoogleAgent";
387
384
  }
388
385
  };
389
- GraphAILogger.info(`text2image: provider=${imageAgentInfo.provider} model=${imageAgentInfo.imageParams.model}`);
386
+ GraphAILogger.info(`text2image: provider=${provider} model=${context.presentationStyle.imageParams?.model}`);
390
387
  const injections = {
391
388
  context,
392
- imageAgentInfo,
393
389
  htmlImageAgentInfo,
394
390
  movieAgentInfo: {
395
391
  agent: getMovieAgent(),
@@ -404,7 +400,7 @@ const getConcurrency = (context) => {
404
400
  return 4;
405
401
  }
406
402
  const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
407
- if (imageAgentInfo.provider === "openai") {
403
+ if (imageAgentInfo.imageParams.provider === "openai") {
408
404
  // NOTE: Here are the rate limits of OpenAI's text2image API (1token = 32x32 patch).
409
405
  // dall-e-3: 7,500 RPM、15 images per minute (4 images for max resolution)
410
406
  // gpt-image-1:3,000,000 TPM、150 images per minute
@@ -1,5 +1,5 @@
1
1
  import { MulmoStudioContext, MulmoCanvasDimension, BeatMediaType, MulmoFillOption } from "../types/index.js";
2
- export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption) => {
2
+ export declare const getVideoPart: (inputIndex: number, mediaType: BeatMediaType, duration: number, canvasInfo: MulmoCanvasDimension, fillOption: MulmoFillOption, speed: number) => {
3
3
  videoId: string;
4
4
  videoPart: string;
5
5
  };
@@ -6,20 +6,28 @@ import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAud
6
6
  import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
7
7
  // const isMac = process.platform === "darwin";
8
8
  const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
9
- export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption) => {
9
+ export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo, fillOption, speed) => {
10
10
  const videoId = `v${inputIndex}`;
11
11
  const videoFilters = [];
12
12
  // Handle different media types
13
+ const originalDuration = duration * speed;
13
14
  if (mediaType === "image") {
14
15
  videoFilters.push("loop=loop=-1:size=1:start=0");
15
16
  }
16
17
  else if (mediaType === "movie") {
17
18
  // For videos, extend with last frame if shorter than required duration
18
19
  // tpad will extend the video by cloning the last frame, then trim will ensure exact duration
19
- videoFilters.push(`tpad=stop_mode=clone:stop_duration=${duration * 2}`); // Use 2x duration to ensure coverage
20
+ videoFilters.push(`tpad=stop_mode=clone:stop_duration=${originalDuration * 2}`); // Use 2x duration to ensure coverage
20
21
  }
21
22
  // Common filters for all media types
22
- videoFilters.push(`trim=duration=${duration}`, "fps=30", "setpts=PTS-STARTPTS");
23
+ videoFilters.push(`trim=duration=${originalDuration}`, "fps=30");
24
+ // Apply speed if specified
25
+ if (speed !== 1.0) {
26
+ videoFilters.push(`setpts=${1 / speed}*PTS`);
27
+ }
28
+ else {
29
+ videoFilters.push("setpts=PTS-STARTPTS");
30
+ }
23
31
  // Apply scaling based on fill option
24
32
  if (fillOption.style === "aspectFill") {
25
33
  // For aspect fill: scale to fill the canvas completely, cropping if necessary
@@ -73,7 +81,13 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
73
81
  const caption = MulmoStudioContextMethods.getCaption(context);
74
82
  const start = performance.now();
75
83
  const ffmpegContext = FfmpegContextInit();
76
- const missingIndex = context.studio.beats.findIndex((beat) => !beat.imageFile && !beat.movieFile);
84
+ const missingIndex = context.studio.beats.findIndex((studioBeat, index) => {
85
+ const beat = context.studio.script.beats[index];
86
+ if (beat.image?.type === "voice_over") {
87
+ return false; // Voice-over does not have either imageFile or movieFile.
88
+ }
89
+ return !studioBeat.imageFile && !studioBeat.movieFile;
90
+ });
77
91
  if (missingIndex !== -1) {
78
92
  GraphAILogger.info(`ERROR: beat.imageFile or beat.movieFile is not set on beat ${missingIndex}.`);
79
93
  return false;
@@ -86,6 +100,11 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
86
100
  const beatTimestamps = [];
87
101
  context.studio.beats.reduce((timestamp, studioBeat, index) => {
88
102
  const beat = context.studio.script.beats[index];
103
+ if (beat.image?.type === "voice_over") {
104
+ filterComplexVideoIds.push(undefined);
105
+ beatTimestamps.push(timestamp);
106
+ return timestamp; // Skip voice-over beats.
107
+ }
89
108
  const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
90
109
  if (!sourceFile) {
91
110
  throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
@@ -105,23 +124,27 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
105
124
  }
106
125
  return 0;
107
126
  })();
108
- const duration = studioBeat.duration + extraPadding;
127
+ // The movie duration is bigger in case of voice-over.
128
+ const duration = Math.max(studioBeat.duration + extraPadding, studioBeat.movieDuration ?? 0);
109
129
  // Get fillOption from merged imageParams (global + beat-specific)
110
130
  const globalFillOption = context.presentationStyle.movieParams?.fillOption;
111
131
  const beatFillOption = beat.movieParams?.fillOption;
112
132
  const defaultFillOption = mulmoFillOptionSchema.parse({}); // let the schema infer the default value
113
133
  const fillOption = { ...defaultFillOption, ...globalFillOption, ...beatFillOption };
114
- const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption);
134
+ const speed = beat.movieParams?.speed ?? 1.0;
135
+ const { videoId, videoPart } = getVideoPart(inputIndex, mediaType, duration, canvasInfo, fillOption, speed);
115
136
  ffmpegContext.filterComplex.push(videoPart);
137
+ /*
116
138
  if (caption && studioBeat.captionFile) {
117
- const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
118
- const compositeVideoId = `c${index}`;
119
- ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
120
- filterComplexVideoIds.push(compositeVideoId);
121
- }
122
- else {
123
- filterComplexVideoIds.push(videoId);
139
+ // NOTE: This works for normal beats, but not for voice-over beats.
140
+ const captionInputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.captionFile);
141
+ const compositeVideoId = `c${index}`;
142
+ ffmpegContext.filterComplex.push(`[${videoId}][${captionInputIndex}:v]overlay=format=auto[${compositeVideoId}]`);
143
+ filterComplexVideoIds.push(compositeVideoId);
144
+ } else {
124
145
  }
146
+ */
147
+ filterComplexVideoIds.push(videoId);
125
148
  if (context.presentationStyle.movieParams?.transition && index < context.studio.beats.length - 1) {
126
149
  const sourceId = filterComplexVideoIds.pop();
127
150
  ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
@@ -135,7 +158,8 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
135
158
  transitionVideoIds.push(`${sourceId}_1`);
136
159
  }
137
160
  }
138
- if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0) {
161
+ // NOTE: We don't support audio if the speed is not 1.0.
162
+ if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
139
163
  const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
140
164
  filterComplexAudioIds.push(audioId);
141
165
  ffmpegContext.filterComplex.push(audioPart);
@@ -148,7 +172,26 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
148
172
  // console.log("*** images", images.audioIds);
149
173
  // Concatenate the trimmed images
150
174
  const concatVideoId = "concat_video";
151
- ffmpegContext.filterComplex.push(`${filterComplexVideoIds.map((id) => `[${id}]`).join("")}concat=n=${context.studio.beats.length}:v=1:a=0[${concatVideoId}]`);
175
+ const videoIds = filterComplexVideoIds.filter((id) => id !== undefined); // filter out voice-over beats
176
+ ffmpegContext.filterComplex.push(`${videoIds.map((id) => `[${id}]`).join("")}concat=n=${videoIds.length}:v=1:a=0[${concatVideoId}]`);
177
+ // Overlay voice-over captions
178
+ const captionedVideoId = (() => {
179
+ const beatsWithCaptions = context.studio.beats.filter(({ captionFile }) => captionFile);
180
+ if (caption && beatsWithCaptions.length > 0) {
181
+ const introPadding = context.presentationStyle.audioParams.introPadding;
182
+ return beatsWithCaptions.reduce((acc, beat, index) => {
183
+ const { startAt, duration, captionFile } = beat;
184
+ if (startAt !== undefined && duration !== undefined && captionFile !== undefined) {
185
+ const captionInputIndex = FfmpegContextAddInput(ffmpegContext, captionFile);
186
+ const compositeVideoId = `oc${index}`;
187
+ ffmpegContext.filterComplex.push(`[${acc}][${captionInputIndex}:v]overlay=format=auto:enable='between(t,${startAt + introPadding},${startAt + duration + introPadding})'[${compositeVideoId}]`);
188
+ return compositeVideoId;
189
+ }
190
+ return acc;
191
+ }, concatVideoId);
192
+ }
193
+ return concatVideoId;
194
+ })();
152
195
  // Add tranditions if needed
153
196
  const mixedVideoId = (() => {
154
197
  if (context.presentationStyle.movieParams?.transition && transitionVideoIds.length > 0) {
@@ -175,10 +218,11 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
175
218
  ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=x='-(t-${transitionStartTime})*W/${transition.duration}':y=0:enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
176
219
  }
177
220
  return outputId;
178
- }, concatVideoId);
221
+ }, captionedVideoId);
179
222
  }
180
- return concatVideoId;
223
+ return captionedVideoId;
181
224
  })();
225
+ GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
182
226
  const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
183
227
  const artifactAudioId = `${audioIndex}:a`;
184
228
  const ffmpegContextAudioId = (() => {
@@ -1,10 +1,12 @@
1
1
  import { assert, GraphAILogger } from "graphai";
2
2
  import { silent60secPath } from "../utils/file.js";
3
3
  import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
4
+ import { userAssert } from "../utils/utils.js";
4
5
  const getMovieDulation = async (beat) => {
5
6
  if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
6
7
  const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
7
- return await ffmpegGetMediaDuration(pathOrUrl);
8
+ const speed = beat.movieParams?.speed ?? 1.0;
9
+ return (await ffmpegGetMediaDuration(pathOrUrl)) / speed;
8
10
  }
9
11
  return 0;
10
12
  };
@@ -65,7 +67,45 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
65
67
  const mediaDurations = await getMediaDurations(context);
66
68
  const beatDurations = [];
67
69
  context.studio.script.beats.forEach((beat, index) => {
70
+ if (beatDurations.length > index) {
71
+ // The current beat has already been processed.
72
+ return;
73
+ }
74
+ assert(beatDurations.length === index, "beatDurations.length !== index");
68
75
  const { audioDuration, movieDuration } = mediaDurations[index];
76
+ // Check if we are processing a voice-over beat.
77
+ if (movieDuration > 0) {
78
+ const group = [index];
79
+ for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
80
+ group.push(i);
81
+ }
82
+ if (group.length > 1) {
83
+ group.reduce((remaining, idx, iGroup) => {
84
+ const subBeatDurations = mediaDurations[idx];
85
+ userAssert(subBeatDurations.audioDuration <= remaining, `subBeatDurations.audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
86
+ if (iGroup === group.length - 1) {
87
+ beatDurations.push(remaining);
88
+ subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
89
+ return 0;
90
+ }
91
+ const nextBeat = context.studio.script.beats[idx + 1];
92
+ assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
93
+ const voiceStartAt = nextBeat.image?.startAt;
94
+ if (voiceStartAt) {
95
+ const remainingDuration = movieDuration - voiceStartAt;
96
+ const duration = remaining - remainingDuration;
97
+ userAssert(duration >= 0, `duration(${duration}) < 0`);
98
+ beatDurations.push(duration);
99
+ subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
100
+ userAssert(subBeatDurations.silenceDuration >= 0, `subBeatDurations.silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
101
+ return remainingDuration;
102
+ }
103
+ beatDurations.push(subBeatDurations.audioDuration);
104
+ return remaining - subBeatDurations.audioDuration;
105
+ }, movieDuration);
106
+ return;
107
+ }
108
+ }
69
109
  // Check if the current beat has media and the next beat does not have media.
70
110
  if (audioDuration > 0) {
71
111
  // Check if the current beat has spilled over audio.
@@ -111,17 +151,15 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
111
151
  }
112
152
  else if (movieDuration > 0) {
113
153
  // This beat has only a movie, not audio.
114
- assert(beatDurations.length === index, "beatDurations.length !== index");
115
154
  beatDurations.push(movieDuration);
116
155
  mediaDurations[index].silenceDuration = movieDuration;
117
156
  }
118
- else if (beatDurations.length === index) {
157
+ else {
119
158
  // The current beat has no audio, nor no spilled over audio
120
159
  const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
121
160
  beatDurations.push(beatDuration);
122
161
  mediaDurations[index].silenceDuration = beatDuration;
123
162
  }
124
- // else { Skip this beat if the duration has been already added as a group }
125
163
  });
126
164
  assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
127
165
  // We cannot reuse longSilentId. We need to explicitly split it for each beat.
@@ -152,9 +190,19 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
152
190
  const result = {
153
191
  studio: {
154
192
  ...context.studio,
155
- beats: context.studio.beats.map((studioBeat, index) => ({ ...studioBeat, duration: beatDurations[index] })),
193
+ beats: context.studio.beats.map((studioBeat, index) => ({
194
+ ...studioBeat,
195
+ duration: beatDurations[index],
196
+ audioDuration: mediaDurations[index].audioDuration,
197
+ movieDuration: mediaDurations[index].movieDuration,
198
+ silenceDuration: mediaDurations[index].silenceDuration,
199
+ })),
156
200
  },
157
201
  };
202
+ result.studio.beats.reduce((acc, beat) => {
203
+ beat.startAt = acc;
204
+ return acc + beat.duration;
205
+ }, 0);
158
206
  // context.studio = result.studio; // TODO: removing this breaks test/test_movie.ts
159
207
  return {
160
208
  ...context,
@@ -29,12 +29,13 @@ export const ttsOpenaiAgent = async ({ namedInputs, params, config }) => {
29
29
  if (e && typeof e === "object" && "error" in e) {
30
30
  GraphAILogger.info("tts_openai_agent: ");
31
31
  GraphAILogger.info(e.error);
32
+ throw new Error("TTS OpenAI Error: " + JSON.stringify(e.error, null, 2));
32
33
  }
33
34
  else if (e instanceof Error) {
34
35
  GraphAILogger.info("tts_openai_agent: ");
35
36
  GraphAILogger.info(e.message);
37
+ throw new Error("TTS OpenAI Error: " + e.message);
36
38
  }
37
- throw new Error("TTS OpenAI Error");
38
39
  }
39
40
  };
40
41
  const ttsOpenaiAgentInfo = {
@@ -34,6 +34,7 @@ export const handler = async (argv) => {
34
34
  const context = { outDirPath, templateName: template, urls, filename: filename, cacheDirPath, llm_model, llm, verbose };
35
35
  if (interactive) {
36
36
  await createMulmoScriptInteractively(context);
37
+ return;
37
38
  }
38
39
  if (inputFile) {
39
40
  await createMulmoScriptFromFile(inputFile, context);
@@ -1,5 +1,5 @@
1
1
  import "dotenv/config";
2
- import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData } from "../types/index.js";
2
+ import { MulmoCanvasDimension, MulmoBeat, SpeechOptions, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData, Text2ImageProvider } from "../types/index.js";
3
3
  export declare const MulmoPresentationStyleMethods: {
4
4
  getCanvasSize(presentationStyle: MulmoPresentationStyle): MulmoCanvasDimension;
5
5
  getSpeechProvider(presentationStyle: MulmoPresentationStyle): Text2SpeechProvider;
@@ -9,7 +9,8 @@ export declare const MulmoPresentationStyleMethods: {
9
9
  getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
10
10
  getProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
11
11
  getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
12
- getImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2ImageAgentInfo;
12
+ getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
13
+ getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
13
14
  getHtmlImageAgentInfo(presentationStyle: MulmoPresentationStyle): Text2HtmlAgentInfo;
14
15
  getImageType(_: MulmoPresentationStyle, beat: MulmoBeat): BeatMediaType;
15
16
  };
@@ -57,17 +57,21 @@ export const MulmoPresentationStyleMethods = {
57
57
  const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
58
58
  return speaker.voiceId;
59
59
  },
60
- getImageAgentInfo(presentationStyle) {
60
+ getText2ImageProvider(provider) {
61
+ return text2ImageProviderSchema.parse(provider);
62
+ },
63
+ getImageAgentInfo(presentationStyle, beat) {
61
64
  // Notice that we copy imageParams from presentationStyle and update
62
65
  // provider and model appropriately.
63
- const provider = text2ImageProviderSchema.parse(presentationStyle.imageParams?.provider);
66
+ const imageParams = { ...presentationStyle.imageParams, ...beat?.imageParams };
67
+ const provider = MulmoPresentationStyleMethods.getText2ImageProvider(imageParams?.provider);
64
68
  const defaultImageParams = {
69
+ provider,
65
70
  model: provider === "openai" ? (process.env.DEFAULT_OPENAI_IMAGE_MODEL ?? defaultOpenAIImageModel) : undefined,
66
71
  };
67
72
  return {
68
- provider,
69
73
  agent: provider === "google" ? "imageGoogleAgent" : "imageOpenaiAgent",
70
- imageParams: { ...defaultImageParams, ...presentationStyle.imageParams },
74
+ imageParams: { ...defaultImageParams, ...imageParams },
71
75
  };
72
76
  },
73
77
  getHtmlImageAgentInfo(presentationStyle) {