mulmocast 1.2.12 → 1.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import "dotenv/config";
2
2
  import { MulmoStudioContext, MulmoBeat, PublicAPIArgs } from "../types/index.js";
3
3
  export declare const getBeatAudioPath: (text: string, context: MulmoStudioContext, beat: MulmoBeat, lang?: string) => string | undefined;
4
+ export declare const listLocalizedAudioPaths: (context: MulmoStudioContext) => (string | undefined)[];
4
5
  export declare const generateBeatAudio: (index: number, context: MulmoStudioContext, args?: PublicAPIArgs & {
5
6
  langs: string[];
6
7
  }) => Promise<void>;
@@ -1,5 +1,5 @@
1
1
  import "dotenv/config";
2
- import { GraphAI, TaskManager } from "graphai";
2
+ import { GraphAI, TaskManager, GraphAILogger } from "graphai";
3
3
  import * as agents from "@graphai/vanilla";
4
4
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
5
5
  import { ttsNijivoiceAgent, ttsOpenaiAgent, ttsGoogleAgent, ttsElevenlabsAgent, addBGMAgent, combineAudioFilesAgent, mediaMockAgent } from "../agents/index.js";
@@ -26,25 +26,34 @@ const getAudioPath = (context, beat, audioFile) => {
26
26
  }
27
27
  return audioFile;
28
28
  };
29
- const getAudioParam = (context, beat) => {
30
- const speaker = MulmoPresentationStyleMethods.getSpeaker(context, beat);
29
+ const getAudioParam = (context, beat, lang) => {
30
+ const speaker = MulmoPresentationStyleMethods.getSpeaker(context, beat, lang);
31
31
  const speechOptions = { ...speaker.speechOptions, ...beat.speechOptions };
32
32
  const provider = text2SpeechProviderSchema.parse(speaker.provider);
33
33
  return { voiceId: speaker.voiceId, provider, speechOptions, model: speaker.model };
34
34
  };
35
35
  export const getBeatAudioPath = (text, context, beat, lang) => {
36
36
  const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
37
- const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat);
37
+ const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat, lang);
38
38
  const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider, model ?? ""].join(":");
39
+ GraphAILogger.log(`getBeatAudioPath [${hash_string}]`);
39
40
  const audioFileName = `${context.studio.filename}_${text2hash(hash_string)}`;
40
41
  const audioFile = getAudioFilePath(audioDirPath, context.studio.filename, audioFileName, lang);
41
42
  return getAudioPath(context, beat, audioFile);
42
43
  };
44
+ export const listLocalizedAudioPaths = (context) => {
45
+ const lang = context.lang ?? context.studio.script.lang;
46
+ return context.studio.script.beats.map((beat, index) => {
47
+ const multiLingual = context.multiLingual[index];
48
+ const text = localizedText(beat, multiLingual, lang);
49
+ return getBeatAudioPath(text, context, beat, lang);
50
+ });
51
+ };
43
52
  const preprocessorAgent = (namedInputs) => {
44
53
  const { beat, studioBeat, multiLingual, context, lang } = namedInputs;
45
54
  // const { lang } = context;
46
55
  const text = localizedText(beat, multiLingual, lang);
47
- const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat);
56
+ const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat, lang);
48
57
  const audioPath = getBeatAudioPath(text, context, beat, lang);
49
58
  studioBeat.audioFile = audioPath; // TODO: Passing by reference is difficult to maintain, so pass it using graphai inputs
50
59
  const needsTTS = !beat.audio && audioPath !== undefined;
@@ -25,6 +25,10 @@ export declare const imagePreprocessAgent: (namedInputs: {
25
25
  lipSyncFile?: string;
26
26
  lipSyncModel?: string;
27
27
  lipSyncAgentName?: string;
28
+ lipSyncTrimAudio?: boolean;
29
+ bgmFile?: string | null;
30
+ startAt?: number;
31
+ duration?: number;
28
32
  audioFile?: string;
29
33
  beatDuration?: number;
30
34
  htmlPrompt?: undefined;
@@ -61,6 +65,10 @@ export declare const imagePreprocessAgent: (namedInputs: {
61
65
  lipSyncFile?: string;
62
66
  lipSyncModel?: string;
63
67
  lipSyncAgentName?: string;
68
+ lipSyncTrimAudio?: boolean;
69
+ bgmFile?: string | null;
70
+ startAt?: number;
71
+ duration?: number;
64
72
  audioFile?: string;
65
73
  beatDuration?: number;
66
74
  htmlPrompt?: undefined;
@@ -100,6 +108,10 @@ export declare const imagePreprocessAgent: (namedInputs: {
100
108
  lipSyncFile?: string;
101
109
  lipSyncModel?: string;
102
110
  lipSyncAgentName?: string;
111
+ lipSyncTrimAudio?: boolean;
112
+ bgmFile?: string | null;
113
+ startAt?: number;
114
+ duration?: number;
103
115
  audioFile?: string;
104
116
  beatDuration?: number;
105
117
  htmlPrompt?: undefined;
@@ -1,8 +1,9 @@
1
- import { MulmoPresentationStyleMethods, MulmoStudioContextMethods, MulmoBeatMethods } from "../methods/index.js";
2
- import { getBeatPngImagePath, getBeatMoviePaths } from "../utils/file.js";
1
+ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods, MulmoBeatMethods, MulmoMediaSourceMethods } from "../methods/index.js";
2
+ import { getBeatPngImagePath, getBeatMoviePaths, getAudioFilePath } from "../utils/file.js";
3
3
  import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
4
4
  import { renderHTMLToImage } from "../utils/markdown.js";
5
5
  import { GraphAILogger } from "graphai";
6
+ import { beatId } from "../utils/utils.js";
6
7
  const htmlStyle = (context, beat) => {
7
8
  return {
8
9
  canvasSize: MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle),
@@ -40,8 +41,20 @@ export const imagePreprocessAgent = async (namedInputs) => {
40
41
  returnValue.lipSyncAgentName = lipSyncAgentInfo.agentName;
41
42
  returnValue.lipSyncModel = beat.lipSyncParams?.model ?? context.presentationStyle.lipSyncParams?.model ?? lipSyncAgentInfo.defaultModel;
42
43
  returnValue.lipSyncFile = moviePaths.lipSyncFile;
43
- // Audio file will be set from the beat's audio file when available
44
- returnValue.audioFile = studioBeat?.audioFile;
44
+ if (context.studio.script.audioParams?.suppressSpeech) {
45
+ returnValue.startAt = studioBeat?.startAt ?? 0;
46
+ returnValue.duration = studioBeat?.duration ?? 0;
47
+ returnValue.lipSyncTrimAudio = true;
48
+ returnValue.bgmFile = MulmoMediaSourceMethods.resolve(context.studio.script.audioParams.bgm, context);
49
+ const folderName = MulmoStudioContextMethods.getFileName(context);
50
+ const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
51
+ const fileName = `${beatId(beat.id, index)}_trimmed.mp3`;
52
+ returnValue.audioFile = getAudioFilePath(audioDirPath, folderName, fileName);
53
+ }
54
+ else {
55
+ // Audio file will be set from the beat's audio file when available
56
+ returnValue.audioFile = studioBeat?.audioFile;
57
+ }
45
58
  }
46
59
  if (beat.image) {
47
60
  const plugin = MulmoBeatMethods.getPlugin(beat);
@@ -10,7 +10,7 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../met
10
10
  import { getOutputStudioFilePath, mkdir } from "../utils/file.js";
11
11
  import { fileCacheAgentFilter } from "../utils/filters.js";
12
12
  import { settings2GraphAIConfig } from "../utils/utils.js";
13
- import { extractImageFromMovie, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
13
+ import { extractImageFromMovie, ffmpegGetMediaDuration, trimMusic } from "../utils/ffmpeg_utils.js";
14
14
  import { getImageRefs } from "./image_references.js";
15
15
  import { imagePreprocessAgent, imagePluginAgent, htmlImageGeneratorAgent } from "./image_agents.js";
16
16
  const vanillaAgents = vanilla.default ?? vanilla;
@@ -224,11 +224,33 @@ const beat_graph_data = {
224
224
  },
225
225
  defaultValue: {},
226
226
  },
227
+ AudioTrimmer: {
228
+ if: ":preprocessor.lipSyncTrimAudio",
229
+ agent: async (namedInputs) => {
230
+ const buffer = await trimMusic(namedInputs.bgmFile, namedInputs.startAt, namedInputs.duration);
231
+ return { buffer };
232
+ },
233
+ inputs: {
234
+ audioFile: ":preprocessor.audioFile",
235
+ bgmFile: ":preprocessor.bgmFile",
236
+ startAt: ":preprocessor.startAt",
237
+ duration: ":preprocessor.duration",
238
+ cache: {
239
+ force: [":context.force"],
240
+ file: ":preprocessor.audioFile",
241
+ index: ":__mapIndex",
242
+ id: ":beat.id",
243
+ sessionType: "audioTrimmer",
244
+ mulmoContext: ":context",
245
+ },
246
+ },
247
+ defaultValue: {},
248
+ },
227
249
  lipSyncGenerator: {
228
250
  if: ":beat.enableLipSync",
229
251
  agent: ":preprocessor.lipSyncAgentName",
230
252
  inputs: {
231
- onComplete: [":soundEffectGenerator"], // to wait for soundEffectGenerator to finish
253
+ onComplete: [":soundEffectGenerator", ":AudioTrimmer"], // to wait for soundEffectGenerator to finish
232
254
  movieFile: ":preprocessor.movieFile",
233
255
  imageFile: ":preprocessor.referenceImageForMovie",
234
256
  audioFile: ":preprocessor.audioFile",
@@ -344,7 +366,7 @@ export const graphOption = async (context, settings) => {
344
366
  {
345
367
  name: "fileCacheAgentFilter",
346
368
  agent: fileCacheAgentFilter,
347
- nodeIds: ["imageGenerator", "movieGenerator", "htmlImageAgent", "soundEffectGenerator", "lipSyncGenerator"],
369
+ nodeIds: ["imageGenerator", "movieGenerator", "htmlImageAgent", "soundEffectGenerator", "lipSyncGenerator", "AudioTrimmer"],
348
370
  },
349
371
  ],
350
372
  taskManager: new TaskManager(MulmoPresentationStyleMethods.getConcurrency(context.presentationStyle)),
@@ -50,4 +50,6 @@ export declare const getOutputMultilingualFilePathAndMkdir: (context: MulmoStudi
50
50
  outDirPath: string;
51
51
  };
52
52
  export declare const translateBeat: (index: number, context: MulmoStudioContext, targetLangs: string[], args?: PublicAPIArgs) => Promise<void>;
53
- export declare const translate: (context: MulmoStudioContext, args?: PublicAPIArgs) => Promise<MulmoStudioContext>;
53
+ export declare const translate: (context: MulmoStudioContext, args?: PublicAPIArgs & {
54
+ targetLangs?: string[];
55
+ }) => Promise<MulmoStudioContext>;
@@ -270,7 +270,9 @@ export const translate = async (context, args) => {
270
270
  try {
271
271
  MulmoStudioContextMethods.setSessionState(context, "multiLingual", true);
272
272
  const { outputMultilingualFilePath, outDirPath } = getOutputMultilingualFilePathAndMkdir(context);
273
- const targetLangs = [...new Set([context.lang, context.studio.script.captionParams?.lang].filter((x) => !isNull(x)))];
273
+ const targetLangs = args?.targetLangs
274
+ ? args?.targetLangs
275
+ : [...new Set([context.lang, context.studio.script.captionParams?.lang].filter((x) => !isNull(x)))];
274
276
  const config = settings2GraphAIConfig(settings, process.env);
275
277
  assert(!!config?.openAIAgent?.apiKey, "The OPENAI_API_KEY environment variable is missing or empty");
276
278
  const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters, config });
@@ -9,7 +9,7 @@ export declare const MulmoPresentationStyleMethods: {
9
9
  getAllSpeechProviders(presentationStyle: MulmoPresentationStyle): Set<Text2SpeechProvider>;
10
10
  getTextSlideStyle(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
11
11
  getDefaultSpeaker(presentationStyle: MulmoPresentationStyle): string;
12
- getSpeaker(context: MulmoStudioContext, beat: MulmoBeat): SpeakerData;
12
+ getSpeaker(context: MulmoStudioContext, beat: MulmoBeat, targetLang: string | undefined): SpeakerData;
13
13
  getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
14
14
  getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
15
15
  getMovieAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): {
@@ -50,13 +50,14 @@ export const MulmoPresentationStyleMethods = {
50
50
  }
51
51
  return keys[0];
52
52
  },
53
- getSpeaker(context, beat) {
53
+ getSpeaker(context, beat, targetLang) {
54
54
  userAssert(!!context.presentationStyle?.speechParams?.speakers, "presentationStyle.speechParams.speakers is not set!!");
55
55
  const speakerId = beat?.speaker ?? MulmoPresentationStyleMethods.getDefaultSpeaker(context.presentationStyle);
56
56
  const speaker = context.presentationStyle.speechParams.speakers[speakerId];
57
57
  userAssert(!!speaker, `speaker is not set: speaker "${speakerId}"`);
58
- // Check if the speaker has a language-specific version
59
- const lang = context.lang ?? context.studio.script.lang;
58
+ // Check if the speaker has a language-specific version.
59
+ // Normally, lang is determined by the context, but lang may be specified when using the API.
60
+ const lang = targetLang ?? context.lang ?? context.studio.script.lang;
60
61
  if (speaker.lang && lang && speaker.lang[lang]) {
61
62
  return speaker.lang[lang];
62
63
  }
@@ -16,3 +16,4 @@ export declare const ffmpegGetMediaDuration: (filePath: string) => Promise<{
16
16
  hasAudio: boolean;
17
17
  }>;
18
18
  export declare const extractImageFromMovie: (movieFile: string, imagePath: string) => Promise<object>;
19
+ export declare const trimMusic: (inputFile: string, startTime: number, duration: number) => Promise<Buffer>;
@@ -89,3 +89,36 @@ export const extractImageFromMovie = (movieFile, imagePath) => {
89
89
  .run();
90
90
  });
91
91
  };
92
+ export const trimMusic = (inputFile, startTime, duration) => {
93
+ return new Promise((resolve, reject) => {
94
+ if (!inputFile.startsWith("http://") && !inputFile.startsWith("https://") && !fs.existsSync(inputFile)) {
95
+ reject(new Error(`File not found: ${inputFile}`));
96
+ return;
97
+ }
98
+ if (duration <= 0) {
99
+ reject(new Error(`Invalid duration: duration (${duration}) must be greater than 0`));
100
+ return;
101
+ }
102
+ const chunks = [];
103
+ ffmpeg(inputFile)
104
+ .seekInput(startTime)
105
+ .duration(duration)
106
+ .format("mp3")
107
+ .on("start", () => {
108
+ GraphAILogger.log(`Trimming audio from ${startTime}s for ${duration}s...`);
109
+ })
110
+ .on("error", (err) => {
111
+ GraphAILogger.error("Error occurred while trimming audio:", err);
112
+ reject(err);
113
+ })
114
+ .on("end", () => {
115
+ const buffer = Buffer.concat(chunks);
116
+ GraphAILogger.log(`Audio trimmed successfully, buffer size: ${buffer.length} bytes`);
117
+ resolve(buffer);
118
+ })
119
+ .pipe()
120
+ .on("data", (chunk) => {
121
+ chunks.push(chunk);
122
+ });
123
+ });
124
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mulmocast",
3
- "version": "1.2.12",
3
+ "version": "1.2.14",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "lib/index.node.js",
@@ -0,0 +1,82 @@
1
+ {
2
+ "$mulmocast": {
3
+ "version": "1.1",
4
+ "credit": "closing"
5
+ },
6
+ "canvasSize": {
7
+ "width": 1536,
8
+ "height": 1024
9
+ },
10
+ "speechParams": {
11
+ "speakers": {
12
+ "Presenter": {
13
+ "displayName": {
14
+ "en": "Presenter"
15
+ },
16
+ "voiceId": "shimmer"
17
+ }
18
+ }
19
+ },
20
+ "imageParams": {
21
+ "provider": "openai",
22
+ "style": "<style>Vibrant 3D animation style inspired by K-pop aesthetics, with glossy, stylized characters. The overall visual style combines elements of modern animation, game cinematics, and fashion-forward character design, with sleek outlines, glowing effects, and a polished, cinematic finish.</style>",
23
+ "images": {
24
+ "min": {
25
+ "type": "image",
26
+ "source": {
27
+ "kind": "url",
28
+ "url": "https://raw.githubusercontent.com/receptron/mulmocast-media/refs/heads/main/characters/min_anime.png"
29
+ }
30
+ }
31
+ }
32
+ },
33
+ "movieParams": {
34
+ "provider": "replicate"
35
+ },
36
+ "soundEffectParams": {
37
+ "provider": "replicate"
38
+ },
39
+ "captionParams": {
40
+ "lang": "en",
41
+ "styles": ["font-size: 64px", "width: 90%", "padding-left: 5%", "padding-right: 5%"]
42
+ },
43
+ "audioParams": {
44
+ "padding": 0,
45
+ "introPadding": 0,
46
+ "closingPadding": 0,
47
+ "outroPadding": 0,
48
+ "bgm": {
49
+ "kind": "url",
50
+ "url": "https://raw.githubusercontent.com/receptron/mulmocast-media/refs/heads/main/music/finetuning_with_you.mp3"
51
+ },
52
+ "bgmVolume": 1,
53
+ "audioVolume": 0,
54
+ "suppressSpeech": true
55
+ },
56
+ "title": "Music Video",
57
+ "lang": "en",
58
+ "beats": [
59
+ {
60
+ "text": "Finetuning with you",
61
+ "duration": 7.0,
62
+ "image": {
63
+ "type": "textSlide",
64
+ "slide": {
65
+ "title": "Finetuning with you"
66
+ }
67
+ }
68
+ },
69
+ {
70
+ "text": "Whispers hide in silver rain. Every shadow calls your name.",
71
+ "duration": 9.32,
72
+ "imagePrompt": "Singer walking alone at night in neon-lit rainy street, holding a clear umbrella, raindrops sparkling, wearing a black mini dress with thigh-high boots, reflective puddles surrounding her.",
73
+ "enableLipSync": true
74
+ },
75
+ {
76
+ "text": "I dissolve into the night. Just to echo what you liked.",
77
+ "duration": 8.28,
78
+ "imagePrompt": "Singer standing against a glowing city skyline at night, hair blowing in wind, long white trench coat fluttering, reaching out with one hand as if fading into the background lights.",
79
+ "enableLipSync": true
80
+ }
81
+ ]
82
+ }