mulmocast 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/lib/actions/audio.d.ts +0 -1
  2. package/lib/actions/audio.js +18 -13
  3. package/lib/actions/image_agents.d.ts +3 -12
  4. package/lib/actions/image_agents.js +12 -8
  5. package/lib/actions/images.js +3 -1
  6. package/lib/actions/movie.js +1 -3
  7. package/lib/actions/translate.js +13 -31
  8. package/lib/agents/image_openai_agent.js +4 -1
  9. package/lib/agents/lipsync_replicate_agent.js +10 -3
  10. package/lib/cli/commands/audio/handler.js +1 -1
  11. package/lib/cli/commands/image/handler.js +1 -1
  12. package/lib/cli/commands/movie/handler.js +1 -1
  13. package/lib/cli/commands/pdf/handler.js +1 -1
  14. package/lib/cli/helpers.d.ts +1 -4
  15. package/lib/cli/helpers.js +3 -2
  16. package/lib/mcp/server.js +1 -1
  17. package/lib/methods/mulmo_presentation_style.d.ts +5 -5
  18. package/lib/methods/mulmo_presentation_style.js +14 -8
  19. package/lib/methods/mulmo_script.js +4 -1
  20. package/lib/methods/mulmo_studio_context.d.ts +1 -0
  21. package/lib/methods/mulmo_studio_context.js +8 -0
  22. package/lib/types/agent.d.ts +4 -0
  23. package/lib/types/schema.d.ts +712 -8
  24. package/lib/types/schema.js +6 -2
  25. package/lib/types/type.d.ts +1 -1
  26. package/lib/utils/const.js +1 -1
  27. package/lib/utils/context.d.ts +401 -34
  28. package/lib/utils/context.js +95 -56
  29. package/lib/utils/file.d.ts +1 -1
  30. package/lib/utils/file.js +5 -2
  31. package/lib/utils/filters.d.ts +1 -0
  32. package/lib/utils/filters.js +8 -0
  33. package/lib/utils/preprocess.d.ts +15 -2
  34. package/lib/utils/preprocess.js +3 -3
  35. package/lib/utils/provider2agent.d.ts +3 -2
  36. package/lib/utils/provider2agent.js +20 -2
  37. package/lib/utils/string.d.ts +1 -1
  38. package/lib/utils/string.js +11 -8
  39. package/package.json +2 -1
  40. package/scripts/templates/image_refs.json +1 -0
  41. package/scripts/templates/voice_over.json +1 -0
  42. package/scripts/test/gpt.json +33 -0
  43. package/scripts/test/mulmo_story.json +11 -0
  44. package/scripts/test/test.json +64 -0
  45. package/scripts/test/test1.json +41 -0
  46. package/scripts/test/test2.json +66 -0
  47. package/scripts/test/test_audio.json +152 -0
  48. package/scripts/test/test_audio_instructions.json +70 -0
  49. package/scripts/test/test_beats.json +59 -0
  50. package/scripts/test/test_captions.json +53 -0
  51. package/scripts/test/test_elevenlabs_models.json +194 -0
  52. package/scripts/test/test_en.json +29 -0
  53. package/scripts/test/test_hello.json +18 -0
  54. package/scripts/test/test_hello_google.json +26 -0
  55. package/scripts/test/test_html.json +67 -0
  56. package/scripts/test/test_image_refs.json +50 -0
  57. package/scripts/test/test_images.json +49 -0
  58. package/scripts/test/test_lang.json +87 -0
  59. package/scripts/test/test_layout.json +153 -0
  60. package/scripts/test/test_lipsync.json +62 -0
  61. package/scripts/test/test_loop.json +35 -0
  62. package/scripts/test/test_media.json +245 -0
  63. package/scripts/test/test_mixed_providers.json +92 -0
  64. package/scripts/test/test_movie.json +40 -0
  65. package/scripts/test/test_no_audio.json +253 -0
  66. package/scripts/test/test_no_audio_with_credit.json +254 -0
  67. package/scripts/test/test_order.json +69 -0
  68. package/scripts/test/test_order_portrait.json +73 -0
  69. package/scripts/test/test_replicate.json +145 -0
  70. package/scripts/test/test_slideout_left_no_audio.json +46 -0
  71. package/scripts/test/test_sound_effect.json +41 -0
  72. package/scripts/test/test_spillover.json +117 -0
  73. package/scripts/test/test_transition.json +56 -0
  74. package/scripts/test/test_transition_no_audio.json +46 -0
  75. package/scripts/test/test_video_speed.json +81 -0
  76. package/scripts/test/test_voice_over.json +105 -0
  77. package/scripts/test/test_voices.json +55 -0
@@ -2,6 +2,5 @@ import "dotenv/config";
2
2
  import type { CallbackFunction } from "graphai";
3
3
  import { MulmoStudioContext, MulmoBeat } from "../types/index.js";
4
4
  export declare const getBeatAudioPath: (text: string, context: MulmoStudioContext, beat: MulmoBeat, lang?: string) => string | undefined;
5
- export declare const audioFilePath: (context: MulmoStudioContext) => string;
6
5
  export declare const generateBeatAudio: (index: number, context: MulmoStudioContext, settings?: Record<string, string>, callbacks?: CallbackFunction[]) => Promise<void>;
7
6
  export declare const audio: (context: MulmoStudioContext, settings?: Record<string, string>, callbacks?: CallbackFunction[]) => Promise<MulmoStudioContext>;
@@ -9,8 +9,8 @@ import ttsGoogleAgent from "../agents/tts_google_agent.js";
9
9
  import ttsElevenlabsAgent from "../agents/tts_elevenlabs_agent.js";
10
10
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
11
11
  import { MulmoPresentationStyleMethods } from "../methods/index.js";
12
- import { text2SpeechProviderSchema, } from "../types/index.js";
13
- import { fileCacheAgentFilter } from "../utils/filters.js";
12
+ import { text2SpeechProviderSchema } from "../types/index.js";
13
+ import { fileCacheAgentFilter, nijovoiceTextAgentFilter } from "../utils/filters.js";
14
14
  import { getAudioArtifactFilePath, getAudioFilePath, getOutputStudioFilePath, resolveDirPath, defaultBGMPath, mkdir, writingMessage } from "../utils/file.js";
15
15
  import { text2hash, localizedText, settings2GraphAIConfig } from "../utils/utils.js";
16
16
  import { provider2TTSAgent } from "../utils/provider2agent.js";
@@ -30,15 +30,15 @@ const getAudioPath = (context, beat, audioFile) => {
30
30
  }
31
31
  return audioFile;
32
32
  };
33
- const getAudioParam = (presentationStyle, beat) => {
34
- const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
33
+ const getAudioParam = (context, beat) => {
34
+ const speaker = MulmoPresentationStyleMethods.getSpeaker(context, beat);
35
35
  const speechOptions = { ...speaker.speechOptions, ...beat.speechOptions };
36
36
  const provider = text2SpeechProviderSchema.parse(speaker.provider);
37
37
  return { voiceId: speaker.voiceId, provider, speechOptions, model: speaker.model };
38
38
  };
39
39
  export const getBeatAudioPath = (text, context, beat, lang) => {
40
40
  const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
41
- const { voiceId, provider, speechOptions, model } = getAudioParam(context.presentationStyle, beat);
41
+ const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat);
42
42
  const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider, model ?? ""].join(":");
43
43
  const audioFileName = `${context.studio.filename}_${text2hash(hash_string)}`;
44
44
  const audioFile = getAudioFilePath(audioDirPath, context.studio.filename, audioFileName, lang);
@@ -46,9 +46,9 @@ export const getBeatAudioPath = (text, context, beat, lang) => {
46
46
  };
47
47
  const preprocessor = (namedInputs) => {
48
48
  const { beat, studioBeat, multiLingual, context } = namedInputs;
49
- const { lang, presentationStyle } = context;
49
+ const { lang } = context;
50
50
  const text = localizedText(beat, multiLingual, lang);
51
- const { voiceId, provider, speechOptions, model } = getAudioParam(presentationStyle, beat);
51
+ const { voiceId, provider, speechOptions, model } = getAudioParam(context, beat);
52
52
  const audioPath = getBeatAudioPath(text, context, beat, lang);
53
53
  studioBeat.audioFile = audioPath; // TODO: Passing by reference is difficult to maintain, so pass it using graphai inputs
54
54
  const needsTTS = !beat.audio && audioPath !== undefined;
@@ -58,6 +58,8 @@ const preprocessor = (namedInputs) => {
58
58
  voiceId,
59
59
  speechOptions,
60
60
  model,
61
+ provider,
62
+ lang,
61
63
  audioPath,
62
64
  studioBeat,
63
65
  needsTTS,
@@ -84,6 +86,8 @@ const graph_tts = {
84
86
  agent: ":preprocessor.ttsAgent",
85
87
  inputs: {
86
88
  text: ":preprocessor.text",
89
+ provider: ":preprocessor.provider",
90
+ lang: ":preprocessor.lang",
87
91
  cache: {
88
92
  force: [":context.force"],
89
93
  file: ":preprocessor.audioPath",
@@ -173,12 +177,12 @@ const agentFilters = [
173
177
  agent: fileCacheAgentFilter,
174
178
  nodeIds: ["tts"],
175
179
  },
180
+ {
181
+ name: "nijovoiceTextAgentFilter",
182
+ agent: nijovoiceTextAgentFilter,
183
+ nodeIds: ["tts"],
184
+ },
176
185
  ];
177
- export const audioFilePath = (context) => {
178
- const fileName = MulmoStudioContextMethods.getFileName(context);
179
- const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
180
- return getAudioArtifactFilePath(outDirPath, fileName);
181
- };
182
186
  const getConcurrency = (context) => {
183
187
  // Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
184
188
  const hasLimitedConcurrencyProvider = Object.values(context.presentationStyle.speechParams.speakers).some((speaker) => {
@@ -231,7 +235,7 @@ export const audio = async (context, settings, callbacks) => {
231
235
  const fileName = MulmoStudioContextMethods.getFileName(context);
232
236
  const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
233
237
  const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
234
- const audioArtifactFilePath = audioFilePath(context);
238
+ const audioArtifactFilePath = getAudioArtifactFilePath(context);
235
239
  const audioSegmentDirPath = resolveDirPath(audioDirPath, fileName);
236
240
  const audioCombinedFilePath = getAudioFilePath(audioDirPath, fileName, fileName, context.lang);
237
241
  const outputStudioFilePath = getOutputStudioFilePath(outDirPath, fileName);
@@ -253,6 +257,7 @@ export const audio = async (context, settings, callbacks) => {
253
257
  const result = await graph.run();
254
258
  writingMessage(audioCombinedFilePath);
255
259
  MulmoStudioContextMethods.setSessionState(context, "audio", false);
260
+ writingMessage(audioArtifactFilePath);
256
261
  return result.combineFiles;
257
262
  }
258
263
  catch (__error) {
@@ -23,10 +23,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
23
23
  };
24
24
  lipSyncFile?: string;
25
25
  lipSyncModel?: string;
26
- lipSyncAgentInfo?: {
27
- agentName: string;
28
- defaultModel: string;
29
- };
26
+ lipSyncAgentName?: string;
30
27
  audioFile?: string;
31
28
  beatDuration?: number;
32
29
  htmlPrompt?: undefined;
@@ -61,10 +58,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
61
58
  };
62
59
  lipSyncFile?: string;
63
60
  lipSyncModel?: string;
64
- lipSyncAgentInfo?: {
65
- agentName: string;
66
- defaultModel: string;
67
- };
61
+ lipSyncAgentName?: string;
68
62
  audioFile?: string;
69
63
  beatDuration?: number;
70
64
  htmlPrompt?: undefined;
@@ -102,10 +96,7 @@ export declare const imagePreprocessAgent: (namedInputs: {
102
96
  };
103
97
  lipSyncFile?: string;
104
98
  lipSyncModel?: string;
105
- lipSyncAgentInfo?: {
106
- agentName: string;
107
- defaultModel: string;
108
- };
99
+ lipSyncAgentName?: string;
109
100
  audioFile?: string;
110
101
  beatDuration?: number;
111
102
  htmlPrompt?: undefined;
@@ -25,16 +25,20 @@ export const imagePreprocessAgent = async (namedInputs) => {
25
25
  movieFile: beat.moviePrompt ? moviePaths.movieFile : undefined,
26
26
  beatDuration: beat.duration ?? studioBeat?.duration,
27
27
  };
28
- if (beat.soundEffectPrompt) {
29
- returnValue.soundEffectAgentInfo = MulmoPresentationStyleMethods.getSoundEffectAgentInfo(context.presentationStyle, beat);
30
- returnValue.soundEffectModel =
31
- beat.soundEffectParams?.model ?? context.presentationStyle.soundEffectParams?.model ?? returnValue.soundEffectAgentInfo.defaultModel;
32
- returnValue.soundEffectFile = moviePaths.soundEffectFile;
33
- returnValue.soundEffectPrompt = beat.soundEffectPrompt;
28
+ const isMovie = Boolean(beat.moviePrompt || beat?.image?.type === "movie");
29
+ if (isMovie) {
30
+ if (beat.soundEffectPrompt) {
31
+ returnValue.soundEffectAgentInfo = MulmoPresentationStyleMethods.getSoundEffectAgentInfo(context.presentationStyle, beat);
32
+ returnValue.soundEffectModel =
33
+ beat.soundEffectParams?.model ?? context.presentationStyle.soundEffectParams?.model ?? returnValue.soundEffectAgentInfo.defaultModel;
34
+ returnValue.soundEffectFile = moviePaths.soundEffectFile;
35
+ returnValue.soundEffectPrompt = beat.soundEffectPrompt;
36
+ }
34
37
  }
35
38
  if (beat.enableLipSync) {
36
- returnValue.lipSyncAgentInfo = MulmoPresentationStyleMethods.getLipSyncAgentInfo(context.presentationStyle, beat);
37
- returnValue.lipSyncModel = beat.lipSyncParams?.model ?? context.presentationStyle.lipSyncParams?.model ?? returnValue.lipSyncAgentInfo.defaultModel;
39
+ const lipSyncAgentInfo = MulmoPresentationStyleMethods.getLipSyncAgentInfo(context.presentationStyle, beat);
40
+ returnValue.lipSyncAgentName = lipSyncAgentInfo.agentName;
41
+ returnValue.lipSyncModel = beat.lipSyncParams?.model ?? context.presentationStyle.lipSyncParams?.model ?? lipSyncAgentInfo.defaultModel;
38
42
  returnValue.lipSyncFile = moviePaths.lipSyncFile;
39
43
  // Audio file will be set from the beat's audio file when available
40
44
  returnValue.audioFile = studioBeat?.audioFile;
@@ -135,6 +135,7 @@ const beat_graph_data = {
135
135
  model: ":preprocessor.imageParams.model",
136
136
  moderation: ":preprocessor.imageParams.moderation",
137
137
  canvasSize: ":context.presentationStyle.canvasSize",
138
+ quality: ":preprocessor.imageParams.quality",
138
139
  },
139
140
  },
140
141
  defaultValue: {},
@@ -217,10 +218,11 @@ const beat_graph_data = {
217
218
  },
218
219
  lipSyncGenerator: {
219
220
  if: ":beat.enableLipSync",
220
- agent: ":preprocessor.lipSyncAgentInfo.agentName",
221
+ agent: ":preprocessor.lipSyncAgentName",
221
222
  inputs: {
222
223
  onComplete: [":soundEffectGenerator"], // to wait for soundEffectGenerator to finish
223
224
  movieFile: ":preprocessor.movieFile",
225
+ imageFile: ":preprocessor.referenceImageForMovie",
224
226
  audioFile: ":preprocessor.audioFile",
225
227
  lipSyncFile: ":preprocessor.lipSyncFile",
226
228
  params: {
@@ -246,9 +246,7 @@ export const movieFilePath = (context) => {
246
246
  export const movie = async (context) => {
247
247
  MulmoStudioContextMethods.setSessionState(context, "video", true);
248
248
  try {
249
- const fileName = MulmoStudioContextMethods.getFileName(context);
250
- const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
251
- const audioArtifactFilePath = getAudioArtifactFilePath(outDirPath, fileName);
249
+ const audioArtifactFilePath = getAudioArtifactFilePath(context);
252
250
  const outputVideoPath = movieFilePath(context);
253
251
  if (await createVideo(audioArtifactFilePath, outputVideoPath, context)) {
254
252
  writingMessage(outputVideoPath);
@@ -1,9 +1,9 @@
1
1
  import "dotenv/config";
2
- import { GraphAI, assert } from "graphai";
2
+ import { GraphAI, assert, isNull } from "graphai";
3
3
  import * as agents from "@graphai/vanilla";
4
4
  import { openAIAgent } from "@graphai/openai_agent";
5
5
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
6
- import { recursiveSplitJa, replacementsJa, replacePairsJa } from "../utils/string.js";
6
+ import { recursiveSplitJa } from "../utils/string.js";
7
7
  import { settings2GraphAIConfig } from "../utils/utils.js";
8
8
  import { getOutputMultilingualFilePath, mkdir, writingMessage } from "../utils/file.js";
9
9
  import { translateSystemPrompt, translatePrompts } from "../utils/prompt.js";
@@ -13,17 +13,9 @@ const translateGraph = {
13
13
  version: 0.5,
14
14
  nodes: {
15
15
  context: {},
16
- defaultLang: {},
17
16
  outDirPath: {},
18
17
  outputMultilingualFilePath: {},
19
- lang: {
20
- agent: "stringUpdateTextAgent",
21
- inputs: {
22
- newText: ":context.studio.script.lang",
23
- oldText: ":defaultLang",
24
- },
25
- },
26
- targetLangs: {}, // TODO
18
+ targetLangs: {},
27
19
  mergeStudioResult: {
28
20
  isResult: true,
29
21
  agent: "mergeObjectAgent",
@@ -37,7 +29,6 @@ const translateGraph = {
37
29
  targetLangs: ":targetLangs",
38
30
  context: ":context",
39
31
  rows: ":context.studio.script.beats",
40
- lang: ":lang",
41
32
  },
42
33
  params: {
43
34
  rowKey: "beat",
@@ -62,7 +53,7 @@ const translateGraph = {
62
53
  beat: ":beat",
63
54
  multiLingual: ":multiLingual",
64
55
  rows: ":targetLangs",
65
- lang: ":lang.text",
56
+ lang: ":context.studio.script.lang",
66
57
  context: ":context",
67
58
  beatIndex: ":__mapIndex",
68
59
  },
@@ -120,17 +111,11 @@ const translateGraph = {
120
111
  },
121
112
  ttsTexts: {
122
113
  agent: (namedInputs) => {
123
- const { localizedText, targetLang } = namedInputs;
114
+ const { localizedText } = namedInputs;
124
115
  // cache
125
116
  if (localizedText.ttsTexts) {
126
117
  return localizedText;
127
118
  }
128
- if (targetLang === "ja") {
129
- return {
130
- ...localizedText,
131
- ttsTexts: localizedText?.texts?.map((text) => replacePairsJa(text, replacementsJa)),
132
- };
133
- }
134
119
  return {
135
120
  ...localizedText,
136
121
  ttsTexts: localizedText.texts,
@@ -180,18 +165,14 @@ const localizedTextCacheAgentFilter = async (context, next) => {
180
165
  if (!beat.text) {
181
166
  return { text: "" };
182
167
  }
183
- // The original text is unchanged and the target language text is present
184
- if (multiLingual.multiLingualTexts &&
185
- multiLingual.multiLingualTexts[lang] &&
186
- multiLingual.multiLingualTexts[lang].text === beat.text &&
187
- multiLingual.multiLingualTexts[targetLang] &&
188
- multiLingual.multiLingualTexts[targetLang].text) {
189
- return { text: multiLingual.multiLingualTexts[targetLang].text };
190
- }
191
168
  // same language
192
169
  if (targetLang === lang) {
193
170
  return { text: beat.text };
194
171
  }
172
+ // The original text is unchanged and the target language text is present
173
+ if (multiLingual.multiLingualTexts?.[lang]?.text === beat.text && multiLingual.multiLingualTexts[targetLang]?.text) {
174
+ return { text: multiLingual.multiLingualTexts[targetLang].text };
175
+ }
195
176
  try {
196
177
  MulmoStudioContextMethods.setBeatSessionState(mulmoContext, "multiLingual", beatIndex, true);
197
178
  return await next(context);
@@ -207,8 +188,6 @@ const agentFilters = [
207
188
  nodeIds: ["localizedTexts"],
208
189
  },
209
190
  ];
210
- const defaultLang = "en";
211
- const targetLangs = ["ja", "en"];
212
191
  export const translate = async (context, args) => {
213
192
  const { settings, callbacks } = args ?? {};
214
193
  try {
@@ -217,11 +196,14 @@ export const translate = async (context, args) => {
217
196
  const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
218
197
  const outputMultilingualFilePath = getOutputMultilingualFilePath(outDirPath, fileName);
219
198
  mkdir(outDirPath);
199
+ const langs = (context.multiLingual ?? []).map((x) => Object.keys(x.multiLingualTexts)).flat(); // existing langs in multiLingual
200
+ const targetLangs = [
201
+ ...new Set([context.studio.script.lang, langs, context.lang, context.studio.script.captionParams?.lang].flat().filter((x) => !isNull(x))),
202
+ ];
220
203
  const config = settings2GraphAIConfig(settings, process.env);
221
204
  assert(!!config?.openAIAgent?.apiKey, "The OPENAI_API_KEY environment variable is missing or empty");
222
205
  const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters, config });
223
206
  graph.injectValue("context", context);
224
- graph.injectValue("defaultLang", defaultLang);
225
207
  graph.injectValue("targetLangs", targetLangs);
226
208
  graph.injectValue("outDirPath", outDirPath);
227
209
  graph.injectValue("outputMultilingualFilePath", outputMultilingualFilePath);
@@ -6,7 +6,7 @@ import { provider2ImageAgent } from "../utils/provider2agent.js";
6
6
  // https://platform.openai.com/docs/guides/image-generation
7
7
  export const imageOpenaiAgent = async ({ namedInputs, params, config, }) => {
8
8
  const { prompt, referenceImages } = namedInputs;
9
- const { moderation, canvasSize } = params;
9
+ const { moderation, canvasSize, quality } = params;
10
10
  const { apiKey, baseURL } = { ...config };
11
11
  const model = params.model ?? provider2ImageAgent["openai"].defaultModel;
12
12
  const openai = new OpenAI({ apiKey, baseURL });
@@ -42,6 +42,9 @@ export const imageOpenaiAgent = async ({ namedInputs, params, config, }) => {
42
42
  };
43
43
  if (model === "gpt-image-1") {
44
44
  imageOptions.moderation = moderation || "auto";
45
+ if (quality) {
46
+ imageOptions.quality = quality;
47
+ }
45
48
  }
46
49
  const response = await (async () => {
47
50
  try {
@@ -3,7 +3,7 @@ import { GraphAILogger } from "graphai";
3
3
  import Replicate from "replicate";
4
4
  import { provider2LipSyncAgent } from "../utils/provider2agent.js";
5
5
  export const lipSyncReplicateAgent = async ({ namedInputs, params, config, }) => {
6
- const { movieFile, audioFile } = namedInputs;
6
+ const { movieFile, audioFile, imageFile } = namedInputs;
7
7
  const apiKey = config?.apiKey;
8
8
  const model = params.model ?? provider2LipSyncAgent.replicate.defaultModel;
9
9
  if (!apiKey) {
@@ -12,10 +12,12 @@ export const lipSyncReplicateAgent = async ({ namedInputs, params, config, }) =>
12
12
  const replicate = new Replicate({
13
13
  auth: apiKey,
14
14
  });
15
- const videoBuffer = readFileSync(movieFile);
15
+ const videoBuffer = movieFile ? readFileSync(movieFile) : undefined;
16
16
  const audioBuffer = readFileSync(audioFile);
17
- const videoUri = `data:video/quicktime;base64,${videoBuffer.toString("base64")}`;
17
+ const imageBuffer = imageFile ? readFileSync(imageFile) : undefined;
18
+ const videoUri = videoBuffer ? `data:video/quicktime;base64,${videoBuffer.toString("base64")}` : undefined;
18
19
  const audioUri = `data:audio/wav;base64,${audioBuffer.toString("base64")}`;
20
+ const imageUri = imageBuffer ? `data:image/png;base64,${imageBuffer.toString("base64")}` : undefined;
19
21
  const input = {
20
22
  video: undefined,
21
23
  video_input: undefined,
@@ -23,6 +25,7 @@ export const lipSyncReplicateAgent = async ({ namedInputs, params, config, }) =>
23
25
  audio: undefined,
24
26
  audio_input: undefined,
25
27
  audio_file: undefined,
28
+ image: undefined,
26
29
  };
27
30
  const modelParams = provider2LipSyncAgent.replicate.modelParams[model];
28
31
  if (!modelParams) {
@@ -30,12 +33,16 @@ export const lipSyncReplicateAgent = async ({ namedInputs, params, config, }) =>
30
33
  }
31
34
  const videoParam = modelParams.video;
32
35
  const audioParam = modelParams.audio;
36
+ const imageParam = modelParams.image;
33
37
  if (videoParam === "video" || videoParam === "video_input" || videoParam === "video_url") {
34
38
  input[videoParam] = videoUri;
35
39
  }
36
40
  if (audioParam === "audio" || audioParam === "audio_input" || audioParam === "audio_file") {
37
41
  input[audioParam] = audioUri;
38
42
  }
43
+ if (imageParam === "image") {
44
+ input[imageParam] = imageUri;
45
+ }
39
46
  const model_identifier = provider2LipSyncAgent.replicate.modelParams[model]?.identifier ?? model;
40
47
  try {
41
48
  const output = await replicate.run(model_identifier, {
@@ -5,6 +5,6 @@ export const handler = async (argv) => {
5
5
  if (!context) {
6
6
  process.exit(1);
7
7
  }
8
- await runTranslateIfNeeded(context, argv);
8
+ await runTranslateIfNeeded(context);
9
9
  await audio(context);
10
10
  };
@@ -5,6 +5,6 @@ export const handler = async (argv) => {
5
5
  if (!context) {
6
6
  process.exit(1);
7
7
  }
8
- await runTranslateIfNeeded(context, argv);
8
+ await runTranslateIfNeeded(context);
9
9
  await images(context);
10
10
  };
@@ -5,6 +5,6 @@ export const handler = async (argv) => {
5
5
  if (!context) {
6
6
  process.exit(1);
7
7
  }
8
- await runTranslateIfNeeded(context, argv);
8
+ await runTranslateIfNeeded(context, true);
9
9
  await audio(context).then(images).then(captions).then(movie);
10
10
  };
@@ -5,7 +5,7 @@ export const handler = async (argv) => {
5
5
  if (!context) {
6
6
  process.exit(1);
7
7
  }
8
- await runTranslateIfNeeded(context, argv);
8
+ await runTranslateIfNeeded(context);
9
9
  await images(context);
10
10
  await pdf(context, argv.pdf_mode, argv.pdf_size);
11
11
  };
@@ -1,9 +1,6 @@
1
1
  import type { CliArgs } from "../types/cli_types.js";
2
2
  import { FileObject, InitOptions, MulmoStudioContext } from "../types/index.js";
3
- export declare const runTranslateIfNeeded: (context: MulmoStudioContext, argv: {
4
- l?: string;
5
- c?: string;
6
- }) => Promise<void>;
3
+ export declare const runTranslateIfNeeded: (context: MulmoStudioContext, includeCaption?: boolean) => Promise<void>;
7
4
  export declare const setGraphAILogger: (verbose: boolean | undefined, logValues?: Record<string, unknown>) => void;
8
5
  export declare const getFileObject: (args: {
9
6
  basedir?: string;
@@ -5,10 +5,11 @@ import clipboardy from "clipboardy";
5
5
  import { getBaseDirPath, getFullPath, getOutputStudioFilePath, resolveDirPath, mkdir, getOutputMultilingualFilePath, generateTimestampedFileName, } from "../utils/file.js";
6
6
  import { isHttp } from "../utils/utils.js";
7
7
  import { outDirName, imageDirName, audioDirName } from "../utils/const.js";
8
+ import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
8
9
  import { translate } from "../actions/translate.js";
9
10
  import { initializeContextFromFiles } from "../utils/context.js";
10
- export const runTranslateIfNeeded = async (context, argv) => {
11
- if (argv.l || context.studio.script.captionParams?.lang) {
11
+ export const runTranslateIfNeeded = async (context, includeCaption = false) => {
12
+ if (MulmoStudioContextMethods.needTranslate(context, includeCaption)) {
12
13
  GraphAILogger.log("run translate");
13
14
  await translate(context);
14
15
  }
package/lib/mcp/server.js CHANGED
@@ -104,7 +104,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
104
104
  throw new Error("Failed to initialize context from MulmoScript");
105
105
  }
106
106
  // Run translation if needed
107
- await runTranslateIfNeeded(context, argv);
107
+ await runTranslateIfNeeded(context);
108
108
  // Execute the requested command
109
109
  switch (cmd) {
110
110
  case "movie":
@@ -1,12 +1,11 @@
1
1
  import "dotenv/config";
2
- import { MulmoCanvasDimension, MulmoBeat, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData, Text2ImageProvider } from "../types/index.js";
2
+ import { MulmoCanvasDimension, MulmoBeat, Text2SpeechProvider, Text2ImageAgentInfo, Text2HtmlAgentInfo, BeatMediaType, MulmoPresentationStyle, SpeakerData, Text2ImageProvider, MulmoStudioContext } from "../types/index.js";
3
3
  export declare const MulmoPresentationStyleMethods: {
4
4
  getCanvasSize(presentationStyle: MulmoPresentationStyle): MulmoCanvasDimension;
5
5
  getAllSpeechProviders(presentationStyle: MulmoPresentationStyle): Set<Text2SpeechProvider>;
6
6
  getTextSlideStyle(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
7
7
  getDefaultSpeaker(presentationStyle: MulmoPresentationStyle): string;
8
- getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
9
- getTTSModel(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string | undefined;
8
+ getSpeaker(context: MulmoStudioContext, beat: MulmoBeat): SpeakerData;
10
9
  getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
11
10
  getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
12
11
  getMovieAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): {
@@ -37,9 +36,10 @@ export declare const MulmoPresentationStyleMethods: {
37
36
  defaultModel: import("../utils/provider2agent.js").ReplicateModel;
38
37
  models: import("../utils/provider2agent.js").ReplicateModel[];
39
38
  modelParams: Record<import("../utils/provider2agent.js").ReplicateModel, {
40
- identifier?: `${string}/${string}:${string}`;
41
- video: string;
39
+ identifier?: `${string}/${string}:${string}` | `${string}/${string}`;
40
+ video?: string;
42
41
  audio: string;
42
+ image?: string;
43
43
  }>;
44
44
  };
45
45
  getConcurrency(presentationStyle: MulmoPresentationStyle): 4 | 16;
@@ -46,18 +46,24 @@ export const MulmoPresentationStyleMethods = {
46
46
  }
47
47
  return keys[0];
48
48
  },
49
- getSpeaker(presentationStyle, beat) {
50
- userAssert(!!presentationStyle?.speechParams?.speakers, "presentationStyle.speechParams.speakers is not set!!");
51
- const speakerId = beat?.speaker ?? MulmoPresentationStyleMethods.getDefaultSpeaker(presentationStyle);
52
- userAssert(!!speakerId, "beat.speaker and default speaker is not set");
53
- const speaker = presentationStyle.speechParams.speakers[speakerId];
49
+ getSpeaker(context, beat) {
50
+ userAssert(!!context.presentationStyle?.speechParams?.speakers, "presentationStyle.speechParams.speakers is not set!!");
51
+ const speakerId = beat?.speaker ?? MulmoPresentationStyleMethods.getDefaultSpeaker(context.presentationStyle);
52
+ const speaker = context.presentationStyle.speechParams.speakers[speakerId];
54
53
  userAssert(!!speaker, `speaker is not set: speaker "${speakerId}"`);
54
+ // Check if the speaker has a language-specific version
55
+ const lang = context.lang ?? context.studio.script.lang;
56
+ if (speaker.lang && lang && speaker.lang[lang]) {
57
+ return speaker.lang[lang];
58
+ }
55
59
  return speaker;
56
60
  },
57
- getTTSModel(presentationStyle, beat) {
58
- const speaker = MulmoPresentationStyleMethods.getSpeaker(presentationStyle, beat);
59
- return speaker.model;
61
+ /* NOTE: This method is not used.
62
+ getTTSModel(context: MulmoStudioContext, beat: MulmoBeat): string | undefined {
63
+ const speaker = MulmoPresentationStyleMethods.getSpeaker(context, beat);
64
+ return speaker.model;
60
65
  },
66
+ */
61
67
  getText2ImageProvider(provider) {
62
68
  return text2ImageProviderSchema.parse(provider);
63
69
  },
@@ -18,6 +18,9 @@ const validators = [{ from: "1.0", to: "1.1", validator: validate_1_0 }];
18
18
  export const MulmoScriptMethods = {
19
19
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
20
20
  validate(script) {
21
+ const version = script.$mulmocast.version;
22
+ // lang was optional in 1.0 and 1.1
23
+ const defaultLang = version === "1.0" || version === "1.1" ? { lang: "en" } : {};
21
24
  const validatedScript = validators.reduce((acc, validator) => {
22
25
  if (acc.$mulmocast.version === validator.from) {
23
26
  const validated = validator.validator(acc);
@@ -25,7 +28,7 @@ export const MulmoScriptMethods = {
25
28
  return validated;
26
29
  }
27
30
  return acc;
28
- }, script);
31
+ }, { ...defaultLang, ...script });
29
32
  return mulmoScriptSchema.parse(validatedScript);
30
33
  },
31
34
  };
@@ -11,4 +11,5 @@ export declare const MulmoStudioContextMethods: {
11
11
  getCaption(context: MulmoStudioContext): string | undefined;
12
12
  setSessionState(context: MulmoStudioContext, sessionType: SessionType, value: boolean): void;
13
13
  setBeatSessionState(context: MulmoStudioContext, sessionType: BeatSessionType, index: number, value: boolean): void;
14
+ needTranslate(context: MulmoStudioContext, includeCaption?: boolean): boolean | "" | undefined;
14
15
  };
@@ -63,4 +63,12 @@ export const MulmoStudioContextMethods = {
63
63
  }
64
64
  notifyBeatStateChange(context, sessionType, index);
65
65
  },
66
+ needTranslate(context, includeCaption = false) {
67
+ // context.studio.script.lang = defaultLang, context.lang = targetLanguage.
68
+ if (includeCaption) {
69
+ return (context.studio.script.lang !== context.lang ||
70
+ (context.studio.script.captionParams?.lang && context.studio.script.lang !== context.studio.script.captionParams?.lang));
71
+ }
72
+ return context.studio.script.lang !== context.lang;
73
+ },
66
74
  };
@@ -1,11 +1,13 @@
1
1
  export type OpenAIImageSize = "1792x1024" | "1024x1792" | "1024x1024" | "1536x1024" | "1024x1536";
2
2
  export type OpenAIImageModeration = "low" | "auto";
3
+ export type OpenAIImageQuality = "low" | "medium" | "high" | "auto";
3
4
  export type OpenAIImageOptions = {
4
5
  model: string;
5
6
  prompt: string;
6
7
  n: number;
7
8
  size: OpenAIImageSize;
8
9
  moderation?: OpenAIImageModeration;
10
+ quality?: OpenAIImageQuality;
9
11
  };
10
12
  export type AgentBufferResult = {
11
13
  buffer: Buffer;
@@ -35,6 +37,7 @@ export type ImageAgentParams = {
35
37
  };
36
38
  export type OpenAIImageAgentParams = ImageAgentParams & {
37
39
  moderation: OpenAIImageModeration | null | undefined;
40
+ quality?: OpenAIImageQuality;
38
41
  };
39
42
  export type OpenAIImageAgentConfig = {
40
43
  baseURL?: string;
@@ -74,6 +77,7 @@ export type LipSyncAgentInputs = {
74
77
  lipSyncFile: string;
75
78
  movieFile: string;
76
79
  audioFile: string;
80
+ imageFile: string;
77
81
  };
78
82
  export type GoogleMovieAgentConfig = GoogleImageAgentConfig;
79
83
  export type ReplicateMovieAgentConfig = AgentConfig;