mulmocast 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/lib/actions/audio.js +8 -5
  2. package/lib/actions/image_agents.d.ts +3 -3
  3. package/lib/actions/image_references.js +2 -1
  4. package/lib/actions/images.js +19 -11
  5. package/lib/actions/movie.js +3 -2
  6. package/lib/actions/translate.d.ts +4 -1
  7. package/lib/actions/translate.js +6 -3
  8. package/lib/agents/add_bgm_agent.js +1 -1
  9. package/lib/agents/combine_audio_files_agent.js +116 -93
  10. package/lib/agents/movie_replicate_agent.js +1 -1
  11. package/lib/agents/tts_elevenlabs_agent.d.ts +2 -1
  12. package/lib/agents/tts_elevenlabs_agent.js +5 -4
  13. package/lib/agents/tts_google_agent.d.ts +2 -9
  14. package/lib/agents/tts_nijivoice_agent.d.ts +2 -1
  15. package/lib/agents/tts_nijivoice_agent.js +4 -5
  16. package/lib/agents/tts_openai_agent.d.ts +2 -13
  17. package/lib/agents/tts_openai_agent.js +4 -3
  18. package/lib/index.browser.d.ts +1 -2
  19. package/lib/index.browser.js +2 -3
  20. package/lib/index.common.d.ts +2 -0
  21. package/lib/index.common.js +3 -0
  22. package/lib/index.js +1 -0
  23. package/lib/index.node.d.ts +7 -0
  24. package/lib/index.node.js +8 -0
  25. package/lib/methods/mulmo_presentation_style.d.ts +1 -0
  26. package/lib/methods/mulmo_presentation_style.js +4 -0
  27. package/lib/types/agent.d.ts +29 -2
  28. package/lib/types/agent.js +0 -1
  29. package/lib/types/schema.d.ts +344 -254
  30. package/lib/types/schema.js +5 -3
  31. package/lib/utils/context.d.ts +24 -19
  32. package/lib/utils/ffmpeg_utils.d.ts +4 -1
  33. package/lib/utils/ffmpeg_utils.js +2 -1
  34. package/lib/utils/image_plugins/image.d.ts +2 -2
  35. package/lib/utils/image_plugins/movie.d.ts +2 -2
  36. package/lib/utils/preprocess.d.ts +21 -18
  37. package/lib/utils/provider2agent.d.ts +4 -0
  38. package/lib/utils/provider2agent.js +6 -0
  39. package/lib/utils/utils.js +6 -0
  40. package/package.json +6 -6
@@ -34,12 +34,13 @@ const getAudioParam = (presentationStyle, beat) => {
34
34
  // Use speaker-specific provider if available, otherwise fall back to script-level provider
35
35
  const provider = MulmoPresentationStyleMethods.getTTSProvider(presentationStyle, beat);
36
36
  const speechOptions = MulmoPresentationStyleMethods.getSpeechOptions(presentationStyle, beat);
37
- return { voiceId, provider, speechOptions };
37
+ const model = MulmoPresentationStyleMethods.getTTSModel(presentationStyle, beat);
38
+ return { voiceId, provider, speechOptions, model };
38
39
  };
39
40
  export const getBeatAudioPath = (text, context, beat, lang) => {
40
41
  const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
41
- const { voiceId, provider, speechOptions } = getAudioParam(context.presentationStyle, beat);
42
- const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider].join(":");
42
+ const { voiceId, provider, speechOptions, model } = getAudioParam(context.presentationStyle, beat);
43
+ const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider, model ?? ""].join(":");
43
44
  const audioFileName = `${context.studio.filename}_${text2hash(hash_string)}`;
44
45
  const audioFile = getAudioFilePath(audioDirPath, context.studio.filename, audioFileName, lang);
45
46
  return getAudioPath(context, beat, audioFile);
@@ -48,15 +49,16 @@ const preprocessor = (namedInputs) => {
48
49
  const { beat, studioBeat, multiLingual, context } = namedInputs;
49
50
  const { lang, presentationStyle } = context;
50
51
  const text = localizedText(beat, multiLingual, lang);
51
- const { voiceId, provider, speechOptions } = getAudioParam(presentationStyle, beat);
52
+ const { voiceId, provider, speechOptions, model } = getAudioParam(presentationStyle, beat);
52
53
  const audioPath = getBeatAudioPath(text, context, beat, lang);
53
- studioBeat.audioFile = audioPath; // TODO
54
+ studioBeat.audioFile = audioPath; // TODO: Passing by reference is difficult to maintain, so pass it using graphai inputs
54
55
  const needsTTS = !beat.audio && audioPath !== undefined;
55
56
  return {
56
57
  ttsAgent: provider2TTSAgent[provider].agentName,
57
58
  text,
58
59
  voiceId,
59
60
  speechOptions,
61
+ model,
60
62
  audioPath,
61
63
  studioBeat,
62
64
  needsTTS,
@@ -94,6 +96,7 @@ const graph_tts = {
94
96
  voice: ":preprocessor.voiceId",
95
97
  speed: ":preprocessor.speechOptions.speed",
96
98
  instructions: ":preprocessor.speechOptions.instruction",
99
+ model: ":preprocessor.model",
97
100
  },
98
101
  },
99
102
  },
@@ -14,8 +14,8 @@ export declare const imagePreprocessAgent: (namedInputs: {
14
14
  referenceImageForMovie: string | undefined;
15
15
  imageParams: {
16
16
  provider: string;
17
- style?: string | undefined;
18
17
  model?: string | undefined;
18
+ style?: string | undefined;
19
19
  moderation?: string | undefined;
20
20
  images?: Record<string, {
21
21
  type: "image";
@@ -58,8 +58,8 @@ export declare const imagePreprocessAgent: (namedInputs: {
58
58
  };
59
59
  imageParams: {
60
60
  provider: string;
61
- style?: string | undefined;
62
61
  model?: string | undefined;
62
+ style?: string | undefined;
63
63
  moderation?: string | undefined;
64
64
  images?: Record<string, {
65
65
  type: "image";
@@ -105,8 +105,8 @@ export declare const imagePreprocessAgent: (namedInputs: {
105
105
  };
106
106
  imageParams: {
107
107
  provider: string;
108
- style?: string | undefined;
109
108
  model?: string | undefined;
109
+ style?: string | undefined;
110
110
  moderation?: string | undefined;
111
111
  images?: Record<string, {
112
112
  type: "image";
@@ -1,5 +1,5 @@
1
1
  import fs from "fs";
2
- import { GraphAI } from "graphai";
2
+ import { GraphAI, GraphAILogger } from "graphai";
3
3
  import { getReferenceImagePath } from "../utils/file.js";
4
4
  import { getExtention } from "../utils/utils.js";
5
5
  import { graphOption } from "./images.js";
@@ -13,6 +13,7 @@ export const generateReferenceImage = async (inputs) => {
13
13
  // generate image
14
14
  const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
15
15
  const prompt = `${image.prompt}\n${imageAgentInfo.imageParams.style || ""}`;
16
+ GraphAILogger.info(`Generating reference image for ${key}: ${prompt}`);
16
17
  const image_graph_data = {
17
18
  version: 0.5,
18
19
  nodes: {
@@ -11,7 +11,7 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../met
11
11
  import { getOutputStudioFilePath, mkdir } from "../utils/file.js";
12
12
  import { fileCacheAgentFilter } from "../utils/filters.js";
13
13
  import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
14
- import { extractImageFromMovie } from "../utils/ffmpeg_utils.js";
14
+ import { extractImageFromMovie, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
15
15
  import { getImageRefs } from "./image_references.js";
16
16
  import { imagePreprocessAgent, imagePluginAgent, htmlImageGeneratorAgent } from "./image_agents.js";
17
17
  const vanillaAgents = vanilla.default ?? vanilla;
@@ -166,16 +166,30 @@ const beat_graph_data = {
166
166
  },
167
167
  defaultValue: {},
168
168
  },
169
+ audioChecker: {
170
+ if: ":preprocessor.movieFile",
171
+ agent: async (namedInputs) => {
172
+ const { hasAudio } = await ffmpegGetMediaDuration(namedInputs.movieFile);
173
+ return { hasMovieAudio: hasAudio };
174
+ },
175
+ inputs: {
176
+ onComplete: [":movieGenerator"], // to wait for movieGenerator to finish
177
+ movieFile: ":preprocessor.movieFile",
178
+ },
179
+ defaultValue: {},
180
+ },
169
181
  output: {
170
182
  agent: "copyAgent",
171
183
  inputs: {
172
- onComplete: [":imageFromMovie", ":htmlImageGenerator"], // to wait for imageFromMovie to finish
184
+ onComplete: [":imageFromMovie", ":htmlImageGenerator", ":audioChecker"], // to wait for imageFromMovie to finish
173
185
  imageFile: ":preprocessor.imagePath",
174
186
  movieFile: ":preprocessor.movieFile",
187
+ hasMovieAudio: ":audioChecker.hasMovieAudio",
175
188
  },
176
189
  output: {
177
190
  imageFile: ".imageFile",
178
191
  movieFile: ".movieFile",
192
+ hasMovieAudio: ".hasMovieAudio",
179
193
  },
180
194
  isResult: true,
181
195
  },
@@ -279,17 +293,11 @@ export const graphOption = async (context, settings) => {
279
293
  const config = settings2GraphAIConfig(settings, process.env);
280
294
  // We need to get google's auth token only if the google is the text2image provider.
281
295
  if (provider === "google" || context.presentationStyle.movieParams?.provider === "google") {
282
- userAssert(!!process.env.GOOGLE_PROJECT_ID, "GOOGLE_PROJECT_ID is not set");
296
+ userAssert(!!config.movieGoogleAgent || !!config.imageGoogleAgent, "GOOGLE_PROJECT_ID is not set");
283
297
  GraphAILogger.log("google was specified as text2image engine");
284
298
  const token = await googleAuth();
285
- config["imageGoogleAgent"] = {
286
- projectId: process.env.GOOGLE_PROJECT_ID,
287
- token,
288
- };
289
- config["movieGoogleAgent"] = {
290
- projectId: process.env.GOOGLE_PROJECT_ID,
291
- token,
292
- };
299
+ config["imageGoogleAgent"].token = token;
300
+ config["movieGoogleAgent"].token = token;
293
301
  }
294
302
  options.config = config;
295
303
  return options;
@@ -204,8 +204,9 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
204
204
  videoIdsForBeats.push(videoId);
205
205
  }
206
206
  // NOTE: We don't support audio if the speed is not 1.0.
207
- if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0 && speed === 1.0) {
208
- const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
207
+ const movieVolume = beat.audioParams?.movieVolume ?? 1.0;
208
+ if (studioBeat.hasMovieAudio && movieVolume > 0.0 && speed === 1.0) {
209
+ const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, movieVolume);
209
210
  audioIdsFromMovieBeats.push(audioId);
210
211
  ffmpegContext.filterComplex.push(audioPart);
211
212
  }
@@ -1,4 +1,7 @@
1
1
  import "dotenv/config";
2
2
  import type { CallbackFunction } from "graphai";
3
3
  import { MulmoStudioContext } from "../types/index.js";
4
- export declare const translate: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
4
+ export declare const translate: (context: MulmoStudioContext, args?: {
5
+ callbacks?: CallbackFunction[];
6
+ settings?: Record<string, string>;
7
+ }) => Promise<void>;
@@ -4,6 +4,7 @@ import * as agents from "@graphai/vanilla";
4
4
  import { openAIAgent } from "@graphai/openai_agent";
5
5
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
6
6
  import { recursiveSplitJa, replacementsJa, replacePairsJa } from "../utils/string.js";
7
+ import { settings2GraphAIConfig } from "../utils/utils.js";
7
8
  import { getOutputMultilingualFilePath, mkdir, writingMessage } from "../utils/file.js";
8
9
  import { translateSystemPrompt, translatePrompts } from "../utils/prompt.js";
9
10
  import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
@@ -208,15 +209,17 @@ const agentFilters = [
208
209
  ];
209
210
  const defaultLang = "en";
210
211
  const targetLangs = ["ja", "en"];
211
- export const translate = async (context, callbacks) => {
212
+ export const translate = async (context, args) => {
213
+ const { settings, callbacks } = args ?? {};
212
214
  try {
213
215
  MulmoStudioContextMethods.setSessionState(context, "multiLingual", true);
214
216
  const fileName = MulmoStudioContextMethods.getFileName(context);
215
217
  const outDirPath = MulmoStudioContextMethods.getOutDirPath(context);
216
218
  const outputMultilingualFilePath = getOutputMultilingualFilePath(outDirPath, fileName);
217
219
  mkdir(outDirPath);
218
- assert(!!process.env.OPENAI_API_KEY, "The OPENAI_API_KEY environment variable is missing or empty");
219
- const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters });
220
+ const config = settings2GraphAIConfig(settings, process.env);
221
+ assert(!!config?.openAIAgent?.apiKey, "The OPENAI_API_KEY environment variable is missing or empty");
222
+ const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters, config });
220
223
  graph.injectValue("context", context);
221
224
  graph.injectValue("defaultLang", defaultLang);
222
225
  graph.injectValue("targetLangs", targetLangs);
@@ -10,7 +10,7 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
10
10
  if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
11
11
  throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
12
12
  }
13
- const speechDuration = await ffmpegGetMediaDuration(voiceFile);
13
+ const { duration: speechDuration } = await ffmpegGetMediaDuration(voiceFile);
14
14
  const introPadding = context.presentationStyle.audioParams.introPadding;
15
15
  const outroPadding = context.presentationStyle.audioParams.outroPadding;
16
16
  const totalDuration = speechDuration + introPadding + outroPadding;
@@ -1,14 +1,15 @@
1
1
  import { assert, GraphAILogger } from "graphai";
2
2
  import { silent60secPath } from "../utils/file.js";
3
- import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
3
+ import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration, } from "../utils/ffmpeg_utils.js";
4
4
  import { userAssert } from "../utils/utils.js";
5
- const getMovieDulation = async (beat) => {
5
+ const getMovieDuration = async (beat) => {
6
6
  if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
7
7
  const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
8
8
  const speed = beat.movieParams?.speed ?? 1.0;
9
- return (await ffmpegGetMediaDuration(pathOrUrl)) / speed;
9
+ const { duration, hasAudio } = await ffmpegGetMediaDuration(pathOrUrl);
10
+ return { duration: duration / speed, hasAudio };
10
11
  }
11
- return 0;
12
+ return { duration: 0, hasAudio: false };
12
13
  };
13
14
  const getPadding = (context, beat, index) => {
14
15
  if (beat.audioParams?.padding !== undefined) {
@@ -29,16 +30,17 @@ const getTotalPadding = (padding, movieDuration, audioDuration, duration) => {
29
30
  }
30
31
  return padding;
31
32
  };
32
- const getMediaDurations = (context) => {
33
+ const getMediaDurationsOfAllBeats = (context) => {
33
34
  return Promise.all(context.studio.beats.map(async (studioBeat, index) => {
34
35
  const beat = context.studio.script.beats[index];
35
- const movieDuration = await getMovieDulation(beat);
36
- const audioDuration = studioBeat.audioFile ? await ffmpegGetMediaDuration(studioBeat.audioFile) : 0;
36
+ const { duration: movieDuration, hasAudio: hasMovieAudio } = await getMovieDuration(beat);
37
+ const audioDuration = studioBeat.audioFile ? (await ffmpegGetMediaDuration(studioBeat.audioFile)).duration : 0;
37
38
  return {
38
39
  movieDuration,
39
40
  audioDuration,
40
41
  hasMedia: movieDuration + audioDuration > 0,
41
42
  silenceDuration: 0,
43
+ hasMovieAudio,
42
44
  };
43
45
  }));
44
46
  };
@@ -60,11 +62,98 @@ const getGroupBeatDurations = (context, group, audioDuration) => {
60
62
  });
61
63
  return durations;
62
64
  };
65
+ const getInputIds = (context, mediaDurations, ffmpegContext, silentIds) => {
66
+ const inputIds = [];
67
+ context.studio.beats.forEach((studioBeat, index) => {
68
+ const { silenceDuration } = mediaDurations[index];
69
+ const paddingId = `[padding_${index}]`;
70
+ if (studioBeat.audioFile) {
71
+ const audioId = FfmpegContextInputFormattedAudio(ffmpegContext, studioBeat.audioFile);
72
+ inputIds.push(audioId);
73
+ }
74
+ if (silenceDuration > 0) {
75
+ const silentId = silentIds.pop();
76
+ ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${silenceDuration}${paddingId}`);
77
+ inputIds.push(paddingId);
78
+ }
79
+ });
80
+ return inputIds;
81
+ };
82
+ const voiceOverProcess = (context, mediaDurations, movieDuration, beatDurations, groupLength) => {
83
+ return (remaining, idx, iGroup) => {
84
+ const subBeatDurations = mediaDurations[idx];
85
+ userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
86
+ if (iGroup === groupLength - 1) {
87
+ beatDurations.push(remaining);
88
+ subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
89
+ return 0;
90
+ }
91
+ const nextBeat = context.studio.script.beats[idx + 1];
92
+ assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
93
+ const voiceStartAt = nextBeat.image?.startAt;
94
+ if (voiceStartAt) {
95
+ const remainingDuration = movieDuration - voiceStartAt;
96
+ const duration = remaining - remainingDuration;
97
+ userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
98
+ beatDurations.push(duration);
99
+ subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
100
+ userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
101
+ return remainingDuration;
102
+ }
103
+ beatDurations.push(subBeatDurations.audioDuration);
104
+ return remaining - subBeatDurations.audioDuration;
105
+ };
106
+ };
107
+ const getVoiceOverGroup = (context, index) => {
108
+ const group = [index];
109
+ for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
110
+ group.push(i);
111
+ }
112
+ return group;
113
+ };
114
+ const getSpillOverGroup = (context, mediaDurations, index) => {
115
+ const group = [index];
116
+ for (let i = index + 1; i < context.studio.beats.length && !mediaDurations[i].hasMedia; i++) {
117
+ group.push(i);
118
+ }
119
+ return group;
120
+ };
121
+ const spilledOverAudio = (context, group, audioDuration, beatDurations, mediaDurations) => {
122
+ const groupBeatsDurations = getGroupBeatDurations(context, group, audioDuration);
123
+ // Yes, the current beat has spilled over audio.
124
+ const beatsTotalDuration = groupBeatsDurations.reduce((a, b) => a + b, 0);
125
+ if (beatsTotalDuration > audioDuration + 0.01) {
126
+ // 0.01 is a tolerance to avoid floating point precision issues
127
+ group.reduce((remaining, idx, iGroup) => {
128
+ if (remaining >= groupBeatsDurations[iGroup]) {
129
+ return remaining - groupBeatsDurations[iGroup];
130
+ }
131
+ mediaDurations[idx].silenceDuration = groupBeatsDurations[iGroup] - remaining;
132
+ return 0;
133
+ }, audioDuration);
134
+ }
135
+ else if (audioDuration > beatsTotalDuration) {
136
+ // Last beat gets the rest of the audio.
137
+ groupBeatsDurations[groupBeatsDurations.length - 1] += audioDuration - beatsTotalDuration;
138
+ }
139
+ beatDurations.push(...groupBeatsDurations);
140
+ };
141
+ const noSpilledOverAudio = (context, beat, index, movieDuration, audioDuration, beatDurations, mediaDurations) => {
142
+ // padding is the amount of audio padding specified in the script.
143
+ const padding = getPadding(context, beat, index);
144
+ // totalPadding is the amount of audio padding to be added to the audio file.
145
+ const totalPadding = Math.round(getTotalPadding(padding, movieDuration, audioDuration, beat.duration) * 100) / 100;
146
+ const beatDuration = audioDuration + totalPadding;
147
+ beatDurations.push(beatDuration);
148
+ if (totalPadding > 0) {
149
+ mediaDurations[index].silenceDuration = totalPadding;
150
+ }
151
+ };
63
152
  const combineAudioFilesAgent = async ({ namedInputs, }) => {
64
153
  const { context, combinedFileName } = namedInputs;
65
154
  const ffmpegContext = FfmpegContextInit();
66
155
  // First, get the audio durations of all beats, taking advantage of multi-threading capability of ffmpeg.
67
- const mediaDurations = await getMediaDurations(context);
156
+ const mediaDurations = await getMediaDurationsOfAllBeats(context);
68
157
  const beatDurations = [];
69
158
  context.studio.script.beats.forEach((beat, index) => {
70
159
  if (beatDurations.length > index) {
@@ -75,91 +164,37 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
75
164
  const { audioDuration, movieDuration } = mediaDurations[index];
76
165
  // Check if we are processing a voice-over beat.
77
166
  if (movieDuration > 0) {
78
- const group = [index];
79
- for (let i = index + 1; i < context.studio.beats.length && context.studio.script.beats[i].image?.type === "voice_over"; i++) {
80
- group.push(i);
81
- }
167
+ const group = getVoiceOverGroup(context, index);
82
168
  if (group.length > 1) {
83
- group.reduce((remaining, idx, iGroup) => {
84
- const subBeatDurations = mediaDurations[idx];
85
- userAssert(subBeatDurations.audioDuration <= remaining, `Duration Overflow: At index(${idx}) audioDuration(${subBeatDurations.audioDuration}) > remaining(${remaining})`);
86
- if (iGroup === group.length - 1) {
87
- beatDurations.push(remaining);
88
- subBeatDurations.silenceDuration = remaining - subBeatDurations.audioDuration;
89
- return 0;
90
- }
91
- const nextBeat = context.studio.script.beats[idx + 1];
92
- assert(nextBeat.image?.type === "voice_over", "nextBeat.image.type !== voice_over");
93
- const voiceStartAt = nextBeat.image?.startAt;
94
- if (voiceStartAt) {
95
- const remainingDuration = movieDuration - voiceStartAt;
96
- const duration = remaining - remainingDuration;
97
- userAssert(duration >= 0, `Invalid startAt: At index(${idx}), avaiable duration(${duration}) < 0`);
98
- beatDurations.push(duration);
99
- subBeatDurations.silenceDuration = duration - subBeatDurations.audioDuration;
100
- userAssert(subBeatDurations.silenceDuration >= 0, `Duration Overwrap: At index(${idx}), silenceDuration(${subBeatDurations.silenceDuration}) < 0`);
101
- return remainingDuration;
102
- }
103
- beatDurations.push(subBeatDurations.audioDuration);
104
- return remaining - subBeatDurations.audioDuration;
105
- }, movieDuration);
169
+ GraphAILogger.log(`Voice over group: ${group.length}`);
170
+ group.reduce(voiceOverProcess(context, mediaDurations, movieDuration, beatDurations, group.length), movieDuration);
106
171
  return;
107
172
  }
108
173
  }
109
174
  // Check if the current beat has media and the next beat does not have media.
110
175
  if (audioDuration > 0) {
111
176
  // Check if the current beat has spilled over audio.
112
- const group = [index];
113
- for (let i = index + 1; i < context.studio.beats.length && !mediaDurations[i].hasMedia; i++) {
114
- group.push(i);
115
- }
177
+ const group = getSpillOverGroup(context, mediaDurations, index);
116
178
  if (group.length > 1) {
117
- const groupBeatsDurations = getGroupBeatDurations(context, group, audioDuration);
118
- // Yes, the current beat has spilled over audio.
119
- const beatsTotalDuration = groupBeatsDurations.reduce((a, b) => a + b, 0);
120
- if (beatsTotalDuration > audioDuration + 0.01) {
121
- // 0.01 is a tolerance to avoid floating point precision issues
122
- group.reduce((remaining, idx, iGroup) => {
123
- if (remaining >= groupBeatsDurations[iGroup]) {
124
- return remaining - groupBeatsDurations[iGroup];
125
- }
126
- mediaDurations[idx].silenceDuration = groupBeatsDurations[iGroup] - remaining;
127
- return 0;
128
- }, audioDuration);
129
- }
130
- else {
131
- // Last beat gets the rest of the audio.
132
- if (audioDuration > beatsTotalDuration) {
133
- groupBeatsDurations[groupBeatsDurations.length - 1] += audioDuration - beatsTotalDuration;
134
- }
135
- }
136
- beatDurations.push(...groupBeatsDurations);
137
- }
138
- else {
139
- // No spilled over audio.
140
- assert(beatDurations.length === index, "beatDurations.length !== index");
141
- // padding is the amount of audio padding specified in the script.
142
- const padding = getPadding(context, beat, index);
143
- // totalPadding is the amount of audio padding to be added to the audio file.
144
- const totalPadding = Math.round(getTotalPadding(padding, movieDuration, audioDuration, beat.duration) * 100) / 100;
145
- const beatDuration = audioDuration + totalPadding;
146
- beatDurations.push(beatDuration);
147
- if (totalPadding > 0) {
148
- mediaDurations[index].silenceDuration = totalPadding;
149
- }
179
+ GraphAILogger.log(`Spill over group: ${group.length}`);
180
+ spilledOverAudio(context, group, audioDuration, beatDurations, mediaDurations);
181
+ return;
150
182
  }
183
+ // No spilled over audio.
184
+ assert(beatDurations.length === index, "beatDurations.length !== index");
185
+ noSpilledOverAudio(context, beat, index, movieDuration, audioDuration, beatDurations, mediaDurations);
186
+ return;
151
187
  }
152
- else if (movieDuration > 0) {
188
+ if (movieDuration > 0) {
153
189
  // This beat has only a movie, not audio.
154
190
  beatDurations.push(movieDuration);
155
191
  mediaDurations[index].silenceDuration = movieDuration;
192
+ return;
156
193
  }
157
- else {
158
- // The current beat has no audio, nor no spilled over audio
159
- const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
160
- beatDurations.push(beatDuration);
161
- mediaDurations[index].silenceDuration = beatDuration;
162
- }
194
+ // The current beat has no audio, nor no spilled over audio
195
+ const beatDuration = beat.duration ?? (movieDuration > 0 ? movieDuration : 1.0);
196
+ beatDurations.push(beatDuration);
197
+ mediaDurations[index].silenceDuration = beatDuration;
163
198
  });
164
199
  assert(beatDurations.length === context.studio.beats.length, "beatDurations.length !== studio.beats.length");
165
200
  // We cannot reuse longSilentId. We need to explicitly split it for each beat.
@@ -168,20 +203,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
168
203
  const longSilentId = FfmpegContextInputFormattedAudio(ffmpegContext, silent60secPath(), undefined, ["-stream_loop", "-1"]);
169
204
  ffmpegContext.filterComplex.push(`${longSilentId}asplit=${silentIds.length}${silentIds.join("")}`);
170
205
  }
171
- const inputIds = [];
172
- context.studio.beats.forEach((studioBeat, index) => {
173
- const { silenceDuration } = mediaDurations[index];
174
- const paddingId = `[padding_${index}]`;
175
- if (studioBeat.audioFile) {
176
- const audioId = FfmpegContextInputFormattedAudio(ffmpegContext, studioBeat.audioFile);
177
- inputIds.push(audioId);
178
- }
179
- if (silenceDuration > 0) {
180
- const silentId = silentIds.pop();
181
- ffmpegContext.filterComplex.push(`${silentId}atrim=start=0:end=${silenceDuration}${paddingId}`);
182
- inputIds.push(paddingId);
183
- }
184
- });
206
+ const inputIds = getInputIds(context, mediaDurations, ffmpegContext, silentIds);
185
207
  assert(silentIds.length === 0, "silentIds.length !== 0");
186
208
  GraphAILogger.log("filterComplex:", ffmpegContext.filterComplex.join("\n"));
187
209
  // Finally, combine all audio files.
@@ -196,6 +218,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
196
218
  audioDuration: mediaDurations[index].audioDuration,
197
219
  movieDuration: mediaDurations[index].movieDuration,
198
220
  silenceDuration: mediaDurations[index].silenceDuration,
221
+ hasMovieAudio: mediaDurations[index].hasMovieAudio,
199
222
  })),
200
223
  },
201
224
  };
@@ -62,7 +62,7 @@ export const movieReplicateAgent = async ({ namedInputs, params, config, }) => {
62
62
  const { prompt, imagePath } = namedInputs;
63
63
  const aspectRatio = getAspectRatio(params.canvasSize);
64
64
  const duration = params.duration ?? 5;
65
- const apiKey = config?.apiKey ?? process.env.REPLICATE_API_TOKEN;
65
+ const apiKey = config?.apiKey;
66
66
  if (!apiKey) {
67
67
  throw new Error("REPLICATE_API_TOKEN environment variable is required");
68
68
  }
@@ -1,4 +1,5 @@
1
1
  import type { AgentFunction, AgentFunctionInfo } from "graphai";
2
- export declare const ttsElevenlabsAgent: AgentFunction;
2
+ import type { ElevenlabsTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, AgentConfig } from "../types/agent.js";
3
+ export declare const ttsElevenlabsAgent: AgentFunction<ElevenlabsTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, AgentConfig>;
3
4
  declare const ttsElevenlabsAgentInfo: AgentFunctionInfo;
4
5
  export default ttsElevenlabsAgentInfo;
@@ -1,18 +1,19 @@
1
1
  import { GraphAILogger } from "graphai";
2
- export const ttsElevenlabsAgent = async ({ namedInputs, params, config }) => {
2
+ import { provider2TTSAgent } from "../utils/provider2agent.js";
3
+ export const ttsElevenlabsAgent = async ({ namedInputs, params, config, }) => {
3
4
  const { text } = namedInputs;
4
5
  const { voice, model, stability, similarityBoost, suppressError } = params;
5
- const apiKey = config?.apiKey ?? process.env.ELEVENLABS_API_KEY;
6
+ const apiKey = config?.apiKey;
6
7
  if (!apiKey) {
7
8
  throw new Error("ELEVENLABS_API_KEY environment variable is required");
8
9
  }
9
10
  if (!voice) {
10
- throw new Error("Voice ID is required");
11
+ throw new Error("ELEVENLABS Voice ID is required");
11
12
  }
12
13
  try {
13
14
  const requestBody = {
14
15
  text,
15
- model_id: model ?? "eleven_monolingual_v1",
16
+ model_id: model ?? provider2TTSAgent.elevenlabs.defaultModel,
16
17
  voice_settings: {
17
18
  stability: stability ?? 0.5,
18
19
  similarity_boost: similarityBoost ?? 0.75,
@@ -1,12 +1,5 @@
1
1
  import type { AgentFunction, AgentFunctionInfo } from "graphai";
2
- export declare const ttsGoogleAgent: AgentFunction<{
3
- voice: string;
4
- speed: number;
5
- suppressError: boolean;
6
- }, {
7
- buffer?: Buffer | null;
8
- }, {
9
- text: string;
10
- }>;
2
+ import type { GoogleTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult } from "../types/agent.js";
3
+ export declare const ttsGoogleAgent: AgentFunction<GoogleTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs>;
11
4
  declare const ttsGoogleAgentInfo: AgentFunctionInfo;
12
5
  export default ttsGoogleAgentInfo;
@@ -1,4 +1,5 @@
1
1
  import type { AgentFunction, AgentFunctionInfo } from "graphai";
2
- export declare const ttsNijivoiceAgent: AgentFunction;
2
+ import type { NijivoiceTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, AgentConfig } from "../types/agent.js";
3
+ export declare const ttsNijivoiceAgent: AgentFunction<NijivoiceTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, AgentConfig>;
3
4
  declare const ttsNijivoiceAgentInfo: AgentFunctionInfo;
4
5
  export default ttsNijivoiceAgentInfo;
@@ -1,21 +1,20 @@
1
1
  import { GraphAILogger, assert } from "graphai";
2
- const nijovoiceApiKey = process.env.NIJIVOICE_API_KEY ?? "";
3
2
  const errorMessage = [
4
3
  "TTS NijiVoice: No API key. ",
5
4
  "You have the following options:",
6
5
  "1. Obtain an API key from Niji Voice (https://platform.nijivoice.com/) and set it as the NIJIVOICE_API_KEY environment variable.",
7
6
  '2. Use OpenAI\'s TTS instead of Niji Voice by changing speechParams.provider from "nijivoice" to "openai".',
8
7
  ].join("\n");
9
- export const ttsNijivoiceAgent = async ({ params, namedInputs, config }) => {
8
+ export const ttsNijivoiceAgent = async ({ params, namedInputs, config, }) => {
10
9
  const { suppressError, voice, speed, speed_global } = params;
11
10
  const { apiKey } = config ?? {};
12
11
  const { text } = namedInputs;
13
- assert(apiKey ?? nijovoiceApiKey, errorMessage);
12
+ assert(!!apiKey, errorMessage);
14
13
  const url = `https://api.nijivoice.com/api/platform/v1/voice-actors/${voice}/generate-voice`;
15
14
  const options = {
16
15
  method: "POST",
17
16
  headers: {
18
- "x-api-key": apiKey ?? nijovoiceApiKey,
17
+ "x-api-key": apiKey,
19
18
  accept: "application/json",
20
19
  "content-type": "application/json",
21
20
  },
@@ -31,7 +30,7 @@ export const ttsNijivoiceAgent = async ({ params, namedInputs, config }) => {
31
30
  if (voiceJson && voiceJson.generatedVoice && voiceJson.generatedVoice.audioFileDownloadUrl) {
32
31
  const audioRes = await fetch(voiceJson.generatedVoice.audioFileDownloadUrl);
33
32
  const buffer = Buffer.from(await audioRes.arrayBuffer());
34
- return { buffer, generatedVoice: voiceJson.generatedVoice };
33
+ return { buffer };
35
34
  }
36
35
  if (suppressError) {
37
36
  return {
@@ -1,16 +1,5 @@
1
1
  import type { AgentFunction, AgentFunctionInfo } from "graphai";
2
- export declare const ttsOpenaiAgent: AgentFunction<{
3
- model: string;
4
- voice: string;
5
- instructions: string;
6
- suppressError: boolean;
7
- }, {
8
- buffer?: Buffer;
9
- }, {
10
- text: string;
11
- }, {
12
- baseURL?: string;
13
- apiKey?: string;
14
- }>;
2
+ import type { OpenAITTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, OpenAIImageAgentConfig } from "../types/agent.js";
3
+ export declare const ttsOpenaiAgent: AgentFunction<OpenAITTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, OpenAIImageAgentConfig>;
15
4
  declare const ttsOpenaiAgentInfo: AgentFunctionInfo;
16
5
  export default ttsOpenaiAgentInfo;
@@ -1,14 +1,15 @@
1
1
  import { GraphAILogger } from "graphai";
2
2
  import OpenAI from "openai";
3
- export const ttsOpenaiAgent = async ({ namedInputs, params, config }) => {
3
+ import { provider2TTSAgent } from "../utils/provider2agent.js";
4
+ export const ttsOpenaiAgent = async ({ namedInputs, params, config, }) => {
4
5
  const { text } = namedInputs;
5
6
  const { model, voice, suppressError, instructions } = params;
6
7
  const { apiKey, baseURL } = config ?? {};
7
8
  const openai = new OpenAI({ apiKey, baseURL });
8
9
  try {
9
10
  const tts_options = {
10
- model: model ?? "gpt-4o-mini-tts", // "tts-1",
11
- voice: voice ?? "shimmer",
11
+ model: model ?? provider2TTSAgent.openai.defaultModel,
12
+ voice: voice ?? provider2TTSAgent.openai.defaultVoice,
12
13
  input: text,
13
14
  };
14
15
  if (instructions) {
@@ -1,3 +1,2 @@
1
- export * from "./types/type.js";
2
- export * from "./types/schema.js";
1
+ export * from "./index.common.js";
3
2
  export * from "./agents/validate_schema_agent.js";