mulmocast 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/actions/audio.js +13 -18
- package/lib/actions/image_agents.d.ts +30 -6
- package/lib/actions/image_agents.js +5 -2
- package/lib/actions/image_references.js +2 -1
- package/lib/actions/images.d.ts +9 -1
- package/lib/actions/images.js +38 -13
- package/lib/actions/movie.js +3 -2
- package/lib/agents/add_bgm_agent.js +1 -1
- package/lib/agents/combine_audio_files_agent.js +10 -7
- package/lib/agents/image_google_agent.js +2 -2
- package/lib/agents/image_openai_agent.js +2 -2
- package/lib/agents/movie_replicate_agent.js +1 -1
- package/lib/agents/tts_elevenlabs_agent.d.ts +2 -1
- package/lib/agents/tts_elevenlabs_agent.js +4 -3
- package/lib/agents/tts_google_agent.d.ts +2 -9
- package/lib/agents/tts_nijivoice_agent.d.ts +2 -1
- package/lib/agents/tts_nijivoice_agent.js +3 -3
- package/lib/agents/tts_openai_agent.d.ts +2 -13
- package/lib/agents/tts_openai_agent.js +4 -3
- package/lib/index.browser.d.ts +1 -0
- package/lib/index.browser.js +1 -0
- package/lib/index.d.ts +1 -0
- package/lib/index.js +2 -0
- package/lib/methods/mulmo_presentation_style.d.ts +2 -1
- package/lib/methods/mulmo_presentation_style.js +21 -17
- package/lib/types/agent.d.ts +29 -2
- package/lib/types/agent.js +0 -1
- package/lib/types/schema.d.ts +596 -485
- package/lib/types/schema.js +15 -11
- package/lib/utils/const.d.ts +0 -1
- package/lib/utils/const.js +0 -1
- package/lib/utils/context.d.ts +36 -30
- package/lib/utils/ffmpeg_utils.d.ts +4 -1
- package/lib/utils/ffmpeg_utils.js +2 -1
- package/lib/utils/preprocess.d.ts +28 -24
- package/lib/utils/provider2agent.d.ts +76 -0
- package/lib/utils/provider2agent.js +87 -0
- package/lib/utils/utils.d.ts +6 -11
- package/lib/utils/utils.js +5 -26
- package/package.json +2 -2
package/lib/actions/audio.js
CHANGED
|
@@ -12,18 +12,10 @@ import { MulmoPresentationStyleMethods } from "../methods/index.js";
|
|
|
12
12
|
import { fileCacheAgentFilter } from "../utils/filters.js";
|
|
13
13
|
import { getAudioArtifactFilePath, getAudioFilePath, getOutputStudioFilePath, resolveDirPath, defaultBGMPath, mkdir, writingMessage } from "../utils/file.js";
|
|
14
14
|
import { text2hash, localizedText, settings2GraphAIConfig } from "../utils/utils.js";
|
|
15
|
+
import { provider2TTSAgent } from "../utils/provider2agent.js";
|
|
15
16
|
import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
|
|
16
17
|
import { MulmoMediaSourceMethods } from "../methods/mulmo_media_source.js";
|
|
17
18
|
const vanillaAgents = agents.default ?? agents;
|
|
18
|
-
// const rion_takanashi_voice = "b9277ce3-ba1c-4f6f-9a65-c05ca102ded0"; // たかなし りおん
|
|
19
|
-
// const ben_carter_voice = "bc06c63f-fef6-43b6-92f7-67f919bd5dae"; // ベン・カーター
|
|
20
|
-
const provider_to_agent = {
|
|
21
|
-
nijivoice: "ttsNijivoiceAgent",
|
|
22
|
-
openai: "ttsOpenaiAgent",
|
|
23
|
-
google: "ttsGoogleAgent",
|
|
24
|
-
elevenlabs: "ttsElevenlabsAgent",
|
|
25
|
-
mock: "mediaMockAgent",
|
|
26
|
-
};
|
|
27
19
|
const getAudioPath = (context, beat, audioFile) => {
|
|
28
20
|
if (beat.audio?.type === "audio") {
|
|
29
21
|
const path = MulmoMediaSourceMethods.resolve(beat.audio.source, context);
|
|
@@ -40,14 +32,15 @@ const getAudioPath = (context, beat, audioFile) => {
|
|
|
40
32
|
const getAudioParam = (presentationStyle, beat) => {
|
|
41
33
|
const voiceId = MulmoPresentationStyleMethods.getVoiceId(presentationStyle, beat);
|
|
42
34
|
// Use speaker-specific provider if available, otherwise fall back to script-level provider
|
|
43
|
-
const provider = MulmoPresentationStyleMethods.
|
|
35
|
+
const provider = MulmoPresentationStyleMethods.getTTSProvider(presentationStyle, beat);
|
|
44
36
|
const speechOptions = MulmoPresentationStyleMethods.getSpeechOptions(presentationStyle, beat);
|
|
45
|
-
|
|
37
|
+
const model = MulmoPresentationStyleMethods.getTTSModel(presentationStyle, beat);
|
|
38
|
+
return { voiceId, provider, speechOptions, model };
|
|
46
39
|
};
|
|
47
40
|
export const getBeatAudioPath = (text, context, beat, lang) => {
|
|
48
41
|
const audioDirPath = MulmoStudioContextMethods.getAudioDirPath(context);
|
|
49
|
-
const { voiceId, provider, speechOptions } = getAudioParam(context.presentationStyle, beat);
|
|
50
|
-
const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider].join(":");
|
|
42
|
+
const { voiceId, provider, speechOptions, model } = getAudioParam(context.presentationStyle, beat);
|
|
43
|
+
const hash_string = [text, voiceId, speechOptions?.instruction ?? "", speechOptions?.speed ?? 1.0, provider, model ?? ""].join(":");
|
|
51
44
|
const audioFileName = `${context.studio.filename}_${text2hash(hash_string)}`;
|
|
52
45
|
const audioFile = getAudioFilePath(audioDirPath, context.studio.filename, audioFileName, lang);
|
|
53
46
|
return getAudioPath(context, beat, audioFile);
|
|
@@ -56,15 +49,16 @@ const preprocessor = (namedInputs) => {
|
|
|
56
49
|
const { beat, studioBeat, multiLingual, context } = namedInputs;
|
|
57
50
|
const { lang, presentationStyle } = context;
|
|
58
51
|
const text = localizedText(beat, multiLingual, lang);
|
|
59
|
-
const { voiceId, provider, speechOptions } = getAudioParam(presentationStyle, beat);
|
|
52
|
+
const { voiceId, provider, speechOptions, model } = getAudioParam(presentationStyle, beat);
|
|
60
53
|
const audioPath = getBeatAudioPath(text, context, beat, lang);
|
|
61
|
-
studioBeat.audioFile = audioPath; // TODO
|
|
54
|
+
studioBeat.audioFile = audioPath; // TODO: Passing by reference is difficult to maintain, so pass it using graphai inputs
|
|
62
55
|
const needsTTS = !beat.audio && audioPath !== undefined;
|
|
63
56
|
return {
|
|
64
|
-
ttsAgent:
|
|
57
|
+
ttsAgent: provider2TTSAgent[provider].agentName,
|
|
65
58
|
text,
|
|
66
59
|
voiceId,
|
|
67
60
|
speechOptions,
|
|
61
|
+
model,
|
|
68
62
|
audioPath,
|
|
69
63
|
studioBeat,
|
|
70
64
|
needsTTS,
|
|
@@ -102,6 +96,7 @@ const graph_tts = {
|
|
|
102
96
|
voice: ":preprocessor.voiceId",
|
|
103
97
|
speed: ":preprocessor.speechOptions.speed",
|
|
104
98
|
instructions: ":preprocessor.speechOptions.instruction",
|
|
99
|
+
model: ":preprocessor.model",
|
|
105
100
|
},
|
|
106
101
|
},
|
|
107
102
|
},
|
|
@@ -186,8 +181,8 @@ export const audioFilePath = (context) => {
|
|
|
186
181
|
const getConcurrency = (context) => {
|
|
187
182
|
// Check if any speaker uses nijivoice or elevenlabs (providers that require concurrency = 1)
|
|
188
183
|
const hasLimitedConcurrencyProvider = Object.values(context.presentationStyle.speechParams.speakers).some((speaker) => {
|
|
189
|
-
const provider = speaker.provider ?? context.presentationStyle.speechParams.provider;
|
|
190
|
-
return provider
|
|
184
|
+
const provider = (speaker.provider ?? context.presentationStyle.speechParams.provider);
|
|
185
|
+
return provider2TTSAgent[provider].hasLimitedConcurrency;
|
|
191
186
|
});
|
|
192
187
|
return hasLimitedConcurrencyProvider ? 1 : 8;
|
|
193
188
|
};
|
|
@@ -13,9 +13,9 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
13
13
|
imagePath: string | undefined;
|
|
14
14
|
referenceImageForMovie: string | undefined;
|
|
15
15
|
imageParams: {
|
|
16
|
-
provider:
|
|
17
|
-
style?: string | undefined;
|
|
16
|
+
provider: string;
|
|
18
17
|
model?: string | undefined;
|
|
18
|
+
style?: string | undefined;
|
|
19
19
|
moderation?: string | undefined;
|
|
20
20
|
images?: Record<string, {
|
|
21
21
|
type: "image";
|
|
@@ -44,10 +44,22 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
44
44
|
} | {
|
|
45
45
|
imagePath: string;
|
|
46
46
|
imageFromMovie: boolean;
|
|
47
|
+
movieParams: {
|
|
48
|
+
speed?: number | undefined;
|
|
49
|
+
model?: string | undefined;
|
|
50
|
+
fillOption?: {
|
|
51
|
+
style: "aspectFit" | "aspectFill";
|
|
52
|
+
} | undefined;
|
|
53
|
+
provider?: string | undefined;
|
|
54
|
+
transition?: {
|
|
55
|
+
type: "fade" | "slideout_left";
|
|
56
|
+
duration: number;
|
|
57
|
+
} | undefined;
|
|
58
|
+
};
|
|
47
59
|
imageParams: {
|
|
48
|
-
provider:
|
|
49
|
-
style?: string | undefined;
|
|
60
|
+
provider: string;
|
|
50
61
|
model?: string | undefined;
|
|
62
|
+
style?: string | undefined;
|
|
51
63
|
moderation?: string | undefined;
|
|
52
64
|
images?: Record<string, {
|
|
53
65
|
type: "image";
|
|
@@ -79,10 +91,22 @@ export declare const imagePreprocessAgent: (namedInputs: {
|
|
|
79
91
|
imageAgentInfo: import("../types/type.js").Text2ImageAgentInfo;
|
|
80
92
|
prompt: string;
|
|
81
93
|
referenceImages: string[];
|
|
94
|
+
movieParams: {
|
|
95
|
+
speed?: number | undefined;
|
|
96
|
+
model?: string | undefined;
|
|
97
|
+
fillOption?: {
|
|
98
|
+
style: "aspectFit" | "aspectFill";
|
|
99
|
+
} | undefined;
|
|
100
|
+
provider?: string | undefined;
|
|
101
|
+
transition?: {
|
|
102
|
+
type: "fade" | "slideout_left";
|
|
103
|
+
duration: number;
|
|
104
|
+
} | undefined;
|
|
105
|
+
};
|
|
82
106
|
imageParams: {
|
|
83
|
-
provider:
|
|
84
|
-
style?: string | undefined;
|
|
107
|
+
provider: string;
|
|
85
108
|
model?: string | undefined;
|
|
109
|
+
style?: string | undefined;
|
|
86
110
|
moderation?: string | undefined;
|
|
87
111
|
images?: Record<string, {
|
|
88
112
|
type: "image";
|
|
@@ -2,6 +2,7 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods, MulmoBeatMeth
|
|
|
2
2
|
import { getBeatPngImagePath, getBeatMoviePath } from "../utils/file.js";
|
|
3
3
|
import { imagePrompt, htmlImageSystemPrompt } from "../utils/prompt.js";
|
|
4
4
|
import { renderHTMLToImage } from "../utils/markdown.js";
|
|
5
|
+
import { GraphAILogger } from "graphai";
|
|
5
6
|
const htmlStyle = (context, beat) => {
|
|
6
7
|
return {
|
|
7
8
|
canvasSize: MulmoPresentationStyleMethods.getCanvasSize(context.presentationStyle),
|
|
@@ -27,13 +28,15 @@ export const imagePreprocessAgent = async (namedInputs) => {
|
|
|
27
28
|
// undefined prompt indicates that image generation is not needed
|
|
28
29
|
return { ...returnValue, imagePath: pluginPath, referenceImageForMovie: pluginPath };
|
|
29
30
|
}
|
|
31
|
+
const movieParams = { ...context.presentationStyle.movieParams, ...beat.movieParams };
|
|
32
|
+
GraphAILogger.log(`movieParams: ${index}`, movieParams, beat.moviePrompt);
|
|
30
33
|
if (beat.moviePrompt && !beat.imagePrompt) {
|
|
31
|
-
return { ...returnValue, imagePath, imageFromMovie: true }; // no image prompt, only movie prompt
|
|
34
|
+
return { ...returnValue, imagePath, imageFromMovie: true, movieParams }; // no image prompt, only movie prompt
|
|
32
35
|
}
|
|
33
36
|
// referenceImages for "edit_image", openai agent.
|
|
34
37
|
const referenceImages = MulmoBeatMethods.getImageReferenceForImageGenerator(beat, imageRefs);
|
|
35
38
|
const prompt = imagePrompt(beat, imageAgentInfo.imageParams.style);
|
|
36
|
-
return { ...returnValue, imagePath, referenceImageForMovie: imagePath, imageAgentInfo, prompt, referenceImages };
|
|
39
|
+
return { ...returnValue, imagePath, referenceImageForMovie: imagePath, imageAgentInfo, prompt, referenceImages, movieParams };
|
|
37
40
|
};
|
|
38
41
|
export const imagePluginAgent = async (namedInputs) => {
|
|
39
42
|
const { context, beat, index } = namedInputs;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import fs from "fs";
|
|
2
|
-
import { GraphAI } from "graphai";
|
|
2
|
+
import { GraphAI, GraphAILogger } from "graphai";
|
|
3
3
|
import { getReferenceImagePath } from "../utils/file.js";
|
|
4
4
|
import { getExtention } from "../utils/utils.js";
|
|
5
5
|
import { graphOption } from "./images.js";
|
|
@@ -13,6 +13,7 @@ export const generateReferenceImage = async (inputs) => {
|
|
|
13
13
|
// generate image
|
|
14
14
|
const imageAgentInfo = MulmoPresentationStyleMethods.getImageAgentInfo(context.presentationStyle);
|
|
15
15
|
const prompt = `${image.prompt}\n${imageAgentInfo.imageParams.style || ""}`;
|
|
16
|
+
GraphAILogger.info(`Generating reference image for ${key}: ${prompt}`);
|
|
16
17
|
const image_graph_data = {
|
|
17
18
|
version: 0.5,
|
|
18
19
|
nodes: {
|
package/lib/actions/images.d.ts
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
import type { GraphOptions, CallbackFunction } from "graphai";
|
|
2
2
|
import { MulmoStudioContext } from "../types/index.js";
|
|
3
3
|
export declare const graphOption: (context: MulmoStudioContext, settings?: Record<string, string>) => Promise<GraphOptions>;
|
|
4
|
-
|
|
4
|
+
type ImageOptions = {
|
|
5
|
+
imageAgents: Record<string, unknown>;
|
|
6
|
+
};
|
|
7
|
+
export declare const images: (context: MulmoStudioContext, args?: {
|
|
8
|
+
settings?: Record<string, string>;
|
|
9
|
+
callbacks?: CallbackFunction[];
|
|
10
|
+
options?: ImageOptions;
|
|
11
|
+
}) => Promise<MulmoStudioContext>;
|
|
5
12
|
export declare const generateBeatImage: (inputs: {
|
|
6
13
|
index: number;
|
|
7
14
|
context: MulmoStudioContext;
|
|
@@ -10,3 +17,4 @@ export declare const generateBeatImage: (inputs: {
|
|
|
10
17
|
forceMovie?: boolean;
|
|
11
18
|
forceImage?: boolean;
|
|
12
19
|
}) => Promise<void>;
|
|
20
|
+
export {};
|
package/lib/actions/images.js
CHANGED
|
@@ -2,7 +2,7 @@ import dotenv from "dotenv";
|
|
|
2
2
|
import fs from "fs";
|
|
3
3
|
import { GraphAI, GraphAILogger, TaskManager } from "graphai";
|
|
4
4
|
import { GoogleAuth } from "google-auth-library";
|
|
5
|
-
import * as
|
|
5
|
+
import * as vanilla from "@graphai/vanilla";
|
|
6
6
|
import { openAIAgent } from "@graphai/openai_agent";
|
|
7
7
|
import { anthropicAgent } from "@graphai/anthropic_agent";
|
|
8
8
|
import { fileWriteAgent } from "@graphai/vanilla_node_agents";
|
|
@@ -11,16 +11,22 @@ import { MulmoPresentationStyleMethods, MulmoStudioContextMethods } from "../met
|
|
|
11
11
|
import { getOutputStudioFilePath, mkdir } from "../utils/file.js";
|
|
12
12
|
import { fileCacheAgentFilter } from "../utils/filters.js";
|
|
13
13
|
import { userAssert, settings2GraphAIConfig } from "../utils/utils.js";
|
|
14
|
-
import { extractImageFromMovie } from "../utils/ffmpeg_utils.js";
|
|
14
|
+
import { extractImageFromMovie, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
|
|
15
15
|
import { getImageRefs } from "./image_references.js";
|
|
16
16
|
import { imagePreprocessAgent, imagePluginAgent, htmlImageGeneratorAgent } from "./image_agents.js";
|
|
17
|
-
const vanillaAgents =
|
|
17
|
+
const vanillaAgents = vanilla.default ?? vanilla;
|
|
18
18
|
const imageAgents = {
|
|
19
|
-
...vanillaAgents,
|
|
20
19
|
imageGoogleAgent,
|
|
20
|
+
imageOpenaiAgent,
|
|
21
|
+
};
|
|
22
|
+
const movieAgents = {
|
|
21
23
|
movieGoogleAgent,
|
|
22
24
|
movieReplicateAgent,
|
|
23
|
-
|
|
25
|
+
};
|
|
26
|
+
const defaultAgents = {
|
|
27
|
+
...vanillaAgents,
|
|
28
|
+
...imageAgents,
|
|
29
|
+
...movieAgents,
|
|
24
30
|
mediaMockAgent,
|
|
25
31
|
fileWriteAgent,
|
|
26
32
|
openAIAgent,
|
|
@@ -141,7 +147,7 @@ const beat_graph_data = {
|
|
|
141
147
|
mulmoContext: ":context",
|
|
142
148
|
},
|
|
143
149
|
params: {
|
|
144
|
-
model: ":
|
|
150
|
+
model: ":preprocessor.movieParams.model",
|
|
145
151
|
duration: ":beat.duration",
|
|
146
152
|
canvasSize: ":context.presentationStyle.canvasSize",
|
|
147
153
|
},
|
|
@@ -160,16 +166,30 @@ const beat_graph_data = {
|
|
|
160
166
|
},
|
|
161
167
|
defaultValue: {},
|
|
162
168
|
},
|
|
169
|
+
audioChecker: {
|
|
170
|
+
if: ":preprocessor.movieFile",
|
|
171
|
+
agent: async (namedInputs) => {
|
|
172
|
+
const { hasAudio } = await ffmpegGetMediaDuration(namedInputs.movieFile);
|
|
173
|
+
return { hasMovieAudio: hasAudio };
|
|
174
|
+
},
|
|
175
|
+
inputs: {
|
|
176
|
+
onComplete: [":movieGenerator"], // to wait for movieGenerator to finish
|
|
177
|
+
movieFile: ":preprocessor.movieFile",
|
|
178
|
+
},
|
|
179
|
+
defaultValue: {},
|
|
180
|
+
},
|
|
163
181
|
output: {
|
|
164
182
|
agent: "copyAgent",
|
|
165
183
|
inputs: {
|
|
166
|
-
onComplete: [":imageFromMovie", ":htmlImageGenerator"], // to wait for imageFromMovie to finish
|
|
184
|
+
onComplete: [":imageFromMovie", ":htmlImageGenerator", ":audioChecker"], // to wait for imageFromMovie to finish
|
|
167
185
|
imageFile: ":preprocessor.imagePath",
|
|
168
186
|
movieFile: ":preprocessor.movieFile",
|
|
187
|
+
hasMovieAudio: ":audioChecker.hasMovieAudio",
|
|
169
188
|
},
|
|
170
189
|
output: {
|
|
171
190
|
imageFile: ".imageFile",
|
|
172
191
|
movieFile: ".movieFile",
|
|
192
|
+
hasMovieAudio: ".hasMovieAudio",
|
|
173
193
|
},
|
|
174
194
|
isResult: true,
|
|
175
195
|
},
|
|
@@ -308,10 +328,14 @@ const prepareGenerateImages = async (context) => {
|
|
|
308
328
|
};
|
|
309
329
|
return injections;
|
|
310
330
|
};
|
|
311
|
-
const generateImages = async (context, settings, callbacks) => {
|
|
312
|
-
const
|
|
331
|
+
const generateImages = async (context, settings, callbacks, options) => {
|
|
332
|
+
const optionImageAgents = options?.imageAgents ?? {};
|
|
313
333
|
const injections = await prepareGenerateImages(context);
|
|
314
|
-
const
|
|
334
|
+
const graphaiAgent = {
|
|
335
|
+
...defaultAgents,
|
|
336
|
+
...optionImageAgents,
|
|
337
|
+
};
|
|
338
|
+
const graph = new GraphAI(graph_data, graphaiAgent, await graphOption(context, settings));
|
|
315
339
|
Object.keys(injections).forEach((key) => {
|
|
316
340
|
graph.injectValue(key, injections[key]);
|
|
317
341
|
});
|
|
@@ -324,10 +348,11 @@ const generateImages = async (context, settings, callbacks) => {
|
|
|
324
348
|
return res.mergeResult;
|
|
325
349
|
};
|
|
326
350
|
// public api
|
|
327
|
-
export const images = async (context,
|
|
351
|
+
export const images = async (context, args) => {
|
|
352
|
+
const { settings, callbacks, options } = args ?? {};
|
|
328
353
|
try {
|
|
329
354
|
MulmoStudioContextMethods.setSessionState(context, "image", true);
|
|
330
|
-
const newContext = await generateImages(context, settings, callbacks);
|
|
355
|
+
const newContext = await generateImages(context, settings, callbacks, options);
|
|
331
356
|
MulmoStudioContextMethods.setSessionState(context, "image", false);
|
|
332
357
|
return newContext;
|
|
333
358
|
}
|
|
@@ -341,7 +366,7 @@ export const generateBeatImage = async (inputs) => {
|
|
|
341
366
|
const { index, context, settings, callbacks, forceMovie, forceImage } = inputs;
|
|
342
367
|
const options = await graphOption(context, settings);
|
|
343
368
|
const injections = await prepareGenerateImages(context);
|
|
344
|
-
const graph = new GraphAI(beat_graph_data,
|
|
369
|
+
const graph = new GraphAI(beat_graph_data, defaultAgents, options);
|
|
345
370
|
Object.keys(injections).forEach((key) => {
|
|
346
371
|
if ("outputStudioFilePath" !== key) {
|
|
347
372
|
graph.injectValue(key, injections[key]);
|
package/lib/actions/movie.js
CHANGED
|
@@ -204,8 +204,9 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, context) => {
|
|
|
204
204
|
videoIdsForBeats.push(videoId);
|
|
205
205
|
}
|
|
206
206
|
// NOTE: We don't support audio if the speed is not 1.0.
|
|
207
|
-
|
|
208
|
-
|
|
207
|
+
const movieVolume = beat.audioParams?.movieVolume ?? 1.0;
|
|
208
|
+
if (studioBeat.hasMovieAudio && movieVolume > 0.0 && speed === 1.0) {
|
|
209
|
+
const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, movieVolume);
|
|
209
210
|
audioIdsFromMovieBeats.push(audioId);
|
|
210
211
|
ffmpegContext.filterComplex.push(audioPart);
|
|
211
212
|
}
|
|
@@ -10,7 +10,7 @@ const addBGMAgent = async ({ namedInputs, params, }) => {
|
|
|
10
10
|
if (!musicFile.match(/^http/) && !fs.existsSync(musicFile)) {
|
|
11
11
|
throw new Error(`AddBGMAgent musicFile not exist: ${musicFile}`);
|
|
12
12
|
}
|
|
13
|
-
const speechDuration = await ffmpegGetMediaDuration(voiceFile);
|
|
13
|
+
const { duration: speechDuration } = await ffmpegGetMediaDuration(voiceFile);
|
|
14
14
|
const introPadding = context.presentationStyle.audioParams.introPadding;
|
|
15
15
|
const outroPadding = context.presentationStyle.audioParams.outroPadding;
|
|
16
16
|
const totalDuration = speechDuration + introPadding + outroPadding;
|
|
@@ -2,13 +2,14 @@ import { assert, GraphAILogger } from "graphai";
|
|
|
2
2
|
import { silent60secPath } from "../utils/file.js";
|
|
3
3
|
import { FfmpegContextInit, FfmpegContextGenerateOutput, FfmpegContextInputFormattedAudio, ffmpegGetMediaDuration } from "../utils/ffmpeg_utils.js";
|
|
4
4
|
import { userAssert } from "../utils/utils.js";
|
|
5
|
-
const
|
|
5
|
+
const getMovieDuration = async (beat) => {
|
|
6
6
|
if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
|
|
7
7
|
const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
|
|
8
8
|
const speed = beat.movieParams?.speed ?? 1.0;
|
|
9
|
-
|
|
9
|
+
const { duration, hasAudio } = await ffmpegGetMediaDuration(pathOrUrl);
|
|
10
|
+
return { duration: duration / speed, hasAudio };
|
|
10
11
|
}
|
|
11
|
-
return 0;
|
|
12
|
+
return { duration: 0, hasAudio: false };
|
|
12
13
|
};
|
|
13
14
|
const getPadding = (context, beat, index) => {
|
|
14
15
|
if (beat.audioParams?.padding !== undefined) {
|
|
@@ -29,16 +30,17 @@ const getTotalPadding = (padding, movieDuration, audioDuration, duration) => {
|
|
|
29
30
|
}
|
|
30
31
|
return padding;
|
|
31
32
|
};
|
|
32
|
-
const
|
|
33
|
+
const getMediaDurationsOfAllBeats = (context) => {
|
|
33
34
|
return Promise.all(context.studio.beats.map(async (studioBeat, index) => {
|
|
34
35
|
const beat = context.studio.script.beats[index];
|
|
35
|
-
const movieDuration = await
|
|
36
|
-
const audioDuration = studioBeat.audioFile ? await ffmpegGetMediaDuration(studioBeat.audioFile) : 0;
|
|
36
|
+
const { duration: movieDuration, hasAudio: hasMovieAudio } = await getMovieDuration(beat);
|
|
37
|
+
const audioDuration = studioBeat.audioFile ? (await ffmpegGetMediaDuration(studioBeat.audioFile)).duration : 0;
|
|
37
38
|
return {
|
|
38
39
|
movieDuration,
|
|
39
40
|
audioDuration,
|
|
40
41
|
hasMedia: movieDuration + audioDuration > 0,
|
|
41
42
|
silenceDuration: 0,
|
|
43
|
+
hasMovieAudio,
|
|
42
44
|
};
|
|
43
45
|
}));
|
|
44
46
|
};
|
|
@@ -64,7 +66,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
64
66
|
const { context, combinedFileName } = namedInputs;
|
|
65
67
|
const ffmpegContext = FfmpegContextInit();
|
|
66
68
|
// First, get the audio durations of all beats, taking advantage of multi-threading capability of ffmpeg.
|
|
67
|
-
const mediaDurations = await
|
|
69
|
+
const mediaDurations = await getMediaDurationsOfAllBeats(context);
|
|
68
70
|
const beatDurations = [];
|
|
69
71
|
context.studio.script.beats.forEach((beat, index) => {
|
|
70
72
|
if (beatDurations.length > index) {
|
|
@@ -196,6 +198,7 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
|
|
|
196
198
|
audioDuration: mediaDurations[index].audioDuration,
|
|
197
199
|
movieDuration: mediaDurations[index].movieDuration,
|
|
198
200
|
silenceDuration: mediaDurations[index].silenceDuration,
|
|
201
|
+
hasMovieAudio: mediaDurations[index].hasMovieAudio,
|
|
199
202
|
})),
|
|
200
203
|
},
|
|
201
204
|
};
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { GraphAILogger } from "graphai";
|
|
2
2
|
import { getAspectRatio } from "./movie_google_agent.js";
|
|
3
|
+
import { provider2ImageAgent } from "../utils/provider2agent.js";
|
|
3
4
|
async function generateImage(projectId, model, token, prompt, aspectRatio) {
|
|
4
5
|
const GOOGLE_IMAGEN_ENDPOINT = `https://us-central1-aiplatform.googleapis.com/v1/projects/${projectId}/locations/us-central1/publishers/google/models/${model}:predict`;
|
|
5
6
|
try {
|
|
@@ -54,8 +55,7 @@ async function generateImage(projectId, model, token, prompt, aspectRatio) {
|
|
|
54
55
|
export const imageGoogleAgent = async ({ namedInputs, params, config, }) => {
|
|
55
56
|
const { prompt } = namedInputs;
|
|
56
57
|
const aspectRatio = getAspectRatio(params.canvasSize);
|
|
57
|
-
const model = params.model ?? "
|
|
58
|
-
//const projectId = process.env.GOOGLE_PROJECT_ID; // Your Google Cloud Project ID
|
|
58
|
+
const model = params.model ?? provider2ImageAgent["google"].defaultModel;
|
|
59
59
|
const projectId = config?.projectId;
|
|
60
60
|
const token = config?.token;
|
|
61
61
|
try {
|
|
@@ -2,13 +2,13 @@ import fs from "fs";
|
|
|
2
2
|
import path from "path";
|
|
3
3
|
import { GraphAILogger } from "graphai";
|
|
4
4
|
import OpenAI, { toFile } from "openai";
|
|
5
|
-
import {
|
|
5
|
+
import { provider2ImageAgent } from "../utils/provider2agent.js";
|
|
6
6
|
// https://platform.openai.com/docs/guides/image-generation
|
|
7
7
|
export const imageOpenaiAgent = async ({ namedInputs, params, config, }) => {
|
|
8
8
|
const { prompt, referenceImages } = namedInputs;
|
|
9
9
|
const { moderation, canvasSize } = params;
|
|
10
10
|
const { apiKey, baseURL } = { ...config };
|
|
11
|
-
const model = params.model ??
|
|
11
|
+
const model = params.model ?? provider2ImageAgent["openai"].defaultModel;
|
|
12
12
|
const openai = new OpenAI({ apiKey, baseURL });
|
|
13
13
|
const size = (() => {
|
|
14
14
|
if (model === "gpt-image-1") {
|
|
@@ -21,7 +21,7 @@ async function generateMovie(model, apiKey, prompt, imagePath, aspectRatio, dura
|
|
|
21
21
|
if (imagePath) {
|
|
22
22
|
const buffer = readFileSync(imagePath);
|
|
23
23
|
const base64Image = `data:image/png;base64,${buffer.toString("base64")}`;
|
|
24
|
-
if (model === "kwaivgi/kling-v2.1") {
|
|
24
|
+
if (model === "kwaivgi/kling-v2.1" || model === "kwaivgi/kling-v1.6-pro") {
|
|
25
25
|
input.start_image = base64Image;
|
|
26
26
|
}
|
|
27
27
|
else {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { AgentFunction, AgentFunctionInfo } from "graphai";
|
|
2
|
-
|
|
2
|
+
import type { ElevenlabsTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, AgentConfig } from "../types/agent.js";
|
|
3
|
+
export declare const ttsElevenlabsAgent: AgentFunction<ElevenlabsTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, AgentConfig>;
|
|
3
4
|
declare const ttsElevenlabsAgentInfo: AgentFunctionInfo;
|
|
4
5
|
export default ttsElevenlabsAgentInfo;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { GraphAILogger } from "graphai";
|
|
2
|
-
|
|
2
|
+
import { provider2TTSAgent } from "../utils/provider2agent.js";
|
|
3
|
+
export const ttsElevenlabsAgent = async ({ namedInputs, params, config, }) => {
|
|
3
4
|
const { text } = namedInputs;
|
|
4
5
|
const { voice, model, stability, similarityBoost, suppressError } = params;
|
|
5
6
|
const apiKey = config?.apiKey ?? process.env.ELEVENLABS_API_KEY;
|
|
@@ -7,12 +8,12 @@ export const ttsElevenlabsAgent = async ({ namedInputs, params, config }) => {
|
|
|
7
8
|
throw new Error("ELEVENLABS_API_KEY environment variable is required");
|
|
8
9
|
}
|
|
9
10
|
if (!voice) {
|
|
10
|
-
throw new Error("Voice ID is required");
|
|
11
|
+
throw new Error("ELEVENLABS Voice ID is required");
|
|
11
12
|
}
|
|
12
13
|
try {
|
|
13
14
|
const requestBody = {
|
|
14
15
|
text,
|
|
15
|
-
model_id: model ??
|
|
16
|
+
model_id: model ?? provider2TTSAgent.elevenlabs.defaultModel,
|
|
16
17
|
voice_settings: {
|
|
17
18
|
stability: stability ?? 0.5,
|
|
18
19
|
similarity_boost: similarityBoost ?? 0.75,
|
|
@@ -1,12 +1,5 @@
|
|
|
1
1
|
import type { AgentFunction, AgentFunctionInfo } from "graphai";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
speed: number;
|
|
5
|
-
suppressError: boolean;
|
|
6
|
-
}, {
|
|
7
|
-
buffer?: Buffer | null;
|
|
8
|
-
}, {
|
|
9
|
-
text: string;
|
|
10
|
-
}>;
|
|
2
|
+
import type { GoogleTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult } from "../types/agent.js";
|
|
3
|
+
export declare const ttsGoogleAgent: AgentFunction<GoogleTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs>;
|
|
11
4
|
declare const ttsGoogleAgentInfo: AgentFunctionInfo;
|
|
12
5
|
export default ttsGoogleAgentInfo;
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { AgentFunction, AgentFunctionInfo } from "graphai";
|
|
2
|
-
|
|
2
|
+
import type { NijivoiceTTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, AgentConfig } from "../types/agent.js";
|
|
3
|
+
export declare const ttsNijivoiceAgent: AgentFunction<NijivoiceTTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, AgentConfig>;
|
|
3
4
|
declare const ttsNijivoiceAgentInfo: AgentFunctionInfo;
|
|
4
5
|
export default ttsNijivoiceAgentInfo;
|
|
@@ -6,11 +6,11 @@ const errorMessage = [
|
|
|
6
6
|
"1. Obtain an API key from Niji Voice (https://platform.nijivoice.com/) and set it as the NIJIVOICE_API_KEY environment variable.",
|
|
7
7
|
'2. Use OpenAI\'s TTS instead of Niji Voice by changing speechParams.provider from "nijivoice" to "openai".',
|
|
8
8
|
].join("\n");
|
|
9
|
-
export const ttsNijivoiceAgent = async ({ params, namedInputs, config }) => {
|
|
9
|
+
export const ttsNijivoiceAgent = async ({ params, namedInputs, config, }) => {
|
|
10
10
|
const { suppressError, voice, speed, speed_global } = params;
|
|
11
11
|
const { apiKey } = config ?? {};
|
|
12
12
|
const { text } = namedInputs;
|
|
13
|
-
assert(apiKey ?? nijovoiceApiKey, errorMessage);
|
|
13
|
+
assert(!!(apiKey ?? nijovoiceApiKey), errorMessage);
|
|
14
14
|
const url = `https://api.nijivoice.com/api/platform/v1/voice-actors/${voice}/generate-voice`;
|
|
15
15
|
const options = {
|
|
16
16
|
method: "POST",
|
|
@@ -31,7 +31,7 @@ export const ttsNijivoiceAgent = async ({ params, namedInputs, config }) => {
|
|
|
31
31
|
if (voiceJson && voiceJson.generatedVoice && voiceJson.generatedVoice.audioFileDownloadUrl) {
|
|
32
32
|
const audioRes = await fetch(voiceJson.generatedVoice.audioFileDownloadUrl);
|
|
33
33
|
const buffer = Buffer.from(await audioRes.arrayBuffer());
|
|
34
|
-
return { buffer
|
|
34
|
+
return { buffer };
|
|
35
35
|
}
|
|
36
36
|
if (suppressError) {
|
|
37
37
|
return {
|
|
@@ -1,16 +1,5 @@
|
|
|
1
1
|
import type { AgentFunction, AgentFunctionInfo } from "graphai";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
voice: string;
|
|
5
|
-
instructions: string;
|
|
6
|
-
suppressError: boolean;
|
|
7
|
-
}, {
|
|
8
|
-
buffer?: Buffer;
|
|
9
|
-
}, {
|
|
10
|
-
text: string;
|
|
11
|
-
}, {
|
|
12
|
-
baseURL?: string;
|
|
13
|
-
apiKey?: string;
|
|
14
|
-
}>;
|
|
2
|
+
import type { OpenAITTSAgentParams, AgentBufferResult, AgentTextInputs, AgentErrorResult, OpenAIImageAgentConfig } from "../types/agent.js";
|
|
3
|
+
export declare const ttsOpenaiAgent: AgentFunction<OpenAITTSAgentParams, AgentBufferResult | AgentErrorResult, AgentTextInputs, OpenAIImageAgentConfig>;
|
|
15
4
|
declare const ttsOpenaiAgentInfo: AgentFunctionInfo;
|
|
16
5
|
export default ttsOpenaiAgentInfo;
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import { GraphAILogger } from "graphai";
|
|
2
2
|
import OpenAI from "openai";
|
|
3
|
-
|
|
3
|
+
import { provider2TTSAgent } from "../utils/provider2agent.js";
|
|
4
|
+
export const ttsOpenaiAgent = async ({ namedInputs, params, config, }) => {
|
|
4
5
|
const { text } = namedInputs;
|
|
5
6
|
const { model, voice, suppressError, instructions } = params;
|
|
6
7
|
const { apiKey, baseURL } = config ?? {};
|
|
7
8
|
const openai = new OpenAI({ apiKey, baseURL });
|
|
8
9
|
try {
|
|
9
10
|
const tts_options = {
|
|
10
|
-
model: model ??
|
|
11
|
-
voice: voice ??
|
|
11
|
+
model: model ?? provider2TTSAgent.openai.defaultModel,
|
|
12
|
+
voice: voice ?? provider2TTSAgent.openai.defaultVoice,
|
|
12
13
|
input: text,
|
|
13
14
|
};
|
|
14
15
|
if (instructions) {
|
package/lib/index.browser.d.ts
CHANGED
package/lib/index.browser.js
CHANGED
package/lib/index.d.ts
CHANGED
|
@@ -2,6 +2,7 @@ export * from "./actions/index.js";
|
|
|
2
2
|
export * from "./cli/helpers.js";
|
|
3
3
|
export * from "./utils/file.js";
|
|
4
4
|
export * from "./utils/ffmpeg_utils.js";
|
|
5
|
+
export * from "./utils/provider2agent.js";
|
|
5
6
|
export * from "./methods/index.js";
|
|
6
7
|
export * from "./agents/index.js";
|
|
7
8
|
export * from "./types/index.js";
|
package/lib/index.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
// NOTE: If you want to support usage in the browser codebase, also add to src/index.browser.ts
|
|
1
2
|
export * from "./actions/index.js";
|
|
2
3
|
export * from "./cli/helpers.js";
|
|
3
4
|
export * from "./utils/file.js";
|
|
4
5
|
export * from "./utils/ffmpeg_utils.js";
|
|
6
|
+
export * from "./utils/provider2agent.js";
|
|
5
7
|
export * from "./methods/index.js";
|
|
6
8
|
export * from "./agents/index.js";
|
|
7
9
|
export * from "./types/index.js";
|
|
@@ -7,7 +7,8 @@ export declare const MulmoPresentationStyleMethods: {
|
|
|
7
7
|
getTextSlideStyle(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
|
|
8
8
|
getSpeechOptions(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeechOptions | undefined;
|
|
9
9
|
getSpeaker(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): SpeakerData;
|
|
10
|
-
|
|
10
|
+
getTTSProvider(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): Text2SpeechProvider;
|
|
11
|
+
getTTSModel(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string | undefined;
|
|
11
12
|
getVoiceId(presentationStyle: MulmoPresentationStyle, beat: MulmoBeat): string;
|
|
12
13
|
getText2ImageProvider(provider: Text2ImageProvider | undefined): Text2ImageProvider;
|
|
13
14
|
getImageAgentInfo(presentationStyle: MulmoPresentationStyle, beat?: MulmoBeat): Text2ImageAgentInfo;
|