mulmocast 0.0.10 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +18 -3
  2. package/assets/templates/ghibli_shorts.json +34 -0
  3. package/assets/templates/shorts.json +18 -0
  4. package/assets/templates/trailer.json +25 -0
  5. package/lib/actions/audio.d.ts +2 -1
  6. package/lib/actions/audio.js +35 -17
  7. package/lib/actions/captions.js +5 -5
  8. package/lib/actions/images.d.ts +2 -1
  9. package/lib/actions/images.js +90 -58
  10. package/lib/actions/movie.js +53 -16
  11. package/lib/actions/pdf.js +3 -3
  12. package/lib/actions/translate.d.ts +2 -1
  13. package/lib/actions/translate.js +21 -16
  14. package/lib/agents/combine_audio_files_agent.js +4 -0
  15. package/lib/agents/image_google_agent.d.ts +4 -1
  16. package/lib/agents/image_google_agent.js +3 -2
  17. package/lib/agents/image_openai_agent.d.ts +5 -3
  18. package/lib/agents/image_openai_agent.js +35 -7
  19. package/lib/agents/index.d.ts +2 -1
  20. package/lib/agents/index.js +2 -1
  21. package/lib/agents/movie_google_agent.d.ts +9 -2
  22. package/lib/agents/movie_google_agent.js +24 -16
  23. package/lib/agents/tts_elevenlabs_agent.d.ts +4 -0
  24. package/lib/agents/tts_elevenlabs_agent.js +60 -0
  25. package/lib/agents/tts_google_agent.js +1 -1
  26. package/lib/agents/tts_nijivoice_agent.js +3 -2
  27. package/lib/agents/tts_openai_agent.js +1 -1
  28. package/lib/cli/commands/audio/handler.js +4 -1
  29. package/lib/cli/commands/image/handler.js +4 -1
  30. package/lib/cli/commands/movie/handler.js +4 -1
  31. package/lib/cli/commands/pdf/handler.js +4 -1
  32. package/lib/cli/commands/translate/handler.js +4 -1
  33. package/lib/cli/helpers.d.ts +3 -3
  34. package/lib/cli/helpers.js +38 -20
  35. package/lib/index.d.ts +5 -0
  36. package/lib/index.js +5 -0
  37. package/lib/methods/mulmo_media_source.d.ts +1 -0
  38. package/lib/methods/mulmo_media_source.js +12 -0
  39. package/lib/methods/mulmo_script.d.ts +1 -1
  40. package/lib/methods/mulmo_script.js +9 -5
  41. package/lib/methods/mulmo_studio_context.d.ts +5 -0
  42. package/lib/methods/mulmo_studio_context.js +23 -0
  43. package/lib/types/index.d.ts +1 -0
  44. package/lib/types/index.js +1 -0
  45. package/lib/types/schema.d.ts +1513 -290
  46. package/lib/types/schema.js +26 -35
  47. package/lib/types/type.d.ts +4 -1
  48. package/lib/utils/file.d.ts +5 -15
  49. package/lib/utils/file.js +14 -21
  50. package/lib/utils/filters.js +4 -4
  51. package/lib/utils/image_plugins/beat.d.ts +4 -0
  52. package/lib/utils/image_plugins/beat.js +7 -0
  53. package/lib/utils/image_plugins/image.d.ts +1 -1
  54. package/lib/utils/image_plugins/index.d.ts +2 -1
  55. package/lib/utils/image_plugins/index.js +2 -1
  56. package/lib/utils/image_plugins/movie.d.ts +1 -1
  57. package/lib/utils/image_plugins/source.js +2 -2
  58. package/lib/utils/preprocess.d.ts +26 -23
  59. package/lib/utils/preprocess.js +4 -0
  60. package/package.json +8 -8
  61. package/scripts/templates/movie_prompts_no_text_template.json +50 -0
  62. package/scripts/templates/shorts_template.json +52 -0
@@ -1,8 +1,9 @@
1
- import { GraphAILogger } from "graphai";
1
+ import { GraphAILogger, assert } from "graphai";
2
+ import { mulmoTransitionSchema } from "../types/index.js";
2
3
  import { MulmoScriptMethods } from "../methods/index.js";
3
4
  import { getAudioArtifactFilePath, getOutputVideoFilePath, writingMessage } from "../utils/file.js";
4
5
  import { FfmpegContextAddInput, FfmpegContextInit, FfmpegContextPushFormattedAudio, FfmpegContextGenerateOutput } from "../utils/ffmpeg_utils.js";
5
- import { MulmoStudioMethods } from "../methods/mulmo_studio.js";
6
+ import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
6
7
  // const isMac = process.platform === "darwin";
7
8
  const videoCodec = "libx264"; // "h264_videotoolbox" (macOS only) is too noisy
8
9
  export const getVideoPart = (inputIndex, mediaType, duration, canvasInfo) => {
@@ -38,10 +39,10 @@ export const getAudioPart = (inputIndex, duration, delay, mixAudio) => {
38
39
  `[${audioId}]`,
39
40
  };
40
41
  };
41
- const getOutputOption = (audioId) => {
42
+ const getOutputOption = (audioId, videoId) => {
42
43
  return [
43
44
  "-preset medium", // Changed from veryfast to medium for better compression
44
- "-map [v]", // Map the video stream
45
+ `-map [${videoId}]`, // Map the video stream
45
46
  `-map ${audioId}`, // Map the audio stream
46
47
  `-c:v ${videoCodec}`, // Set video codec
47
48
  ...(videoCodec === "libx264" ? ["-crf", "26"] : []), // Add CRF for libx264
@@ -61,20 +62,27 @@ const getOutputOption = (audioId) => {
61
62
  const createVideo = async (audioArtifactFilePath, outputVideoPath, studio, caption) => {
62
63
  const start = performance.now();
63
64
  const ffmpegContext = FfmpegContextInit();
64
- if (studio.beats.some((beat) => !beat.imageFile)) {
65
- GraphAILogger.info("beat.imageFile is not set. Please run `yarn run images ${file}` ");
66
- return;
65
+ const missingIndex = studio.beats.findIndex((beat) => !beat.imageFile && !beat.movieFile);
66
+ if (missingIndex !== -1) {
67
+ GraphAILogger.info(`ERROR: beat.imageFile or beat.movieFile is not set on beat ${missingIndex}.`);
68
+ return false;
67
69
  }
68
70
  const canvasInfo = MulmoScriptMethods.getCanvasSize(studio.script);
69
71
  // Add each image input
70
72
  const filterComplexVideoIds = [];
71
73
  const filterComplexAudioIds = [];
74
+ const transitionVideoIds = [];
75
+ const beatTimestamps = [];
72
76
  studio.beats.reduce((timestamp, studioBeat, index) => {
73
77
  const beat = studio.script.beats[index];
74
- if (!studioBeat.imageFile || !studioBeat.duration) {
75
- throw new Error(`studioBeat.imageFile or studioBeat.duration is not set: index=${index}`);
78
+ const sourceFile = studioBeat.movieFile ?? studioBeat.imageFile;
79
+ if (!sourceFile) {
80
+ throw new Error(`studioBeat.imageFile or studioBeat.movieFile is not set: index=${index}`);
76
81
  }
77
- const inputIndex = FfmpegContextAddInput(ffmpegContext, studioBeat.movieFile ?? studioBeat.imageFile);
82
+ if (!studioBeat.duration) {
83
+ throw new Error(`studioBeat.duration is not set: index=${index}`);
84
+ }
85
+ const inputIndex = FfmpegContextAddInput(ffmpegContext, sourceFile);
78
86
  const mediaType = studioBeat.movieFile ? "movie" : MulmoScriptMethods.getImageType(studio.script, beat);
79
87
  const extraPadding = (() => {
80
88
  // We need to consider only intro and outro padding because the other paddings were already added to the beat.duration
@@ -98,16 +106,43 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, studio, capti
98
106
  else {
99
107
  filterComplexVideoIds.push(videoId);
100
108
  }
109
+ if (studio.script.movieParams?.transition && index < studio.beats.length - 1) {
110
+ const sourceId = filterComplexVideoIds.pop();
111
+ ffmpegContext.filterComplex.push(`[${sourceId}]split=2[${sourceId}_0][${sourceId}_1]`);
112
+ filterComplexVideoIds.push(`${sourceId}_0`);
113
+ transitionVideoIds.push(`${sourceId}_1`);
114
+ }
101
115
  if (beat.image?.type == "movie" && beat.image.mixAudio > 0.0) {
102
116
  const { audioId, audioPart } = getAudioPart(inputIndex, duration, timestamp, beat.image.mixAudio);
103
117
  filterComplexAudioIds.push(audioId);
104
118
  ffmpegContext.filterComplex.push(audioPart);
105
119
  }
120
+ beatTimestamps.push(timestamp);
106
121
  return timestamp + duration;
107
122
  }, 0);
123
+ assert(filterComplexVideoIds.length === studio.beats.length, "videoIds.length !== studio.beats.length");
124
+ assert(beatTimestamps.length === studio.beats.length, "beatTimestamps.length !== studio.beats.length");
108
125
  // console.log("*** images", images.audioIds);
109
126
  // Concatenate the trimmed images
110
- ffmpegContext.filterComplex.push(`${filterComplexVideoIds.map((id) => `[${id}]`).join("")}concat=n=${studio.beats.length}:v=1:a=0[v]`);
127
+ const concatVideoId = "concat_video";
128
+ ffmpegContext.filterComplex.push(`${filterComplexVideoIds.map((id) => `[${id}]`).join("")}concat=n=${studio.beats.length}:v=1:a=0[${concatVideoId}]`);
129
+ // Add tranditions if needed
130
+ const mixedVideoId = (() => {
131
+ if (studio.script.movieParams?.transition && transitionVideoIds.length > 1) {
132
+ const transition = mulmoTransitionSchema.parse(studio.script.movieParams.transition);
133
+ return transitionVideoIds.reduce((acc, transitionVideoId, index) => {
134
+ const transitionStartTime = beatTimestamps[index + 1] - 0.05; // 0.05 is to avoid flickering
135
+ const processedVideoId = `${transitionVideoId}_f`;
136
+ // TODO: This mechanism does not work for video beats yet. It works only with image beats.
137
+ // If we can to add other transition types than fade, we need to add them here.
138
+ ffmpegContext.filterComplex.push(`[${transitionVideoId}]format=yuva420p,fade=t=out:d=${transition.duration}:alpha=1,setpts=PTS-STARTPTS+${transitionStartTime}/TB[${processedVideoId}]`);
139
+ const outputId = `${transitionVideoId}_o`;
140
+ ffmpegContext.filterComplex.push(`[${acc}][${processedVideoId}]overlay=enable='between(t,${transitionStartTime},${transitionStartTime + transition.duration})'[${outputId}]`);
141
+ return outputId;
142
+ }, concatVideoId);
143
+ }
144
+ return concatVideoId;
145
+ })();
111
146
  const audioIndex = FfmpegContextAddInput(ffmpegContext, audioArtifactFilePath); // Add audio input
112
147
  const artifactAudioId = `${audioIndex}:a`;
113
148
  const ffmpegContextAudioId = (() => {
@@ -121,23 +156,25 @@ const createVideo = async (audioArtifactFilePath, outputVideoPath, studio, capti
121
156
  }
122
157
  return artifactAudioId;
123
158
  })();
124
- await FfmpegContextGenerateOutput(ffmpegContext, outputVideoPath, getOutputOption(ffmpegContextAudioId));
159
+ await FfmpegContextGenerateOutput(ffmpegContext, outputVideoPath, getOutputOption(ffmpegContextAudioId, mixedVideoId));
125
160
  const end = performance.now();
126
161
  GraphAILogger.info(`Video created successfully! ${Math.round(end - start) / 1000} sec`);
127
162
  GraphAILogger.info(studio.script.title);
128
163
  GraphAILogger.info((studio.script.references ?? []).map((reference) => `${reference.title} (${reference.url})`).join("\n"));
164
+ return true;
129
165
  };
130
166
  export const movie = async (context) => {
131
- MulmoStudioMethods.setSessionState(context.studio, "video", true);
167
+ MulmoStudioContextMethods.setSessionState(context, "video", true);
132
168
  try {
133
169
  const { studio, fileDirs, caption } = context;
134
170
  const { outDirPath } = fileDirs;
135
171
  const audioArtifactFilePath = getAudioArtifactFilePath(outDirPath, studio.filename);
136
172
  const outputVideoPath = getOutputVideoFilePath(outDirPath, studio.filename, context.lang, caption);
137
- await createVideo(audioArtifactFilePath, outputVideoPath, studio, caption);
138
- writingMessage(outputVideoPath);
173
+ if (await createVideo(audioArtifactFilePath, outputVideoPath, studio, caption)) {
174
+ writingMessage(outputVideoPath);
175
+ }
139
176
  }
140
177
  finally {
141
- MulmoStudioMethods.setSessionState(context.studio, "video", false);
178
+ MulmoStudioContextMethods.setSessionState(context, "video", false);
142
179
  }
143
180
  };
@@ -6,7 +6,7 @@ import { chunkArray, isHttp, localizedText } from "../utils/utils.js";
6
6
  import { getOutputPdfFilePath, writingMessage } from "../utils/file.js";
7
7
  import { MulmoScriptMethods } from "../methods/index.js";
8
8
  import { fontSize, textMargin, drawSize, wrapText } from "../utils/pdf.js";
9
- import { MulmoStudioMethods } from "../methods/mulmo_studio.js";
9
+ import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
10
10
  const imagesPerPage = 4;
11
11
  const offset = 10;
12
12
  const handoutImageRatio = 0.5;
@@ -224,10 +224,10 @@ const generatePdf = async (context, pdfMode, pdfSize) => {
224
224
  };
225
225
  export const pdf = async (context, pdfMode, pdfSize) => {
226
226
  try {
227
- MulmoStudioMethods.setSessionState(context.studio, "pdf", true);
227
+ MulmoStudioContextMethods.setSessionState(context, "pdf", true);
228
228
  await generatePdf(context, pdfMode, pdfSize);
229
229
  }
230
230
  finally {
231
- MulmoStudioMethods.setSessionState(context.studio, "pdf", false);
231
+ MulmoStudioContextMethods.setSessionState(context, "pdf", false);
232
232
  }
233
233
  };
@@ -1,3 +1,4 @@
1
1
  import "dotenv/config";
2
+ import type { CallbackFunction } from "graphai";
2
3
  import { MulmoStudioContext } from "../types/index.js";
3
- export declare const translate: (context: MulmoStudioContext) => Promise<void>;
4
+ export declare const translate: (context: MulmoStudioContext, callbacks?: CallbackFunction[]) => Promise<void>;
@@ -6,19 +6,19 @@ import { fileWriteAgent } from "@graphai/vanilla_node_agents";
6
6
  import { recursiveSplitJa, replacementsJa, replacePairsJa } from "../utils/string.js";
7
7
  import { getOutputStudioFilePath, mkdir, writingMessage } from "../utils/file.js";
8
8
  import { translateSystemPrompt, translatePrompts } from "../utils/prompt.js";
9
- import { MulmoStudioMethods } from "../methods/mulmo_studio.js";
9
+ import { MulmoStudioContextMethods } from "../methods/mulmo_studio_context.js";
10
10
  const vanillaAgents = agents.default ?? agents;
11
11
  const translateGraph = {
12
12
  version: 0.5,
13
13
  nodes: {
14
- studio: {},
14
+ context: {},
15
15
  defaultLang: {},
16
16
  outDirPath: {},
17
17
  outputStudioFilePath: {},
18
18
  lang: {
19
19
  agent: "stringUpdateTextAgent",
20
20
  inputs: {
21
- newText: ":studio.script.lang",
21
+ newText: ":context.studio.script.lang",
22
22
  oldText: ":defaultLang",
23
23
  },
24
24
  },
@@ -27,15 +27,15 @@ const translateGraph = {
27
27
  isResult: true,
28
28
  agent: "mergeObjectAgent",
29
29
  inputs: {
30
- items: [":studio", { multiLingual: ":beatsMap.mergeMultiLingualData" }],
30
+ items: [":context.studio", { multiLingual: ":beatsMap.mergeMultiLingualData" }],
31
31
  },
32
32
  },
33
33
  beatsMap: {
34
34
  agent: "mapAgent",
35
35
  inputs: {
36
36
  targetLangs: ":targetLangs",
37
- studio: ":studio",
38
- rows: ":studio.script.beats",
37
+ context: ":context",
38
+ rows: ":context.studio.script.beats",
39
39
  lang: ":lang",
40
40
  },
41
41
  params: {
@@ -52,7 +52,7 @@ const translateGraph = {
52
52
  },
53
53
  inputs: {
54
54
  index: ":__mapIndex",
55
- rows: ":studio.multiLingual",
55
+ rows: ":context.studio.multiLingual",
56
56
  },
57
57
  },
58
58
  preprocessMultiLingual: {
@@ -62,7 +62,7 @@ const translateGraph = {
62
62
  multiLingual: ":multiLingual",
63
63
  rows: ":targetLangs",
64
64
  lang: ":lang.text",
65
- studio: ":studio",
65
+ context: ":context",
66
66
  beatIndex: ":__mapIndex",
67
67
  },
68
68
  params: {
@@ -79,7 +79,7 @@ const translateGraph = {
79
79
  multiLingual: ":multiLingual", // for cache
80
80
  lang: ":lang", // for cache
81
81
  beatIndex: ":beatIndex", // for cache
82
- studio: ":studio", // for cache
82
+ mulmoContext: ":context", // for cache
83
83
  system: translateSystemPrompt,
84
84
  prompt: translatePrompts,
85
85
  },
@@ -175,7 +175,7 @@ const translateGraph = {
175
175
  };
176
176
  const localizedTextCacheAgentFilter = async (context, next) => {
177
177
  const { namedInputs } = context;
178
- const { studio, targetLang, beat, beatIndex, lang, multiLingual } = namedInputs;
178
+ const { mulmoContext, targetLang, beat, beatIndex, lang, multiLingual } = namedInputs;
179
179
  if (!beat.text) {
180
180
  return { text: "" };
181
181
  }
@@ -192,11 +192,11 @@ const localizedTextCacheAgentFilter = async (context, next) => {
192
192
  return { text: beat.text };
193
193
  }
194
194
  try {
195
- MulmoStudioMethods.setBeatSessionState(studio, "multiLingual", beatIndex, true);
195
+ MulmoStudioContextMethods.setBeatSessionState(mulmoContext, "multiLingual", beatIndex, true);
196
196
  return await next(context);
197
197
  }
198
198
  finally {
199
- MulmoStudioMethods.setBeatSessionState(studio, "multiLingual", beatIndex, false);
199
+ MulmoStudioContextMethods.setBeatSessionState(mulmoContext, "multiLingual", beatIndex, false);
200
200
  }
201
201
  };
202
202
  const agentFilters = [
@@ -208,20 +208,25 @@ const agentFilters = [
208
208
  ];
209
209
  const defaultLang = "en";
210
210
  const targetLangs = ["ja", "en"];
211
- export const translate = async (context) => {
211
+ export const translate = async (context, callbacks) => {
212
212
  try {
213
- MulmoStudioMethods.setSessionState(context.studio, "multiLingual", true);
213
+ MulmoStudioContextMethods.setSessionState(context, "multiLingual", true);
214
214
  const { studio, fileDirs } = context;
215
215
  const { outDirPath } = fileDirs;
216
216
  const outputStudioFilePath = getOutputStudioFilePath(outDirPath, studio.filename);
217
217
  mkdir(outDirPath);
218
218
  assert(!!process.env.OPENAI_API_KEY, "The OPENAI_API_KEY environment variable is missing or empty");
219
219
  const graph = new GraphAI(translateGraph, { ...vanillaAgents, fileWriteAgent, openAIAgent }, { agentFilters });
220
- graph.injectValue("studio", studio);
220
+ graph.injectValue("context", context);
221
221
  graph.injectValue("defaultLang", defaultLang);
222
222
  graph.injectValue("targetLangs", targetLangs);
223
223
  graph.injectValue("outDirPath", outDirPath);
224
224
  graph.injectValue("outputStudioFilePath", outputStudioFilePath);
225
+ if (callbacks) {
226
+ callbacks.forEach((callback) => {
227
+ graph.registerCallback(callback);
228
+ });
229
+ }
225
230
  const results = await graph.run();
226
231
  writingMessage(outputStudioFilePath);
227
232
  if (results.mergeStudioResult) {
@@ -229,6 +234,6 @@ export const translate = async (context) => {
229
234
  }
230
235
  }
231
236
  finally {
232
- MulmoStudioMethods.setSessionState(context.studio, "multiLingual", false);
237
+ MulmoStudioContextMethods.setSessionState(context, "multiLingual", false);
233
238
  }
234
239
  };
@@ -26,11 +26,15 @@ const combineAudioFilesAgent = async ({ namedInputs, }) => {
26
26
  const totalPadding = await (async () => {
27
27
  if (beat.image?.type === "movie" && (beat.image.source.kind === "url" || beat.image.source.kind === "path")) {
28
28
  const pathOrUrl = beat.image.source.kind === "url" ? beat.image.source.url : beat.image.source.path;
29
+ // NOTE: We respect the duration of the movie, only if the movie is specified as a madia source, NOT generated.
29
30
  const movieDuration = await ffmpegGetMediaDuration(pathOrUrl);
30
31
  if (movieDuration > audioDuration) {
31
32
  return padding + (movieDuration - audioDuration);
32
33
  }
33
34
  }
35
+ else if (beat.duration && beat.duration > audioDuration) {
36
+ return padding + (beat.duration - audioDuration);
37
+ }
34
38
  return padding;
35
39
  })();
36
40
  studioBeat.duration = audioDuration + totalPadding;
@@ -5,7 +5,10 @@ export type ImageGoogleConfig = {
5
5
  };
6
6
  export declare const imageGoogleAgent: AgentFunction<{
7
7
  model: string;
8
- aspectRatio: string;
8
+ canvasSize: {
9
+ width: number;
10
+ height: number;
11
+ };
9
12
  }, {
10
13
  buffer: Buffer;
11
14
  }, {
@@ -1,4 +1,5 @@
1
1
  import { GraphAILogger } from "graphai";
2
+ import { getAspectRatio } from "./movie_google_agent.js";
2
3
  async function generateImage(projectId, model, token, prompt, aspectRatio) {
3
4
  const GOOGLE_IMAGEN_ENDPOINT = `https://us-central1-aiplatform.googleapis.com/v1/projects/${projectId}/locations/us-central1/publishers/google/models/${model}:predict`;
4
5
  try {
@@ -50,9 +51,9 @@ async function generateImage(projectId, model, token, prompt, aspectRatio) {
50
51
  throw error;
51
52
  }
52
53
  }
53
- export const imageGoogleAgent = async ({ namedInputs, params, config, }) => {
54
+ export const imageGoogleAgent = async ({ namedInputs, params, config }) => {
54
55
  const { prompt } = namedInputs;
55
- const aspectRatio = params.aspectRatio ?? "16:9";
56
+ const aspectRatio = getAspectRatio(params.canvasSize);
56
57
  const model = params.model ?? "imagen-3.0-fast-generate-001";
57
58
  //const projectId = process.env.GOOGLE_PROJECT_ID; // Your Google Cloud Project ID
58
59
  const projectId = config?.projectId;
@@ -1,16 +1,18 @@
1
1
  import { AgentFunction, AgentFunctionInfo } from "graphai";
2
- type OpenAIImageSize = "1792x1024" | "auto" | "1024x1024" | "1536x1024" | "1024x1536" | "256x256";
3
2
  type OpenAIModeration = "low" | "auto";
4
3
  export declare const imageOpenaiAgent: AgentFunction<{
5
4
  apiKey: string;
6
5
  model: string;
7
- size: OpenAIImageSize | null | undefined;
8
6
  moderation: OpenAIModeration | null | undefined;
9
- images: string[] | null | undefined;
7
+ canvasSize: {
8
+ width: number;
9
+ height: number;
10
+ };
10
11
  }, {
11
12
  buffer: Buffer;
12
13
  }, {
13
14
  prompt: string;
15
+ images: string[] | null | undefined;
14
16
  }>;
15
17
  declare const imageOpenaiAgentInfo: AgentFunctionInfo;
16
18
  export default imageOpenaiAgentInfo;
@@ -1,15 +1,41 @@
1
1
  import fs from "fs";
2
+ import path from "path";
2
3
  import OpenAI, { toFile } from "openai";
3
4
  // https://platform.openai.com/docs/guides/image-generation
4
5
  export const imageOpenaiAgent = async ({ namedInputs, params }) => {
5
- const { prompt } = namedInputs;
6
- const { apiKey, model, size, moderation, images } = params;
6
+ const { prompt, images } = namedInputs;
7
+ const { apiKey, moderation, canvasSize } = params;
8
+ const model = params.model ?? "dall-e-3";
7
9
  const openai = new OpenAI({ apiKey });
10
+ const size = (() => {
11
+ if (model === "gpt-image-1") {
12
+ if (canvasSize.width > canvasSize.height) {
13
+ return "1536x1024";
14
+ }
15
+ else if (canvasSize.width < canvasSize.height) {
16
+ return "1024x1536";
17
+ }
18
+ else {
19
+ return "1024x1024";
20
+ }
21
+ }
22
+ else {
23
+ if (canvasSize.width > canvasSize.height) {
24
+ return "1792x1024";
25
+ }
26
+ else if (canvasSize.width < canvasSize.height) {
27
+ return "1024x1792";
28
+ }
29
+ else {
30
+ return "1024x1024";
31
+ }
32
+ }
33
+ })();
8
34
  const imageOptions = {
9
- model: model ?? "dall-e-3",
35
+ model,
10
36
  prompt,
11
37
  n: 1,
12
- size: size ?? (model === "gpt-image-1" ? "1536x1024" : "1792x1024"),
38
+ size,
13
39
  };
14
40
  if (model === "gpt-image-1") {
15
41
  imageOptions.moderation = moderation || "auto";
@@ -17,9 +43,11 @@ export const imageOpenaiAgent = async ({ namedInputs, params }) => {
17
43
  const response = await (async () => {
18
44
  const targetSize = imageOptions.size;
19
45
  if ((images ?? []).length > 0 && (targetSize === "1536x1024" || targetSize === "1024x1536" || targetSize === "1024x1024")) {
20
- const imagelist = await Promise.all((images ?? []).map(async (file) => await toFile(fs.createReadStream(file), null, {
21
- type: "image/png", // TODO: Support JPEG as well
22
- })));
46
+ const imagelist = await Promise.all((images ?? []).map(async (file) => {
47
+ const ext = path.extname(file).toLowerCase();
48
+ const type = ext === ".jpg" || ext === ".jpeg" ? "image/jpeg" : "image/png";
49
+ return await toFile(fs.createReadStream(file), null, { type });
50
+ }));
23
51
  return await openai.images.edit({ ...imageOptions, size: targetSize, image: imagelist });
24
52
  }
25
53
  else {
@@ -2,6 +2,7 @@ import addBGMAgent from "./add_bgm_agent.js";
2
2
  import combineAudioFilesAgent from "./combine_audio_files_agent.js";
3
3
  import imageGoogleAgent from "./image_google_agent.js";
4
4
  import imageOpenaiAgent from "./image_openai_agent.js";
5
+ import ttsElevenlabsAgent from "./tts_elevenlabs_agent.js";
5
6
  import ttsNijivoiceAgent from "./tts_nijivoice_agent.js";
6
7
  import ttsOpenaiAgent from "./tts_openai_agent.js";
7
8
  import validateSchemaAgent from "./validate_schema_agent.js";
@@ -9,4 +10,4 @@ import { browserlessAgent } from "@graphai/browserless_agent";
9
10
  import { textInputAgent } from "@graphai/input_agents";
10
11
  import { openAIAgent } from "@graphai/openai_agent";
11
12
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
12
- export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGoogleAgent, imageOpenaiAgent, ttsNijivoiceAgent, ttsOpenaiAgent, validateSchemaAgent, };
13
+ export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGoogleAgent, imageOpenaiAgent, ttsElevenlabsAgent, ttsNijivoiceAgent, ttsOpenaiAgent, validateSchemaAgent, };
@@ -2,6 +2,7 @@ import addBGMAgent from "./add_bgm_agent.js";
2
2
  import combineAudioFilesAgent from "./combine_audio_files_agent.js";
3
3
  import imageGoogleAgent from "./image_google_agent.js";
4
4
  import imageOpenaiAgent from "./image_openai_agent.js";
5
+ import ttsElevenlabsAgent from "./tts_elevenlabs_agent.js";
5
6
  import ttsNijivoiceAgent from "./tts_nijivoice_agent.js";
6
7
  import ttsOpenaiAgent from "./tts_openai_agent.js";
7
8
  import validateSchemaAgent from "./validate_schema_agent.js";
@@ -10,4 +11,4 @@ import { textInputAgent } from "@graphai/input_agents";
10
11
  import { openAIAgent } from "@graphai/openai_agent";
11
12
  // import * as vanilla from "@graphai/vanilla";
12
13
  import { fileWriteAgent } from "@graphai/vanilla_node_agents";
13
- export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGoogleAgent, imageOpenaiAgent, ttsNijivoiceAgent, ttsOpenaiAgent, validateSchemaAgent, };
14
+ export { openAIAgent, fileWriteAgent, browserlessAgent, textInputAgent, addBGMAgent, combineAudioFilesAgent, imageGoogleAgent, imageOpenaiAgent, ttsElevenlabsAgent, ttsNijivoiceAgent, ttsOpenaiAgent, validateSchemaAgent, };
@@ -3,15 +3,22 @@ export type MovieGoogleConfig = {
3
3
  projectId?: string;
4
4
  token?: string;
5
5
  };
6
+ export declare const getAspectRatio: (canvasSize: {
7
+ width: number;
8
+ height: number;
9
+ }) => string;
6
10
  export declare const movieGoogleAgent: AgentFunction<{
7
11
  model: string;
8
- aspectRatio: string;
12
+ canvasSize: {
13
+ width: number;
14
+ height: number;
15
+ };
9
16
  duration?: number;
10
17
  }, {
11
18
  buffer: Buffer;
12
19
  }, {
13
20
  prompt: string;
14
- imagePath: string;
21
+ imagePath?: string;
15
22
  }, MovieGoogleConfig>;
16
23
  declare const movieGoogleAgentInfo: AgentFunctionInfo;
17
24
  export default movieGoogleAgentInfo;
@@ -2,26 +2,29 @@ import { readFileSync } from "fs";
2
2
  import { GraphAILogger, sleep } from "graphai";
3
3
  async function generateMovie(projectId, model, token, prompt, imagePath, aspectRatio, duration) {
4
4
  const GOOGLE_IMAGEN_ENDPOINT = `https://us-central1-aiplatform.googleapis.com/v1/projects/${projectId}/locations/us-central1/publishers/google/models/${model}`;
5
- // Prepare the payload for the API request
6
- const buffer = readFileSync(imagePath);
7
- const bytesBase64Encoded = buffer.toString("base64");
8
5
  const payload = {
9
6
  instances: [
10
7
  {
11
8
  prompt: prompt,
12
- image: {
13
- bytesBase64Encoded,
14
- mimeType: "image/png",
15
- },
9
+ image: undefined,
16
10
  },
17
11
  ],
18
12
  parameters: {
19
13
  sampleCount: 1,
20
14
  aspectRatio: aspectRatio,
21
- //safetySetting: "block_only_high",
15
+ safetySetting: "block_only_high",
16
+ personGeneration: "allow_all",
22
17
  durationSeconds: duration,
23
18
  },
24
19
  };
20
+ if (imagePath) {
21
+ const buffer = readFileSync(imagePath);
22
+ const bytesBase64Encoded = buffer.toString("base64");
23
+ payload.instances[0].image = {
24
+ bytesBase64Encoded,
25
+ mimeType: "image/png",
26
+ };
27
+ }
25
28
  // Make the API call using fetch
26
29
  const response = await fetch(`${GOOGLE_IMAGEN_ENDPOINT}:predictLongRunning`, {
27
30
  method: "POST",
@@ -32,6 +35,7 @@ async function generateMovie(projectId, model, token, prompt, imagePath, aspectR
32
35
  body: JSON.stringify(payload),
33
36
  });
34
37
  if (!response.ok) {
38
+ GraphAILogger.info("create project on google cloud console and setup the project. More details see readme.");
35
39
  throw new Error(`Error: ${response.status} - ${response.statusText}`);
36
40
  }
37
41
  const initialResponse = await response.json();
@@ -72,18 +76,22 @@ async function generateMovie(projectId, model, token, prompt, imagePath, aspectR
72
76
  }
73
77
  return undefined;
74
78
  }
79
+ export const getAspectRatio = (canvasSize) => {
80
+ if (canvasSize.width > canvasSize.height) {
81
+ return "16:9";
82
+ }
83
+ else if (canvasSize.width < canvasSize.height) {
84
+ return "9:16";
85
+ }
86
+ else {
87
+ return "1:1";
88
+ }
89
+ };
75
90
  export const movieGoogleAgent = async ({ namedInputs, params, config }) => {
76
91
  const { prompt, imagePath } = namedInputs;
77
- /*
78
- if (prompt) {
79
- const buffer = Buffer.from(prompt);
80
- return { buffer };
81
- }
82
- */
83
- const aspectRatio = params.aspectRatio ?? "16:9";
92
+ const aspectRatio = getAspectRatio(params.canvasSize);
84
93
  const model = params.model ?? "veo-2.0-generate-001"; // "veo-3.0-generate-preview";
85
94
  const duration = params.duration ?? 8;
86
- //const projectId = process.env.GOOGLE_PROJECT_ID; // Your Google Cloud Project ID
87
95
  const projectId = config?.projectId;
88
96
  const token = config?.token;
89
97
  try {
@@ -0,0 +1,4 @@
1
+ import type { AgentFunction, AgentFunctionInfo } from "graphai";
2
+ export declare const ttsElevenlabsAgent: AgentFunction;
3
+ declare const ttsElevenlabsAgentInfo: AgentFunctionInfo;
4
+ export default ttsElevenlabsAgentInfo;
@@ -0,0 +1,60 @@
1
+ import { GraphAILogger } from "graphai";
2
+ export const ttsElevenlabsAgent = async ({ namedInputs, params }) => {
3
+ const { text } = namedInputs;
4
+ const { voice, model, stability, similarityBoost, suppressError } = params;
5
+ const apiKey = process.env.ELEVENLABS_API_KEY;
6
+ if (!apiKey) {
7
+ throw new Error("ELEVENLABS_API_KEY environment variable is required");
8
+ }
9
+ if (!voice) {
10
+ throw new Error("Voice ID is required");
11
+ }
12
+ try {
13
+ const requestBody = {
14
+ text,
15
+ model_id: model ?? "eleven_monolingual_v1",
16
+ voice_settings: {
17
+ stability: stability ?? 0.5,
18
+ similarity_boost: similarityBoost ?? 0.75,
19
+ },
20
+ };
21
+ GraphAILogger.log("ElevenLabs TTS options", requestBody);
22
+ const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voice}`, {
23
+ method: "POST",
24
+ headers: {
25
+ Accept: "audio/mpeg",
26
+ "Content-Type": "application/json",
27
+ "xi-api-key": apiKey,
28
+ },
29
+ body: JSON.stringify(requestBody),
30
+ });
31
+ if (!response.ok) {
32
+ throw new Error(`Eleven Labs API error: ${response.status} ${response.statusText}`);
33
+ }
34
+ const arrayBuffer = await response.arrayBuffer();
35
+ const buffer = Buffer.from(arrayBuffer);
36
+ return { buffer };
37
+ }
38
+ catch (e) {
39
+ if (suppressError) {
40
+ return {
41
+ error: e,
42
+ };
43
+ }
44
+ GraphAILogger.info(e);
45
+ throw new Error("TTS Eleven Labs Error");
46
+ }
47
+ };
48
+ const ttsElevenlabsAgentInfo = {
49
+ name: "ttsElevenlabsAgent",
50
+ agent: ttsElevenlabsAgent,
51
+ mock: ttsElevenlabsAgent,
52
+ samples: [],
53
+ description: "Eleven Labs TTS agent",
54
+ category: ["tts"],
55
+ author: "Receptron Team",
56
+ repository: "https://github.com/receptron/mulmocast-cli/",
57
+ license: "MIT",
58
+ environmentVariables: ["ELEVENLABS_API_KEY"],
59
+ };
60
+ export default ttsElevenlabsAgentInfo;
@@ -44,7 +44,7 @@ const ttsGoogleAgentInfo = {
44
44
  description: "Google TTS agent",
45
45
  category: ["tts"],
46
46
  author: "Receptron Team",
47
- repository: "https://github.com/receptron/graphai-agents/tree/main/tts/tts-openai-agent",
47
+ repository: "https://github.com/receptron/mulmocast-cli/",
48
48
  license: "MIT",
49
49
  environmentVariables: ["OPENAI_API_KEY"],
50
50
  };
@@ -57,8 +57,9 @@ const ttsNijivoiceAgentInfo = {
57
57
  samples: [],
58
58
  description: "TTS nijivoice agent",
59
59
  category: ["tts"],
60
- author: "isamu arimoto",
61
- repository: "https://github.com/receptron/graphai/",
60
+ author: "Receptron Team",
61
+ repository: "https://github.com/receptron/mulmocast-cli/",
62
62
  license: "MIT",
63
+ environmentVariables: ["NIJIVOICE_API_KEY"],
63
64
  };
64
65
  export default ttsNijivoiceAgentInfo;