@mixio-pro/kalaasetu-mcp 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/storage/index.ts +27 -0
- package/src/storage/interface.ts +7 -0
- package/src/storage/local.ts +55 -0
- package/src/storage/payload.ts +46 -0
- package/src/tools/gemini.ts +239 -89
- package/src/tools/image-to-video.ts +273 -209
package/package.json
CHANGED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { StorageProvider } from "./interface";
|
|
2
|
+
import { LocalStorageProvider } from "./local";
|
|
3
|
+
import { PayloadStorageProvider } from "./payload";
|
|
4
|
+
|
|
5
|
+
let storageInstance: StorageProvider | null = null;
|
|
6
|
+
|
|
7
|
+
export function getStorage(): StorageProvider {
|
|
8
|
+
if (!storageInstance) {
|
|
9
|
+
const type = process.env.STORAGE_PROVIDER || "local";
|
|
10
|
+
console.error(`Initializing storage provider: ${type}`); // Log to stderr for debug
|
|
11
|
+
|
|
12
|
+
if (type === "payload") {
|
|
13
|
+
storageInstance = new PayloadStorageProvider(
|
|
14
|
+
process.env.PAYLOAD_API_URL || "http://localhost:3000",
|
|
15
|
+
process.env.PAYLOAD_API_KEY || ""
|
|
16
|
+
);
|
|
17
|
+
} else {
|
|
18
|
+
storageInstance = new LocalStorageProvider(process.cwd());
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Initialize async (fire and forget or handle properly in app startup)
|
|
22
|
+
storageInstance
|
|
23
|
+
.init()
|
|
24
|
+
.catch((err) => console.error("Failed to init storage:", err));
|
|
25
|
+
}
|
|
26
|
+
return storageInstance;
|
|
27
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export interface StorageProvider {
|
|
2
|
+
init(): Promise<void>;
|
|
3
|
+
readFile(path: string): Promise<Buffer>;
|
|
4
|
+
writeFile(path: string, data: Buffer | string): Promise<string>; // Returns path or URL
|
|
5
|
+
exists(path: string): Promise<boolean>;
|
|
6
|
+
getPublicUrl(path: string): Promise<string>;
|
|
7
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import * as fs from "fs";
|
|
2
|
+
import * as path from "path";
|
|
3
|
+
import type { StorageProvider } from "./interface";
|
|
4
|
+
|
|
5
|
+
export class LocalStorageProvider implements StorageProvider {
|
|
6
|
+
private basePath: string;
|
|
7
|
+
|
|
8
|
+
constructor(basePath: string = process.cwd()) {
|
|
9
|
+
this.basePath = basePath;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async init(): Promise<void> {
|
|
13
|
+
// No-op for local
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
async readFile(filePath: string): Promise<Buffer> {
|
|
17
|
+
// Handle absolute paths by checking if it starts with basePath or just use it if it exists
|
|
18
|
+
let fullPath = filePath;
|
|
19
|
+
if (!path.isAbsolute(filePath)) {
|
|
20
|
+
fullPath = path.resolve(this.basePath, filePath);
|
|
21
|
+
}
|
|
22
|
+
return fs.promises.readFile(fullPath);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
async writeFile(filePath: string, data: Buffer | string): Promise<string> {
|
|
26
|
+
let fullPath = filePath;
|
|
27
|
+
if (!path.isAbsolute(filePath)) {
|
|
28
|
+
fullPath = path.resolve(this.basePath, filePath);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const dir = path.dirname(fullPath);
|
|
32
|
+
if (!fs.existsSync(dir)) {
|
|
33
|
+
await fs.promises.mkdir(dir, { recursive: true });
|
|
34
|
+
}
|
|
35
|
+
await fs.promises.writeFile(fullPath, data);
|
|
36
|
+
return fullPath;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
async exists(filePath: string): Promise<boolean> {
|
|
40
|
+
let fullPath = filePath;
|
|
41
|
+
if (!path.isAbsolute(filePath)) {
|
|
42
|
+
fullPath = path.resolve(this.basePath, filePath);
|
|
43
|
+
}
|
|
44
|
+
return fs.existsSync(fullPath);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
async getPublicUrl(filePath: string): Promise<string> {
|
|
48
|
+
// For local, we just return the absolute path
|
|
49
|
+
let fullPath = filePath;
|
|
50
|
+
if (!path.isAbsolute(filePath)) {
|
|
51
|
+
fullPath = path.resolve(this.basePath, filePath);
|
|
52
|
+
}
|
|
53
|
+
return fullPath;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import type { StorageProvider } from "./interface";
|
|
2
|
+
|
|
3
|
+
export class PayloadStorageProvider implements StorageProvider {
|
|
4
|
+
private apiUrl: string;
|
|
5
|
+
private apiKey: string;
|
|
6
|
+
private collection: string;
|
|
7
|
+
|
|
8
|
+
constructor(apiUrl: string, apiKey: string, collection: string = "media") {
|
|
9
|
+
this.apiUrl = apiUrl;
|
|
10
|
+
this.apiKey = apiKey;
|
|
11
|
+
this.collection = collection;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
async init(): Promise<void> {
|
|
15
|
+
console.log("Initializing Payload Storage Provider...");
|
|
16
|
+
// TODO: Verify connection to Payload CMS
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
async readFile(filePath: string): Promise<Buffer> {
|
|
20
|
+
// TODO: Implement fetching file from Payload CMS
|
|
21
|
+
// 1. Search for file by filename or ID
|
|
22
|
+
// 2. Download the file buffer
|
|
23
|
+
console.log(`[Payload] Reading file: ${filePath}`);
|
|
24
|
+
throw new Error("PayloadStorageProvider.readFile not implemented yet.");
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async writeFile(filePath: string, data: Buffer | string): Promise<string> {
|
|
28
|
+
// TODO: Implement uploading file to Payload CMS
|
|
29
|
+
// 1. Create FormData
|
|
30
|
+
// 2. POST to /api/{collection}
|
|
31
|
+
console.log(`[Payload] Writing file: ${filePath}`);
|
|
32
|
+
throw new Error("PayloadStorageProvider.writeFile not implemented yet.");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async exists(filePath: string): Promise<boolean> {
|
|
36
|
+
// TODO: Check if file exists in Payload
|
|
37
|
+
console.log(`[Payload] Checking existence: ${filePath}`);
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async getPublicUrl(filePath: string): Promise<string> {
|
|
42
|
+
// TODO: Return the public URL of the file in Payload
|
|
43
|
+
console.log(`[Payload] Getting public URL: ${filePath}`);
|
|
44
|
+
return `${this.apiUrl}/${this.collection}/${filePath}`;
|
|
45
|
+
}
|
|
46
|
+
}
|
package/src/tools/gemini.ts
CHANGED
|
@@ -1,17 +1,26 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
-
import {
|
|
2
|
+
import {
|
|
3
|
+
GoogleGenAI,
|
|
4
|
+
createPartFromUri,
|
|
5
|
+
createUserContent,
|
|
6
|
+
} from "@google/genai";
|
|
3
7
|
import * as fs from "fs";
|
|
8
|
+
import * as path from "path";
|
|
9
|
+
import * as os from "os";
|
|
4
10
|
import * as wav from "wav";
|
|
11
|
+
import { PassThrough } from "stream";
|
|
12
|
+
import { getStorage } from "../storage";
|
|
5
13
|
|
|
6
14
|
const ai = new GoogleGenAI({
|
|
7
15
|
apiKey: process.env.GEMINI_API_KEY || "",
|
|
8
16
|
});
|
|
9
17
|
|
|
10
|
-
function fileToGenerativePart(filePath: string) {
|
|
11
|
-
|
|
18
|
+
async function fileToGenerativePart(filePath: string) {
|
|
19
|
+
const storage = getStorage();
|
|
20
|
+
if (!(await storage.exists(filePath))) {
|
|
12
21
|
throw new Error(`File not found: ${filePath}`);
|
|
13
22
|
}
|
|
14
|
-
const imageBytes =
|
|
23
|
+
const imageBytes = await storage.readFile(filePath);
|
|
15
24
|
return {
|
|
16
25
|
inlineData: {
|
|
17
26
|
data: Buffer.from(imageBytes).toString("base64"),
|
|
@@ -21,22 +30,37 @@ function fileToGenerativePart(filePath: string) {
|
|
|
21
30
|
}
|
|
22
31
|
|
|
23
32
|
// Helper function to save WAV file
|
|
24
|
-
function
|
|
33
|
+
// Helper function to save WAV file
|
|
34
|
+
async function saveWaveFile(
|
|
25
35
|
filename: string,
|
|
26
36
|
pcmData: Buffer,
|
|
27
37
|
channels = 1,
|
|
28
38
|
rate = 24000,
|
|
29
|
-
sampleWidth = 2
|
|
39
|
+
sampleWidth = 2
|
|
30
40
|
): Promise<void> {
|
|
31
41
|
return new Promise((resolve, reject) => {
|
|
32
|
-
const writer = new wav.
|
|
42
|
+
const writer = new wav.Writer({
|
|
33
43
|
channels,
|
|
34
44
|
sampleRate: rate,
|
|
35
45
|
bitDepth: sampleWidth * 8,
|
|
36
46
|
});
|
|
37
47
|
|
|
38
|
-
|
|
39
|
-
|
|
48
|
+
const stream = new PassThrough();
|
|
49
|
+
const chunks: Buffer[] = [];
|
|
50
|
+
|
|
51
|
+
writer.pipe(stream);
|
|
52
|
+
stream.on("data", (chunk) => chunks.push(chunk));
|
|
53
|
+
stream.on("end", async () => {
|
|
54
|
+
try {
|
|
55
|
+
const wavBuffer = Buffer.concat(chunks);
|
|
56
|
+
const storage = getStorage();
|
|
57
|
+
await storage.writeFile(filename, wavBuffer);
|
|
58
|
+
resolve();
|
|
59
|
+
} catch (err) {
|
|
60
|
+
reject(err);
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
writer.on("error", reject);
|
|
40
64
|
|
|
41
65
|
writer.write(pcmData);
|
|
42
66
|
writer.end();
|
|
@@ -45,31 +69,59 @@ function saveWaveFile(
|
|
|
45
69
|
|
|
46
70
|
// Helper function to check if URL is YouTube URL
|
|
47
71
|
function isYouTubeUrl(url: string): boolean {
|
|
48
|
-
return url.includes(
|
|
72
|
+
return url.includes("youtube.com/watch") || url.includes("youtu.be");
|
|
49
73
|
}
|
|
50
74
|
|
|
51
75
|
// Helper function to get file size in bytes
|
|
52
|
-
function getFileSize(filePath: string): number {
|
|
53
|
-
const
|
|
54
|
-
|
|
76
|
+
async function getFileSize(filePath: string): Promise<number> {
|
|
77
|
+
const storage = getStorage();
|
|
78
|
+
const buffer = await storage.readFile(filePath);
|
|
79
|
+
return buffer.length;
|
|
55
80
|
}
|
|
56
81
|
|
|
82
|
+
// Helper function to upload file to Gemini API
|
|
57
83
|
// Helper function to upload file to Gemini API
|
|
58
84
|
async function uploadFileToGemini(filePath: string): Promise<any> {
|
|
59
85
|
try {
|
|
86
|
+
const storage = getStorage();
|
|
87
|
+
// For Gemini API, we need a local file path.
|
|
88
|
+
// If storage is local, we can use the path directly (if we can resolve it).
|
|
89
|
+
// If storage is remote, we must download to a temp file.
|
|
90
|
+
|
|
91
|
+
let localPath = filePath;
|
|
92
|
+
let isTemp = false;
|
|
93
|
+
|
|
94
|
+
// Check if we can get a local path from storage (hacky check for LocalStorageProvider)
|
|
95
|
+
// A better way is to always download to temp if not sure, or ask storage for a local path.
|
|
96
|
+
// For now, let's assume we need to download if it's not a local file system path that exists.
|
|
97
|
+
|
|
98
|
+
if (!fs.existsSync(filePath)) {
|
|
99
|
+
// Try to read from storage and write to temp
|
|
100
|
+
const buffer = await storage.readFile(filePath);
|
|
101
|
+
const tempDir = os.tmpdir();
|
|
102
|
+
const tempFilePath = path.join(tempDir, path.basename(filePath));
|
|
103
|
+
fs.writeFileSync(tempFilePath, buffer);
|
|
104
|
+
localPath = tempFilePath;
|
|
105
|
+
isTemp = true;
|
|
106
|
+
}
|
|
107
|
+
|
|
60
108
|
const uploadedFile = await ai.files.upload({
|
|
61
|
-
file:
|
|
109
|
+
file: localPath,
|
|
62
110
|
});
|
|
63
|
-
|
|
111
|
+
|
|
112
|
+
if (isTemp) {
|
|
113
|
+
fs.unlinkSync(localPath);
|
|
114
|
+
}
|
|
115
|
+
|
|
64
116
|
// Wait for file processing to complete
|
|
65
117
|
let getFile = await ai.files.get({ name: uploadedFile.name! });
|
|
66
|
-
while (getFile.state ===
|
|
67
|
-
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
118
|
+
while (getFile.state === "PROCESSING") {
|
|
119
|
+
await new Promise((resolve) => setTimeout(resolve, 3000));
|
|
68
120
|
getFile = await ai.files.get({ name: uploadedFile.name! });
|
|
69
121
|
}
|
|
70
122
|
|
|
71
|
-
if (getFile.state ===
|
|
72
|
-
throw new Error(
|
|
123
|
+
if (getFile.state === "FAILED") {
|
|
124
|
+
throw new Error("File processing failed");
|
|
73
125
|
}
|
|
74
126
|
|
|
75
127
|
return getFile;
|
|
@@ -79,41 +131,58 @@ async function uploadFileToGemini(filePath: string): Promise<any> {
|
|
|
79
131
|
}
|
|
80
132
|
|
|
81
133
|
// Helper function to process video input intelligently
|
|
82
|
-
async function processVideoInput(
|
|
134
|
+
async function processVideoInput(
|
|
135
|
+
input: string,
|
|
136
|
+
config?: { fps?: number; startOffset?: string; endOffset?: string }
|
|
137
|
+
): Promise<any> {
|
|
83
138
|
if (isYouTubeUrl(input)) {
|
|
84
139
|
return {
|
|
85
140
|
fileData: {
|
|
86
141
|
fileUri: input,
|
|
87
|
-
mimeType:
|
|
88
|
-
videoMetadata: config
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
142
|
+
mimeType: "video/*",
|
|
143
|
+
videoMetadata: config
|
|
144
|
+
? {
|
|
145
|
+
fps: config.fps,
|
|
146
|
+
startOffset: config.startOffset,
|
|
147
|
+
endOffset: config.endOffset,
|
|
148
|
+
}
|
|
149
|
+
: undefined,
|
|
150
|
+
},
|
|
94
151
|
};
|
|
95
152
|
} else {
|
|
96
153
|
// Local file processing - use File Upload API
|
|
97
|
-
|
|
154
|
+
const storage = getStorage();
|
|
155
|
+
if (!(await storage.exists(input))) {
|
|
98
156
|
throw new Error(`Video file not found: ${input}`);
|
|
99
157
|
}
|
|
100
|
-
|
|
158
|
+
|
|
101
159
|
// Upload file to Gemini API
|
|
102
160
|
const uploadedFile = await uploadFileToGemini(input);
|
|
103
|
-
|
|
161
|
+
|
|
104
162
|
return uploadedFile;
|
|
105
163
|
}
|
|
106
164
|
}
|
|
107
165
|
|
|
108
166
|
export const geminiTextToImage = {
|
|
109
167
|
name: "geminiTextToImage",
|
|
110
|
-
description:
|
|
168
|
+
description:
|
|
169
|
+
"Generate images from text prompts using Gemini 2.5 Flash Image model",
|
|
111
170
|
parameters: z.object({
|
|
112
171
|
prompt: z.string().describe("Text description of the image to generate"),
|
|
113
|
-
aspect_ratio: z
|
|
114
|
-
|
|
172
|
+
aspect_ratio: z
|
|
173
|
+
.string()
|
|
174
|
+
.optional()
|
|
175
|
+
.describe("Aspect ratio: 1:1, 3:4, 4:3, 9:16, or 16:9"),
|
|
176
|
+
output_path: z
|
|
177
|
+
.string()
|
|
178
|
+
.optional()
|
|
179
|
+
.describe("File path to save the generated image"),
|
|
115
180
|
}),
|
|
116
|
-
execute: async (args: {
|
|
181
|
+
execute: async (args: {
|
|
182
|
+
prompt: string;
|
|
183
|
+
aspect_ratio?: string;
|
|
184
|
+
output_path?: string;
|
|
185
|
+
}) => {
|
|
117
186
|
try {
|
|
118
187
|
const response = await ai.models.generateContent({
|
|
119
188
|
model: "gemini-2.5-flash-image",
|
|
@@ -133,10 +202,17 @@ export const geminiTextToImage = {
|
|
|
133
202
|
} else if (part.inlineData?.data) {
|
|
134
203
|
const imageData = part.inlineData.data;
|
|
135
204
|
if (args.output_path) {
|
|
136
|
-
|
|
205
|
+
const storage = getStorage();
|
|
206
|
+
await storage.writeFile(
|
|
207
|
+
args.output_path,
|
|
208
|
+
Buffer.from(imageData, "base64")
|
|
209
|
+
);
|
|
137
210
|
result += `\nImage saved to: ${args.output_path}`;
|
|
138
211
|
} else {
|
|
139
|
-
result += `\nGenerated image (base64): ${imageData.substring(
|
|
212
|
+
result += `\nGenerated image (base64): ${imageData.substring(
|
|
213
|
+
0,
|
|
214
|
+
100
|
|
215
|
+
)}...`;
|
|
140
216
|
}
|
|
141
217
|
}
|
|
142
218
|
}
|
|
@@ -150,20 +226,33 @@ export const geminiTextToImage = {
|
|
|
150
226
|
|
|
151
227
|
export const geminiEditImage = {
|
|
152
228
|
name: "geminiEditImage",
|
|
153
|
-
description:
|
|
229
|
+
description:
|
|
230
|
+
"Edit existing images with text instructions using Gemini 2.5 Flash Image Preview",
|
|
154
231
|
parameters: z.object({
|
|
155
232
|
image_path: z.string().describe("Path to the source image file"),
|
|
156
233
|
prompt: z.string().describe("Text instructions for editing the image"),
|
|
157
|
-
output_path: z
|
|
158
|
-
|
|
234
|
+
output_path: z
|
|
235
|
+
.string()
|
|
236
|
+
.optional()
|
|
237
|
+
.describe("File path to save the edited image"),
|
|
238
|
+
reference_images: z
|
|
239
|
+
.array(z.string())
|
|
240
|
+
.optional()
|
|
241
|
+
.describe("Additional image paths for reference"),
|
|
159
242
|
}),
|
|
160
|
-
execute: async (args: {
|
|
243
|
+
execute: async (args: {
|
|
244
|
+
image_path: string;
|
|
245
|
+
prompt: string;
|
|
246
|
+
output_path?: string;
|
|
247
|
+
reference_images?: string[];
|
|
248
|
+
}) => {
|
|
161
249
|
try {
|
|
162
|
-
const
|
|
163
|
-
|
|
250
|
+
const imagePart = await fileToGenerativePart(args.image_path);
|
|
251
|
+
const contents: any[] = [args.prompt, imagePart];
|
|
252
|
+
|
|
164
253
|
if (args.reference_images) {
|
|
165
254
|
for (const refPath of args.reference_images) {
|
|
166
|
-
contents.push(fileToGenerativePart(refPath));
|
|
255
|
+
contents.push(await fileToGenerativePart(refPath));
|
|
167
256
|
}
|
|
168
257
|
}
|
|
169
258
|
|
|
@@ -180,10 +269,17 @@ export const geminiEditImage = {
|
|
|
180
269
|
} else if (part.inlineData?.data) {
|
|
181
270
|
const imageData = part.inlineData.data;
|
|
182
271
|
if (args.output_path) {
|
|
183
|
-
|
|
272
|
+
const storage = getStorage();
|
|
273
|
+
await storage.writeFile(
|
|
274
|
+
args.output_path,
|
|
275
|
+
Buffer.from(imageData, "base64")
|
|
276
|
+
);
|
|
184
277
|
result += `\nEdited image saved to: ${args.output_path}`;
|
|
185
278
|
} else {
|
|
186
|
-
result += `\nEdited image (base64): ${imageData.substring(
|
|
279
|
+
result += `\nEdited image (base64): ${imageData.substring(
|
|
280
|
+
0,
|
|
281
|
+
100
|
|
282
|
+
)}...`;
|
|
187
283
|
}
|
|
188
284
|
}
|
|
189
285
|
}
|
|
@@ -197,9 +293,12 @@ export const geminiEditImage = {
|
|
|
197
293
|
|
|
198
294
|
export const geminiAnalyzeImages = {
|
|
199
295
|
name: "geminiAnalyzeImages",
|
|
200
|
-
description:
|
|
296
|
+
description:
|
|
297
|
+
"Analyze and describe images using Gemini 2.5 Pro with advanced multimodal understanding",
|
|
201
298
|
parameters: z.object({
|
|
202
|
-
image_paths: z
|
|
299
|
+
image_paths: z
|
|
300
|
+
.array(z.string())
|
|
301
|
+
.describe("Array of image file paths to analyze"),
|
|
203
302
|
prompt: z.string().describe("Text prompt or question about the images"),
|
|
204
303
|
}),
|
|
205
304
|
execute: async (args: { image_paths: string[]; prompt: string }) => {
|
|
@@ -208,12 +307,12 @@ export const geminiAnalyzeImages = {
|
|
|
208
307
|
if (!args.image_paths) {
|
|
209
308
|
throw new Error("Image paths not provided");
|
|
210
309
|
}
|
|
211
|
-
|
|
310
|
+
|
|
212
311
|
// Convert to array if passed as string
|
|
213
312
|
let imagePaths: string[];
|
|
214
|
-
if (typeof args.image_paths ===
|
|
313
|
+
if (typeof args.image_paths === "string") {
|
|
215
314
|
const strValue = args.image_paths as string;
|
|
216
|
-
if (strValue.startsWith(
|
|
315
|
+
if (strValue.startsWith("[") && strValue.endsWith("]")) {
|
|
217
316
|
try {
|
|
218
317
|
imagePaths = JSON.parse(strValue);
|
|
219
318
|
} catch {
|
|
@@ -227,15 +326,15 @@ export const geminiAnalyzeImages = {
|
|
|
227
326
|
} else {
|
|
228
327
|
throw new Error("Invalid image_paths: must be array or string");
|
|
229
328
|
}
|
|
230
|
-
|
|
329
|
+
|
|
231
330
|
if (imagePaths.length === 0) {
|
|
232
331
|
throw new Error("At least one image path must be provided");
|
|
233
332
|
}
|
|
234
|
-
|
|
333
|
+
|
|
235
334
|
const contents: any[] = [args.prompt];
|
|
236
|
-
|
|
335
|
+
|
|
237
336
|
for (const imagePath of imagePaths) {
|
|
238
|
-
contents.push(fileToGenerativePart(imagePath));
|
|
337
|
+
contents.push(await fileToGenerativePart(imagePath));
|
|
239
338
|
}
|
|
240
339
|
|
|
241
340
|
const response = await ai.models.generateContent({
|
|
@@ -260,41 +359,56 @@ export const geminiAnalyzeImages = {
|
|
|
260
359
|
|
|
261
360
|
export const geminiSingleSpeakerTts = {
|
|
262
361
|
name: "geminiSingleSpeakerTts",
|
|
263
|
-
description:
|
|
362
|
+
description:
|
|
363
|
+
"Generate single speaker voice audio from text using Gemini 2.5 Pro Preview TTS model",
|
|
264
364
|
parameters: z.object({
|
|
265
365
|
text: z.string().describe("Text to convert to speech"),
|
|
266
|
-
voice_name: z
|
|
267
|
-
|
|
366
|
+
voice_name: z
|
|
367
|
+
.string()
|
|
368
|
+
.describe(
|
|
369
|
+
"Voice name from supported options. Use Kore, Erinome or Despina for the female voices and Enceladus for male."
|
|
370
|
+
),
|
|
371
|
+
output_path: z
|
|
372
|
+
.string()
|
|
373
|
+
.optional()
|
|
374
|
+
.describe(
|
|
375
|
+
"Output WAV file path (optional, defaults to timestamp-based filename)"
|
|
376
|
+
),
|
|
268
377
|
}),
|
|
269
|
-
execute: async (args: {
|
|
378
|
+
execute: async (args: {
|
|
379
|
+
text: string;
|
|
380
|
+
voice_name: string;
|
|
381
|
+
output_path?: string;
|
|
382
|
+
}) => {
|
|
270
383
|
try {
|
|
271
384
|
const response = await ai.models.generateContent({
|
|
272
385
|
model: "gemini-2.5-pro-preview-tts",
|
|
273
386
|
contents: [{ parts: [{ text: args.text }] }],
|
|
274
387
|
config: {
|
|
275
|
-
responseModalities: [
|
|
388
|
+
responseModalities: ["AUDIO"],
|
|
276
389
|
speechConfig: {
|
|
277
390
|
voiceConfig: {
|
|
278
|
-
prebuiltVoiceConfig: {
|
|
279
|
-
voiceName: args.voice_name ||
|
|
391
|
+
prebuiltVoiceConfig: {
|
|
392
|
+
voiceName: args.voice_name || "Despina",
|
|
280
393
|
},
|
|
281
394
|
},
|
|
282
395
|
},
|
|
283
396
|
},
|
|
284
397
|
});
|
|
285
398
|
|
|
286
|
-
const data =
|
|
399
|
+
const data =
|
|
400
|
+
response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
|
287
401
|
if (!data) {
|
|
288
402
|
throw new Error("No audio data received from Gemini API");
|
|
289
403
|
}
|
|
290
404
|
|
|
291
|
-
const audioBuffer = Buffer.from(data,
|
|
292
|
-
|
|
405
|
+
const audioBuffer = Buffer.from(data, "base64");
|
|
406
|
+
|
|
293
407
|
// Generate output filename if not provided
|
|
294
408
|
const outputPath = args.output_path || `voice_output_${Date.now()}.wav`;
|
|
295
|
-
|
|
409
|
+
|
|
296
410
|
await saveWaveFile(outputPath, audioBuffer);
|
|
297
|
-
|
|
411
|
+
|
|
298
412
|
return `Audio generated successfully: ${outputPath}`;
|
|
299
413
|
} catch (error: any) {
|
|
300
414
|
throw new Error(`Voice generation failed: ${error.message}`);
|
|
@@ -304,27 +418,59 @@ export const geminiSingleSpeakerTts = {
|
|
|
304
418
|
|
|
305
419
|
export const geminiAnalyzeVideos = {
|
|
306
420
|
name: "geminiAnalyzeVideos",
|
|
307
|
-
description:
|
|
421
|
+
description:
|
|
422
|
+
"Analyze and understand video content using Gemini 2.5 Flash model. Intelligently handles YouTube URLs and local videos (files <20MB processed inline, ≥20MB uploaded via File API). Supports timestamp queries, clipping, and custom frame rates with default 5 FPS for local videos to optimize processing.",
|
|
308
423
|
parameters: z.object({
|
|
309
|
-
video_inputs: z
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
424
|
+
video_inputs: z
|
|
425
|
+
.array(z.string())
|
|
426
|
+
.describe(
|
|
427
|
+
"Array of video inputs - mix of local file paths and YouTube URLs (max 10 videos). Local files <20MB processed inline, larger files uploaded via File API automatically."
|
|
428
|
+
),
|
|
429
|
+
prompt: z
|
|
430
|
+
.string()
|
|
431
|
+
.describe(
|
|
432
|
+
"Text prompt or question about the videos. Use MM:SS format for timestamp references (e.g., 'What happens at 01:30?')."
|
|
433
|
+
),
|
|
434
|
+
fps: z
|
|
435
|
+
.number()
|
|
436
|
+
.optional()
|
|
437
|
+
.describe(
|
|
438
|
+
"Frame rate for video processing (default: 5 FPS for local videos to reduce file size, 1 FPS for YouTube URLs)"
|
|
439
|
+
),
|
|
440
|
+
start_offset: z
|
|
441
|
+
.string()
|
|
442
|
+
.optional()
|
|
443
|
+
.describe("Clip start time in seconds with 's' suffix (e.g., '40s')"),
|
|
444
|
+
end_offset: z
|
|
445
|
+
.string()
|
|
446
|
+
.optional()
|
|
447
|
+
.describe("Clip end time in seconds with 's' suffix (e.g., '80s')"),
|
|
448
|
+
media_resolution: z
|
|
449
|
+
.string()
|
|
450
|
+
.optional()
|
|
451
|
+
.describe(
|
|
452
|
+
"Media resolution: 'default' or 'low' (low resolution uses ~100 tokens/sec vs 300 tokens/sec)"
|
|
453
|
+
),
|
|
315
454
|
}),
|
|
316
|
-
execute: async (args: {
|
|
455
|
+
execute: async (args: {
|
|
456
|
+
video_inputs: string[];
|
|
457
|
+
prompt: string;
|
|
458
|
+
fps?: number;
|
|
459
|
+
start_offset?: string;
|
|
460
|
+
end_offset?: string;
|
|
461
|
+
media_resolution?: string;
|
|
462
|
+
}) => {
|
|
317
463
|
try {
|
|
318
464
|
// Handle array parsing
|
|
319
465
|
if (!args.video_inputs) {
|
|
320
466
|
throw new Error("Video inputs not provided");
|
|
321
467
|
}
|
|
322
|
-
|
|
468
|
+
|
|
323
469
|
// Convert to array if passed as string
|
|
324
470
|
let videoInputs: string[];
|
|
325
|
-
if (typeof args.video_inputs ===
|
|
471
|
+
if (typeof args.video_inputs === "string") {
|
|
326
472
|
const strValue = args.video_inputs as string;
|
|
327
|
-
if (strValue.startsWith(
|
|
473
|
+
if (strValue.startsWith("[") && strValue.endsWith("]")) {
|
|
328
474
|
try {
|
|
329
475
|
videoInputs = JSON.parse(strValue);
|
|
330
476
|
} catch {
|
|
@@ -338,43 +484,47 @@ export const geminiAnalyzeVideos = {
|
|
|
338
484
|
} else {
|
|
339
485
|
throw new Error("Invalid video_inputs: must be array or string");
|
|
340
486
|
}
|
|
341
|
-
|
|
487
|
+
|
|
342
488
|
if (videoInputs.length === 0) {
|
|
343
489
|
throw new Error("At least one video input must be provided");
|
|
344
490
|
}
|
|
345
|
-
|
|
491
|
+
|
|
346
492
|
if (videoInputs.length > 10) {
|
|
347
|
-
throw new Error(
|
|
493
|
+
throw new Error(
|
|
494
|
+
"Maximum 10 videos per request allowed for Gemini 2.5+ models"
|
|
495
|
+
);
|
|
348
496
|
}
|
|
349
497
|
|
|
350
498
|
// Prepare video parts for content
|
|
351
499
|
const videoParts: any[] = [];
|
|
352
|
-
|
|
500
|
+
|
|
353
501
|
// Process each video input
|
|
354
502
|
for (const videoInput of videoInputs) {
|
|
355
503
|
const videoConfig = {
|
|
356
504
|
fps: args.fps || (isYouTubeUrl(videoInput) ? 1 : 5), // Default 5 FPS for local, 1 FPS for YouTube
|
|
357
505
|
startOffset: args.start_offset,
|
|
358
|
-
endOffset: args.end_offset
|
|
506
|
+
endOffset: args.end_offset,
|
|
359
507
|
};
|
|
360
|
-
|
|
508
|
+
|
|
361
509
|
const videoPart = await processVideoInput(videoInput, videoConfig);
|
|
362
510
|
videoParts.push(videoPart);
|
|
363
511
|
}
|
|
364
512
|
|
|
365
513
|
// Build content using createUserContent and createPartFromUri for uploaded files
|
|
366
514
|
const contentParts: any[] = [args.prompt];
|
|
367
|
-
|
|
515
|
+
|
|
368
516
|
for (const videoPart of videoParts) {
|
|
369
517
|
if (videoPart.uri && videoPart.mimeType) {
|
|
370
|
-
contentParts.push(
|
|
518
|
+
contentParts.push(
|
|
519
|
+
createPartFromUri(videoPart.uri, videoPart.mimeType)
|
|
520
|
+
);
|
|
371
521
|
}
|
|
372
522
|
}
|
|
373
|
-
|
|
523
|
+
|
|
374
524
|
const finalContents = createUserContent(contentParts);
|
|
375
|
-
|
|
525
|
+
|
|
376
526
|
const response = await ai.models.generateContent({
|
|
377
|
-
model:
|
|
527
|
+
model: "gemini-2.5-pro",
|
|
378
528
|
contents: finalContents,
|
|
379
529
|
});
|
|
380
530
|
|
|
@@ -386,7 +536,7 @@ export const geminiAnalyzeVideos = {
|
|
|
386
536
|
}
|
|
387
537
|
}
|
|
388
538
|
}
|
|
389
|
-
|
|
539
|
+
|
|
390
540
|
return result || "Video analysis completed but no text response received";
|
|
391
541
|
} catch (error: any) {
|
|
392
542
|
throw new Error(`Video analysis failed: ${error.message}`);
|
|
@@ -1,263 +1,327 @@
|
|
|
1
|
-
import { z } from "zod";
|
|
2
1
|
import * as fs from "fs";
|
|
2
|
+
import { GoogleAuth } from "google-auth-library";
|
|
3
|
+
import { exec } from "child_process";
|
|
3
4
|
import * as path from "path";
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
import { getStorage } from "../storage";
|
|
4
7
|
|
|
5
8
|
async function wait(ms: number): Promise<void> {
|
|
6
9
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
7
10
|
}
|
|
8
11
|
|
|
9
|
-
function
|
|
10
|
-
|
|
12
|
+
async function fetchAccessToken(): Promise<string> {
|
|
13
|
+
try {
|
|
14
|
+
const auth = new GoogleAuth({
|
|
15
|
+
scopes: ["https://www.googleapis.com/auth/cloud-platform"],
|
|
16
|
+
});
|
|
17
|
+
const client = await auth.getClient();
|
|
18
|
+
const token = await client.getAccessToken();
|
|
19
|
+
if (!token || typeof token !== "string") {
|
|
20
|
+
throw new Error("No token from GoogleAuth");
|
|
21
|
+
}
|
|
22
|
+
return token;
|
|
23
|
+
} catch (e) {
|
|
24
|
+
// Fallback to gcloud
|
|
25
|
+
return await new Promise((resolve, reject) => {
|
|
26
|
+
exec("gcloud auth print-access-token", (err, stdout, stderr) => {
|
|
27
|
+
if (err) {
|
|
28
|
+
reject(
|
|
29
|
+
new Error(
|
|
30
|
+
`Failed to fetch an access token (ADC and gcloud): ${
|
|
31
|
+
stderr || err.message
|
|
32
|
+
}`
|
|
33
|
+
)
|
|
34
|
+
);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
const t = (stdout || "").trim();
|
|
38
|
+
if (!t) {
|
|
39
|
+
reject(
|
|
40
|
+
new Error(
|
|
41
|
+
"Failed to fetch an access token: empty token from gcloud"
|
|
42
|
+
)
|
|
43
|
+
);
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
resolve(t);
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
async function fileToBase64(
|
|
53
|
+
filePath: string
|
|
54
|
+
): Promise<{ data: string; mimeType: string }> {
|
|
55
|
+
const storage = getStorage();
|
|
56
|
+
if (!(await storage.exists(filePath))) {
|
|
11
57
|
throw new Error(`File not found: ${filePath}`);
|
|
12
58
|
}
|
|
13
|
-
const buf =
|
|
59
|
+
const buf = await storage.readFile(filePath);
|
|
14
60
|
const data = Buffer.from(buf).toString("base64");
|
|
15
|
-
//
|
|
16
|
-
const
|
|
17
|
-
const mimeType = ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' :
|
|
18
|
-
ext === '.png' ? 'image/png' :
|
|
19
|
-
ext === '.webp' ? 'image/webp' : 'image/png';
|
|
61
|
+
// Default to PNG if not sure, similar to existing code
|
|
62
|
+
const mimeType = "image/png";
|
|
20
63
|
return { data, mimeType };
|
|
21
64
|
}
|
|
22
65
|
|
|
23
66
|
export const imageToVideo = {
|
|
24
|
-
name: "
|
|
25
|
-
description:
|
|
67
|
+
name: "image_to_video",
|
|
68
|
+
description:
|
|
69
|
+
"Generate videos from an image as starting first frame using Vertex Veo models (predictLongRunning + fetchPredictOperation).",
|
|
26
70
|
parameters: z.object({
|
|
27
71
|
prompt: z.string().describe("Text description for the video"),
|
|
28
|
-
image_path: z
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
72
|
+
image_path: z
|
|
73
|
+
.string()
|
|
74
|
+
.optional()
|
|
75
|
+
.describe("Path to source image for image-to-video generation"),
|
|
76
|
+
last_frame_path: z
|
|
77
|
+
.string()
|
|
78
|
+
.optional()
|
|
79
|
+
.describe("Path to last frame image to guide ending frame (optional)"),
|
|
80
|
+
aspect_ratio: z
|
|
81
|
+
.string()
|
|
82
|
+
.optional()
|
|
83
|
+
.describe("Video aspect ratio: '16:9' or '9:16' (default: '9:16')"),
|
|
84
|
+
duration_seconds: z
|
|
85
|
+
.string()
|
|
86
|
+
.optional()
|
|
87
|
+
.describe("Video duration in seconds: '4', '6', or '8' (default: '6')"),
|
|
88
|
+
resolution: z
|
|
89
|
+
.string()
|
|
90
|
+
.optional()
|
|
91
|
+
.describe("Video resolution: '720p' or '1080p' (default: '720p')"),
|
|
92
|
+
negative_prompt: z
|
|
93
|
+
.string()
|
|
94
|
+
.optional()
|
|
95
|
+
.describe("Text describing what not to include in the video"),
|
|
96
|
+
person_generation: z
|
|
97
|
+
.string()
|
|
98
|
+
.optional()
|
|
99
|
+
.describe(
|
|
100
|
+
"Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"
|
|
101
|
+
),
|
|
102
|
+
reference_images: z
|
|
103
|
+
.array(z.string())
|
|
104
|
+
.optional()
|
|
105
|
+
.describe("Additional image paths for reference (max 3)"),
|
|
106
|
+
output_path: z
|
|
107
|
+
.string()
|
|
108
|
+
.optional()
|
|
109
|
+
.describe(
|
|
110
|
+
"Output MP4 file path (if multiple predictions, index suffix is added)"
|
|
111
|
+
),
|
|
112
|
+
project_id: z
|
|
113
|
+
.string()
|
|
114
|
+
.optional()
|
|
115
|
+
.describe("GCP Project ID (default: mixio-pro)"),
|
|
116
|
+
location_id: z
|
|
117
|
+
.string()
|
|
118
|
+
.optional()
|
|
119
|
+
.describe("Vertex region (default: us-central1)"),
|
|
120
|
+
model_id: z
|
|
121
|
+
.string()
|
|
122
|
+
.optional()
|
|
123
|
+
.describe("Model ID (default: veo-3.1-fast-generate-preview)"),
|
|
124
|
+
generate_audio: z
|
|
125
|
+
.boolean()
|
|
126
|
+
.optional()
|
|
127
|
+
.describe(
|
|
128
|
+
"Boolean flag to enable generation of audio along with the video"
|
|
129
|
+
)
|
|
130
|
+
.default(false),
|
|
39
131
|
}),
|
|
40
|
-
|
|
132
|
+
async execute(args: {
|
|
41
133
|
prompt: string;
|
|
42
134
|
image_path?: string;
|
|
43
135
|
last_frame_path?: string;
|
|
44
136
|
aspect_ratio?: string;
|
|
45
|
-
duration_seconds?: string
|
|
137
|
+
duration_seconds?: string;
|
|
46
138
|
resolution?: string;
|
|
47
139
|
negative_prompt?: string;
|
|
48
140
|
person_generation?: string;
|
|
49
|
-
reference_images?: string[];
|
|
141
|
+
reference_images?: string[] | string;
|
|
50
142
|
output_path?: string;
|
|
51
|
-
|
|
143
|
+
project_id?: string;
|
|
144
|
+
location_id?: string;
|
|
52
145
|
model_id?: string;
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
146
|
+
generate_audio?: boolean;
|
|
147
|
+
}) {
|
|
148
|
+
const projectId = args.project_id || "mixio-pro";
|
|
149
|
+
const location = args.location_id || "us-central1";
|
|
150
|
+
const modelId = args.model_id || "veo-3.1-fast-generate-preview";
|
|
58
151
|
|
|
59
|
-
const
|
|
60
|
-
const baseUrl = "https://generativelanguage.googleapis.com/v1beta";
|
|
152
|
+
const token = await fetchAccessToken();
|
|
61
153
|
|
|
62
|
-
|
|
63
|
-
const durationSeconds = args.duration_seconds
|
|
64
|
-
? (typeof args.duration_seconds === 'string' ? parseInt(args.duration_seconds) : args.duration_seconds)
|
|
65
|
-
: 6; // default
|
|
66
|
-
|
|
67
|
-
try {
|
|
68
|
-
// Build the request body for predictLongRunning
|
|
69
|
-
const instances: any[] = [
|
|
70
|
-
{
|
|
71
|
-
prompt: args.prompt,
|
|
72
|
-
},
|
|
73
|
-
];
|
|
154
|
+
const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
|
|
74
155
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
156
|
+
let imagePart: any = undefined;
|
|
157
|
+
if (args.image_path) {
|
|
158
|
+
const { data, mimeType } = await fileToBase64(args.image_path);
|
|
159
|
+
imagePart = {
|
|
160
|
+
image: {
|
|
79
161
|
bytesBase64Encoded: data,
|
|
80
162
|
mimeType,
|
|
81
|
-
}
|
|
82
|
-
}
|
|
163
|
+
},
|
|
164
|
+
};
|
|
165
|
+
}
|
|
83
166
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
167
|
+
let lastFramePart: any = undefined;
|
|
168
|
+
if (args.last_frame_path) {
|
|
169
|
+
const { data, mimeType } = await fileToBase64(args.last_frame_path);
|
|
170
|
+
lastFramePart = {
|
|
171
|
+
lastFrame: {
|
|
88
172
|
bytesBase64Encoded: data,
|
|
89
173
|
mimeType,
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
// Add reference images if provided
|
|
94
|
-
if (args.reference_images && args.reference_images.length > 0) {
|
|
95
|
-
const refImages = args.reference_images.slice(0, 3).map((imgPath) => {
|
|
96
|
-
const { data, mimeType } = fileToBase64(imgPath);
|
|
97
|
-
return {
|
|
98
|
-
image: {
|
|
99
|
-
bytesBase64Encoded: data,
|
|
100
|
-
mimeType,
|
|
101
|
-
},
|
|
102
|
-
referenceType: "asset",
|
|
103
|
-
};
|
|
104
|
-
});
|
|
105
|
-
instances[0].referenceImages = refImages;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// Build parameters - NOTE: Parameters go in "parameters" object, not in instances
|
|
109
|
-
const parameters: any = {};
|
|
110
|
-
|
|
111
|
-
if (args.aspect_ratio) {
|
|
112
|
-
parameters.aspectRatio = args.aspect_ratio;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
if (durationSeconds) {
|
|
116
|
-
parameters.durationSeconds = durationSeconds;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
if (args.resolution) {
|
|
120
|
-
parameters.resolution = args.resolution;
|
|
121
|
-
}
|
|
174
|
+
},
|
|
175
|
+
};
|
|
176
|
+
}
|
|
122
177
|
|
|
123
|
-
|
|
124
|
-
|
|
178
|
+
let referenceImages: any[] | undefined = undefined;
|
|
179
|
+
if (args.reference_images) {
|
|
180
|
+
let refImages: string[];
|
|
181
|
+
if (typeof args.reference_images === "string") {
|
|
182
|
+
if (
|
|
183
|
+
args.reference_images.startsWith("[") &&
|
|
184
|
+
args.reference_images.endsWith("]")
|
|
185
|
+
) {
|
|
186
|
+
try {
|
|
187
|
+
refImages = JSON.parse(args.reference_images);
|
|
188
|
+
} catch {
|
|
189
|
+
throw new Error("Invalid reference_images format");
|
|
190
|
+
}
|
|
191
|
+
} else {
|
|
192
|
+
refImages = [args.reference_images];
|
|
193
|
+
}
|
|
194
|
+
} else if (Array.isArray(args.reference_images)) {
|
|
195
|
+
refImages = args.reference_images;
|
|
196
|
+
} else {
|
|
197
|
+
throw new Error("Invalid reference_images: must be array or string");
|
|
125
198
|
}
|
|
126
199
|
|
|
127
|
-
if (
|
|
128
|
-
|
|
200
|
+
if (refImages.length > 0) {
|
|
201
|
+
referenceImages = await Promise.all(
|
|
202
|
+
refImages.slice(0, 3).map(async (p) => {
|
|
203
|
+
const { data, mimeType } = await fileToBase64(p);
|
|
204
|
+
return {
|
|
205
|
+
image: {
|
|
206
|
+
bytesBase64Encoded: data,
|
|
207
|
+
mimeType,
|
|
208
|
+
},
|
|
209
|
+
referenceType: "asset",
|
|
210
|
+
};
|
|
211
|
+
})
|
|
212
|
+
);
|
|
129
213
|
}
|
|
214
|
+
}
|
|
130
215
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
216
|
+
const personGeneration =
|
|
217
|
+
args.person_generation || (args.image_path ? "allow_adult" : "allow_all");
|
|
218
|
+
|
|
219
|
+
const instances: any[] = [
|
|
220
|
+
{
|
|
221
|
+
prompt: args.prompt,
|
|
222
|
+
...(imagePart || {}),
|
|
223
|
+
...(lastFramePart || {}),
|
|
224
|
+
...(referenceImages ? { referenceImages } : {}),
|
|
225
|
+
},
|
|
226
|
+
];
|
|
227
|
+
|
|
228
|
+
const parameters: any = {
|
|
229
|
+
aspectRatio: args.aspect_ratio || "9:16",
|
|
230
|
+
durationSeconds: parseInt(args.duration_seconds || "6") || 6,
|
|
231
|
+
resolution: args.resolution || "720p",
|
|
232
|
+
negativePrompt: args.negative_prompt,
|
|
233
|
+
generateAudio: args.generate_audio || false,
|
|
234
|
+
personGeneration,
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
const res = await fetch(url, {
|
|
238
|
+
method: "POST",
|
|
239
|
+
headers: {
|
|
240
|
+
Authorization: `Bearer ${token}`,
|
|
241
|
+
"Content-Type": "application/json",
|
|
242
|
+
},
|
|
243
|
+
body: JSON.stringify({ instances, parameters }),
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
if (!res.ok) {
|
|
247
|
+
const text = await res.text();
|
|
248
|
+
throw new Error(`Vertex request failed: ${res.status} ${text}`);
|
|
249
|
+
}
|
|
134
250
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
251
|
+
const op = (await res.json()) as any;
|
|
252
|
+
const name: string = op.name || op.operation || "";
|
|
253
|
+
if (!name) {
|
|
254
|
+
throw new Error(
|
|
255
|
+
"Vertex did not return an operation name for long-running request"
|
|
256
|
+
);
|
|
257
|
+
}
|
|
139
258
|
|
|
140
|
-
|
|
259
|
+
let current = op;
|
|
260
|
+
let done = !!op.done;
|
|
261
|
+
let tries = 0;
|
|
141
262
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
263
|
+
// Poll using fetchPredictOperation as per Vertex recommendation
|
|
264
|
+
const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
|
|
265
|
+
while (!done && tries < 60) {
|
|
266
|
+
await wait(10000);
|
|
267
|
+
const poll = await fetch(fetchUrl, {
|
|
145
268
|
method: "POST",
|
|
146
269
|
headers: {
|
|
147
|
-
|
|
270
|
+
Authorization: `Bearer ${token}`,
|
|
148
271
|
"Content-Type": "application/json",
|
|
149
272
|
},
|
|
150
|
-
body: JSON.stringify(
|
|
273
|
+
body: JSON.stringify({ operationName: name }),
|
|
151
274
|
});
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
throw new Error(`Video generation request failed: ${response.status} ${errorText}`);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const operation = await response.json() as any;
|
|
159
|
-
const operationName: string = operation.name || operation.operation || "";
|
|
160
|
-
|
|
161
|
-
if (!operationName) {
|
|
162
|
-
throw new Error("No operation name returned from API");
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
console.log(`Operation started: ${operationName}`);
|
|
166
|
-
|
|
167
|
-
// Step 2: Poll the operation status by getting the operation directly
|
|
168
|
-
let currentOp: any = operation;
|
|
169
|
-
let done = !!operation.done;
|
|
170
|
-
let tries = 0;
|
|
171
|
-
const maxTries = 60; // 10 minutes with 10s intervals
|
|
172
|
-
|
|
173
|
-
while (!done && tries < maxTries) {
|
|
174
|
-
await wait(10000); // Wait 10 seconds
|
|
175
|
-
tries++;
|
|
176
|
-
console.log(`Polling attempt ${tries}/${maxTries}...`);
|
|
177
|
-
|
|
178
|
-
// Poll by getting the operation status directly
|
|
179
|
-
const pollResponse = await fetch(`${baseUrl}/${operationName}`, {
|
|
180
|
-
method: "GET",
|
|
181
|
-
headers: {
|
|
182
|
-
"x-goog-api-key": apiKey,
|
|
183
|
-
},
|
|
184
|
-
});
|
|
185
|
-
|
|
186
|
-
if (!pollResponse.ok) {
|
|
187
|
-
const errorText = await pollResponse.text();
|
|
188
|
-
throw new Error(`Operation polling failed: ${pollResponse.status} ${errorText}`);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
currentOp = await pollResponse.json() as any;
|
|
192
|
-
done = !!currentOp.done || !!currentOp.response;
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
if (!done) {
|
|
196
|
-
throw new Error("Video generation timed out after 10 minutes");
|
|
275
|
+
if (!poll.ok) {
|
|
276
|
+
const text = await poll.text();
|
|
277
|
+
throw new Error(`Vertex operation poll failed: ${poll.status} ${text}`);
|
|
197
278
|
}
|
|
279
|
+
current = (await poll.json()) as any;
|
|
280
|
+
done = !!current.done || !!current.response;
|
|
281
|
+
tries++;
|
|
282
|
+
}
|
|
198
283
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
const
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
console.log(`Downloading video ${i + 1}/${generatedSamples.length} from ${videoUri}...`);
|
|
227
|
-
|
|
228
|
-
// Download video from URI with API key
|
|
229
|
-
const videoResponse = await fetch(videoUri, {
|
|
230
|
-
method: "GET",
|
|
231
|
-
headers: {
|
|
232
|
-
"x-goog-api-key": apiKey,
|
|
233
|
-
},
|
|
234
|
-
});
|
|
235
|
-
|
|
236
|
-
if (!videoResponse.ok) {
|
|
237
|
-
throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
|
|
284
|
+
const resp = current.response || current;
|
|
285
|
+
// Decode from response.videos[].bytesBase64Encoded only
|
|
286
|
+
const outputs: string[] = [];
|
|
287
|
+
const saveVideo = async (base64: string, index: number) => {
|
|
288
|
+
if (!base64) return;
|
|
289
|
+
const filePath = args.output_path
|
|
290
|
+
? index === 0
|
|
291
|
+
? args.output_path
|
|
292
|
+
: args.output_path.replace(/\.mp4$/i, `_${index}.mp4`)
|
|
293
|
+
: `video_output_${Date.now()}${index === 0 ? "" : "_" + index}.mp4`;
|
|
294
|
+
// For storage provider, we use the path as is (relative or absolute)
|
|
295
|
+
// If using LocalStorage, it handles resolving.
|
|
296
|
+
// If using Payload, it handles the key.
|
|
297
|
+
|
|
298
|
+
const buf = Buffer.from(base64, "base64");
|
|
299
|
+
const storage = getStorage();
|
|
300
|
+
await storage.writeFile(filePath, buf);
|
|
301
|
+
outputs.push(filePath);
|
|
302
|
+
};
|
|
303
|
+
|
|
304
|
+
if (Array.isArray(resp?.videos) && resp.videos.length > 0) {
|
|
305
|
+
for (let i = 0; i < resp.videos.length; i++) {
|
|
306
|
+
const v = resp.videos[i] || {};
|
|
307
|
+
if (typeof v.bytesBase64Encoded === "string") {
|
|
308
|
+
await saveVideo(v.bytesBase64Encoded, i);
|
|
238
309
|
}
|
|
239
|
-
|
|
240
|
-
const videoBuffer = await videoResponse.arrayBuffer();
|
|
241
|
-
|
|
242
|
-
// Save video to file
|
|
243
|
-
const filePath = args.output_path
|
|
244
|
-
? (i === 0 ? args.output_path : args.output_path.replace(/\.mp4$/i, `_${i}.mp4`))
|
|
245
|
-
: `video_output_${Date.now()}${i === 0 ? '' : '_' + i}.mp4`;
|
|
246
|
-
const absPath = path.resolve(filePath);
|
|
247
|
-
|
|
248
|
-
fs.writeFileSync(absPath, Buffer.from(videoBuffer));
|
|
249
|
-
outputs.push(absPath);
|
|
250
|
-
console.log(`Saved video to: ${absPath}`);
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
if (outputs.length > 0) {
|
|
254
|
-
return `Video(s) saved successfully:\n${outputs.map((p, i) => `${i + 1}. ${p}`).join('\n')}`;
|
|
255
310
|
}
|
|
256
|
-
|
|
257
|
-
return "Video generation completed but no videos were saved.";
|
|
258
|
-
|
|
259
|
-
} catch (error: any) {
|
|
260
|
-
throw new Error(`Video generation failed: ${error.message || JSON.stringify(error)}`);
|
|
261
311
|
}
|
|
312
|
+
if (outputs.length > 0) {
|
|
313
|
+
return `Video(s) saved: ${outputs.join(", ")}`;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// If nothing saved, return a concise summary plus head/tail snippets of JSON
|
|
317
|
+
let jsonStr = "";
|
|
318
|
+
try {
|
|
319
|
+
jsonStr = JSON.stringify(resp);
|
|
320
|
+
} catch {}
|
|
321
|
+
const head150 = jsonStr ? jsonStr.slice(0, 150) : "";
|
|
322
|
+
const tail50 = jsonStr
|
|
323
|
+
? jsonStr.slice(Math.max(0, jsonStr.length - 50))
|
|
324
|
+
: "";
|
|
325
|
+
return `Vertex operation done but no videos array present. operationName=${name}. json_head150=${head150} json_tail50=${tail50}`;
|
|
262
326
|
},
|
|
263
327
|
};
|