@mixio-pro/kalaasetu-mcp 1.0.1 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +192 -13
- package/package.json +1 -1
- package/src/tools/gemini.ts +2 -2
- package/src/tools/image-to-video.sdk-backup.ts +161 -0
- package/src/tools/image-to-video.ts +190 -158
package/README.md
CHANGED
|
@@ -2,6 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
Kalaasetu MCP Server - A powerful Model Context Protocol server providing various AI tools for content generation and analysis.
|
|
4
4
|
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npx @mixio-pro/kalaasetu-mcp@latest
|
|
9
|
+
```
|
|
10
|
+
|
|
5
11
|
## Features
|
|
6
12
|
|
|
7
13
|
### 🎨 Gemini Tools
|
|
@@ -24,12 +30,163 @@ Kalaasetu MCP Server - A powerful Model Context Protocol server providing variou
|
|
|
24
30
|
|
|
25
31
|
## Installation
|
|
26
32
|
|
|
27
|
-
|
|
33
|
+
### Using npx (Recommended)
|
|
34
|
+
|
|
35
|
+
Run directly without installation:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
npx @mixio-pro/kalaasetu-mcp@latest
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Global Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
npm install -g @mixio-pro/kalaasetu-mcp
|
|
45
|
+
kalaasetu-mcp
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Local Development
|
|
49
|
+
|
|
50
|
+
To install dependencies for local development:
|
|
28
51
|
|
|
29
52
|
```bash
|
|
30
53
|
bun install
|
|
31
54
|
```
|
|
32
55
|
|
|
56
|
+
## MCP Client Configuration
|
|
57
|
+
|
|
58
|
+
### Cursor IDE
|
|
59
|
+
|
|
60
|
+
Add to your Cursor settings (`~/.cursor/config.json` or via Settings → MCP):
|
|
61
|
+
|
|
62
|
+
```json
|
|
63
|
+
{
|
|
64
|
+
"mcpServers": {
|
|
65
|
+
"kalaasetu": {
|
|
66
|
+
"command": "npx",
|
|
67
|
+
"args": ["@mixio-pro/kalaasetu-mcp@latest"],
|
|
68
|
+
"env": {
|
|
69
|
+
"GEMINI_API_KEY": "your-gemini-api-key",
|
|
70
|
+
"FAL_KEY": "your-fal-api-key",
|
|
71
|
+
"PERPLEXITY_API_KEY": "your-perplexity-api-key"
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### OpenCode IDE
|
|
79
|
+
|
|
80
|
+
Add to your OpenCode MCP configuration:
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"mcpServers": {
|
|
85
|
+
"kalaasetu": {
|
|
86
|
+
"command": "npx",
|
|
87
|
+
"args": ["@mixio-pro/kalaasetu-mcp@latest"],
|
|
88
|
+
"environment": {
|
|
89
|
+
"GEMINI_API_KEY": "your-gemini-api-key",
|
|
90
|
+
"FAL_KEY": "your-fal-api-key",
|
|
91
|
+
"PERPLEXITY_API_KEY": "your-perplexity-api-key"
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Claude Desktop
|
|
99
|
+
|
|
100
|
+
Add to your Claude Desktop configuration (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
|
|
101
|
+
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"mcpServers": {
|
|
105
|
+
"kalaasetu": {
|
|
106
|
+
"command": "npx",
|
|
107
|
+
"args": ["@mixio-pro/kalaasetu-mcp@latest"],
|
|
108
|
+
"env": {
|
|
109
|
+
"GEMINI_API_KEY": "your-gemini-api-key",
|
|
110
|
+
"FAL_KEY": "your-fal-api-key",
|
|
111
|
+
"PERPLEXITY_API_KEY": "your-perplexity-api-key"
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### OpenCode IDE
|
|
119
|
+
|
|
120
|
+
Add to your OpenCode MCP configuration:
|
|
121
|
+
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"mcpServers": {
|
|
125
|
+
"kalaasetu": {
|
|
126
|
+
"type": "local",
|
|
127
|
+
"command": ["npx", "@mixio-pro/kalaasetu-mcp@latest"],
|
|
128
|
+
"env": {
|
|
129
|
+
"GEMINI_API_KEY": "your-gemini-api-key",
|
|
130
|
+
"FAL_KEY": "your-fal-api-key",
|
|
131
|
+
"PERPLEXITY_API_KEY": "your-perplexity-api-key",
|
|
132
|
+
"GOOGLE_APPLICATION_CREDENTIALS": "/path/to/your/gcp-credentials.json"
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Claude Desktop
|
|
140
|
+
|
|
141
|
+
Add to your Claude Desktop configuration (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
|
|
142
|
+
|
|
143
|
+
```json
|
|
144
|
+
{
|
|
145
|
+
"mcpServers": {
|
|
146
|
+
"kalaasetu": {
|
|
147
|
+
"command": "npx",
|
|
148
|
+
"args": ["@mixio-pro/kalaasetu-mcp@latest"],
|
|
149
|
+
"env": {
|
|
150
|
+
"GEMINI_API_KEY": "your-gemini-api-key",
|
|
151
|
+
"FAL_KEY": "your-fal-api-key",
|
|
152
|
+
"PERPLEXITY_API_KEY": "your-perplexity-api-key",
|
|
153
|
+
"GOOGLE_APPLICATION_CREDENTIALS": "/path/to/your/gcp-credentials.json"
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Environment Variables
|
|
161
|
+
|
|
162
|
+
### Required API Keys
|
|
163
|
+
|
|
164
|
+
| Variable | Description | Get API Key |
|
|
165
|
+
|----------|-------------|-------------|
|
|
166
|
+
| `GEMINI_API_KEY` | For Gemini image generation, TTS, video analysis, and Veo video generation | [Google AI Studio](https://aistudio.google.com/app/apikey) |
|
|
167
|
+
| `FAL_KEY` | For Infinitalk and Hunyuan Avatar tools | [FAL AI](https://fal.ai/dashboard/keys) |
|
|
168
|
+
| `PERPLEXITY_API_KEY` | For image and video search | [Perplexity API](https://www.perplexity.ai/settings/api) |
|
|
169
|
+
|
|
170
|
+
### Setting Environment Variables
|
|
171
|
+
|
|
172
|
+
#### For Command Line Usage
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# macOS/Linux
|
|
176
|
+
export GEMINI_API_KEY="your-gemini-api-key"
|
|
177
|
+
export FAL_KEY="your-fal-api-key"
|
|
178
|
+
export PERPLEXITY_API_KEY="your-perplexity-api-key"
|
|
179
|
+
|
|
180
|
+
# Windows (PowerShell)
|
|
181
|
+
$env:GEMINI_API_KEY="your-gemini-api-key"
|
|
182
|
+
$env:FAL_KEY="your-fal-api-key"
|
|
183
|
+
$env:PERPLEXITY_API_KEY="your-perplexity-api-key"
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
#### For MCP Clients
|
|
187
|
+
|
|
188
|
+
Set the environment variables in your MCP client configuration as shown in the examples above.
|
|
189
|
+
|
|
33
190
|
## Configuration
|
|
34
191
|
|
|
35
192
|
Set up the required API keys as environment variables:
|
|
@@ -48,17 +205,27 @@ export PERPLEXITY_API_KEY="your-perplexity-api-key"
|
|
|
48
205
|
|
|
49
206
|
## Running the Server
|
|
50
207
|
|
|
51
|
-
|
|
208
|
+
### Local Development
|
|
209
|
+
|
|
210
|
+
To run the server locally:
|
|
52
211
|
|
|
53
212
|
```bash
|
|
54
|
-
bun run index.ts
|
|
213
|
+
bun run src/index.ts
|
|
55
214
|
```
|
|
56
215
|
|
|
57
|
-
|
|
216
|
+
Or with auto-reload:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
bun run dev
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Available Tools
|
|
223
|
+
|
|
224
|
+
### Infinitalk
|
|
58
225
|
|
|
59
226
|
Generate talking avatar videos from images and audio using FAL AI's Infinitalk model.
|
|
60
227
|
|
|
61
|
-
|
|
228
|
+
#### Parameters
|
|
62
229
|
|
|
63
230
|
- `image_url` (required): URL of the input image
|
|
64
231
|
- `audio_url` (required): URL of the audio file for lip-sync
|
|
@@ -67,9 +234,8 @@ Generate talking avatar videos from images and audio using FAL AI's Infinitalk m
|
|
|
67
234
|
- `resolution` (optional): Video resolution - "480p" or "720p" (default: "480p")
|
|
68
235
|
- `seed` (optional): Random seed for reproducibility (default: 42)
|
|
69
236
|
- `acceleration` (optional): Generation speed - "none", "regular", or "high" (default: "regular")
|
|
70
|
-
- `fal_key` (optional): FAL API key (uses FAL_KEY env var if not provided)
|
|
71
237
|
|
|
72
|
-
|
|
238
|
+
#### Example Usage
|
|
73
239
|
|
|
74
240
|
```json
|
|
75
241
|
{
|
|
@@ -81,11 +247,11 @@ Generate talking avatar videos from images and audio using FAL AI's Infinitalk m
|
|
|
81
247
|
}
|
|
82
248
|
```
|
|
83
249
|
|
|
84
|
-
|
|
250
|
+
### Perplexity Images
|
|
85
251
|
|
|
86
252
|
Search for images using Perplexity AI with advanced filtering options.
|
|
87
253
|
|
|
88
|
-
|
|
254
|
+
#### Parameters
|
|
89
255
|
|
|
90
256
|
- `query` (required): The search query for images
|
|
91
257
|
- `image_domain_filter` (optional): Array of domains to include or exclude (prefix with '-' to exclude)
|
|
@@ -93,7 +259,7 @@ Search for images using Perplexity AI with advanced filtering options.
|
|
|
93
259
|
- `image_format_filter` (optional): Array of allowed image formats
|
|
94
260
|
- Example: `["jpg", "png", "gif"]`
|
|
95
261
|
|
|
96
|
-
|
|
262
|
+
#### Example Usage
|
|
97
263
|
|
|
98
264
|
```json
|
|
99
265
|
{
|
|
@@ -103,17 +269,17 @@ Search for images using Perplexity AI with advanced filtering options.
|
|
|
103
269
|
}
|
|
104
270
|
```
|
|
105
271
|
|
|
106
|
-
|
|
272
|
+
### Perplexity Videos
|
|
107
273
|
|
|
108
274
|
Search for videos using Perplexity AI with domain filtering.
|
|
109
275
|
|
|
110
|
-
|
|
276
|
+
#### Parameters
|
|
111
277
|
|
|
112
278
|
- `query` (required): The search query for videos
|
|
113
279
|
- `search_domain_filter` (optional): Array of domains to limit search (use '-' prefix to exclude)
|
|
114
280
|
- Example: `["youtube.com"]` or `["-tiktok.com"]`
|
|
115
281
|
|
|
116
|
-
|
|
282
|
+
#### Example Usage
|
|
117
283
|
|
|
118
284
|
```json
|
|
119
285
|
{
|
|
@@ -122,6 +288,19 @@ Search for videos using Perplexity AI with domain filtering.
|
|
|
122
288
|
}
|
|
123
289
|
```
|
|
124
290
|
|
|
291
|
+
## Testing
|
|
292
|
+
|
|
293
|
+
You can test the MCP server using the MCP Inspector:
|
|
294
|
+
|
|
295
|
+
```bash
|
|
296
|
+
npx @modelcontextprotocol/inspector npx @mixio-pro/kalaasetu-mcp@latest
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
## Requirements
|
|
300
|
+
|
|
301
|
+
- **Bun**: This package requires Bun runtime. Install from [bun.sh](https://bun.sh)
|
|
302
|
+
- **API Keys**: Obtain the necessary API keys from the providers listed above
|
|
303
|
+
|
|
125
304
|
## Project Info
|
|
126
305
|
|
|
127
306
|
This project was created using `bun init` in bun v1.3.1. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime.
|
package/package.json
CHANGED
package/src/tools/gemini.ts
CHANGED
|
@@ -263,7 +263,7 @@ export const geminiSingleSpeakerTts = {
|
|
|
263
263
|
description: "Generate single speaker voice audio from text using Gemini 2.5 Pro Preview TTS model",
|
|
264
264
|
parameters: z.object({
|
|
265
265
|
text: z.string().describe("Text to convert to speech"),
|
|
266
|
-
voice_name: z.string().describe("Voice name from supported options
|
|
266
|
+
voice_name: z.string().describe("Voice name from supported options. Use Kore, Erinome or Despina for the female voices and Enceladus for male."),
|
|
267
267
|
output_path: z.string().optional().describe("Output WAV file path (optional, defaults to timestamp-based filename)"),
|
|
268
268
|
}),
|
|
269
269
|
execute: async (args: { text: string; voice_name: string; output_path?: string }) => {
|
|
@@ -276,7 +276,7 @@ export const geminiSingleSpeakerTts = {
|
|
|
276
276
|
speechConfig: {
|
|
277
277
|
voiceConfig: {
|
|
278
278
|
prebuiltVoiceConfig: {
|
|
279
|
-
voiceName: args.voice_name
|
|
279
|
+
voiceName: args.voice_name || 'Despina'
|
|
280
280
|
},
|
|
281
281
|
},
|
|
282
282
|
},
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import * as fs from "fs";
|
|
3
|
+
import * as path from "path";
|
|
4
|
+
import { GoogleGenAI } from "@google/genai";
|
|
5
|
+
|
|
6
|
+
async function wait(ms: number): Promise<void> {
|
|
7
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function fileToBase64(filePath: string): { data: string; mimeType: string } {
|
|
11
|
+
if (!fs.existsSync(filePath)) {
|
|
12
|
+
throw new Error(`File not found: ${filePath}`);
|
|
13
|
+
}
|
|
14
|
+
const buf = fs.readFileSync(filePath);
|
|
15
|
+
const data = Buffer.from(buf).toString("base64");
|
|
16
|
+
// Detect mime type from extension
|
|
17
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
18
|
+
const mimeType = ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' :
|
|
19
|
+
ext === '.png' ? 'image/png' :
|
|
20
|
+
ext === '.webp' ? 'image/webp' : 'image/png';
|
|
21
|
+
return { data, mimeType };
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export const imageToVideo = {
|
|
25
|
+
name: "imageToVideo",
|
|
26
|
+
description: "Generate videos from an image as starting first frame using Vertex Veo models (predictLongRunning + fetchPredictOperation).",
|
|
27
|
+
parameters: z.object({
|
|
28
|
+
prompt: z.string().describe("Text description for the video"),
|
|
29
|
+
image_path: z.string().optional().describe("Path to source image for image-to-video generation"),
|
|
30
|
+
aspect_ratio: z.string().optional().describe("Video aspect ratio: '16:9' or '9:16' (default: '9:16')"),
|
|
31
|
+
duration_seconds: z.number().optional().describe("Video duration in seconds: 4, 6, or 8 (default: 6)"),
|
|
32
|
+
resolution: z.string().optional().describe("Video resolution: '720p' or '1080p' (default: '720p')"),
|
|
33
|
+
negative_prompt: z.string().optional().describe("Text describing what not to include in the video"),
|
|
34
|
+
person_generation: z.string().optional().describe("Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"),
|
|
35
|
+
reference_images: z.array(z.string()).optional().describe("Additional image paths for reference (max 3)"),
|
|
36
|
+
output_path: z.string().optional().describe("Output MP4 file path (if multiple predictions, index suffix is added)"),
|
|
37
|
+
gemini_api_key: z.string().optional().describe("Gemini API key (uses GEMINI_API_KEY env var if not provided)"),
|
|
38
|
+
model_id: z.string().optional().describe("Model ID (default: veo-2.0-generate-001)"),
|
|
39
|
+
}),
|
|
40
|
+
execute: async (args: {
|
|
41
|
+
prompt: string;
|
|
42
|
+
image_path?: string;
|
|
43
|
+
aspect_ratio?: string;
|
|
44
|
+
duration_seconds?: number;
|
|
45
|
+
resolution?: string;
|
|
46
|
+
negative_prompt?: string;
|
|
47
|
+
person_generation?: string;
|
|
48
|
+
reference_images?: string[];
|
|
49
|
+
output_path?: string;
|
|
50
|
+
gemini_api_key?: string;
|
|
51
|
+
model_id?: string;
|
|
52
|
+
}) => {
|
|
53
|
+
const apiKey = args.gemini_api_key || process.env.GEMINI_API_KEY;
|
|
54
|
+
if (!apiKey) {
|
|
55
|
+
throw new Error("Gemini API key is required. Set GEMINI_API_KEY environment variable or pass gemini_api_key parameter. Get one at https://aistudio.google.com/app/apikey");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const model = args.model_id || "veo-2.0-generate-001";
|
|
59
|
+
|
|
60
|
+
// Initialize Google GenAI client
|
|
61
|
+
const genai = new GoogleGenAI({ apiKey });
|
|
62
|
+
|
|
63
|
+
// Build config for video generation
|
|
64
|
+
const config: any = {};
|
|
65
|
+
|
|
66
|
+
if (args.duration_seconds !== undefined) {
|
|
67
|
+
config.duration_seconds = args.duration_seconds;
|
|
68
|
+
} else {
|
|
69
|
+
config.duration_seconds = 6; // default
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
if (args.aspect_ratio) {
|
|
73
|
+
config.aspect_ratio = args.aspect_ratio;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
try {
|
|
77
|
+
// Start video generation operation
|
|
78
|
+
console.log(`Starting video generation with model: ${model}`);
|
|
79
|
+
let operation = await genai.models.generateVideos({
|
|
80
|
+
model,
|
|
81
|
+
prompt: args.prompt,
|
|
82
|
+
config,
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
console.log("Operation started, waiting for completion...");
|
|
86
|
+
|
|
87
|
+
// Poll until operation is complete (max 10 minutes)
|
|
88
|
+
let tries = 0;
|
|
89
|
+
const maxTries = 60; // 10 minutes with 10s intervals
|
|
90
|
+
|
|
91
|
+
while (!operation.done && tries < maxTries) {
|
|
92
|
+
await wait(10000); // Wait 10 seconds
|
|
93
|
+
tries++;
|
|
94
|
+
console.log(`Polling attempt ${tries}/${maxTries}...`);
|
|
95
|
+
|
|
96
|
+
operation = await genai.operations.getVideosOperation({
|
|
97
|
+
operation: operation,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (!operation.done) {
|
|
102
|
+
throw new Error("Video generation timed out after 10 minutes");
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
console.log("Operation completed!");
|
|
106
|
+
console.log("Full Response:", JSON.stringify(operation.response, null, 2));
|
|
107
|
+
|
|
108
|
+
// Extract generated videos from response
|
|
109
|
+
const generatedVideos = operation.response?.generatedVideos || [];
|
|
110
|
+
|
|
111
|
+
if (!generatedVideos || generatedVideos.length === 0) {
|
|
112
|
+
const respStr = JSON.stringify(operation.response, null, 2);
|
|
113
|
+
return `Video generation completed but no videos found in response.\n\nFull Response:\n${respStr.slice(0, 2000)}${respStr.length > 2000 ? '\n...(truncated)' : ''}`;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Download and save videos
|
|
117
|
+
const outputs: string[] = [];
|
|
118
|
+
|
|
119
|
+
for (let i = 0; i < generatedVideos.length; i++) {
|
|
120
|
+
const generatedVideo = generatedVideos[i];
|
|
121
|
+
const videoUri = generatedVideo?.video?.uri;
|
|
122
|
+
|
|
123
|
+
if (!videoUri) {
|
|
124
|
+
console.warn(`Video ${i} has no URI`);
|
|
125
|
+
continue;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
console.log(`Downloading video ${i + 1}/${generatedVideos.length}...`);
|
|
129
|
+
|
|
130
|
+
// Download video from URI
|
|
131
|
+
const videoUrl = `${videoUri}&key=${apiKey}`;
|
|
132
|
+
const response = await fetch(videoUrl);
|
|
133
|
+
|
|
134
|
+
if (!response.ok) {
|
|
135
|
+
throw new Error(`Failed to download video: ${response.status} ${response.statusText}`);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const buffer = await response.arrayBuffer();
|
|
139
|
+
|
|
140
|
+
// Save video to file
|
|
141
|
+
const filePath = args.output_path
|
|
142
|
+
? (i === 0 ? args.output_path : args.output_path.replace(/\.mp4$/i, `_${i}.mp4`))
|
|
143
|
+
: `video_output_${Date.now()}${i === 0 ? '' : '_' + i}.mp4`;
|
|
144
|
+
const absPath = path.resolve(filePath);
|
|
145
|
+
|
|
146
|
+
fs.writeFileSync(absPath, Buffer.from(buffer));
|
|
147
|
+
outputs.push(absPath);
|
|
148
|
+
console.log(`Saved video to: ${absPath}`);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (outputs.length > 0) {
|
|
152
|
+
return `Video(s) saved successfully:\n${outputs.map((p, i) => `${i + 1}. ${p}`).join('\n')}`;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return "Video generation completed but no videos were saved.";
|
|
156
|
+
|
|
157
|
+
} catch (error: any) {
|
|
158
|
+
throw new Error(`Video generation failed: ${error.message || JSON.stringify(error)}`);
|
|
159
|
+
}
|
|
160
|
+
},
|
|
161
|
+
};
|
|
@@ -1,125 +1,99 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
2
|
import * as fs from "fs";
|
|
3
|
-
import { GoogleAuth } from "google-auth-library";
|
|
4
|
-
import { exec } from "child_process";
|
|
5
3
|
import * as path from "path";
|
|
6
4
|
|
|
7
5
|
async function wait(ms: number): Promise<void> {
|
|
8
6
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
9
7
|
}
|
|
10
8
|
|
|
11
|
-
async function fetchAccessToken(): Promise<string> {
|
|
12
|
-
try {
|
|
13
|
-
const auth = new GoogleAuth({ scopes: ["https://www.googleapis.com/auth/cloud-platform"] });
|
|
14
|
-
const client = await auth.getClient();
|
|
15
|
-
const token = await client.getAccessToken();
|
|
16
|
-
if (!token || !token.token || typeof token.token !== "string") {
|
|
17
|
-
throw new Error("No token from GoogleAuth");
|
|
18
|
-
}
|
|
19
|
-
return token.token;
|
|
20
|
-
} catch (e: any) {
|
|
21
|
-
// Fallback to gcloud
|
|
22
|
-
return await new Promise((resolve, reject) => {
|
|
23
|
-
exec("gcloud auth print-access-token", (err, stdout, stderr) => {
|
|
24
|
-
if (err) {
|
|
25
|
-
reject(new Error(`Failed to fetch an access token (ADC and gcloud): ${stderr || err.message}`));
|
|
26
|
-
return;
|
|
27
|
-
}
|
|
28
|
-
const t = (stdout || "").trim();
|
|
29
|
-
if (!t) {
|
|
30
|
-
reject(new Error("Failed to fetch an access token: empty token from gcloud"));
|
|
31
|
-
return;
|
|
32
|
-
}
|
|
33
|
-
resolve(t);
|
|
34
|
-
});
|
|
35
|
-
});
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
|
|
39
9
|
function fileToBase64(filePath: string): { data: string; mimeType: string } {
|
|
40
10
|
if (!fs.existsSync(filePath)) {
|
|
41
11
|
throw new Error(`File not found: ${filePath}`);
|
|
42
12
|
}
|
|
43
13
|
const buf = fs.readFileSync(filePath);
|
|
44
14
|
const data = Buffer.from(buf).toString("base64");
|
|
45
|
-
//
|
|
46
|
-
const
|
|
15
|
+
// Detect mime type from extension
|
|
16
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
17
|
+
const mimeType = ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' :
|
|
18
|
+
ext === '.png' ? 'image/png' :
|
|
19
|
+
ext === '.webp' ? 'image/webp' : 'image/png';
|
|
47
20
|
return { data, mimeType };
|
|
48
21
|
}
|
|
49
22
|
|
|
50
23
|
export const imageToVideo = {
|
|
51
24
|
name: "imageToVideo",
|
|
52
|
-
description: "Generate videos from an image as starting first frame using
|
|
25
|
+
description: "Generate videos from an image as starting first frame using Gemini Veo models via HTTP API with Gemini API key.",
|
|
53
26
|
parameters: z.object({
|
|
54
27
|
prompt: z.string().describe("Text description for the video"),
|
|
55
28
|
image_path: z.string().optional().describe("Path to source image for image-to-video generation"),
|
|
29
|
+
last_frame_path: z.string().optional().describe("Path to last frame image to guide ending frame (optional)"),
|
|
56
30
|
aspect_ratio: z.string().optional().describe("Video aspect ratio: '16:9' or '9:16' (default: '9:16')"),
|
|
57
|
-
duration_seconds: z.string().optional().describe("Video duration in seconds:
|
|
31
|
+
duration_seconds: z.union([z.string(), z.number()]).optional().describe("Video duration in seconds: 4, 6, or 8 (default: 6)"),
|
|
58
32
|
resolution: z.string().optional().describe("Video resolution: '720p' or '1080p' (default: '720p')"),
|
|
59
33
|
negative_prompt: z.string().optional().describe("Text describing what not to include in the video"),
|
|
60
34
|
person_generation: z.string().optional().describe("Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"),
|
|
61
35
|
reference_images: z.array(z.string()).optional().describe("Additional image paths for reference (max 3)"),
|
|
62
36
|
output_path: z.string().optional().describe("Output MP4 file path (if multiple predictions, index suffix is added)"),
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
model_id: z.string().optional().describe("Model ID (default: veo-3.1-fast-generate-preview)"),
|
|
37
|
+
gemini_api_key: z.string().optional().describe("Gemini API key (uses GEMINI_API_KEY env var if not provided)"),
|
|
38
|
+
model_id: z.string().optional().describe("Model ID (default: veo-3.1-generate-preview)"),
|
|
66
39
|
}),
|
|
67
40
|
execute: async (args: {
|
|
68
41
|
prompt: string;
|
|
69
42
|
image_path?: string;
|
|
43
|
+
last_frame_path?: string;
|
|
70
44
|
aspect_ratio?: string;
|
|
71
|
-
duration_seconds?: string;
|
|
45
|
+
duration_seconds?: string | number;
|
|
72
46
|
resolution?: string;
|
|
73
47
|
negative_prompt?: string;
|
|
74
48
|
person_generation?: string;
|
|
75
49
|
reference_images?: string[];
|
|
76
50
|
output_path?: string;
|
|
77
|
-
|
|
78
|
-
location_id?: string;
|
|
51
|
+
gemini_api_key?: string;
|
|
79
52
|
model_id?: string;
|
|
80
53
|
}) => {
|
|
81
|
-
const
|
|
82
|
-
|
|
83
|
-
|
|
54
|
+
const apiKey = args.gemini_api_key || process.env.GEMINI_API_KEY;
|
|
55
|
+
if (!apiKey) {
|
|
56
|
+
throw new Error("Gemini API key is required. Set GEMINI_API_KEY environment variable or pass gemini_api_key parameter. Get one at https://aistudio.google.com/app/apikey");
|
|
57
|
+
}
|
|
84
58
|
|
|
85
|
-
const
|
|
59
|
+
const modelId = args.model_id || "veo-3.1-generate-preview";
|
|
60
|
+
const baseUrl = "https://generativelanguage.googleapis.com/v1beta";
|
|
86
61
|
|
|
87
|
-
|
|
62
|
+
// Convert duration_seconds to number, handling both string and number inputs
|
|
63
|
+
const durationSeconds = args.duration_seconds
|
|
64
|
+
? (typeof args.duration_seconds === 'string' ? parseInt(args.duration_seconds) : args.duration_seconds)
|
|
65
|
+
: 6; // default
|
|
88
66
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
const
|
|
92
|
-
|
|
93
|
-
|
|
67
|
+
try {
|
|
68
|
+
// Build the request body for predictLongRunning
|
|
69
|
+
const instances: any[] = [
|
|
70
|
+
{
|
|
71
|
+
prompt: args.prompt,
|
|
72
|
+
},
|
|
73
|
+
];
|
|
74
|
+
|
|
75
|
+
// Add image if provided (first frame)
|
|
76
|
+
if (args.image_path) {
|
|
77
|
+
const { data, mimeType } = fileToBase64(args.image_path);
|
|
78
|
+
instances[0].image = {
|
|
94
79
|
bytesBase64Encoded: data,
|
|
95
80
|
mimeType,
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
}
|
|
81
|
+
};
|
|
82
|
+
}
|
|
99
83
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
refImages = JSON.parse(strValue);
|
|
108
|
-
} catch {
|
|
109
|
-
throw new Error("Invalid reference_images format");
|
|
110
|
-
}
|
|
111
|
-
} else {
|
|
112
|
-
refImages = [strValue];
|
|
113
|
-
}
|
|
114
|
-
} else if (Array.isArray(args.reference_images)) {
|
|
115
|
-
refImages = args.reference_images;
|
|
116
|
-
} else {
|
|
117
|
-
throw new Error("Invalid reference_images: must be array or string");
|
|
84
|
+
// Add last frame if provided (for interpolation)
|
|
85
|
+
if (args.last_frame_path) {
|
|
86
|
+
const { data, mimeType } = fileToBase64(args.last_frame_path);
|
|
87
|
+
instances[0].lastFrame = {
|
|
88
|
+
bytesBase64Encoded: data,
|
|
89
|
+
mimeType,
|
|
90
|
+
};
|
|
118
91
|
}
|
|
119
92
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
93
|
+
// Add reference images if provided
|
|
94
|
+
if (args.reference_images && args.reference_images.length > 0) {
|
|
95
|
+
const refImages = args.reference_images.slice(0, 3).map((imgPath) => {
|
|
96
|
+
const { data, mimeType } = fileToBase64(imgPath);
|
|
123
97
|
return {
|
|
124
98
|
image: {
|
|
125
99
|
bytesBase64Encoded: data,
|
|
@@ -128,104 +102,162 @@ export const imageToVideo = {
|
|
|
128
102
|
referenceType: "asset",
|
|
129
103
|
};
|
|
130
104
|
});
|
|
105
|
+
instances[0].referenceImages = refImages;
|
|
131
106
|
}
|
|
132
|
-
}
|
|
133
107
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
const instances: any[] = [
|
|
137
|
-
{
|
|
138
|
-
prompt: args.prompt,
|
|
139
|
-
...(imagePart || {}),
|
|
140
|
-
...(referenceImages ? { referenceImages } : {}),
|
|
141
|
-
},
|
|
142
|
-
];
|
|
143
|
-
|
|
144
|
-
const parameters: any = {
|
|
145
|
-
aspectRatio: args.aspect_ratio || "9:16",
|
|
146
|
-
durationSeconds: parseInt(args.duration_seconds || "6"),
|
|
147
|
-
resolution: args.resolution || "720p",
|
|
148
|
-
negativePrompt: args.negative_prompt,
|
|
149
|
-
generateAudio: false,
|
|
150
|
-
personGeneration,
|
|
151
|
-
};
|
|
152
|
-
|
|
153
|
-
const res = await fetch(url, {
|
|
154
|
-
method: "POST",
|
|
155
|
-
headers: {
|
|
156
|
-
Authorization: `Bearer ${token}`,
|
|
157
|
-
"Content-Type": "application/json",
|
|
158
|
-
},
|
|
159
|
-
body: JSON.stringify({ instances, parameters }),
|
|
160
|
-
});
|
|
161
|
-
|
|
162
|
-
if (!res.ok) {
|
|
163
|
-
const text = await res.text();
|
|
164
|
-
throw new Error(`Vertex request failed: ${res.status} ${text}`);
|
|
165
|
-
}
|
|
108
|
+
// Build parameters - NOTE: Parameters go in "parameters" object, not in instances
|
|
109
|
+
const parameters: any = {};
|
|
166
110
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
111
|
+
if (args.aspect_ratio) {
|
|
112
|
+
parameters.aspectRatio = args.aspect_ratio;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (durationSeconds) {
|
|
116
|
+
parameters.durationSeconds = durationSeconds;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (args.resolution) {
|
|
120
|
+
parameters.resolution = args.resolution;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if (args.negative_prompt) {
|
|
124
|
+
parameters.negativePrompt = args.negative_prompt;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (args.person_generation) {
|
|
128
|
+
parameters.personGeneration = args.person_generation;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const requestBody: any = {
|
|
132
|
+
instances,
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
// Only add parameters if we have any
|
|
136
|
+
if (Object.keys(parameters).length > 0) {
|
|
137
|
+
requestBody.parameters = parameters;
|
|
138
|
+
}
|
|
172
139
|
|
|
173
|
-
|
|
174
|
-
let done = !!(op as any).done;
|
|
175
|
-
let tries = 0;
|
|
140
|
+
console.log(`Starting video generation with model: ${modelId}`);
|
|
176
141
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
await wait(10000);
|
|
181
|
-
const poll = await fetch(fetchUrl, {
|
|
142
|
+
// Step 1: Start the long-running operation
|
|
143
|
+
const url = `${baseUrl}/models/${modelId}:predictLongRunning`;
|
|
144
|
+
const response = await fetch(url, {
|
|
182
145
|
method: "POST",
|
|
183
146
|
headers: {
|
|
184
|
-
|
|
147
|
+
"x-goog-api-key": apiKey,
|
|
185
148
|
"Content-Type": "application/json",
|
|
186
149
|
},
|
|
187
|
-
body: JSON.stringify(
|
|
150
|
+
body: JSON.stringify(requestBody),
|
|
188
151
|
});
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
152
|
+
|
|
153
|
+
if (!response.ok) {
|
|
154
|
+
const errorText = await response.text();
|
|
155
|
+
throw new Error(`Video generation request failed: ${response.status} ${errorText}`);
|
|
192
156
|
}
|
|
193
|
-
current = await poll.json();
|
|
194
|
-
done = !!(current as any).done || !!(current as any).response;
|
|
195
|
-
tries++;
|
|
196
|
-
}
|
|
197
157
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
158
|
+
const operation = await response.json() as any;
|
|
159
|
+
const operationName: string = operation.name || operation.operation || "";
|
|
160
|
+
|
|
161
|
+
if (!operationName) {
|
|
162
|
+
throw new Error("No operation name returned from API");
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
console.log(`Operation started: ${operationName}`);
|
|
166
|
+
|
|
167
|
+
// Step 2: Poll the operation status by getting the operation directly
|
|
168
|
+
let currentOp: any = operation;
|
|
169
|
+
let done = !!operation.done;
|
|
170
|
+
let tries = 0;
|
|
171
|
+
const maxTries = 60; // 10 minutes with 10s intervals
|
|
172
|
+
|
|
173
|
+
while (!done && tries < maxTries) {
|
|
174
|
+
await wait(10000); // Wait 10 seconds
|
|
175
|
+
tries++;
|
|
176
|
+
console.log(`Polling attempt ${tries}/${maxTries}...`);
|
|
177
|
+
|
|
178
|
+
// Poll by getting the operation status directly
|
|
179
|
+
const pollResponse = await fetch(`${baseUrl}/${operationName}`, {
|
|
180
|
+
method: "GET",
|
|
181
|
+
headers: {
|
|
182
|
+
"x-goog-api-key": apiKey,
|
|
183
|
+
},
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
if (!pollResponse.ok) {
|
|
187
|
+
const errorText = await pollResponse.text();
|
|
188
|
+
throw new Error(`Operation polling failed: ${pollResponse.status} ${errorText}`);
|
|
217
189
|
}
|
|
190
|
+
|
|
191
|
+
currentOp = await pollResponse.json() as any;
|
|
192
|
+
done = !!currentOp.done || !!currentOp.response;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if (!done) {
|
|
196
|
+
throw new Error("Video generation timed out after 10 minutes");
|
|
218
197
|
}
|
|
219
|
-
}
|
|
220
|
-
if (outputs.length > 0) {
|
|
221
|
-
return `Video(s) saved: ${outputs.join(', ')}`;
|
|
222
|
-
}
|
|
223
198
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
199
|
+
console.log("Operation completed!");
|
|
200
|
+
|
|
201
|
+
// Step 3: Extract and download videos
|
|
202
|
+
const resp = currentOp.response || currentOp;
|
|
203
|
+
|
|
204
|
+
// The response structure is: response.generateVideoResponse.generatedSamples[].video.uri
|
|
205
|
+
const generateVideoResponse = resp?.generateVideoResponse;
|
|
206
|
+
const generatedSamples = generateVideoResponse?.generatedSamples || [];
|
|
207
|
+
|
|
208
|
+
if (!generatedSamples || generatedSamples.length === 0) {
|
|
209
|
+
let jsonStr = "";
|
|
210
|
+
try { jsonStr = JSON.stringify(resp, null, 2); } catch {}
|
|
211
|
+
return `Video generation completed but no generatedSamples found.\n\nFull Response:\n${jsonStr.slice(0, 1000)}${jsonStr.length > 1000 ? '\n...(truncated)' : ''}`;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const outputs: string[] = [];
|
|
215
|
+
|
|
216
|
+
// Download videos from URIs
|
|
217
|
+
for (let i = 0; i < generatedSamples.length; i++) {
|
|
218
|
+
const sample = generatedSamples[i];
|
|
219
|
+
const videoUri = sample?.video?.uri;
|
|
220
|
+
|
|
221
|
+
if (!videoUri) {
|
|
222
|
+
console.warn(`Sample ${i} has no video URI`);
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
console.log(`Downloading video ${i + 1}/${generatedSamples.length} from ${videoUri}...`);
|
|
227
|
+
|
|
228
|
+
// Download video from URI with API key
|
|
229
|
+
const videoResponse = await fetch(videoUri, {
|
|
230
|
+
method: "GET",
|
|
231
|
+
headers: {
|
|
232
|
+
"x-goog-api-key": apiKey,
|
|
233
|
+
},
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
if (!videoResponse.ok) {
|
|
237
|
+
throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const videoBuffer = await videoResponse.arrayBuffer();
|
|
241
|
+
|
|
242
|
+
// Save video to file
|
|
243
|
+
const filePath = args.output_path
|
|
244
|
+
? (i === 0 ? args.output_path : args.output_path.replace(/\.mp4$/i, `_${i}.mp4`))
|
|
245
|
+
: `video_output_${Date.now()}${i === 0 ? '' : '_' + i}.mp4`;
|
|
246
|
+
const absPath = path.resolve(filePath);
|
|
247
|
+
|
|
248
|
+
fs.writeFileSync(absPath, Buffer.from(videoBuffer));
|
|
249
|
+
outputs.push(absPath);
|
|
250
|
+
console.log(`Saved video to: ${absPath}`);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if (outputs.length > 0) {
|
|
254
|
+
return `Video(s) saved successfully:\n${outputs.map((p, i) => `${i + 1}. ${p}`).join('\n')}`;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return "Video generation completed but no videos were saved.";
|
|
258
|
+
|
|
259
|
+
} catch (error: any) {
|
|
260
|
+
throw new Error(`Video generation failed: ${error.message || JSON.stringify(error)}`);
|
|
261
|
+
}
|
|
230
262
|
},
|
|
231
263
|
};
|