@mixio-pro/kalaasetu-mcp 1.0.1 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,6 +2,12 @@
2
2
 
3
3
  Kalaasetu MCP Server - A powerful Model Context Protocol server providing various AI tools for content generation and analysis.
4
4
 
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ npx @mixio-pro/kalaasetu-mcp@latest
9
+ ```
10
+
5
11
  ## Features
6
12
 
7
13
  ### 🎨 Gemini Tools
@@ -24,12 +30,163 @@ Kalaasetu MCP Server - A powerful Model Context Protocol server providing variou
24
30
 
25
31
  ## Installation
26
32
 
27
- To install dependencies:
33
+ ### Using npx (Recommended)
34
+
35
+ Run directly without installation:
36
+
37
+ ```bash
38
+ npx @mixio-pro/kalaasetu-mcp@latest
39
+ ```
40
+
41
+ ### Global Installation
42
+
43
+ ```bash
44
+ npm install -g @mixio-pro/kalaasetu-mcp
45
+ kalaasetu-mcp
46
+ ```
47
+
48
+ ### Local Development
49
+
50
+ To install dependencies for local development:
28
51
 
29
52
  ```bash
30
53
  bun install
31
54
  ```
32
55
 
56
+ ## MCP Client Configuration
57
+
58
+ ### Cursor IDE
59
+
60
+ Add to your Cursor settings (`~/.cursor/config.json` or via Settings → MCP):
61
+
62
+ ```json
63
+ {
64
+ "mcpServers": {
65
+ "kalaasetu": {
66
+ "command": "npx",
67
+ "args": ["@mixio-pro/kalaasetu-mcp@latest"],
68
+ "env": {
69
+ "GEMINI_API_KEY": "your-gemini-api-key",
70
+ "FAL_KEY": "your-fal-api-key",
71
+ "PERPLEXITY_API_KEY": "your-perplexity-api-key"
72
+ }
73
+ }
74
+ }
75
+ }
76
+ ```
77
+
78
+ ### OpenCode IDE
79
+
80
+ Add to your OpenCode MCP configuration:
81
+
82
+ ```json
83
+ {
84
+ "mcpServers": {
85
+ "kalaasetu": {
86
+ "command": "npx",
87
+ "args": ["@mixio-pro/kalaasetu-mcp@latest"],
88
+ "environment": {
89
+ "GEMINI_API_KEY": "your-gemini-api-key",
90
+ "FAL_KEY": "your-fal-api-key",
91
+ "PERPLEXITY_API_KEY": "your-perplexity-api-key"
92
+ }
93
+ }
94
+ }
95
+ }
96
+ ```
97
+
98
+ ### Claude Desktop
99
+
100
+ Add to your Claude Desktop configuration (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
101
+
102
+ ```json
103
+ {
104
+ "mcpServers": {
105
+ "kalaasetu": {
106
+ "command": "npx",
107
+ "args": ["@mixio-pro/kalaasetu-mcp@latest"],
108
+ "env": {
109
+ "GEMINI_API_KEY": "your-gemini-api-key",
110
+ "FAL_KEY": "your-fal-api-key",
111
+ "PERPLEXITY_API_KEY": "your-perplexity-api-key"
112
+ }
113
+ }
114
+ }
115
+ }
116
+ ```
117
+
118
+ ### OpenCode IDE
119
+
120
+ Add to your OpenCode MCP configuration:
121
+
122
+ ```json
123
+ {
124
+ "mcpServers": {
125
+ "kalaasetu": {
126
+ "type": "local",
127
+ "command": ["npx", "@mixio-pro/kalaasetu-mcp@latest"],
128
+ "env": {
129
+ "GEMINI_API_KEY": "your-gemini-api-key",
130
+ "FAL_KEY": "your-fal-api-key",
131
+ "PERPLEXITY_API_KEY": "your-perplexity-api-key",
132
+ "GOOGLE_APPLICATION_CREDENTIALS": "/path/to/your/gcp-credentials.json"
133
+ }
134
+ }
135
+ }
136
+ }
137
+ ```
138
+
139
+ ### Claude Desktop
140
+
141
+ Add to your Claude Desktop configuration (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
142
+
143
+ ```json
144
+ {
145
+ "mcpServers": {
146
+ "kalaasetu": {
147
+ "command": "npx",
148
+ "args": ["@mixio-pro/kalaasetu-mcp@latest"],
149
+ "env": {
150
+ "GEMINI_API_KEY": "your-gemini-api-key",
151
+ "FAL_KEY": "your-fal-api-key",
152
+ "PERPLEXITY_API_KEY": "your-perplexity-api-key",
153
+ "GOOGLE_APPLICATION_CREDENTIALS": "/path/to/your/gcp-credentials.json"
154
+ }
155
+ }
156
+ }
157
+ }
158
+ ```
159
+
160
+ ## Environment Variables
161
+
162
+ ### Required API Keys
163
+
164
+ | Variable | Description | Get API Key |
165
+ |----------|-------------|-------------|
166
+ | `GEMINI_API_KEY` | For Gemini image generation, TTS, video analysis, and Veo video generation | [Google AI Studio](https://aistudio.google.com/app/apikey) |
167
+ | `FAL_KEY` | For Infinitalk and Hunyuan Avatar tools | [FAL AI](https://fal.ai/dashboard/keys) |
168
+ | `PERPLEXITY_API_KEY` | For image and video search | [Perplexity API](https://www.perplexity.ai/settings/api) |
169
+
170
+ ### Setting Environment Variables
171
+
172
+ #### For Command Line Usage
173
+
174
+ ```bash
175
+ # macOS/Linux
176
+ export GEMINI_API_KEY="your-gemini-api-key"
177
+ export FAL_KEY="your-fal-api-key"
178
+ export PERPLEXITY_API_KEY="your-perplexity-api-key"
179
+
180
+ # Windows (PowerShell)
181
+ $env:GEMINI_API_KEY="your-gemini-api-key"
182
+ $env:FAL_KEY="your-fal-api-key"
183
+ $env:PERPLEXITY_API_KEY="your-perplexity-api-key"
184
+ ```
185
+
186
+ #### For MCP Clients
187
+
188
+ Set the environment variables in your MCP client configuration as shown in the examples above.
189
+
33
190
  ## Configuration
34
191
 
35
192
  Set up the required API keys as environment variables:
@@ -48,17 +205,27 @@ export PERPLEXITY_API_KEY="your-perplexity-api-key"
48
205
 
49
206
  ## Running the Server
50
207
 
51
- To run:
208
+ ### Local Development
209
+
210
+ To run the server locally:
52
211
 
53
212
  ```bash
54
- bun run index.ts
213
+ bun run src/index.ts
55
214
  ```
56
215
 
57
- ## Tool: Infinitalk
216
+ Or with auto-reload:
217
+
218
+ ```bash
219
+ bun run dev
220
+ ```
221
+
222
+ ## Available Tools
223
+
224
+ ### Infinitalk
58
225
 
59
226
  Generate talking avatar videos from images and audio using FAL AI's Infinitalk model.
60
227
 
61
- ### Parameters
228
+ #### Parameters
62
229
 
63
230
  - `image_url` (required): URL of the input image
64
231
  - `audio_url` (required): URL of the audio file for lip-sync
@@ -67,9 +234,8 @@ Generate talking avatar videos from images and audio using FAL AI's Infinitalk m
67
234
  - `resolution` (optional): Video resolution - "480p" or "720p" (default: "480p")
68
235
  - `seed` (optional): Random seed for reproducibility (default: 42)
69
236
  - `acceleration` (optional): Generation speed - "none", "regular", or "high" (default: "regular")
70
- - `fal_key` (optional): FAL API key (uses FAL_KEY env var if not provided)
71
237
 
72
- ### Example Usage
238
+ #### Example Usage
73
239
 
74
240
  ```json
75
241
  {
@@ -81,11 +247,11 @@ Generate talking avatar videos from images and audio using FAL AI's Infinitalk m
81
247
  }
82
248
  ```
83
249
 
84
- ## Tool: Perplexity Images
250
+ ### Perplexity Images
85
251
 
86
252
  Search for images using Perplexity AI with advanced filtering options.
87
253
 
88
- ### Parameters
254
+ #### Parameters
89
255
 
90
256
  - `query` (required): The search query for images
91
257
  - `image_domain_filter` (optional): Array of domains to include or exclude (prefix with '-' to exclude)
@@ -93,7 +259,7 @@ Search for images using Perplexity AI with advanced filtering options.
93
259
  - `image_format_filter` (optional): Array of allowed image formats
94
260
  - Example: `["jpg", "png", "gif"]`
95
261
 
96
- ### Example Usage
262
+ #### Example Usage
97
263
 
98
264
  ```json
99
265
  {
@@ -103,17 +269,17 @@ Search for images using Perplexity AI with advanced filtering options.
103
269
  }
104
270
  ```
105
271
 
106
- ## Tool: Perplexity Videos
272
+ ### Perplexity Videos
107
273
 
108
274
  Search for videos using Perplexity AI with domain filtering.
109
275
 
110
- ### Parameters
276
+ #### Parameters
111
277
 
112
278
  - `query` (required): The search query for videos
113
279
  - `search_domain_filter` (optional): Array of domains to limit search (use '-' prefix to exclude)
114
280
  - Example: `["youtube.com"]` or `["-tiktok.com"]`
115
281
 
116
- ### Example Usage
282
+ #### Example Usage
117
283
 
118
284
  ```json
119
285
  {
@@ -122,6 +288,19 @@ Search for videos using Perplexity AI with domain filtering.
122
288
  }
123
289
  ```
124
290
 
291
+ ## Testing
292
+
293
+ You can test the MCP server using the MCP Inspector:
294
+
295
+ ```bash
296
+ npx @modelcontextprotocol/inspector npx @mixio-pro/kalaasetu-mcp@latest
297
+ ```
298
+
299
+ ## Requirements
300
+
301
+ - **Bun**: This package requires Bun runtime. Install from [bun.sh](https://bun.sh)
302
+ - **API Keys**: Obtain the necessary API keys from the providers listed above
303
+
125
304
  ## Project Info
126
305
 
127
306
  This project was created using `bun init` in bun v1.3.1. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mixio-pro/kalaasetu-mcp",
3
- "version": "1.0.1",
3
+ "version": "1.0.4",
4
4
  "description": "A powerful Model Context Protocol server providing AI tools for content generation and analysis",
5
5
  "type": "module",
6
6
  "module": "src/index.ts",
@@ -263,7 +263,7 @@ export const geminiSingleSpeakerTts = {
263
263
  description: "Generate single speaker voice audio from text using Gemini 2.5 Pro Preview TTS model",
264
264
  parameters: z.object({
265
265
  text: z.string().describe("Text to convert to speech"),
266
- voice_name: z.string().describe("Voice name from supported options (e.g., 'Kore', 'Zephyr', 'Puck', etc.)"),
266
+ voice_name: z.string().describe("Voice name from supported options. Use Kore, Erinome or Despina for the female voices and Enceladus for male."),
267
267
  output_path: z.string().optional().describe("Output WAV file path (optional, defaults to timestamp-based filename)"),
268
268
  }),
269
269
  execute: async (args: { text: string; voice_name: string; output_path?: string }) => {
@@ -276,7 +276,7 @@ export const geminiSingleSpeakerTts = {
276
276
  speechConfig: {
277
277
  voiceConfig: {
278
278
  prebuiltVoiceConfig: {
279
- voiceName: args.voice_name
279
+ voiceName: args.voice_name || 'Despina'
280
280
  },
281
281
  },
282
282
  },
@@ -0,0 +1,161 @@
1
+ import { z } from "zod";
2
+ import * as fs from "fs";
3
+ import * as path from "path";
4
+ import { GoogleGenAI } from "@google/genai";
5
+
6
+ async function wait(ms: number): Promise<void> {
7
+ return new Promise((resolve) => setTimeout(resolve, ms));
8
+ }
9
+
10
+ function fileToBase64(filePath: string): { data: string; mimeType: string } {
11
+ if (!fs.existsSync(filePath)) {
12
+ throw new Error(`File not found: ${filePath}`);
13
+ }
14
+ const buf = fs.readFileSync(filePath);
15
+ const data = Buffer.from(buf).toString("base64");
16
+ // Detect mime type from extension
17
+ const ext = path.extname(filePath).toLowerCase();
18
+ const mimeType = ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' :
19
+ ext === '.png' ? 'image/png' :
20
+ ext === '.webp' ? 'image/webp' : 'image/png';
21
+ return { data, mimeType };
22
+ }
23
+
24
+ export const imageToVideo = {
25
+ name: "imageToVideo",
26
+ description: "Generate videos from an image as starting first frame using Vertex Veo models (predictLongRunning + fetchPredictOperation).",
27
+ parameters: z.object({
28
+ prompt: z.string().describe("Text description for the video"),
29
+ image_path: z.string().optional().describe("Path to source image for image-to-video generation"),
30
+ aspect_ratio: z.string().optional().describe("Video aspect ratio: '16:9' or '9:16' (default: '9:16')"),
31
+ duration_seconds: z.number().optional().describe("Video duration in seconds: 4, 6, or 8 (default: 6)"),
32
+ resolution: z.string().optional().describe("Video resolution: '720p' or '1080p' (default: '720p')"),
33
+ negative_prompt: z.string().optional().describe("Text describing what not to include in the video"),
34
+ person_generation: z.string().optional().describe("Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"),
35
+ reference_images: z.array(z.string()).optional().describe("Additional image paths for reference (max 3)"),
36
+ output_path: z.string().optional().describe("Output MP4 file path (if multiple predictions, index suffix is added)"),
37
+ gemini_api_key: z.string().optional().describe("Gemini API key (uses GEMINI_API_KEY env var if not provided)"),
38
+ model_id: z.string().optional().describe("Model ID (default: veo-2.0-generate-001)"),
39
+ }),
40
+ execute: async (args: {
41
+ prompt: string;
42
+ image_path?: string;
43
+ aspect_ratio?: string;
44
+ duration_seconds?: number;
45
+ resolution?: string;
46
+ negative_prompt?: string;
47
+ person_generation?: string;
48
+ reference_images?: string[];
49
+ output_path?: string;
50
+ gemini_api_key?: string;
51
+ model_id?: string;
52
+ }) => {
53
+ const apiKey = args.gemini_api_key || process.env.GEMINI_API_KEY;
54
+ if (!apiKey) {
55
+ throw new Error("Gemini API key is required. Set GEMINI_API_KEY environment variable or pass gemini_api_key parameter. Get one at https://aistudio.google.com/app/apikey");
56
+ }
57
+
58
+ const model = args.model_id || "veo-2.0-generate-001";
59
+
60
+ // Initialize Google GenAI client
61
+ const genai = new GoogleGenAI({ apiKey });
62
+
63
+ // Build config for video generation
64
+ const config: any = {};
65
+
66
+ if (args.duration_seconds !== undefined) {
67
+ config.duration_seconds = args.duration_seconds;
68
+ } else {
69
+ config.duration_seconds = 6; // default
70
+ }
71
+
72
+ if (args.aspect_ratio) {
73
+ config.aspect_ratio = args.aspect_ratio;
74
+ }
75
+
76
+ try {
77
+ // Start video generation operation
78
+ console.log(`Starting video generation with model: ${model}`);
79
+ let operation = await genai.models.generateVideos({
80
+ model,
81
+ prompt: args.prompt,
82
+ config,
83
+ });
84
+
85
+ console.log("Operation started, waiting for completion...");
86
+
87
+ // Poll until operation is complete (max 10 minutes)
88
+ let tries = 0;
89
+ const maxTries = 60; // 10 minutes with 10s intervals
90
+
91
+ while (!operation.done && tries < maxTries) {
92
+ await wait(10000); // Wait 10 seconds
93
+ tries++;
94
+ console.log(`Polling attempt ${tries}/${maxTries}...`);
95
+
96
+ operation = await genai.operations.getVideosOperation({
97
+ operation: operation,
98
+ });
99
+ }
100
+
101
+ if (!operation.done) {
102
+ throw new Error("Video generation timed out after 10 minutes");
103
+ }
104
+
105
+ console.log("Operation completed!");
106
+ console.log("Full Response:", JSON.stringify(operation.response, null, 2));
107
+
108
+ // Extract generated videos from response
109
+ const generatedVideos = operation.response?.generatedVideos || [];
110
+
111
+ if (!generatedVideos || generatedVideos.length === 0) {
112
+ const respStr = JSON.stringify(operation.response, null, 2);
113
+ return `Video generation completed but no videos found in response.\n\nFull Response:\n${respStr.slice(0, 2000)}${respStr.length > 2000 ? '\n...(truncated)' : ''}`;
114
+ }
115
+
116
+ // Download and save videos
117
+ const outputs: string[] = [];
118
+
119
+ for (let i = 0; i < generatedVideos.length; i++) {
120
+ const generatedVideo = generatedVideos[i];
121
+ const videoUri = generatedVideo?.video?.uri;
122
+
123
+ if (!videoUri) {
124
+ console.warn(`Video ${i} has no URI`);
125
+ continue;
126
+ }
127
+
128
+ console.log(`Downloading video ${i + 1}/${generatedVideos.length}...`);
129
+
130
+ // Download video from URI
131
+ const videoUrl = `${videoUri}&key=${apiKey}`;
132
+ const response = await fetch(videoUrl);
133
+
134
+ if (!response.ok) {
135
+ throw new Error(`Failed to download video: ${response.status} ${response.statusText}`);
136
+ }
137
+
138
+ const buffer = await response.arrayBuffer();
139
+
140
+ // Save video to file
141
+ const filePath = args.output_path
142
+ ? (i === 0 ? args.output_path : args.output_path.replace(/\.mp4$/i, `_${i}.mp4`))
143
+ : `video_output_${Date.now()}${i === 0 ? '' : '_' + i}.mp4`;
144
+ const absPath = path.resolve(filePath);
145
+
146
+ fs.writeFileSync(absPath, Buffer.from(buffer));
147
+ outputs.push(absPath);
148
+ console.log(`Saved video to: ${absPath}`);
149
+ }
150
+
151
+ if (outputs.length > 0) {
152
+ return `Video(s) saved successfully:\n${outputs.map((p, i) => `${i + 1}. ${p}`).join('\n')}`;
153
+ }
154
+
155
+ return "Video generation completed but no videos were saved.";
156
+
157
+ } catch (error: any) {
158
+ throw new Error(`Video generation failed: ${error.message || JSON.stringify(error)}`);
159
+ }
160
+ },
161
+ };
@@ -1,125 +1,99 @@
1
1
  import { z } from "zod";
2
2
  import * as fs from "fs";
3
- import { GoogleAuth } from "google-auth-library";
4
- import { exec } from "child_process";
5
3
  import * as path from "path";
6
4
 
7
5
  async function wait(ms: number): Promise<void> {
8
6
  return new Promise((resolve) => setTimeout(resolve, ms));
9
7
  }
10
8
 
11
- async function fetchAccessToken(): Promise<string> {
12
- try {
13
- const auth = new GoogleAuth({ scopes: ["https://www.googleapis.com/auth/cloud-platform"] });
14
- const client = await auth.getClient();
15
- const token = await client.getAccessToken();
16
- if (!token || !token.token || typeof token.token !== "string") {
17
- throw new Error("No token from GoogleAuth");
18
- }
19
- return token.token;
20
- } catch (e: any) {
21
- // Fallback to gcloud
22
- return await new Promise((resolve, reject) => {
23
- exec("gcloud auth print-access-token", (err, stdout, stderr) => {
24
- if (err) {
25
- reject(new Error(`Failed to fetch an access token (ADC and gcloud): ${stderr || err.message}`));
26
- return;
27
- }
28
- const t = (stdout || "").trim();
29
- if (!t) {
30
- reject(new Error("Failed to fetch an access token: empty token from gcloud"));
31
- return;
32
- }
33
- resolve(t);
34
- });
35
- });
36
- }
37
- }
38
-
39
9
  function fileToBase64(filePath: string): { data: string; mimeType: string } {
40
10
  if (!fs.existsSync(filePath)) {
41
11
  throw new Error(`File not found: ${filePath}`);
42
12
  }
43
13
  const buf = fs.readFileSync(filePath);
44
14
  const data = Buffer.from(buf).toString("base64");
45
- // Default to PNG if not sure, similar to existing code
46
- const mimeType = "image/png";
15
+ // Detect mime type from extension
16
+ const ext = path.extname(filePath).toLowerCase();
17
+ const mimeType = ext === '.jpg' || ext === '.jpeg' ? 'image/jpeg' :
18
+ ext === '.png' ? 'image/png' :
19
+ ext === '.webp' ? 'image/webp' : 'image/png';
47
20
  return { data, mimeType };
48
21
  }
49
22
 
50
23
  export const imageToVideo = {
51
24
  name: "imageToVideo",
52
- description: "Generate videos from an image as starting first frame using Vertex Veo models (predictLongRunning + fetchPredictOperation).",
25
+ description: "Generate videos from an image as starting first frame using Gemini Veo models via HTTP API with Gemini API key.",
53
26
  parameters: z.object({
54
27
  prompt: z.string().describe("Text description for the video"),
55
28
  image_path: z.string().optional().describe("Path to source image for image-to-video generation"),
29
+ last_frame_path: z.string().optional().describe("Path to last frame image to guide ending frame (optional)"),
56
30
  aspect_ratio: z.string().optional().describe("Video aspect ratio: '16:9' or '9:16' (default: '9:16')"),
57
- duration_seconds: z.string().optional().describe("Video duration in seconds: '4', '6', or '8' (default: '6')"),
31
+ duration_seconds: z.union([z.string(), z.number()]).optional().describe("Video duration in seconds: 4, 6, or 8 (default: 6)"),
58
32
  resolution: z.string().optional().describe("Video resolution: '720p' or '1080p' (default: '720p')"),
59
33
  negative_prompt: z.string().optional().describe("Text describing what not to include in the video"),
60
34
  person_generation: z.string().optional().describe("Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"),
61
35
  reference_images: z.array(z.string()).optional().describe("Additional image paths for reference (max 3)"),
62
36
  output_path: z.string().optional().describe("Output MP4 file path (if multiple predictions, index suffix is added)"),
63
- project_id: z.string().optional().describe("GCP Project ID (default: mixio-pro)"),
64
- location_id: z.string().optional().describe("Vertex region (default: us-central1)"),
65
- model_id: z.string().optional().describe("Model ID (default: veo-3.1-fast-generate-preview)"),
37
+ gemini_api_key: z.string().optional().describe("Gemini API key (uses GEMINI_API_KEY env var if not provided)"),
38
+ model_id: z.string().optional().describe("Model ID (default: veo-3.1-generate-preview)"),
66
39
  }),
67
40
  execute: async (args: {
68
41
  prompt: string;
69
42
  image_path?: string;
43
+ last_frame_path?: string;
70
44
  aspect_ratio?: string;
71
- duration_seconds?: string;
45
+ duration_seconds?: string | number;
72
46
  resolution?: string;
73
47
  negative_prompt?: string;
74
48
  person_generation?: string;
75
49
  reference_images?: string[];
76
50
  output_path?: string;
77
- project_id?: string;
78
- location_id?: string;
51
+ gemini_api_key?: string;
79
52
  model_id?: string;
80
53
  }) => {
81
- const projectId = args.project_id || "mixio-pro";
82
- const location = args.location_id || "us-central1";
83
- const modelId = args.model_id || "veo-3.1-fast-generate-preview";
54
+ const apiKey = args.gemini_api_key || process.env.GEMINI_API_KEY;
55
+ if (!apiKey) {
56
+ throw new Error("Gemini API key is required. Set GEMINI_API_KEY environment variable or pass gemini_api_key parameter. Get one at https://aistudio.google.com/app/apikey");
57
+ }
84
58
 
85
- const token = await fetchAccessToken();
59
+ const modelId = args.model_id || "veo-3.1-generate-preview";
60
+ const baseUrl = "https://generativelanguage.googleapis.com/v1beta";
86
61
 
87
- const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
62
+ // Convert duration_seconds to number, handling both string and number inputs
63
+ const durationSeconds = args.duration_seconds
64
+ ? (typeof args.duration_seconds === 'string' ? parseInt(args.duration_seconds) : args.duration_seconds)
65
+ : 6; // default
88
66
 
89
- let imagePart: any = undefined;
90
- if (args.image_path) {
91
- const { data, mimeType } = fileToBase64(args.image_path);
92
- imagePart = {
93
- image: {
67
+ try {
68
+ // Build the request body for predictLongRunning
69
+ const instances: any[] = [
70
+ {
71
+ prompt: args.prompt,
72
+ },
73
+ ];
74
+
75
+ // Add image if provided (first frame)
76
+ if (args.image_path) {
77
+ const { data, mimeType } = fileToBase64(args.image_path);
78
+ instances[0].image = {
94
79
  bytesBase64Encoded: data,
95
80
  mimeType,
96
- },
97
- };
98
- }
81
+ };
82
+ }
99
83
 
100
- let referenceImages: any[] | undefined = undefined;
101
- if (args.reference_images) {
102
- let refImages: string[];
103
- if (typeof args.reference_images === 'string') {
104
- const strValue = args.reference_images as string;
105
- if (strValue.startsWith("[") && strValue.endsWith("]")) {
106
- try {
107
- refImages = JSON.parse(strValue);
108
- } catch {
109
- throw new Error("Invalid reference_images format");
110
- }
111
- } else {
112
- refImages = [strValue];
113
- }
114
- } else if (Array.isArray(args.reference_images)) {
115
- refImages = args.reference_images;
116
- } else {
117
- throw new Error("Invalid reference_images: must be array or string");
84
+ // Add last frame if provided (for interpolation)
85
+ if (args.last_frame_path) {
86
+ const { data, mimeType } = fileToBase64(args.last_frame_path);
87
+ instances[0].lastFrame = {
88
+ bytesBase64Encoded: data,
89
+ mimeType,
90
+ };
118
91
  }
119
92
 
120
- if (refImages.length > 0) {
121
- referenceImages = refImages.slice(0, 3).map((p) => {
122
- const { data, mimeType } = fileToBase64(p);
93
+ // Add reference images if provided
94
+ if (args.reference_images && args.reference_images.length > 0) {
95
+ const refImages = args.reference_images.slice(0, 3).map((imgPath) => {
96
+ const { data, mimeType } = fileToBase64(imgPath);
123
97
  return {
124
98
  image: {
125
99
  bytesBase64Encoded: data,
@@ -128,104 +102,162 @@ export const imageToVideo = {
128
102
  referenceType: "asset",
129
103
  };
130
104
  });
105
+ instances[0].referenceImages = refImages;
131
106
  }
132
- }
133
107
 
134
- const personGeneration = args.person_generation || (args.image_path ? "allow_adult" : "allow_all");
135
-
136
- const instances: any[] = [
137
- {
138
- prompt: args.prompt,
139
- ...(imagePart || {}),
140
- ...(referenceImages ? { referenceImages } : {}),
141
- },
142
- ];
143
-
144
- const parameters: any = {
145
- aspectRatio: args.aspect_ratio || "9:16",
146
- durationSeconds: parseInt(args.duration_seconds || "6"),
147
- resolution: args.resolution || "720p",
148
- negativePrompt: args.negative_prompt,
149
- generateAudio: false,
150
- personGeneration,
151
- };
152
-
153
- const res = await fetch(url, {
154
- method: "POST",
155
- headers: {
156
- Authorization: `Bearer ${token}`,
157
- "Content-Type": "application/json",
158
- },
159
- body: JSON.stringify({ instances, parameters }),
160
- });
161
-
162
- if (!res.ok) {
163
- const text = await res.text();
164
- throw new Error(`Vertex request failed: ${res.status} ${text}`);
165
- }
108
+ // Build parameters - NOTE: Parameters go in "parameters" object, not in instances
109
+ const parameters: any = {};
166
110
 
167
- const op: any = await res.json();
168
- const name: string = (op as any).name || (op as any).operation || "";
169
- if (!name) {
170
- throw new Error("Vertex did not return an operation name for long-running request");
171
- }
111
+ if (args.aspect_ratio) {
112
+ parameters.aspectRatio = args.aspect_ratio;
113
+ }
114
+
115
+ if (durationSeconds) {
116
+ parameters.durationSeconds = durationSeconds;
117
+ }
118
+
119
+ if (args.resolution) {
120
+ parameters.resolution = args.resolution;
121
+ }
122
+
123
+ if (args.negative_prompt) {
124
+ parameters.negativePrompt = args.negative_prompt;
125
+ }
126
+
127
+ if (args.person_generation) {
128
+ parameters.personGeneration = args.person_generation;
129
+ }
130
+
131
+ const requestBody: any = {
132
+ instances,
133
+ };
134
+
135
+ // Only add parameters if we have any
136
+ if (Object.keys(parameters).length > 0) {
137
+ requestBody.parameters = parameters;
138
+ }
172
139
 
173
- let current: any = op;
174
- let done = !!(op as any).done;
175
- let tries = 0;
140
+ console.log(`Starting video generation with model: ${modelId}`);
176
141
 
177
- // Poll using fetchPredictOperation as per Vertex recommendation
178
- const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
179
- while (!done && tries < 60) {
180
- await wait(10000);
181
- const poll = await fetch(fetchUrl, {
142
+ // Step 1: Start the long-running operation
143
+ const url = `${baseUrl}/models/${modelId}:predictLongRunning`;
144
+ const response = await fetch(url, {
182
145
  method: "POST",
183
146
  headers: {
184
- Authorization: `Bearer ${token}`,
147
+ "x-goog-api-key": apiKey,
185
148
  "Content-Type": "application/json",
186
149
  },
187
- body: JSON.stringify({ operationName: name }),
150
+ body: JSON.stringify(requestBody),
188
151
  });
189
- if (!poll.ok) {
190
- const text = await poll.text();
191
- throw new Error(`Vertex operation poll failed: ${poll.status} ${text}`);
152
+
153
+ if (!response.ok) {
154
+ const errorText = await response.text();
155
+ throw new Error(`Video generation request failed: ${response.status} ${errorText}`);
192
156
  }
193
- current = await poll.json();
194
- done = !!(current as any).done || !!(current as any).response;
195
- tries++;
196
- }
197
157
 
198
- const resp = (current as any).response || current;
199
- // Decode from response.videos[].bytesBase64Encoded only
200
- const outputs: string[] = [];
201
- const saveVideo = (base64: string, index: number) => {
202
- if (!base64) return;
203
- const filePath = args.output_path
204
- ? (index === 0 ? args.output_path : args.output_path.replace(/\.mp4$/i, `_${index}.mp4`))
205
- : `video_output_${Date.now()}${index === 0 ? '' : '_' + index}.mp4`;
206
- const absPath = path.resolve(filePath);
207
- const buf = Buffer.from(base64, 'base64');
208
- fs.writeFileSync(absPath, buf);
209
- outputs.push(absPath);
210
- };
211
-
212
- if (Array.isArray(resp?.videos) && resp.videos.length > 0) {
213
- for (let i = 0; i < resp.videos.length; i++) {
214
- const v = resp.videos[i] || {};
215
- if (typeof v.bytesBase64Encoded === 'string') {
216
- saveVideo(v.bytesBase64Encoded, i);
158
+ const operation = await response.json() as any;
159
+ const operationName: string = operation.name || operation.operation || "";
160
+
161
+ if (!operationName) {
162
+ throw new Error("No operation name returned from API");
163
+ }
164
+
165
+ console.log(`Operation started: ${operationName}`);
166
+
167
+ // Step 2: Poll the operation status by getting the operation directly
168
+ let currentOp: any = operation;
169
+ let done = !!operation.done;
170
+ let tries = 0;
171
+ const maxTries = 60; // 10 minutes with 10s intervals
172
+
173
+ while (!done && tries < maxTries) {
174
+ await wait(10000); // Wait 10 seconds
175
+ tries++;
176
+ console.log(`Polling attempt ${tries}/${maxTries}...`);
177
+
178
+ // Poll by getting the operation status directly
179
+ const pollResponse = await fetch(`${baseUrl}/${operationName}`, {
180
+ method: "GET",
181
+ headers: {
182
+ "x-goog-api-key": apiKey,
183
+ },
184
+ });
185
+
186
+ if (!pollResponse.ok) {
187
+ const errorText = await pollResponse.text();
188
+ throw new Error(`Operation polling failed: ${pollResponse.status} ${errorText}`);
217
189
  }
190
+
191
+ currentOp = await pollResponse.json() as any;
192
+ done = !!currentOp.done || !!currentOp.response;
193
+ }
194
+
195
+ if (!done) {
196
+ throw new Error("Video generation timed out after 10 minutes");
218
197
  }
219
- }
220
- if (outputs.length > 0) {
221
- return `Video(s) saved: ${outputs.join(', ')}`;
222
- }
223
198
 
224
- // If nothing saved, return a concise summary plus head/tail snippets of JSON
225
- let jsonStr = "";
226
- try { jsonStr = JSON.stringify(resp); } catch {}
227
- const head150 = jsonStr ? jsonStr.slice(0, 150) : "";
228
- const tail50 = jsonStr ? jsonStr.slice(Math.max(0, jsonStr.length - 50)) : "";
229
- return `Vertex operation done but no videos array present. operationName=${name}. json_head150=${head150} json_tail50=${tail50}`;
199
+ console.log("Operation completed!");
200
+
201
+ // Step 3: Extract and download videos
202
+ const resp = currentOp.response || currentOp;
203
+
204
+ // The response structure is: response.generateVideoResponse.generatedSamples[].video.uri
205
+ const generateVideoResponse = resp?.generateVideoResponse;
206
+ const generatedSamples = generateVideoResponse?.generatedSamples || [];
207
+
208
+ if (!generatedSamples || generatedSamples.length === 0) {
209
+ let jsonStr = "";
210
+ try { jsonStr = JSON.stringify(resp, null, 2); } catch {}
211
+ return `Video generation completed but no generatedSamples found.\n\nFull Response:\n${jsonStr.slice(0, 1000)}${jsonStr.length > 1000 ? '\n...(truncated)' : ''}`;
212
+ }
213
+
214
+ const outputs: string[] = [];
215
+
216
+ // Download videos from URIs
217
+ for (let i = 0; i < generatedSamples.length; i++) {
218
+ const sample = generatedSamples[i];
219
+ const videoUri = sample?.video?.uri;
220
+
221
+ if (!videoUri) {
222
+ console.warn(`Sample ${i} has no video URI`);
223
+ continue;
224
+ }
225
+
226
+ console.log(`Downloading video ${i + 1}/${generatedSamples.length} from ${videoUri}...`);
227
+
228
+ // Download video from URI with API key
229
+ const videoResponse = await fetch(videoUri, {
230
+ method: "GET",
231
+ headers: {
232
+ "x-goog-api-key": apiKey,
233
+ },
234
+ });
235
+
236
+ if (!videoResponse.ok) {
237
+ throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
238
+ }
239
+
240
+ const videoBuffer = await videoResponse.arrayBuffer();
241
+
242
+ // Save video to file
243
+ const filePath = args.output_path
244
+ ? (i === 0 ? args.output_path : args.output_path.replace(/\.mp4$/i, `_${i}.mp4`))
245
+ : `video_output_${Date.now()}${i === 0 ? '' : '_' + i}.mp4`;
246
+ const absPath = path.resolve(filePath);
247
+
248
+ fs.writeFileSync(absPath, Buffer.from(videoBuffer));
249
+ outputs.push(absPath);
250
+ console.log(`Saved video to: ${absPath}`);
251
+ }
252
+
253
+ if (outputs.length > 0) {
254
+ return `Video(s) saved successfully:\n${outputs.map((p, i) => `${i + 1}. ${p}`).join('\n')}`;
255
+ }
256
+
257
+ return "Video generation completed but no videos were saved.";
258
+
259
+ } catch (error: any) {
260
+ throw new Error(`Video generation failed: ${error.message || JSON.stringify(error)}`);
261
+ }
230
262
  },
231
263
  };