@mixio-pro/kalaasetu-mcp 1.2.1 → 2.0.1-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,10 @@ import { z } from "zod";
2
2
  import { getStorage } from "../storage";
3
3
  import { generateTimestampedFilename } from "../utils/filename";
4
4
  import { safeToolExecute } from "../utils/tool-wrapper";
5
+ import {
6
+ resolveEnhancer,
7
+ listVideoEnhancerPresets,
8
+ } from "../utils/prompt-enhancer-presets";
5
9
 
6
10
  import { getGoogleAccessToken } from "../utils/google-auth";
7
11
 
@@ -41,93 +45,149 @@ async function fileToBase64(
41
45
  export const imageToVideo = {
42
46
  name: "generateVideoi2v",
43
47
  description:
44
- "Generate videos from an image as starting first frame using Vertex Veo models (predictLongRunning + fetchPredictOperation).",
48
+ "Generate professional-quality cinematic videos from a starting image and text prompt using Google's Vertex AI Veo models. " +
49
+ "This tool follows a 'Synchronous Facade' pattern: it handles polling internally but can be paused/resumed. " +
50
+ "If the generation takes too long, it returns a 'resume_id' that you MUST use to call this tool again to pick up progress. " +
51
+ "It produces state-of-the-art cinematic results. " +
52
+ "ONLY USE WHEN WORKING WITH GOOGLE VERTEX AI MODELS.",
45
53
  parameters: z.object({
46
- prompt: z.string().describe("Text description for the video"),
54
+ prompt: z
55
+ .string()
56
+ .optional()
57
+ .describe(
58
+ "Required for new requests. Descriptive text for the video action and style (e.g., 'A robot walking through a neon city at night')."
59
+ ),
47
60
  image_path: z
48
61
  .string()
49
62
  .optional()
50
- .describe("Path to source image for image-to-video generation"),
63
+ .describe("Absolute local path or URL to the STARTING image frame."),
51
64
  last_frame_path: z
52
65
  .string()
53
66
  .optional()
54
- .describe("Path to last frame image to guide ending frame (optional)"),
67
+ .describe(
68
+ "Optional: Absolute local path or URL to the ENDING image frame to guide the video's conclusion."
69
+ ),
55
70
  aspect_ratio: z
56
71
  .string()
57
72
  .optional()
58
73
  .default("16:9")
59
- .describe("Video aspect ratio: '16:9' or '9:16'"),
74
+ .describe(
75
+ "Target aspect ratio: '16:9' (landscape) or '9:16' (vertical)."
76
+ ),
60
77
  duration_seconds: z
61
78
  .string()
62
79
  .optional()
63
80
  .default("6")
64
81
  .describe(
65
- "Video duration in seconds. MUST be one of: '4', '6', or '8' (default: '6'). Other values will be rejected by Vertex AI."
82
+ "Target duration. Vertex AI ONLY supports exactly '4', '6', or '8' seconds. Other values will be rounded to the nearest supported step."
66
83
  ),
67
84
  resolution: z
68
85
  .string()
69
86
  .optional()
70
- .describe("Video resolution: '720p' or '1080p' (default: '720p')"),
87
+ .describe("Target resolution: '720p' or '1080p'. Default is '720p'."),
71
88
  negative_prompt: z
72
89
  .string()
73
90
  .optional()
74
- .describe("Text describing what not to include in the video"),
91
+ .describe(
92
+ "Visual elements or styles to EXCLUDE from the generated video."
93
+ ),
75
94
  person_generation: z
76
95
  .string()
77
96
  .optional()
78
97
  .describe(
79
- "Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"
98
+ "Policy for generating people: 'allow_adult' (standard) or 'allow_all'. Note: Gemini 1.5+ safety filters apply."
80
99
  ),
81
100
  reference_images: z
82
101
  .array(z.string())
83
102
  .optional()
84
- .describe("Additional image paths for reference (max 3)"),
103
+ .describe(
104
+ "Optional: Additional images (up to 3) to guide style or character consistency."
105
+ ),
85
106
  output_path: z
86
107
  .string()
87
108
  .optional()
88
109
  .describe(
89
- "Output MP4 file path (if multiple predictions, index suffix is added)"
110
+ "Optional: Local path to save the resulting .mp4 file. Defaults to timestamped filename."
90
111
  ),
91
112
  project_id: z
92
113
  .string()
93
114
  .optional()
94
115
  .default("mixio-pro")
95
- .describe("GCP Project ID (default: mixio-pro)"),
116
+ .describe("GCP Project ID for Vertex billing."),
96
117
  location_id: z
97
118
  .string()
98
119
  .optional()
99
120
  .default("us-central1")
100
- .describe("Vertex region (default: us-central1)"),
121
+ .describe("GCP region for Vertex AI processing (e.g., 'us-central1')."),
101
122
  model_id: z
102
123
  .string()
103
124
  .optional()
104
125
  .default("veo-3.1-fast-generate-001")
105
- .describe("Model ID (default: veo-3.1-fast-generate-001)"),
126
+ .describe("Specific Vertex Veo model ID to use."),
106
127
  generate_audio: z
107
128
  .boolean()
108
129
  .optional()
109
130
  .describe(
110
- "Boolean flag to enable generation of audio along with the video"
131
+ "If true, Vertex will attempt to synthesize synchronized audio for the video."
111
132
  )
112
133
  .default(false),
134
+ resume_id: z
135
+ .string()
136
+ .optional()
137
+ .describe(
138
+ "If provided, the tool will check the status of an existing Vertex operation instead of starting a new one. " +
139
+ "Use the 'request_id' returned in an 'IN_PROGRESS' response."
140
+ ),
141
+ auto_enhance: z
142
+ .boolean()
143
+ .optional()
144
+ .describe(
145
+ "Whether to automatically enhance the prompt using Veo/LTX guidelines (default: true if enabled via preset or config). Set to false to disable enhancement."
146
+ ),
147
+ enhancer_preset: z
148
+ .string()
149
+ .optional()
150
+ .describe(
151
+ "Optional: Name of a video prompt enhancer preset (e.g., 'veo', 'ltx2', 'cinematic_video'). " +
152
+ "When using Veo, setting this to 'veo' (or setting auto_enhance=true) will trigger the LLM-based enhancer."
153
+ ),
113
154
  }),
114
155
  timeoutMs: 1200000, // 20 minutes
115
- async execute(args: {
116
- prompt: string;
117
- image_path?: string;
118
- last_frame_path?: string;
119
- aspect_ratio?: string;
120
- duration_seconds?: string;
121
- resolution?: string;
122
- negative_prompt?: string;
123
- person_generation?: string;
124
- reference_images?: string[] | string;
125
- output_path?: string;
126
- project_id?: string;
127
- location_id?: string;
128
- model_id?: string;
129
- generate_audio?: boolean;
130
- }) {
156
+ async execute(
157
+ args: {
158
+ prompt?: string;
159
+ image_path?: string;
160
+ last_frame_path?: string;
161
+ aspect_ratio?: string;
162
+ duration_seconds?: string;
163
+ resolution?: string;
164
+ negative_prompt?: string;
165
+ person_generation?: string;
166
+ reference_images?: string[] | string;
167
+ output_path?: string;
168
+ project_id?: string;
169
+ location_id?: string;
170
+ model_id?: string;
171
+ generate_audio?: boolean;
172
+ resume_id?: string;
173
+ enhancer_preset?: string;
174
+ auto_enhance?: boolean;
175
+ },
176
+ context?: {
177
+ reportProgress?: (progress: {
178
+ progress: number;
179
+ total: number;
180
+ }) => Promise<void>;
181
+ streamContent?: (content: {
182
+ type: "text";
183
+ text: string;
184
+ }) => Promise<void>;
185
+ log?: {
186
+ info: (msg: string, data?: any) => void;
187
+ debug: (msg: string, data?: any) => void;
188
+ };
189
+ }
190
+ ) {
131
191
  return safeToolExecute(async () => {
132
192
  const projectId = args.project_id || "mixio-pro";
133
193
  const location = args.location_id || "us-central1";
@@ -165,130 +225,270 @@ export const imageToVideo = {
165
225
  ) {
166
226
  durationSeconds = 8;
167
227
  }
228
+ // Stream diagnostic info about auth
229
+ let token: string;
230
+ try {
231
+ if (context?.streamContent) {
232
+ await context.streamContent({
233
+ type: "text" as const,
234
+ text: `[Vertex] Authenticating with Google Cloud (project: ${projectId}, location: ${location})...`,
235
+ });
236
+ }
237
+ token = await getGoogleAccessToken();
238
+ if (context?.streamContent) {
239
+ await context.streamContent({
240
+ type: "text" as const,
241
+ text: `[Vertex] ✓ Authentication successful. Token acquired.`,
242
+ });
243
+ }
244
+ } catch (authError: any) {
245
+ const errorMsg = authError?.message || String(authError);
246
+ if (context?.streamContent) {
247
+ await context.streamContent({
248
+ type: "text" as const,
249
+ text: `[Vertex] ✗ Authentication FAILED: ${errorMsg}. Check GOOGLE_APPLICATION_CREDENTIALS or run 'gcloud auth application-default login'.`,
250
+ });
251
+ }
252
+ throw new Error(`Google Cloud authentication failed: ${errorMsg}`);
253
+ }
168
254
 
169
- const token = await getGoogleAccessToken();
170
-
171
- const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
255
+ const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
172
256
 
173
- let imagePart: any = undefined;
174
- if (args.image_path) {
175
- const { data, mimeType } = await fileToBase64(args.image_path);
176
- imagePart = {
177
- image: {
178
- bytesBase64Encoded: data,
179
- mimeType,
180
- },
181
- };
257
+ // If resuming, reconstruct the full operation path from the UUID
258
+ let operationName: string | undefined;
259
+ if (args.resume_id) {
260
+ // Support both UUID-only and full path formats
261
+ if (args.resume_id.includes("/")) {
262
+ operationName = args.resume_id; // Already a full path
263
+ } else {
264
+ // Reconstruct full path from UUID
265
+ operationName = `projects/${projectId}/locations/${location}/publishers/google/models/${modelId}/operations/${args.resume_id}`;
266
+ }
182
267
  }
268
+ let current: any;
183
269
 
184
- let lastFramePart: any = undefined;
185
- if (args.last_frame_path) {
186
- const { data, mimeType } = await fileToBase64(args.last_frame_path);
187
- lastFramePart = {
188
- lastFrame: {
189
- bytesBase64Encoded: data,
190
- mimeType,
191
- },
192
- };
193
- }
270
+ if (!operationName) {
271
+ if (!args.prompt) {
272
+ throw new Error("prompt is required when starting a new generation.");
273
+ }
274
+
275
+ if (context?.streamContent) {
276
+ await context.streamContent({
277
+ type: "text" as const,
278
+ text: `[Vertex] Submitting video generation request to Veo model: ${modelId}...`,
279
+ });
280
+ }
281
+
282
+ const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
283
+
284
+ let imagePart: any = undefined;
285
+ if (args.image_path) {
286
+ const { data, mimeType } = await fileToBase64(args.image_path);
287
+ imagePart = {
288
+ image: {
289
+ bytesBase64Encoded: data,
290
+ mimeType,
291
+ },
292
+ };
293
+ }
294
+
295
+ let lastFramePart: any = undefined;
296
+ if (args.last_frame_path) {
297
+ const { data, mimeType } = await fileToBase64(args.last_frame_path);
298
+ lastFramePart = {
299
+ lastFrame: {
300
+ bytesBase64Encoded: data,
301
+ mimeType,
302
+ },
303
+ };
304
+ }
194
305
 
195
- let referenceImages: any[] | undefined = undefined;
196
- if (args.reference_images) {
197
- let refImages: string[];
198
- if (typeof args.reference_images === "string") {
199
- if (
200
- args.reference_images.startsWith("[") &&
201
- args.reference_images.endsWith("]")
202
- ) {
203
- try {
204
- refImages = JSON.parse(args.reference_images);
205
- } catch {
206
- throw new Error("Invalid reference_images format");
306
+ let referenceImages: any[] | undefined = undefined;
307
+ if (args.reference_images) {
308
+ let refImages: string[];
309
+ if (typeof args.reference_images === "string") {
310
+ if (
311
+ args.reference_images.startsWith("[") &&
312
+ args.reference_images.endsWith("]")
313
+ ) {
314
+ try {
315
+ refImages = JSON.parse(args.reference_images);
316
+ } catch {
317
+ throw new Error("Invalid reference_images format");
318
+ }
319
+ } else {
320
+ refImages = [args.reference_images];
207
321
  }
322
+ } else if (Array.isArray(args.reference_images)) {
323
+ refImages = args.reference_images;
208
324
  } else {
209
- refImages = [args.reference_images];
325
+ throw new Error(
326
+ "Invalid reference_images: must be array or string"
327
+ );
328
+ }
329
+
330
+ if (refImages.length > 0) {
331
+ referenceImages = await Promise.all(
332
+ refImages.slice(0, 3).map(async (p) => {
333
+ const { data, mimeType } = await fileToBase64(p);
334
+ return {
335
+ image: {
336
+ bytesBase64Encoded: data,
337
+ mimeType,
338
+ },
339
+ referenceType: "asset",
340
+ };
341
+ })
342
+ );
210
343
  }
211
- } else if (Array.isArray(args.reference_images)) {
212
- refImages = args.reference_images;
213
- } else {
214
- throw new Error("Invalid reference_images: must be array or string");
215
344
  }
216
345
 
217
- if (refImages.length > 0) {
218
- referenceImages = await Promise.all(
219
- refImages.slice(0, 3).map(async (p) => {
220
- const { data, mimeType } = await fileToBase64(p);
221
- return {
222
- image: {
223
- bytesBase64Encoded: data,
224
- mimeType,
225
- },
226
- referenceType: "asset",
227
- };
228
- })
229
- );
346
+ const personGeneration =
347
+ args.person_generation ||
348
+ (args.image_path ? "allow_adult" : "allow_all");
349
+
350
+ // Apply prompt enhancement logic
351
+ let enhancedPrompt = args.prompt;
352
+ let enhancedNegativePrompt = args.negative_prompt;
353
+
354
+ // Determine which preset to use
355
+ let presetToUse = args.enhancer_preset;
356
+
357
+ // If auto_enhance is true and no preset specified, default to 'veo'
358
+ if (args.auto_enhance === true && !presetToUse) {
359
+ presetToUse = "veo";
230
360
  }
231
- }
232
361
 
233
- const personGeneration =
234
- args.person_generation ||
235
- (args.image_path ? "allow_adult" : "allow_all");
236
-
237
- const instances: any[] = [
238
- {
239
- prompt: args.prompt,
240
- ...(imagePart || {}),
241
- ...(lastFramePart || {}),
242
- ...(referenceImages ? { referenceImages } : {}),
243
- },
244
- ];
245
-
246
- const parameters: any = {
247
- aspectRatio: args.aspect_ratio || "9:16",
248
- durationSeconds: durationSeconds,
249
- resolution: args.resolution || "720p",
250
- negativePrompt: args.negative_prompt,
251
- generateAudio: args.generate_audio || false,
252
- personGeneration,
253
- };
362
+ // Disable enhancement if auto_enhance is explicitly false
363
+ if (args.auto_enhance === false) {
364
+ presetToUse = undefined;
365
+ }
254
366
 
255
- const res = await fetch(url, {
256
- method: "POST",
257
- headers: {
258
- Authorization: `Bearer ${token}`,
259
- "Content-Type": "application/json",
260
- },
261
- body: JSON.stringify({ instances, parameters }),
262
- });
367
+ if (presetToUse && args.prompt) {
368
+ // Use LLM-based enhancement for 'veo' preset
369
+ if (presetToUse === "veo") {
370
+ const { enhancePromptWithLLM, isLLMEnhancerAvailable } =
371
+ await import("../utils/llm-prompt-enhancer");
372
+
373
+ if (isLLMEnhancerAvailable()) {
374
+ if (context?.streamContent) {
375
+ await context.streamContent({
376
+ type: "text" as const,
377
+ text: `[VEO] Enhancing prompt with Gemini for optimal Veo 3.1 generation...`,
378
+ });
379
+ }
263
380
 
264
- if (!res.ok) {
265
- const text = await res.text();
266
- throw new Error(`Vertex request failed: ${res.status} ${text}`);
381
+ try {
382
+ enhancedPrompt = await enhancePromptWithLLM(args.prompt, "veo");
383
+ context?.log?.info(
384
+ `LLM-enhanced prompt for Veo: "${args.prompt}" → "${enhancedPrompt}"`
385
+ );
386
+
387
+ if (context?.streamContent) {
388
+ await context.streamContent({
389
+ type: "text" as const,
390
+ text: `[VEO] ✓ Prompt enhanced. Length: ${args.prompt.length} → ${enhancedPrompt.length} chars`,
391
+ });
392
+ }
393
+ } catch (err: any) {
394
+ context?.log?.info(
395
+ `LLM enhancement failed, using original: ${err.message}`
396
+ );
397
+ }
398
+ } else {
399
+ context?.log?.info(
400
+ "GEMINI_API_KEY not set, skipping Veo LLM enhancement"
401
+ );
402
+ }
403
+ } else {
404
+ // Fall back to static string-based enhancement for other presets
405
+ const enhancer = resolveEnhancer(presetToUse);
406
+ if (enhancer.hasTransformations()) {
407
+ enhancedPrompt = enhancer.enhance(args.prompt);
408
+ // Apply negative elements if not already set
409
+ const negatives = enhancer.getNegativeElements();
410
+ if (negatives && !enhancedNegativePrompt) {
411
+ enhancedNegativePrompt = negatives;
412
+ }
413
+ }
414
+ }
415
+ }
416
+
417
+ const instances: any[] = [
418
+ {
419
+ prompt: enhancedPrompt,
420
+ ...(imagePart || {}),
421
+ ...(lastFramePart || {}),
422
+ ...(referenceImages ? { referenceImages } : {}),
423
+ },
424
+ ];
425
+
426
+ const parameters: any = {
427
+ aspectRatio: args.aspect_ratio || "9:16",
428
+ durationSeconds: durationSeconds,
429
+ resolution: args.resolution || "720p",
430
+ negativePrompt: enhancedNegativePrompt,
431
+ generateAudio: args.generate_audio || false,
432
+ personGeneration,
433
+ };
434
+
435
+ const res = await fetch(url, {
436
+ method: "POST",
437
+ headers: {
438
+ Authorization: `Bearer ${token}`,
439
+ "Content-Type": "application/json",
440
+ },
441
+ body: JSON.stringify({ instances, parameters }),
442
+ });
443
+
444
+ if (!res.ok) {
445
+ const text = await res.text();
446
+ throw new Error(`Vertex request failed: ${res.status} ${text}`);
447
+ }
448
+
449
+ const op = (await res.json()) as any;
450
+ operationName = op.name || op.operation || "";
451
+ current = op;
267
452
  }
268
453
 
269
- const op = (await res.json()) as any;
270
- const name: string = op.name || op.operation || "";
271
- if (!name) {
454
+ if (!operationName) {
272
455
  throw new Error(
273
456
  "Vertex did not return an operation name for long-running request"
274
457
  );
275
458
  }
276
459
 
277
- let current = op;
278
- let done = !!op.done;
279
- let tries = 0;
460
+ // Extract just the operation UUID from the full path for a cleaner resume_id
461
+ // Full path: projects/.../operations/<uuid>
462
+ const operationUuid = operationName.split("/").pop() || operationName;
463
+
464
+ // Stream the resume_id to the LLM immediately (before polling starts)
465
+ // This way the LLM has it even if MCP client times out during polling
466
+ if (context?.streamContent) {
467
+ const isResume = !!args.resume_id;
468
+ await context.streamContent({
469
+ type: "text" as const,
470
+ text: isResume
471
+ ? `[Vertex] Resuming status check for job: ${operationUuid}`
472
+ : `[Vertex] Video generation started. resume_id: ${operationUuid} (use this to check status if needed)`,
473
+ });
474
+ }
475
+
476
+ // Poll for status - keep polling until done
477
+ // Resume_id was already streamed, so if MCP client times out the LLM still has it
478
+ let done = current ? !!current.done || !!current.response : false;
479
+ const startTime = Date.now();
480
+ const MAX_POLL_TIME = 600000; // 10 minutes - full tool timeout is 20 mins
481
+
482
+ while (!done && Date.now() - startTime < MAX_POLL_TIME) {
483
+ await wait(10000); // 10 second intervals
280
484
 
281
- // Poll using fetchPredictOperation as per Vertex recommendation
282
- const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
283
- while (!done && tries < 60) {
284
- await wait(10000);
285
485
  const poll = await fetch(fetchUrl, {
286
486
  method: "POST",
287
487
  headers: {
288
488
  Authorization: `Bearer ${token}`,
289
489
  "Content-Type": "application/json",
290
490
  },
291
- body: JSON.stringify({ operationName: name }),
491
+ body: JSON.stringify({ operationName }),
292
492
  });
293
493
  if (!poll.ok) {
294
494
  const text = await poll.text();
@@ -298,7 +498,37 @@ export const imageToVideo = {
298
498
  }
299
499
  current = (await poll.json()) as any;
300
500
  done = !!current.done || !!current.response;
301
- tries++;
501
+
502
+ if (context?.reportProgress) {
503
+ const elapsed = Date.now() - startTime;
504
+ const progressPercent = Math.min(
505
+ Math.round((elapsed / MAX_POLL_TIME) * 100),
506
+ 99
507
+ );
508
+ await context.reportProgress({
509
+ progress: progressPercent,
510
+ total: 100,
511
+ });
512
+ }
513
+
514
+ if (context?.streamContent && !done) {
515
+ await context.streamContent({
516
+ type: "text" as const,
517
+ text: `[Vertex] Still processing... (${Math.round(
518
+ (Date.now() - startTime) / 1000
519
+ )}s elapsed)`,
520
+ });
521
+ }
522
+ }
523
+
524
+ if (!done) {
525
+ return JSON.stringify({
526
+ status: "IN_PROGRESS",
527
+ request_id: operationName,
528
+ resume_id: operationName,
529
+ message:
530
+ "Still in progress. Call this tool again with resume_id to continue checking.",
531
+ });
302
532
  }
303
533
 
304
534
  const resp = current.response || current;
@@ -356,7 +586,7 @@ export const imageToVideo = {
356
586
  const tail50 = jsonStr
357
587
  ? jsonStr.slice(Math.max(0, jsonStr.length - 50))
358
588
  : "";
359
- return `Vertex operation done but no videos array present. operationName=${name}. json_head150=${head150} json_tail50=${tail50}`;
589
+ return `Vertex operation done but no videos array present. operationName=${operationName}. json_head150=${head150} json_tail50=${tail50}`;
360
590
  }, "imageToVideo");
361
591
  },
362
592
  };