@mixio-pro/kalaasetu-mcp 1.2.2 → 2.0.2-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,10 @@ import { z } from "zod";
2
2
  import { getStorage } from "../storage";
3
3
  import { generateTimestampedFilename } from "../utils/filename";
4
4
  import { safeToolExecute } from "../utils/tool-wrapper";
5
+ import {
6
+ resolveEnhancer,
7
+ listVideoEnhancerPresets,
8
+ } from "../utils/prompt-enhancer-presets";
5
9
 
6
10
  import { getGoogleAccessToken } from "../utils/google-auth";
7
11
 
@@ -41,15 +45,17 @@ async function fileToBase64(
41
45
  export const imageToVideo = {
42
46
  name: "generateVideoi2v",
43
47
  description:
44
- "Generate professional-quality cinematic videos from a starting image and text prompt using Vertex AI's Veo models. " +
45
- "This is a high-latency tool (often takes 5-15 minutes) but produces state-of-the-art results. " +
46
- "It supports guided generation with start/end frames and specific durations. " +
48
+ "Generate professional-quality cinematic videos from a starting image and text prompt using Google's Vertex AI Veo models. " +
49
+ "This tool follows a 'Synchronous Facade' pattern: it handles polling internally but can be paused/resumed. " +
50
+ "If the generation takes too long, it returns a 'resume_id' that you MUST use to call this tool again to pick up progress. " +
51
+ "It produces state-of-the-art cinematic results. " +
47
52
  "ONLY USE WHEN WORKING WITH GOOGLE VERTEX AI MODELS.",
48
53
  parameters: z.object({
49
54
  prompt: z
50
55
  .string()
56
+ .optional()
51
57
  .describe(
52
- "Descriptive text for the video action and style (e.g., 'A robot walking through a neon city at night')."
58
+ "Required for new requests. Descriptive text for the video action and style (e.g., 'A robot walking through a neon city at night')."
53
59
  ),
54
60
  image_path: z
55
61
  .string()
@@ -125,24 +131,63 @@ export const imageToVideo = {
125
131
  "If true, Vertex will attempt to synthesize synchronized audio for the video."
126
132
  )
127
133
  .default(false),
134
+ resume_id: z
135
+ .string()
136
+ .optional()
137
+ .describe(
138
+ "If provided, the tool will check the status of an existing Vertex operation instead of starting a new one. " +
139
+ "Use the 'request_id' returned in an 'IN_PROGRESS' response."
140
+ ),
141
+ auto_enhance: z
142
+ .boolean()
143
+ .optional()
144
+ .describe(
145
+ "Whether to automatically enhance the prompt using Veo/LTX guidelines (default: true if enabled via preset or config). Set to false to disable enhancement."
146
+ ),
147
+ enhancer_preset: z
148
+ .string()
149
+ .optional()
150
+ .describe(
151
+ "Optional: Name of a video prompt enhancer preset (e.g., 'veo', 'ltx2', 'cinematic_video'). " +
152
+ "When using Veo, setting this to 'veo' (or setting auto_enhance=true) will trigger the LLM-based enhancer."
153
+ ),
128
154
  }),
129
- timeoutMs: 1200000, // 20 minutes
130
- async execute(args: {
131
- prompt: string;
132
- image_path?: string;
133
- last_frame_path?: string;
134
- aspect_ratio?: string;
135
- duration_seconds?: string;
136
- resolution?: string;
137
- negative_prompt?: string;
138
- person_generation?: string;
139
- reference_images?: string[] | string;
140
- output_path?: string;
141
- project_id?: string;
142
- location_id?: string;
143
- model_id?: string;
144
- generate_audio?: boolean;
145
- }) {
155
+ timeoutMs: 90000, // 90 seconds MCP timeout (internal timeout is 60s)
156
+ async execute(
157
+ args: {
158
+ prompt?: string;
159
+ image_path?: string;
160
+ last_frame_path?: string;
161
+ aspect_ratio?: string;
162
+ duration_seconds?: string;
163
+ resolution?: string;
164
+ negative_prompt?: string;
165
+ person_generation?: string;
166
+ reference_images?: string[] | string;
167
+ output_path?: string;
168
+ project_id?: string;
169
+ location_id?: string;
170
+ model_id?: string;
171
+ generate_audio?: boolean;
172
+ resume_id?: string;
173
+ enhancer_preset?: string;
174
+ auto_enhance?: boolean;
175
+ },
176
+ context?: {
177
+ reportProgress?: (progress: {
178
+ progress: number;
179
+ total: number;
180
+ }) => Promise<void>;
181
+ streamContent?: (content: {
182
+ type: "text";
183
+ text: string;
184
+ }) => Promise<void>;
185
+ log?: {
186
+ info: (msg: string, data?: any) => void;
187
+ debug: (msg: string, data?: any) => void;
188
+ };
189
+ }
190
+ ) {
146
191
  return safeToolExecute(async () => {
147
192
  const projectId = args.project_id || "mixio-pro";
148
193
  const location = args.location_id || "us-central1";
@@ -180,130 +225,270 @@ export const imageToVideo = {
180
225
  ) {
181
226
  durationSeconds = 8;
182
227
  }
228
+ // Stream diagnostic info about auth
229
+ let token: string;
230
+ try {
231
+ if (context?.streamContent) {
232
+ await context.streamContent({
233
+ type: "text" as const,
234
+ text: `[Vertex] Authenticating with Google Cloud (project: ${projectId}, location: ${location})...`,
235
+ });
236
+ }
237
+ token = await getGoogleAccessToken();
238
+ if (context?.streamContent) {
239
+ await context.streamContent({
240
+ type: "text" as const,
241
+ text: `[Vertex] ✓ Authentication successful. Token acquired.`,
242
+ });
243
+ }
244
+ } catch (authError: any) {
245
+ const errorMsg = authError?.message || String(authError);
246
+ if (context?.streamContent) {
247
+ await context.streamContent({
248
+ type: "text" as const,
249
+ text: `[Vertex] ✗ Authentication FAILED: ${errorMsg}. Check GOOGLE_APPLICATION_CREDENTIALS or run 'gcloud auth application-default login'.`,
250
+ });
251
+ }
252
+ throw new Error(`Google Cloud authentication failed: ${errorMsg}`);
253
+ }
183
254
 
184
- const token = await getGoogleAccessToken();
185
-
186
- const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
255
+ const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
187
256
 
188
- let imagePart: any = undefined;
189
- if (args.image_path) {
190
- const { data, mimeType } = await fileToBase64(args.image_path);
191
- imagePart = {
192
- image: {
193
- bytesBase64Encoded: data,
194
- mimeType,
195
- },
196
- };
257
+ // If resuming, reconstruct the full operation path from the UUID
258
+ let operationName: string | undefined;
259
+ if (args.resume_id) {
260
+ // Support both UUID-only and full path formats
261
+ if (args.resume_id.includes("/")) {
262
+ operationName = args.resume_id; // Already a full path
263
+ } else {
264
+ // Reconstruct full path from UUID
265
+ operationName = `projects/${projectId}/locations/${location}/publishers/google/models/${modelId}/operations/${args.resume_id}`;
266
+ }
197
267
  }
268
+ let current: any;
198
269
 
199
- let lastFramePart: any = undefined;
200
- if (args.last_frame_path) {
201
- const { data, mimeType } = await fileToBase64(args.last_frame_path);
202
- lastFramePart = {
203
- lastFrame: {
204
- bytesBase64Encoded: data,
205
- mimeType,
206
- },
207
- };
208
- }
270
+ if (!operationName) {
271
+ if (!args.prompt) {
272
+ throw new Error("prompt is required when starting a new generation.");
273
+ }
274
+
275
+ if (context?.streamContent) {
276
+ await context.streamContent({
277
+ type: "text" as const,
278
+ text: `[Vertex] Submitting video generation request to Veo model: ${modelId}...`,
279
+ });
280
+ }
209
281
 
210
- let referenceImages: any[] | undefined = undefined;
211
- if (args.reference_images) {
212
- let refImages: string[];
213
- if (typeof args.reference_images === "string") {
214
- if (
215
- args.reference_images.startsWith("[") &&
216
- args.reference_images.endsWith("]")
217
- ) {
218
- try {
219
- refImages = JSON.parse(args.reference_images);
220
- } catch {
221
- throw new Error("Invalid reference_images format");
282
+ const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
283
+
284
+ let imagePart: any = undefined;
285
+ if (args.image_path) {
286
+ const { data, mimeType } = await fileToBase64(args.image_path);
287
+ imagePart = {
288
+ image: {
289
+ bytesBase64Encoded: data,
290
+ mimeType,
291
+ },
292
+ };
293
+ }
294
+
295
+ let lastFramePart: any = undefined;
296
+ if (args.last_frame_path) {
297
+ const { data, mimeType } = await fileToBase64(args.last_frame_path);
298
+ lastFramePart = {
299
+ lastFrame: {
300
+ bytesBase64Encoded: data,
301
+ mimeType,
302
+ },
303
+ };
304
+ }
305
+
306
+ let referenceImages: any[] | undefined = undefined;
307
+ if (args.reference_images) {
308
+ let refImages: string[];
309
+ if (typeof args.reference_images === "string") {
310
+ if (
311
+ args.reference_images.startsWith("[") &&
312
+ args.reference_images.endsWith("]")
313
+ ) {
314
+ try {
315
+ refImages = JSON.parse(args.reference_images);
316
+ } catch {
317
+ throw new Error("Invalid reference_images format");
318
+ }
319
+ } else {
320
+ refImages = [args.reference_images];
222
321
  }
322
+ } else if (Array.isArray(args.reference_images)) {
323
+ refImages = args.reference_images;
223
324
  } else {
224
- refImages = [args.reference_images];
325
+ throw new Error(
326
+ "Invalid reference_images: must be array or string"
327
+ );
328
+ }
329
+
330
+ if (refImages.length > 0) {
331
+ referenceImages = await Promise.all(
332
+ refImages.slice(0, 3).map(async (p) => {
333
+ const { data, mimeType } = await fileToBase64(p);
334
+ return {
335
+ image: {
336
+ bytesBase64Encoded: data,
337
+ mimeType,
338
+ },
339
+ referenceType: "asset",
340
+ };
341
+ })
342
+ );
225
343
  }
226
- } else if (Array.isArray(args.reference_images)) {
227
- refImages = args.reference_images;
228
- } else {
229
- throw new Error("Invalid reference_images: must be array or string");
230
344
  }
231
345
 
232
- if (refImages.length > 0) {
233
- referenceImages = await Promise.all(
234
- refImages.slice(0, 3).map(async (p) => {
235
- const { data, mimeType } = await fileToBase64(p);
236
- return {
237
- image: {
238
- bytesBase64Encoded: data,
239
- mimeType,
240
- },
241
- referenceType: "asset",
242
- };
243
- })
244
- );
346
+ const personGeneration =
347
+ args.person_generation ||
348
+ (args.image_path ? "allow_adult" : "allow_all");
349
+
350
+ // Apply prompt enhancement logic
351
+ let enhancedPrompt = args.prompt;
352
+ let enhancedNegativePrompt = args.negative_prompt;
353
+
354
+ // Determine which preset to use
355
+ let presetToUse = args.enhancer_preset;
356
+
357
+ // If auto_enhance is true and no preset specified, default to 'veo'
358
+ if (args.auto_enhance === true && !presetToUse) {
359
+ presetToUse = "veo";
245
360
  }
246
- }
247
361
 
248
- const personGeneration =
249
- args.person_generation ||
250
- (args.image_path ? "allow_adult" : "allow_all");
251
-
252
- const instances: any[] = [
253
- {
254
- prompt: args.prompt,
255
- ...(imagePart || {}),
256
- ...(lastFramePart || {}),
257
- ...(referenceImages ? { referenceImages } : {}),
258
- },
259
- ];
260
-
261
- const parameters: any = {
262
- aspectRatio: args.aspect_ratio || "9:16",
263
- durationSeconds: durationSeconds,
264
- resolution: args.resolution || "720p",
265
- negativePrompt: args.negative_prompt,
266
- generateAudio: args.generate_audio || false,
267
- personGeneration,
268
- };
362
+ // Disable enhancement if auto_enhance is explicitly false
363
+ if (args.auto_enhance === false) {
364
+ presetToUse = undefined;
365
+ }
269
366
 
270
- const res = await fetch(url, {
271
- method: "POST",
272
- headers: {
273
- Authorization: `Bearer ${token}`,
274
- "Content-Type": "application/json",
275
- },
276
- body: JSON.stringify({ instances, parameters }),
277
- });
367
+ if (presetToUse && args.prompt) {
368
+ // Use LLM-based enhancement for 'veo' preset
369
+ if (presetToUse === "veo") {
370
+ const { enhancePromptWithLLM, isLLMEnhancerAvailable } =
371
+ await import("../utils/llm-prompt-enhancer");
372
+
373
+ if (isLLMEnhancerAvailable()) {
374
+ if (context?.streamContent) {
375
+ await context.streamContent({
376
+ type: "text" as const,
377
+ text: `[VEO] Enhancing prompt with Gemini for optimal Veo 3.1 generation...`,
378
+ });
379
+ }
380
+
381
+ try {
382
+ enhancedPrompt = await enhancePromptWithLLM(args.prompt, "veo");
383
+ context?.log?.info(
384
+ `LLM-enhanced prompt for Veo: "${args.prompt}" → "${enhancedPrompt}"`
385
+ );
386
+
387
+ if (context?.streamContent) {
388
+ await context.streamContent({
389
+ type: "text" as const,
390
+ text: `[VEO] ✓ Prompt enhanced. Length: ${args.prompt.length} → ${enhancedPrompt.length} chars`,
391
+ });
392
+ }
393
+ } catch (err: any) {
394
+ context?.log?.info(
395
+ `LLM enhancement failed, using original: ${err.message}`
396
+ );
397
+ }
398
+ } else {
399
+ context?.log?.info(
400
+ "GEMINI_API_KEY not set, skipping Veo LLM enhancement"
401
+ );
402
+ }
403
+ } else {
404
+ // Fall back to static string-based enhancement for other presets
405
+ const enhancer = resolveEnhancer(presetToUse);
406
+ if (enhancer.hasTransformations()) {
407
+ enhancedPrompt = enhancer.enhance(args.prompt);
408
+ // Apply negative elements if not already set
409
+ const negatives = enhancer.getNegativeElements();
410
+ if (negatives && !enhancedNegativePrompt) {
411
+ enhancedNegativePrompt = negatives;
412
+ }
413
+ }
414
+ }
415
+ }
416
+
417
+ const instances: any[] = [
418
+ {
419
+ prompt: enhancedPrompt,
420
+ ...(imagePart || {}),
421
+ ...(lastFramePart || {}),
422
+ ...(referenceImages ? { referenceImages } : {}),
423
+ },
424
+ ];
425
+
426
+ const parameters: any = {
427
+ aspectRatio: args.aspect_ratio || "9:16",
428
+ durationSeconds: durationSeconds,
429
+ resolution: args.resolution || "720p",
430
+ negativePrompt: enhancedNegativePrompt,
431
+ generateAudio: args.generate_audio || false,
432
+ personGeneration,
433
+ };
434
+
435
+ const res = await fetch(url, {
436
+ method: "POST",
437
+ headers: {
438
+ Authorization: `Bearer ${token}`,
439
+ "Content-Type": "application/json",
440
+ },
441
+ body: JSON.stringify({ instances, parameters }),
442
+ });
443
+
444
+ if (!res.ok) {
445
+ const text = await res.text();
446
+ throw new Error(`Vertex request failed: ${res.status} ${text}`);
447
+ }
278
448
 
279
- if (!res.ok) {
280
- const text = await res.text();
281
- throw new Error(`Vertex request failed: ${res.status} ${text}`);
449
+ const op = (await res.json()) as any;
450
+ operationName = op.name || op.operation || "";
451
+ current = op;
282
452
  }
283
453
 
284
- const op = (await res.json()) as any;
285
- const name: string = op.name || op.operation || "";
286
- if (!name) {
454
+ if (!operationName) {
287
455
  throw new Error(
288
456
  "Vertex did not return an operation name for long-running request"
289
457
  );
290
458
  }
291
459
 
292
- let current = op;
293
- let done = !!op.done;
294
- let tries = 0;
460
+ // Extract just the operation UUID from the full path for a cleaner resume_id
461
+ // Full path: projects/.../operations/<uuid>
462
+ const operationUuid = operationName.split("/").pop() || operationName;
463
+
464
+ // Stream the resume_id to the LLM immediately (before polling starts)
465
+ // This way the LLM has it even if MCP client times out during polling
466
+ if (context?.streamContent) {
467
+ const isResume = !!args.resume_id;
468
+ await context.streamContent({
469
+ type: "text" as const,
470
+ text: isResume
471
+ ? `[Vertex] Resuming status check for job: ${operationUuid}`
472
+ : `[Vertex] Video generation started. resume_id: ${operationUuid} (use this to check status if needed)`,
473
+ });
474
+ }
475
+
476
+ // Poll for status - keep polling until done
477
+ // Resume_id was already streamed, so if MCP client times out the LLM still has it
478
+ let done = current ? !!current.done || !!current.response : false;
479
+ const startTime = Date.now();
480
+ const MAX_POLL_TIME = 60000; // 60 seconds internal timeout - then return resume_id
481
+
482
+ while (!done && Date.now() - startTime < MAX_POLL_TIME) {
483
+ await wait(10000); // 10 second intervals
295
484
 
296
- // Poll using fetchPredictOperation as per Vertex recommendation
297
- const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
298
- while (!done && tries < 60) {
299
- await wait(10000);
300
485
  const poll = await fetch(fetchUrl, {
301
486
  method: "POST",
302
487
  headers: {
303
488
  Authorization: `Bearer ${token}`,
304
489
  "Content-Type": "application/json",
305
490
  },
306
- body: JSON.stringify({ operationName: name }),
491
+ body: JSON.stringify({ operationName }),
307
492
  });
308
493
  if (!poll.ok) {
309
494
  const text = await poll.text();
@@ -313,7 +498,37 @@ export const imageToVideo = {
313
498
  }
314
499
  current = (await poll.json()) as any;
315
500
  done = !!current.done || !!current.response;
316
- tries++;
501
+
502
+ if (context?.reportProgress) {
503
+ const elapsed = Date.now() - startTime;
504
+ const progressPercent = Math.min(
505
+ Math.round((elapsed / MAX_POLL_TIME) * 100),
506
+ 99
507
+ );
508
+ await context.reportProgress({
509
+ progress: progressPercent,
510
+ total: 100,
511
+ });
512
+ }
513
+
514
+ if (context?.streamContent && !done) {
515
+ await context.streamContent({
516
+ type: "text" as const,
517
+ text: `[Vertex] Still processing... (${Math.round(
518
+ (Date.now() - startTime) / 1000
519
+ )}s elapsed)`,
520
+ });
521
+ }
522
+ }
523
+
524
+ if (!done) {
525
+ return JSON.stringify({
526
+ status: "IN_PROGRESS",
527
+ request_id: operationName,
528
+ resume_id: operationName,
529
+ message:
530
+ "Still in progress. Call this tool again with resume_id to continue checking.",
531
+ });
317
532
  }
318
533
 
319
534
  const resp = current.response || current;
@@ -371,7 +586,7 @@ export const imageToVideo = {
371
586
  const tail50 = jsonStr
372
587
  ? jsonStr.slice(Math.max(0, jsonStr.length - 50))
373
588
  : "";
374
- return `Vertex operation done but no videos array present. operationName=${name}. json_head150=${head150} json_tail50=${tail50}`;
589
+ return `Vertex operation done but no videos array present. operationName=${operationName}. json_head150=${head150} json_tail50=${tail50}`;
375
590
  }, "imageToVideo");
376
591
  },
377
592
  };