npm - @mixio-pro/kalaasetu-mcp - Versions diffs - 1.2.1 → 2.0.1-beta - Mend

@mixio-pro/kalaasetu-mcp 1.2.1 → 2.0.1-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/fal-config.json +106 -0
package/package.json +2 -1
package/src/index.ts +0 -9
package/src/tools/fal/config.ts +120 -23
package/src/tools/fal/generate.ts +370 -84
package/src/tools/fal/index.ts +2 -7
package/src/tools/fal/models.ts +163 -29
package/src/tools/fal/storage.ts +9 -2
package/src/tools/gemini.ts +106 -26
package/src/tools/image-to-video.ts +359 -129
package/src/tools/perplexity.ts +61 -61
package/src/tools/youtube.ts +8 -3
package/src/utils/llm-prompt-enhancer.ts +302 -0
package/src/utils/prompt-enhancer-presets.ts +303 -0
package/src/utils/prompt-enhancer.ts +186 -0

package/src/tools/image-to-video.ts CHANGED Viewed

@@ -2,6 +2,10 @@ import { z } from "zod";
 import { getStorage } from "../storage";
 import { generateTimestampedFilename } from "../utils/filename";
 import { safeToolExecute } from "../utils/tool-wrapper";
+import {
+  resolveEnhancer,
+  listVideoEnhancerPresets,
+} from "../utils/prompt-enhancer-presets";
 import { getGoogleAccessToken } from "../utils/google-auth";
@@ -41,93 +45,149 @@ async function fileToBase64(
 export const imageToVideo = {
   name: "generateVideoi2v",
   description:
-    "Generate videos from an image as starting first frame using Vertex Veo models (predictLongRunning + fetchPredictOperation).",
+    "Generate professional-quality cinematic videos from a starting image and text prompt using Google's Vertex AI Veo models. " +
+    "This tool follows a 'Synchronous Facade' pattern: it handles polling internally but can be paused/resumed. " +
+    "If the generation takes too long, it returns a 'resume_id' that you MUST use to call this tool again to pick up progress. " +
+    "It produces state-of-the-art cinematic results. " +
+    "ONLY USE WHEN WORKING WITH GOOGLE VERTEX AI MODELS.",
   parameters: z.object({
-    prompt: z.string().describe("Text description for the video"),
+    prompt: z
+      .string()
+      .optional()
+      .describe(
+        "Required for new requests. Descriptive text for the video action and style (e.g., 'A robot walking through a neon city at night')."
+      ),
     image_path: z
       .string()
       .optional()
-      .describe("Path to source image for image-to-video generation"),
+      .describe("Absolute local path or URL to the STARTING image frame."),
     last_frame_path: z
       .string()
       .optional()
-      .describe("Path to last frame image to guide ending frame (optional)"),
+      .describe(
+        "Optional: Absolute local path or URL to the ENDING image frame to guide the video's conclusion."
+      ),
     aspect_ratio: z
       .string()
       .optional()
       .default("16:9")
-      .describe("Video aspect ratio: '16:9' or '9:16'"),
+      .describe(
+        "Target aspect ratio: '16:9' (landscape) or '9:16' (vertical)."
+      ),
     duration_seconds: z
       .string()
       .optional()
       .default("6")
       .describe(
-        "Video duration in seconds. MUST be one of: '4', '6', or '8' (default: '6'). Other values will be rejected by Vertex AI."
+        "Target duration. Vertex AI ONLY supports exactly '4', '6', or '8' seconds. Other values will be rounded to the nearest supported step."
       ),
     resolution: z
       .string()
       .optional()
-      .describe("Video resolution: '720p' or '1080p' (default: '720p')"),
+      .describe("Target resolution: '720p' or '1080p'. Default is '720p'."),
     negative_prompt: z
       .string()
       .optional()
-      .describe("Text describing what not to include in the video"),
+      .describe(
+        "Visual elements or styles to EXCLUDE from the generated video."
+      ),
     person_generation: z
       .string()
       .optional()
       .describe(
-        "Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"
+        "Policy for generating people: 'allow_adult' (standard) or 'allow_all'. Note: Gemini 1.5+ safety filters apply."
       ),
     reference_images: z
       .array(z.string())
       .optional()
-      .describe("Additional image paths for reference (max 3)"),
+      .describe(
+        "Optional: Additional images (up to 3) to guide style or character consistency."
+      ),
     output_path: z
       .string()
       .optional()
       .describe(
-        "Output MP4 file path (if multiple predictions, index suffix is added)"
+        "Optional: Local path to save the resulting .mp4 file. Defaults to timestamped filename."
       ),
     project_id: z
       .string()
       .optional()
       .default("mixio-pro")
-      .describe("GCP Project ID (default: mixio-pro)"),
+      .describe("GCP Project ID for Vertex billing."),
     location_id: z
       .string()
       .optional()
       .default("us-central1")
-      .describe("Vertex region (default: us-central1)"),
+      .describe("GCP region for Vertex AI processing (e.g., 'us-central1')."),
     model_id: z
       .string()
       .optional()
       .default("veo-3.1-fast-generate-001")
-      .describe("Model ID (default: veo-3.1-fast-generate-001)"),
+      .describe("Specific Vertex Veo model ID to use."),
     generate_audio: z
       .boolean()
       .optional()
       .describe(
-        "Boolean flag to enable generation of audio along with the video"
+        "If true, Vertex will attempt to synthesize synchronized audio for the video."
       )
       .default(false),
+    resume_id: z
+      .string()
+      .optional()
+      .describe(
+        "If provided, the tool will check the status of an existing Vertex operation instead of starting a new one. " +
+          "Use the 'request_id' returned in an 'IN_PROGRESS' response."
+      ),
+    auto_enhance: z
+      .boolean()
+      .optional()
+      .describe(
+        "Whether to automatically enhance the prompt using Veo/LTX guidelines (default: true if enabled via preset or config). Set to false to disable enhancement."
+      ),
+    enhancer_preset: z
+      .string()
+      .optional()
+      .describe(
+        "Optional: Name of a video prompt enhancer preset (e.g., 'veo', 'ltx2', 'cinematic_video'). " +
+          "When using Veo, setting this to 'veo' (or setting auto_enhance=true) will trigger the LLM-based enhancer."
+      ),
   }),
   timeoutMs: 1200000, // 20 minutes
-  async execute(args: {
-    prompt: string;
-    image_path?: string;
-    last_frame_path?: string;
-    aspect_ratio?: string;
-    duration_seconds?: string;
-    resolution?: string;
-    negative_prompt?: string;
-    person_generation?: string;
-    reference_images?: string[] | string;
-    output_path?: string;
-    project_id?: string;
-    location_id?: string;
-    model_id?: string;
-    generate_audio?: boolean;
-  }) {
+  async execute(
+    args: {
+      prompt?: string;
+      image_path?: string;
+      last_frame_path?: string;
+      aspect_ratio?: string;
+      duration_seconds?: string;
+      resolution?: string;
+      negative_prompt?: string;
+      person_generation?: string;
+      reference_images?: string[] | string;
+      output_path?: string;
+      project_id?: string;
+      location_id?: string;
+      model_id?: string;
+      generate_audio?: boolean;
+      resume_id?: string;
+      enhancer_preset?: string;
+      auto_enhance?: boolean;
+    },
+    context?: {
+      reportProgress?: (progress: {
+        progress: number;
+        total: number;
+      }) => Promise<void>;
+      streamContent?: (content: {
+        type: "text";
+        text: string;
+      }) => Promise<void>;
+      log?: {
+        info: (msg: string, data?: any) => void;
+        debug: (msg: string, data?: any) => void;
+      };
+    }
+  ) {
     return safeToolExecute(async () => {
       const projectId = args.project_id || "mixio-pro";
       const location = args.location_id || "us-central1";
@@ -165,130 +225,270 @@ export const imageToVideo = {
       ) {
         durationSeconds = 8;
       }
+      // Stream diagnostic info about auth
+      let token: string;
+      try {
+        if (context?.streamContent) {
+          await context.streamContent({
+            type: "text" as const,
+            text: `[Vertex] Authenticating with Google Cloud (project: ${projectId}, location: ${location})...`,
+          });
+        }
+        token = await getGoogleAccessToken();
+        if (context?.streamContent) {
+          await context.streamContent({
+            type: "text" as const,
+            text: `[Vertex] ✓ Authentication successful. Token acquired.`,
+          });
+        }
+      } catch (authError: any) {
+        const errorMsg = authError?.message || String(authError);
+        if (context?.streamContent) {
+          await context.streamContent({
+            type: "text" as const,
+            text: `[Vertex] ✗ Authentication FAILED: ${errorMsg}. Check GOOGLE_APPLICATION_CREDENTIALS or run 'gcloud auth application-default login'.`,
+          });
+        }
+        throw new Error(`Google Cloud authentication failed: ${errorMsg}`);
+      }
-      const token = await getGoogleAccessToken();
-      const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
+      const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
-      let imagePart: any = undefined;
-      if (args.image_path) {
-        const { data, mimeType } = await fileToBase64(args.image_path);
-        imagePart = {
-          image: {
-            bytesBase64Encoded: data,
-            mimeType,
-          },
-        };
+      // If resuming, reconstruct the full operation path from the UUID
+      let operationName: string | undefined;
+      if (args.resume_id) {
+        // Support both UUID-only and full path formats
+        if (args.resume_id.includes("/")) {
+          operationName = args.resume_id; // Already a full path
+        } else {
+          // Reconstruct full path from UUID
+          operationName = `projects/${projectId}/locations/${location}/publishers/google/models/${modelId}/operations/${args.resume_id}`;
+        }
       }
+      let current: any;
-      let lastFramePart: any = undefined;
-      if (args.last_frame_path) {
-        const { data, mimeType } = await fileToBase64(args.last_frame_path);
-        lastFramePart = {
-          lastFrame: {
-            bytesBase64Encoded: data,
-            mimeType,
-          },
-        };
-      }
+      if (!operationName) {
+        if (!args.prompt) {
+          throw new Error("prompt is required when starting a new generation.");
+        }
+        if (context?.streamContent) {
+          await context.streamContent({
+            type: "text" as const,
+            text: `[Vertex] Submitting video generation request to Veo model: ${modelId}...`,
+          });
+        }
+        const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
+        let imagePart: any = undefined;
+        if (args.image_path) {
+          const { data, mimeType } = await fileToBase64(args.image_path);
+          imagePart = {
+            image: {
+              bytesBase64Encoded: data,
+              mimeType,
+            },
+          };
+        }
+        let lastFramePart: any = undefined;
+        if (args.last_frame_path) {
+          const { data, mimeType } = await fileToBase64(args.last_frame_path);
+          lastFramePart = {
+            lastFrame: {
+              bytesBase64Encoded: data,
+              mimeType,
+            },
+          };
+        }
-      let referenceImages: any[] | undefined = undefined;
-      if (args.reference_images) {
-        let refImages: string[];
-        if (typeof args.reference_images === "string") {
-          if (
-            args.reference_images.startsWith("[") &&
-            args.reference_images.endsWith("]")
-          ) {
-            try {
-              refImages = JSON.parse(args.reference_images);
-            } catch {
-              throw new Error("Invalid reference_images format");
+        let referenceImages: any[] | undefined = undefined;
+        if (args.reference_images) {
+          let refImages: string[];
+          if (typeof args.reference_images === "string") {
+            if (
+              args.reference_images.startsWith("[") &&
+              args.reference_images.endsWith("]")
+            ) {
+              try {
+                refImages = JSON.parse(args.reference_images);
+              } catch {
+                throw new Error("Invalid reference_images format");
+              }
+            } else {
+              refImages = [args.reference_images];
             }
+          } else if (Array.isArray(args.reference_images)) {
+            refImages = args.reference_images;
           } else {
-            refImages = [args.reference_images];
+            throw new Error(
+              "Invalid reference_images: must be array or string"
+            );
+          }
+          if (refImages.length > 0) {
+            referenceImages = await Promise.all(
+              refImages.slice(0, 3).map(async (p) => {
+                const { data, mimeType } = await fileToBase64(p);
+                return {
+                  image: {
+                    bytesBase64Encoded: data,
+                    mimeType,
+                  },
+                  referenceType: "asset",
+                };
+              })
+            );
           }
-        } else if (Array.isArray(args.reference_images)) {
-          refImages = args.reference_images;
-        } else {
-          throw new Error("Invalid reference_images: must be array or string");
         }
-        if (refImages.length > 0) {
-          referenceImages = await Promise.all(
-            refImages.slice(0, 3).map(async (p) => {
-              const { data, mimeType } = await fileToBase64(p);
-              return {
-                image: {
-                  bytesBase64Encoded: data,
-                  mimeType,
-                },
-                referenceType: "asset",
-              };
-            })
-          );
+        const personGeneration =
+          args.person_generation ||
+          (args.image_path ? "allow_adult" : "allow_all");
+        // Apply prompt enhancement logic
+        let enhancedPrompt = args.prompt;
+        let enhancedNegativePrompt = args.negative_prompt;
+        // Determine which preset to use
+        let presetToUse = args.enhancer_preset;
+        // If auto_enhance is true and no preset specified, default to 'veo'
+        if (args.auto_enhance === true && !presetToUse) {
+          presetToUse = "veo";
         }
-      }
-      const personGeneration =
-        args.person_generation ||
-        (args.image_path ? "allow_adult" : "allow_all");
-      const instances: any[] = [
-        {
-          prompt: args.prompt,
-          ...(imagePart || {}),
-          ...(lastFramePart || {}),
-          ...(referenceImages ? { referenceImages } : {}),
-        },
-      ];
-      const parameters: any = {
-        aspectRatio: args.aspect_ratio || "9:16",
-        durationSeconds: durationSeconds,
-        resolution: args.resolution || "720p",
-        negativePrompt: args.negative_prompt,
-        generateAudio: args.generate_audio || false,
-        personGeneration,
-      };
+        // Disable enhancement if auto_enhance is explicitly false
+        if (args.auto_enhance === false) {
+          presetToUse = undefined;
+        }
-      const res = await fetch(url, {
-        method: "POST",
-        headers: {
-          Authorization: `Bearer ${token}`,
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify({ instances, parameters }),
-      });
+        if (presetToUse && args.prompt) {
+          // Use LLM-based enhancement for 'veo' preset
+          if (presetToUse === "veo") {
+            const { enhancePromptWithLLM, isLLMEnhancerAvailable } =
+              await import("../utils/llm-prompt-enhancer");
+            if (isLLMEnhancerAvailable()) {
+              if (context?.streamContent) {
+                await context.streamContent({
+                  type: "text" as const,
+                  text: `[VEO] Enhancing prompt with Gemini for optimal Veo 3.1 generation...`,
+                });
+              }
-      if (!res.ok) {
-        const text = await res.text();
-        throw new Error(`Vertex request failed: ${res.status} ${text}`);
+              try {
+                enhancedPrompt = await enhancePromptWithLLM(args.prompt, "veo");
+                context?.log?.info(
+                  `LLM-enhanced prompt for Veo: "${args.prompt}" → "${enhancedPrompt}"`
+                );
+                if (context?.streamContent) {
+                  await context.streamContent({
+                    type: "text" as const,
+                    text: `[VEO] ✓ Prompt enhanced. Length: ${args.prompt.length} → ${enhancedPrompt.length} chars`,
+                  });
+                }
+              } catch (err: any) {
+                context?.log?.info(
+                  `LLM enhancement failed, using original: ${err.message}`
+                );
+              }
+            } else {
+              context?.log?.info(
+                "GEMINI_API_KEY not set, skipping Veo LLM enhancement"
+              );
+            }
+          } else {
+            // Fall back to static string-based enhancement for other presets
+            const enhancer = resolveEnhancer(presetToUse);
+            if (enhancer.hasTransformations()) {
+              enhancedPrompt = enhancer.enhance(args.prompt);
+              // Apply negative elements if not already set
+              const negatives = enhancer.getNegativeElements();
+              if (negatives && !enhancedNegativePrompt) {
+                enhancedNegativePrompt = negatives;
+              }
+            }
+          }
+        }
+        const instances: any[] = [
+          {
+            prompt: enhancedPrompt,
+            ...(imagePart || {}),
+            ...(lastFramePart || {}),
+            ...(referenceImages ? { referenceImages } : {}),
+          },
+        ];
+        const parameters: any = {
+          aspectRatio: args.aspect_ratio || "9:16",
+          durationSeconds: durationSeconds,
+          resolution: args.resolution || "720p",
+          negativePrompt: enhancedNegativePrompt,
+          generateAudio: args.generate_audio || false,
+          personGeneration,
+        };
+        const res = await fetch(url, {
+          method: "POST",
+          headers: {
+            Authorization: `Bearer ${token}`,
+            "Content-Type": "application/json",
+          },
+          body: JSON.stringify({ instances, parameters }),
+        });
+        if (!res.ok) {
+          const text = await res.text();
+          throw new Error(`Vertex request failed: ${res.status} ${text}`);
+        }
+        const op = (await res.json()) as any;
+        operationName = op.name || op.operation || "";
+        current = op;
       }
-      const op = (await res.json()) as any;
-      const name: string = op.name || op.operation || "";
-      if (!name) {
+      if (!operationName) {
         throw new Error(
           "Vertex did not return an operation name for long-running request"
         );
       }
-      let current = op;
-      let done = !!op.done;
-      let tries = 0;
+      // Extract just the operation UUID from the full path for a cleaner resume_id
+      // Full path: projects/.../operations/<uuid>
+      const operationUuid = operationName.split("/").pop() || operationName;
+      // Stream the resume_id to the LLM immediately (before polling starts)
+      // This way the LLM has it even if MCP client times out during polling
+      if (context?.streamContent) {
+        const isResume = !!args.resume_id;
+        await context.streamContent({
+          type: "text" as const,
+          text: isResume
+            ? `[Vertex] Resuming status check for job: ${operationUuid}`
+            : `[Vertex] Video generation started. resume_id: ${operationUuid} (use this to check status if needed)`,
+        });
+      }
+      // Poll for status - keep polling until done
+      // Resume_id was already streamed, so if MCP client times out the LLM still has it
+      let done = current ? !!current.done || !!current.response : false;
+      const startTime = Date.now();
+      const MAX_POLL_TIME = 600000; // 10 minutes - full tool timeout is 20 mins
+      while (!done && Date.now() - startTime < MAX_POLL_TIME) {
+        await wait(10000); // 10 second intervals
-      // Poll using fetchPredictOperation as per Vertex recommendation
-      const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
-      while (!done && tries < 60) {
-        await wait(10000);
         const poll = await fetch(fetchUrl, {
           method: "POST",
           headers: {
             Authorization: `Bearer ${token}`,
             "Content-Type": "application/json",
           },
-          body: JSON.stringify({ operationName: name }),
+          body: JSON.stringify({ operationName }),
         });
         if (!poll.ok) {
           const text = await poll.text();
@@ -298,7 +498,37 @@ export const imageToVideo = {
         }
         current = (await poll.json()) as any;
         done = !!current.done || !!current.response;
-        tries++;
+        if (context?.reportProgress) {
+          const elapsed = Date.now() - startTime;
+          const progressPercent = Math.min(
+            Math.round((elapsed / MAX_POLL_TIME) * 100),
+            99
+          );
+          await context.reportProgress({
+            progress: progressPercent,
+            total: 100,
+          });
+        }
+        if (context?.streamContent && !done) {
+          await context.streamContent({
+            type: "text" as const,
+            text: `[Vertex] Still processing... (${Math.round(
+              (Date.now() - startTime) / 1000
+            )}s elapsed)`,
+          });
+        }
+      }
+      if (!done) {
+        return JSON.stringify({
+          status: "IN_PROGRESS",
+          request_id: operationName,
+          resume_id: operationName,
+          message:
+            "Still in progress. Call this tool again with resume_id to continue checking.",
+        });
       }
       const resp = current.response || current;
@@ -356,7 +586,7 @@ export const imageToVideo = {
       const tail50 = jsonStr
         ? jsonStr.slice(Math.max(0, jsonStr.length - 50))
         : "";
-      return `Vertex operation done but no videos array present. operationName=${name}. json_head150=${head150} json_tail50=${tail50}`;
+      return `Vertex operation done but no videos array present. operationName=${operationName}. json_head150=${head150} json_tail50=${tail50}`;
     }, "imageToVideo");
   },
 };