npm - @mixio-pro/kalaasetu-mcp - Versions diffs - 1.2.2 → 2.0.2-beta - Mend

@mixio-pro/kalaasetu-mcp 1.2.2 → 2.0.2-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/fal-config.json +106 -0
package/package.json +2 -1
package/src/index.ts +5 -9
package/src/tools/fal/config.ts +120 -23
package/src/tools/fal/generate.ts +361 -103
package/src/tools/fal/index.ts +2 -7
package/src/tools/fal/models.ts +157 -32
package/src/tools/gemini.ts +40 -2
package/src/tools/get-status.ts +174 -0
package/src/tools/image-to-video.ts +334 -119
package/src/utils/llm-prompt-enhancer.ts +302 -0
package/src/utils/prompt-enhancer-presets.ts +303 -0
package/src/utils/prompt-enhancer.ts +186 -0

package/src/utils/llm-prompt-enhancer.ts ADDED Viewed

@@ -0,0 +1,302 @@
+/**
+ * LLM-Powered Prompt Enhancer
+ *
+ * Uses Gemini 3 Fast to intelligently rewrite prompts based on
+ * prompting guides for specific models (e.g., LTX-2).
+ */
+import { GoogleGenAI } from "@google/genai";
+const ai = new GoogleGenAI({
+  apiKey: process.env.GEMINI_API_KEY || "",
+});
+/**
+ * LTX-2 Prompting Guide embedded as system context.
+ * Based on: https://ltx.io/model/model-blog/prompting-guide-for-ltx-2
+ * Complete guide with examples for optimal prompt rewriting.
+ */
+const LTX2_PROMPTING_GUIDE = `
+You are an expert prompt engineer for LTX-2, a state-of-the-art video generation model.
+Your task is to rewrite the user's prompt following the official LTX-2 prompting guide.
+The key is painting a complete picture of the story you're telling that flows naturally from beginning to end, covering all the elements the model needs to bring your vision to life.
+## EXAMPLE PROMPTS (study these patterns)
+### Example 1 - Action Scene:
+"An action packed, cinematic shot of a monster truck driving fast towards the camera, the truck passes the cameras it pans left to follow the trucks reckless drive. dust and motion blur is around the truck, hand held feel to the camera as it tries to track its ride into the distance. the truck then drifts and turns around, then drives back towards the camera until seen in extreme close up."
+### Example 2 - Dialogue Scene:
+"A warm sunny backyard. The camera starts in a tight cinematic close-up of a woman and a man in their 30s, facing each other with serious expressions. The woman, emotional and dramatic, says softly, 'That's it... Dad's lost it. And we've lost Dad.' The man exhales, slightly annoyed: 'Stop being so dramatic, Jess.' A beat. He glances aside, then mutters defensively, 'He's just having fun.' The camera slowly pans right, revealing the grandfather in the garden wearing enormous butterfly wings, waving his arms in the air like he's trying to take off. He shouts, 'Wheeeew!' as he flaps his wings with full commitment."
+### Example 3 - Interior Scene:
+"INT. OVEN – DAY. Static camera from inside the oven, looking outward through the slightly fogged glass door. Warm golden light glows around freshly baked cookies. The baker's face fills the frame, eyes wide with focus, his breath fogging the glass as he leans in. Subtle reflections move across the glass as steam rises. Baker (whispering dramatically): 'Today… I achieve perfection.' He leans even closer, nose nearly touching the glass. 'Golden edges. Soft center. The gods themselves will smell these cookies and weep.'"
+### Example 4 - Performance Scene:
+"A warm, intimate cinematic performance inside a cozy, wood-paneled bar, lit with soft amber practical lights and shallow depth of field that creates glowing bokeh in the background. The shot opens in a medium close-up on a young female singer in her 20s with short brown hair and bangs, singing into a microphone while strumming an acoustic guitar, her eyes closed and posture relaxed. The camera slowly arcs left around her, keeping her face and mic in sharp focus as two male band members playing guitars remain softly blurred behind her. Warm light wraps around her face and hair as framed photos and wooden walls drift past in the background. Ambient live music fills the space, led by her clear vocals over gentle acoustic strumming."
+## KEY ASPECTS TO INCLUDE
+1. **Establish the shot**: Use cinematography terms (wide shot, medium close-up, over-the-shoulder, static frame, handheld). Include scale or category characteristics.
+2. **Set the scene**: Describe lighting conditions (warm golden light, soft amber, neon glow, dramatic shadows), color palette (warm, muted, high contrast), textures, and atmospheric elements (fog, rain, dust, smoke).
+3. **Describe the action as a NARRATIVE SEQUENCE**: Write the core action flowing naturally from BEGINNING to END. Include what happens first, then next, then after that. Actions should progress temporally.
+4. **Define characters**: Include age, hairstyle, clothing, and distinguishing details. Express emotions through PHYSICAL CUES (posture, gesture, facial expression) - NOT internal states.
+5. **Camera movements**: Specify clearly using terms like: pans left/right, dollys back, slow push in, handheld tracking, arcs around, tilts upward, pulls back. Describe how subjects appear AFTER camera motion.
+6. **Audio and dialogue**: Describe ambient sounds, music quality. For speech, use quotation marks. Mention dialogue style (whispers, shouts, mutters).
+## FOR BEST RESULTS
+- Keep prompt as a SINGLE FLOWING PARAGRAPH
+- Use PRESENT TENSE verbs (speeds, roars, pans, reveals)
+- Match detail to shot scale (closeups need more detail than wide shots)
+- Write 4-8 descriptive sentences
+- Focus on camera's relationship to subject
+- Create a temporal narrative arc (beginning → middle → end)
+## TECHNICAL TERMS TO USE
+Camera language: follows, tracks, pans across, circles around, tilts upward, pushes in, pulls back, overhead view, handheld movement, over-the-shoulder, wide establishing shot, static frame
+Film characteristics: lens flares, film grain, shallow depth of field, bokeh
+Pacing: slow motion, lingering shot, continuous shot, seamless transition, dynamic movement
+## WHAT WORKS WELL WITH LTX-2
+- Cinematic compositions with thoughtful lighting and shallow depth of field
+- Emotive human moments, subtle gestures, facial nuance
+- Weather effects: fog, mist, golden hour light, soft shadows, rain, reflections
+- Clean camera language: "slow dolly in", "handheld tracking", "over-the-shoulder"
+- Stylized aesthetics: painterly, noir, analog film, fashion editorial
+- Lighting/mood control: backlighting, color palettes, soft rim light
+## WHAT TO AVOID
+- Internal emotional states without visual cues (don't say "sad", show the expression)
+- Text, logos, signage, brand names (LTX-2 can't render text)
+- Complex physics: jumping, juggling, chaotic motion
+- Too many characters or layered actions
+- Conflicting light sources
+- Over-complicated prompts
+## YOUR TASK
+Transform the user's simple prompt into a rich, cinematic LTX-2 prompt that:
+1. Establishes a clear shot type and scene
+2. Describes a NARRATIVE SEQUENCE of action from beginning to end
+3. Includes specific camera movements
+4. Sets atmosphere through lighting and color
+5. Uses present tense throughout
+6. Flows as a single cohesive paragraph
+Output ONLY the enhanced prompt. No explanations, no markdown, no labels - just the enhanced cinematic prompt.
+`;
+/**
+ * Veo 3.1 Prompting Guide for image-to-video generation.
+ * Based on Shorts / Veo Shot Planning Guidelines.
+ */
+const VEO_PROMPTING_GUIDE = `
+You are an expert prompt engineer for Google Veo 3.1, a state-of-the-art image-to-video generation model.
+Your task is to rewrite the user's prompt following the Veo Shot Planning Guidelines.
+## VEO 3.1 PROMPTING ESSENTIALS
+### The 5-Part Scene Formula
+Structure every Veo prompt as: [Cinematography] + [Subject] + [Action] + [Context] + [Style & Ambiance]
+1. **Cinematography**: Shot type + camera behavior
+   - Examples: "Vertical 9:16 CLOSE-UP, eye-level, SINGLE LOCKED SHOT"
+   - Examples: "MEDIUM TWO-SHOT with a slow push-in toward the subject"
+   - Examples: "Wide establishing shot, static frame"
+2. **Subject**: Who/what the shot is about
+   - Describe character details: age, clothing, position in frame
+   - Who is in focus vs background
+3. **Action**: What is happening
+   - Describe motion from beginning to end
+   - For dialogue: ordered speaker cues with exact lines in quotes
+4. **Context**: Where it happens
+   - Background description
+   - "Simple and softly blurred background"
+   - "No people walking through frame"
+5. **Style & Ambiance**: Overall mood
+   - Lighting quality
+   - Color palette
+   - Sound/audio descriptions
+### Camera Behavior - BE EXPLICIT
+For LOCKED shots:
+- "SINGLE LOCKED SHOT - NO pans, NO cuts, NO angle changes"
+- "Camera and framing remain completely static"
+For START→END interpolation:
+- "Single continuous shot interpolating naturally from the START frame to the END frame"
+- Describe the motion: "gentle push-in", "slow pan right", "subtle arc around subject"
+### Background Control
+- "Simple and softly blurred conference-room/office/street background"
+- "No people walking through frame"
+- "No new characters entering or exiting the frame"
+- "Any background figures must remain completely still and very out of focus"
+### Always Include These Negative Instructions
+- "No on-screen text or subtitles"
+- "No black bars"
+- "No camera shake" (unless specifically wanted)
+### Dialogue Shots
+For dialogue, include:
+- Ordered speaker cues with visual identifiers
+- Exact dialogue in quotes
+- Voice assignments: "natural male/female voice"
+- "Only ONE character should speak at a time; no overlapping speech"
+- "Short natural pauses between turns"
+### Emotion Through Physical Cues
+- DON'T say "sad" or "angry" - show it through posture, gesture, expression
+- Examples: "bowed and humble", "firm but controlled", "eyes downcast", "slight smile"
+## EXAMPLE PROMPTS
+### Example 1 - Simple Action:
+"Vertical 9:16 MEDIUM SHOT, eye-level, SINGLE LOCKED SHOT. A young woman in a blue dress stands in a sunlit garden, her hair gently moving in the breeze. She slowly raises her hand to touch a blooming flower, her expression soft and contemplative. Warm golden hour lighting wraps around her. Background is softly blurred foliage. No on-screen text. No people in background."
+### Example 2 - Dialogue:
+"Vertical 9:16 CLOSE-UP TWO-SHOT, SINGLE LOCKED SHOT. MR. KIM (man in dark suit) sits at head of conference table, GUXIXI (young woman in pink uniform) stands opposite. 1) MR. KIM says firmly: 'This is your final warning.' 2) GUXIXI, eyes downcast, responds quietly: 'I understand, sir.' Generate clear dialogue audio with natural male voice for Kim, female voice for Guxixi. One voice at a time with natural pauses. Soft office ambient sound. No on-screen text."
+### Example 3 - Camera Movement:
+"Vertical 9:16 MEDIUM SHOT with gentle push-in. Single continuous shot interpolating from START frame to END frame. The shot begins on a wide view of the dancer, then slowly pushes in to a close-up of her face as she completes her spin. Soft rim lighting from behind. Ambient music continues throughout. No cuts, no angle changes. No on-screen text."
+## YOUR TASK
+Transform the user's prompt into a Veo-optimized prompt that:
+1. Uses the 5-part scene formula
+2. Explicitly states camera behavior (locked OR interpolating)
+3. Describes subject and action clearly
+4. Includes negative instructions (no on-screen text, no people in background)
+5. If dialogue present: includes ordered speaker cues and voice assignments
+The user is providing a START image (and optionally END image) separately via API parameters.
+Focus the prompt on describing what happens visually and aurally - not on technical API details.
+Output ONLY the enhanced prompt. No explanations, no markdown, no labels.
+`;
+/**
+ * Configuration for LLM-based prompt enhancement.
+ */
+export interface LLMEnhancerConfig {
+  /** The model to use for enhancement */
+  model?: string;
+  /** System prompt/guide for the enhancer */
+  systemPrompt?: string;
+  /** Maximum tokens for the enhanced prompt */
+  maxTokens?: number;
+  /** Temperature for generation (lower = more deterministic) */
+  temperature?: number;
+}
+/**
+ * Built-in enhancer configurations for different video models.
+ */
+export const LLM_ENHANCER_CONFIGS: Record<string, LLMEnhancerConfig> = {
+  ltx2: {
+    model: "gemini-2.0-flash",
+    systemPrompt: LTX2_PROMPTING_GUIDE,
+    maxTokens: 1024,
+    temperature: 0.4,
+  },
+  veo: {
+    model: "gemini-2.0-flash",
+    systemPrompt: VEO_PROMPTING_GUIDE,
+    maxTokens: 1024,
+    temperature: 0.4,
+  },
+};
+/**
+ * Enhance a prompt using Gemini LLM.
+ *
+ * @param prompt - The user's original prompt
+ * @param configOrName - Either a config name (e.g., "ltx2") or a custom LLMEnhancerConfig
+ * @param images - Optional array of image paths/URLs to include in context
+ * @returns The enhanced prompt
+ */
+export async function enhancePromptWithLLM(
+  prompt: string,
+  configOrName: string | LLMEnhancerConfig = "ltx2",
+  images?: string[]
+): Promise<string> {
+  // Resolve config - ltx2 is always available as default
+  let config: LLMEnhancerConfig;
+  if (typeof configOrName === "string") {
+    const lookedUp = LLM_ENHANCER_CONFIGS[configOrName];
+    config = lookedUp ?? LLM_ENHANCER_CONFIGS["ltx2"]!;
+  } else {
+    config = configOrName;
+  }
+  const model = config.model || "gemini-2.0-flash";
+  const systemPrompt = config.systemPrompt || LTX2_PROMPTING_GUIDE;
+  // Build content parts
+  const contents: any[] = [];
+  // Add images if provided
+  if (images && images.length > 0) {
+    for (const imagePath of images) {
+      // For now, just mention the image in the prompt
+      // Full image support would require reading and encoding the image
+      contents.push(`[Image provided: ${imagePath}]`);
+    }
+  }
+  // Add the user's prompt
+  contents.push(`User's original prompt:\n${prompt}\n\nEnhanced prompt:`);
+  try {
+    const response = await ai.models.generateContent({
+      model,
+      contents: contents.join("\n"),
+      config: {
+        systemInstruction: systemPrompt,
+        maxOutputTokens: config.maxTokens || 1024,
+        temperature: config.temperature || 0.7,
+      },
+    });
+    // Extract text from response
+    const enhancedPrompt =
+      response.candidates?.[0]?.content?.parts?.[0]?.text?.trim();
+    if (!enhancedPrompt) {
+      console.warn("LLM enhancement returned empty, using original prompt");
+      return prompt;
+    }
+    return enhancedPrompt;
+  } catch (error: any) {
+    console.error(`LLM prompt enhancement failed: ${error.message}`);
+    // Fall back to original prompt on error
+    return prompt;
+  }
+}
+/**
+ * Check if Gemini API key is configured.
+ */
+export function isLLMEnhancerAvailable(): boolean {
+  return !!process.env.GEMINI_API_KEY;
+}

package/src/utils/prompt-enhancer-presets.ts ADDED Viewed

@@ -0,0 +1,303 @@
+/**
+ * Prompt Enhancer Presets
+ *
+ * A factory/dictionary of predefined prompt enhancer configurations.
+ * Organized by use case: image generation, video generation, and specialized styles.
+ *
+ * Based on best practices from LTX-2 prompting guide and industry standards.
+ */
+import { PromptEnhancer, type PromptEnhancerConfig } from "./prompt-enhancer";
+// =============================================================================
+// IMAGE GENERATION PRESETS
+// =============================================================================
+/**
+ * Presets optimized for image generation (Gemini Imagen, Flux, SD, etc.)
+ */
+export const IMAGE_PRESETS: Record<string, PromptEnhancerConfig> = {
+  /**
+   * Cinematic still photography style
+   */
+  cinematic: {
+    styleGuide:
+      "cinematic composition, dramatic lighting, shallow depth of field, film grain, 4K, hyperdetailed",
+    negativeElements:
+      "blurry, low quality, pixelated, distorted, watermark, text, logo",
+  },
+  /**
+   * Photorealistic, DSLR-quality imagery
+   */
+  photorealistic: {
+    styleGuide:
+      "photorealistic, ultra-detailed, natural lighting, DSLR quality, 8K resolution, sharp focus",
+    negativeElements:
+      "cartoon, anime, illustration, painting, blurry, distorted, artificial",
+  },
+  /**
+   * Anime and manga illustration style
+   */
+  anime: {
+    styleGuide:
+      "anime style, vibrant colors, cel-shaded, detailed linework, Studio Ghibli inspired",
+    negativeElements:
+      "photorealistic, 3D render, Western cartoon, blurry, low detail",
+  },
+  /**
+   * Abstract and modern art
+   */
+  abstract: {
+    styleGuide:
+      "abstract art, geometric shapes, bold colors, modern art, contemporary, expressive brushstrokes",
+    negativeElements:
+      "photorealistic, detailed faces, text, logo, realistic proportions",
+  },
+  /**
+   * Vintage and retro aesthetic
+   */
+  vintage: {
+    styleGuide:
+      "vintage aesthetic, retro, film grain, 1970s color palette, warm tones, nostalgic, analog photography",
+    negativeElements:
+      "modern, digital, clean, sharp, contemporary, high contrast",
+  },
+  /**
+   * Fashion editorial photography
+   */
+  fashion_editorial: {
+    styleGuide:
+      "fashion editorial, high-end photography, professional lighting, Vogue style, elegant composition, soft rim light",
+    negativeElements:
+      "casual, amateur, blurry, bad lighting, unflattering angle",
+  },
+  /**
+   * Fantasy illustration
+   */
+  fantasy: {
+    styleGuide:
+      "fantasy art, epic composition, magical atmosphere, detailed, painterly, dramatic lighting, ethereal",
+    negativeElements: "modern, realistic, mundane, blurry, low detail",
+  },
+  /**
+   * Minimalist design
+   */
+  minimalist: {
+    styleGuide:
+      "minimalist design, clean lines, negative space, simple composition, modern, elegant, refined",
+    negativeElements: "cluttered, busy, detailed, ornate, complex, messy",
+  },
+};
+// =============================================================================
+// VIDEO GENERATION PRESETS
+// Based on LTX-2 Prompting Guide best practices
+// =============================================================================
+/**
+ * Presets optimized for video generation (LTX, Luma, Veo, etc.)
+ * Following LTX-2 guidelines:
+ * - Establish the shot with cinematography terms
+ * - Set the scene with lighting/atmosphere
+ * - Describe action as natural sequence
+ * - Define camera movements clearly
+ */
+export const VIDEO_PRESETS: Record<string, PromptEnhancerConfig> = {
+  /**
+   * LTX-2 specific preset based on official prompting guide.
+   * Key principles:
+   * - Establish shot with cinematography terms
+   * - Set scene with lighting/atmosphere
+   * - Describe action as natural flowing sequence
+   * - Use clean, readable camera language
+   * - Focus on emotive moments, atmosphere, and stylized aesthetics
+   */
+  ltx2: {
+    // No prefix - LTX-2 wants complete flowing scenes, not formulaic starts
+    suffix:
+      "The camera captures this with thoughtful cinematography, shallow depth of field, and natural motion.",
+    styleGuide:
+      "cinematic composition, present tense, flowing narrative, clear camera language, atmospheric lighting, emotive expressions through physical cues",
+    negativeElements:
+      "text, logos, signage, brand names, complex physics, chaotic motion, jumping, juggling, conflicting light sources, overloaded scene, too many characters, internal emotional states without visual cues",
+  },
+  /**
+   * Cinematic video with dramatic camera work
+   */
+  cinematic_video: {
+    prefix: "A cinematic shot.",
+    styleGuide:
+      "smooth camera movement, professional cinematography, dramatic lighting, shallow depth of field, film grain, cinematic color grading",
+    negativeElements:
+      "shaky camera, low quality, pixelated, glitchy, jerky motion, text overlay",
+  },
+  /**
+   * Documentary-style realistic footage
+   */
+  documentary: {
+    prefix: "Documentary footage.",
+    styleGuide:
+      "natural lighting, handheld camera feel, authentic, observational, steady tracking shot",
+    negativeElements:
+      "staged, artificial, over-produced, fantasy elements, special effects",
+  },
+  /**
+   * Action-packed dynamic shots
+   */
+  action: {
+    prefix: "An action packed shot.",
+    styleGuide:
+      "dynamic camera movement, motion blur, intense, fast-paced, dramatic angles, handheld feel",
+    negativeElements:
+      "static, slow, boring, calm, peaceful, stable tripod shot",
+  },
+  /**
+   * Slow and atmospheric
+   */
+  atmospheric: {
+    styleGuide:
+      "slow camera movement, atmospheric, moody lighting, fog or mist, ambient, lingering shot, soft focus background",
+    negativeElements:
+      "fast motion, harsh lighting, chaotic, busy, cluttered scene",
+  },
+  /**
+   * Talking head / dialogue scene
+   */
+  dialogue: {
+    prefix: "A warm, intimate scene.",
+    styleGuide:
+      "medium close-up, soft lighting, natural conversation, over-the-shoulder shots, subtle camera movement, realistic acting",
+    negativeElements:
+      "wide shot, action, fast movement, loud, chaotic, unrealistic",
+  },
+  /**
+   * Product showcase / commercial
+   */
+  commercial: {
+    styleGuide:
+      "smooth dolly movement, professional studio lighting, clean background, product focus, slow rotation, sharp detail, premium aesthetic",
+    negativeElements:
+      "cluttered, dark, amateur, shaky, low quality, distraction",
+  },
+  /**
+   * Animation / stylized motion
+   */
+  animated: {
+    styleGuide:
+      "animated style, expressive movement, stylized timing, smooth transitions, character animation, vibrant colors",
+    negativeElements: "photorealistic, live action, jerky, static, dull colors",
+  },
+  /**
+   * Establish wide shot for scene setting
+   */
+  establishing: {
+    prefix: "Wide establishing shot.",
+    styleGuide:
+      "expansive view, slow pan or static frame, epic scale, atmospheric, cinematic composition, golden hour lighting",
+    negativeElements:
+      "close-up, cluttered, indoor, tight framing, rapid movement",
+  },
+  /**
+   * Intimate close-up for emotional moments
+   */
+  closeup_emotional: {
+    prefix: "An intimate close-up.",
+    styleGuide:
+      "tight framing on face, shallow depth of field, emotional expression, soft lighting, subtle movement, eyes in focus",
+    negativeElements:
+      "wide shot, distant, cold lighting, static expression, fast movement",
+  },
+};
+// =============================================================================
+// COMBINED PRESETS DICTIONARY
+// =============================================================================
+/**
+ * All available prompt enhancer presets.
+ */
+export const PROMPT_ENHANCER_PRESETS: Record<string, PromptEnhancerConfig> = {
+  ...IMAGE_PRESETS,
+  ...VIDEO_PRESETS,
+};
+// =============================================================================
+// HELPER FUNCTIONS
+// =============================================================================
+/**
+ * Get a PromptEnhancer for a named preset.
+ * Returns undefined if the preset doesn't exist.
+ */
+export function getPromptEnhancer(
+  presetName: string
+): PromptEnhancer | undefined {
+  const config = PROMPT_ENHANCER_PRESETS[presetName];
+  if (!config) {
+    return undefined;
+  }
+  return PromptEnhancer.fromConfig(config);
+}
+/**
+ * List all available enhancer preset names.
+ */
+export function listEnhancerPresets(): string[] {
+  return Object.keys(PROMPT_ENHANCER_PRESETS);
+}
+/**
+ * List image-specific enhancer preset names.
+ */
+export function listImageEnhancerPresets(): string[] {
+  return Object.keys(IMAGE_PRESETS);
+}
+/**
+ * List video-specific enhancer preset names.
+ */
+export function listVideoEnhancerPresets(): string[] {
+  return Object.keys(VIDEO_PRESETS);
+}
+/**
+ * Resolve an enhancer from either a preset name (string) or inline config.
+ * Returns PASSTHROUGH enhancer if resolution fails.
+ */
+export function resolveEnhancer(
+  input: string | PromptEnhancerConfig | undefined
+): PromptEnhancer {
+  if (!input) {
+    return PromptEnhancer.PASSTHROUGH;
+  }
+  if (typeof input === "string") {
+    const enhancer = getPromptEnhancer(input);
+    if (!enhancer) {
+      console.warn(
+        `Prompt enhancer preset '${input}' not found, using passthrough.`
+      );
+      return PromptEnhancer.PASSTHROUGH;
+    }
+    return enhancer;
+  }
+  // It's an inline config object
+  return PromptEnhancer.fromConfig(input);
+}