@mixio-pro/kalaasetu-mcp 1.2.1 → 2.0.1-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/fal-config.json +106 -0
- package/package.json +2 -1
- package/src/index.ts +0 -9
- package/src/tools/fal/config.ts +120 -23
- package/src/tools/fal/generate.ts +370 -84
- package/src/tools/fal/index.ts +2 -7
- package/src/tools/fal/models.ts +163 -29
- package/src/tools/fal/storage.ts +9 -2
- package/src/tools/gemini.ts +106 -26
- package/src/tools/image-to-video.ts +359 -129
- package/src/tools/perplexity.ts +61 -61
- package/src/tools/youtube.ts +8 -3
- package/src/utils/llm-prompt-enhancer.ts +302 -0
- package/src/utils/prompt-enhancer-presets.ts +303 -0
- package/src/utils/prompt-enhancer.ts +186 -0
package/src/tools/perplexity.ts
CHANGED
|
@@ -4,21 +4,26 @@ import { safeToolExecute } from "../utils/tool-wrapper";
|
|
|
4
4
|
export const perplexityImages = {
|
|
5
5
|
name: "perplexityImages",
|
|
6
6
|
description:
|
|
7
|
-
"
|
|
7
|
+
"Search for and discover real-world images using the Perplexity Sonar API. " +
|
|
8
|
+
"This tool provides a summarized text description of findings along with a verified list of image URLs. " +
|
|
9
|
+
"Citations in the text are mapped [N] to the numbered images in the list. " +
|
|
10
|
+
"Ideal for finding visual inspiration, reference photos, or stock-like images from the web.",
|
|
8
11
|
parameters: z.object({
|
|
9
|
-
query: z
|
|
12
|
+
query: z
|
|
13
|
+
.string()
|
|
14
|
+
.describe(
|
|
15
|
+
"Descriptive search terms (e.g., 'SpaceX Starship launch photos')."
|
|
16
|
+
),
|
|
10
17
|
image_domain_filter: z
|
|
11
18
|
.array(z.string())
|
|
12
19
|
.optional()
|
|
13
20
|
.describe(
|
|
14
|
-
"
|
|
21
|
+
"Filter results by domain. Use 'domain.com' to include, or '-domain.com' to exclude. (e.g., ['wikimedia.org', '-pinterest.com'])."
|
|
15
22
|
),
|
|
16
23
|
image_format_filter: z
|
|
17
24
|
.array(z.string())
|
|
18
25
|
.optional()
|
|
19
|
-
.describe(
|
|
20
|
-
"A list of allowed image formats. E.g., ['jpg', 'png', 'gif']."
|
|
21
|
-
),
|
|
26
|
+
.describe("Target specific formats: 'jpg', 'png', 'gif', etc."),
|
|
22
27
|
}),
|
|
23
28
|
timeoutMs: 300000,
|
|
24
29
|
execute: async (args: {
|
|
@@ -33,28 +38,26 @@ export const perplexityImages = {
|
|
|
33
38
|
}
|
|
34
39
|
|
|
35
40
|
const url = "https://api.perplexity.ai/chat/completions";
|
|
36
|
-
const headers = {
|
|
41
|
+
const headers: Record<string, string> = {
|
|
37
42
|
Authorization: `Bearer ${apiKey}`,
|
|
38
43
|
"Content-Type": "application/json",
|
|
39
44
|
accept: "application/json",
|
|
40
45
|
};
|
|
41
46
|
|
|
42
|
-
const payload
|
|
47
|
+
const payload = {
|
|
43
48
|
model: "sonar",
|
|
44
49
|
messages: [
|
|
45
50
|
{ role: "user", content: `Show me images of ${args.query}` },
|
|
46
51
|
],
|
|
47
52
|
return_images: true,
|
|
53
|
+
...(args.image_domain_filter
|
|
54
|
+
? { image_domain_filter: args.image_domain_filter }
|
|
55
|
+
: {}),
|
|
56
|
+
...(args.image_format_filter
|
|
57
|
+
? { image_format_filter: args.image_format_filter }
|
|
58
|
+
: {}),
|
|
48
59
|
};
|
|
49
60
|
|
|
50
|
-
if (args.image_domain_filter) {
|
|
51
|
-
payload.image_domain_filter = args.image_domain_filter;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
if (args.image_format_filter) {
|
|
55
|
-
payload.image_format_filter = args.image_format_filter;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
61
|
const res = await fetch(url, {
|
|
59
62
|
method: "POST",
|
|
60
63
|
headers: headers,
|
|
@@ -66,51 +69,49 @@ export const perplexityImages = {
|
|
|
66
69
|
throw new Error(`Perplexity API request failed: ${res.status} ${text}`);
|
|
67
70
|
}
|
|
68
71
|
|
|
69
|
-
const data
|
|
72
|
+
const data = (await res.json()) as any;
|
|
70
73
|
let content = data.choices?.[0]?.message?.content;
|
|
71
|
-
const images = data.images;
|
|
72
|
-
const citations = data.citations;
|
|
74
|
+
const images = (data.images || []) as any[];
|
|
75
|
+
const citations = (data.citations || []) as string[];
|
|
73
76
|
|
|
74
|
-
if (
|
|
77
|
+
if (images.length === 0) {
|
|
75
78
|
return `No direct image URLs found in the API response. The text content was: ${content}`;
|
|
76
79
|
}
|
|
77
80
|
|
|
78
81
|
// Create a map of origin_url -> new 1-based index
|
|
79
|
-
const originUrlToImageIndex:
|
|
80
|
-
images.forEach((img
|
|
82
|
+
const originUrlToImageIndex: Record<string, number> = {};
|
|
83
|
+
images.forEach((img, index) => {
|
|
81
84
|
if (img.origin_url) {
|
|
82
85
|
originUrlToImageIndex[img.origin_url] = index + 1;
|
|
83
86
|
}
|
|
84
87
|
});
|
|
85
88
|
|
|
86
89
|
// Create a map of old citation index -> new image index
|
|
87
|
-
const oldToNewCitationMap:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
});
|
|
94
|
-
}
|
|
90
|
+
const oldToNewCitationMap: Record<number, number> = {};
|
|
91
|
+
citations.forEach((citationUrl, index) => {
|
|
92
|
+
if (originUrlToImageIndex[citationUrl]) {
|
|
93
|
+
oldToNewCitationMap[index + 1] = originUrlToImageIndex[citationUrl];
|
|
94
|
+
}
|
|
95
|
+
});
|
|
95
96
|
|
|
96
97
|
// Replace citations in the content
|
|
97
98
|
if (content && typeof content === "string") {
|
|
98
99
|
content = content
|
|
99
|
-
.replace(/\[(\d+)\]/g, (
|
|
100
|
+
.replace(/\[(\d+)\]/g, (_match, oldIndexStr) => {
|
|
100
101
|
const oldIndex = parseInt(oldIndexStr, 10);
|
|
101
102
|
const newIndex = oldToNewCitationMap[oldIndex];
|
|
102
103
|
if (newIndex) {
|
|
103
104
|
return `[${newIndex}]`;
|
|
104
105
|
}
|
|
105
|
-
return "";
|
|
106
|
+
return "";
|
|
106
107
|
})
|
|
107
108
|
.replace(/(\s\s+)/g, " ")
|
|
108
|
-
.trim();
|
|
109
|
+
.trim();
|
|
109
110
|
}
|
|
110
111
|
|
|
111
112
|
// Build the final formatted output
|
|
112
113
|
let output = content + "\n\n--- Images ---\n";
|
|
113
|
-
images.forEach((img
|
|
114
|
+
images.forEach((img, index) => {
|
|
114
115
|
output += `${index + 1}. ${img.image_url}\n (Source: ${
|
|
115
116
|
img.origin_url
|
|
116
117
|
})\n`;
|
|
@@ -124,14 +125,16 @@ export const perplexityImages = {
|
|
|
124
125
|
export const perplexityVideos = {
|
|
125
126
|
name: "perplexityVideos",
|
|
126
127
|
description:
|
|
127
|
-
"
|
|
128
|
+
"Search for web videos (e.g., from YouTube, Vimeo) using Perplexity Sonar Pro. " +
|
|
129
|
+
"Provides a textual summary of the content found and direct links to the videos. " +
|
|
130
|
+
"Perfect for finding research material or specific clips based on natural language queries.",
|
|
128
131
|
parameters: z.object({
|
|
129
|
-
query: z.string().describe("The search query for videos."),
|
|
132
|
+
query: z.string().describe("The natural language search query for videos."),
|
|
130
133
|
search_domain_filter: z
|
|
131
134
|
.array(z.string())
|
|
132
135
|
.optional()
|
|
133
136
|
.describe(
|
|
134
|
-
"
|
|
137
|
+
"Optional: Restrict search to specific domains (e.g., ['youtube.com']) or exclude them with '-' prefix."
|
|
135
138
|
),
|
|
136
139
|
}),
|
|
137
140
|
timeoutMs: 300000,
|
|
@@ -143,24 +146,23 @@ export const perplexityVideos = {
|
|
|
143
146
|
}
|
|
144
147
|
|
|
145
148
|
const url = "https://api.perplexity.ai/chat/completions";
|
|
146
|
-
const headers = {
|
|
149
|
+
const headers: Record<string, string> = {
|
|
147
150
|
Authorization: `Bearer ${apiKey}`,
|
|
148
151
|
"Content-Type": "application/json",
|
|
149
152
|
accept: "application/json",
|
|
150
153
|
};
|
|
151
154
|
|
|
152
|
-
const payload
|
|
155
|
+
const payload = {
|
|
153
156
|
model: "sonar-pro",
|
|
154
157
|
messages: [
|
|
155
158
|
{ role: "user", content: `Show me videos of ${args.query}` },
|
|
156
159
|
],
|
|
157
160
|
media_response: { overrides: { return_videos: true } },
|
|
161
|
+
...(args.search_domain_filter
|
|
162
|
+
? { search_domain_filter: args.search_domain_filter }
|
|
163
|
+
: {}),
|
|
158
164
|
};
|
|
159
165
|
|
|
160
|
-
if (args.search_domain_filter) {
|
|
161
|
-
payload.search_domain_filter = args.search_domain_filter;
|
|
162
|
-
}
|
|
163
|
-
|
|
164
166
|
const res = await fetch(url, {
|
|
165
167
|
method: "POST",
|
|
166
168
|
headers: headers,
|
|
@@ -172,12 +174,12 @@ export const perplexityVideos = {
|
|
|
172
174
|
throw new Error(`Perplexity API request failed: ${res.status} ${text}`);
|
|
173
175
|
}
|
|
174
176
|
|
|
175
|
-
const data
|
|
177
|
+
const data = (await res.json()) as any;
|
|
176
178
|
let content = data.choices?.[0]?.message?.content;
|
|
177
|
-
const videos = data.videos;
|
|
178
|
-
const citations = data.citations;
|
|
179
|
+
const videos = (data.videos || []) as any[];
|
|
180
|
+
const citations = (data.citations || []) as string[];
|
|
179
181
|
|
|
180
|
-
if (
|
|
182
|
+
if (videos.length === 0) {
|
|
181
183
|
return `No direct video URLs found in the API response. Full API Response: ${JSON.stringify(
|
|
182
184
|
data,
|
|
183
185
|
null,
|
|
@@ -186,41 +188,39 @@ export const perplexityVideos = {
|
|
|
186
188
|
}
|
|
187
189
|
|
|
188
190
|
// Create a map of video url -> new 1-based index
|
|
189
|
-
const urlToVideoIndex:
|
|
190
|
-
videos.forEach((video
|
|
191
|
+
const urlToVideoIndex: Record<string, number> = {};
|
|
192
|
+
videos.forEach((video, index) => {
|
|
191
193
|
if (video.url) {
|
|
192
194
|
urlToVideoIndex[video.url] = index + 1;
|
|
193
195
|
}
|
|
194
196
|
});
|
|
195
197
|
|
|
196
198
|
// Create a map of old citation index -> new video index
|
|
197
|
-
const oldToNewCitationMap:
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
});
|
|
204
|
-
}
|
|
199
|
+
const oldToNewCitationMap: Record<number, number> = {};
|
|
200
|
+
citations.forEach((citationUrl, index) => {
|
|
201
|
+
if (urlToVideoIndex[citationUrl]) {
|
|
202
|
+
oldToNewCitationMap[index + 1] = urlToVideoIndex[citationUrl];
|
|
203
|
+
}
|
|
204
|
+
});
|
|
205
205
|
|
|
206
206
|
// Replace citations in the content
|
|
207
207
|
if (content && typeof content === "string") {
|
|
208
208
|
content = content
|
|
209
|
-
.replace(/\[(\d+)\]/g, (
|
|
209
|
+
.replace(/\[(\d+)\]/g, (_match, oldIndexStr) => {
|
|
210
210
|
const oldIndex = parseInt(oldIndexStr, 10);
|
|
211
211
|
const newIndex = oldToNewCitationMap[oldIndex];
|
|
212
212
|
if (newIndex) {
|
|
213
213
|
return `[${newIndex}]`;
|
|
214
214
|
}
|
|
215
|
-
return "";
|
|
215
|
+
return "";
|
|
216
216
|
})
|
|
217
217
|
.replace(/(\s\s+)/g, " ")
|
|
218
|
-
.trim();
|
|
218
|
+
.trim();
|
|
219
219
|
}
|
|
220
220
|
|
|
221
221
|
// Build the final formatted output
|
|
222
222
|
let output = content + "\n\n--- Videos ---\n";
|
|
223
|
-
videos.forEach((video
|
|
223
|
+
videos.forEach((video, index) => {
|
|
224
224
|
output += `${index + 1}. ${video.url}\n`;
|
|
225
225
|
});
|
|
226
226
|
|
package/src/tools/youtube.ts
CHANGED
|
@@ -9,16 +9,21 @@ const ai = new GoogleGenAI({
|
|
|
9
9
|
export const analyzeYoutubeVideo = {
|
|
10
10
|
name: "analyzeYoutubeVideo",
|
|
11
11
|
description:
|
|
12
|
-
"
|
|
12
|
+
"Perform deep semantic analysis of a YouTube video using Google's Gemini 2.5 Flash model. " +
|
|
13
|
+
"Use this to summarize long videos, identify specific scenes, extract information, or ask visual questions about the video's content. " +
|
|
14
|
+
"This tool treats the YouTube URL as a first-class video stream for the model. " +
|
|
15
|
+
"ONLY USE WHEN WORKING WITH GOOGLE/GEMINI MODELS.",
|
|
13
16
|
parameters: z.object({
|
|
14
17
|
youtube_url: z
|
|
15
18
|
.string()
|
|
16
19
|
.describe(
|
|
17
|
-
"
|
|
20
|
+
"The full URL of the YouTube video (e.g., 'https://www.youtube.com/watch?v=dQw4w9WgXcQ')."
|
|
18
21
|
),
|
|
19
22
|
prompt: z
|
|
20
23
|
.string()
|
|
21
|
-
.describe(
|
|
24
|
+
.describe(
|
|
25
|
+
"Instruction or question about the video content (e.g., 'Summarize the main points' or 'What color was the car?')."
|
|
26
|
+
),
|
|
22
27
|
}),
|
|
23
28
|
timeoutMs: 300000,
|
|
24
29
|
execute: async (args: { youtube_url: string; prompt: string }) => {
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-Powered Prompt Enhancer
|
|
3
|
+
*
|
|
4
|
+
* Uses Gemini 3 Fast to intelligently rewrite prompts based on
|
|
5
|
+
* prompting guides for specific models (e.g., LTX-2).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { GoogleGenAI } from "@google/genai";
|
|
9
|
+
|
|
10
|
+
const ai = new GoogleGenAI({
|
|
11
|
+
apiKey: process.env.GEMINI_API_KEY || "",
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* LTX-2 Prompting Guide embedded as system context.
|
|
16
|
+
* Based on: https://ltx.io/model/model-blog/prompting-guide-for-ltx-2
|
|
17
|
+
* Complete guide with examples for optimal prompt rewriting.
|
|
18
|
+
*/
|
|
19
|
+
const LTX2_PROMPTING_GUIDE = `
|
|
20
|
+
You are an expert prompt engineer for LTX-2, a state-of-the-art video generation model.
|
|
21
|
+
Your task is to rewrite the user's prompt following the official LTX-2 prompting guide.
|
|
22
|
+
|
|
23
|
+
The key is painting a complete picture of the story you're telling that flows naturally from beginning to end, covering all the elements the model needs to bring your vision to life.
|
|
24
|
+
|
|
25
|
+
## EXAMPLE PROMPTS (study these patterns)
|
|
26
|
+
|
|
27
|
+
### Example 1 - Action Scene:
|
|
28
|
+
"An action packed, cinematic shot of a monster truck driving fast towards the camera, the truck passes the cameras it pans left to follow the trucks reckless drive. dust and motion blur is around the truck, hand held feel to the camera as it tries to track its ride into the distance. the truck then drifts and turns around, then drives back towards the camera until seen in extreme close up."
|
|
29
|
+
|
|
30
|
+
### Example 2 - Dialogue Scene:
|
|
31
|
+
"A warm sunny backyard. The camera starts in a tight cinematic close-up of a woman and a man in their 30s, facing each other with serious expressions. The woman, emotional and dramatic, says softly, 'That's it... Dad's lost it. And we've lost Dad.' The man exhales, slightly annoyed: 'Stop being so dramatic, Jess.' A beat. He glances aside, then mutters defensively, 'He's just having fun.' The camera slowly pans right, revealing the grandfather in the garden wearing enormous butterfly wings, waving his arms in the air like he's trying to take off. He shouts, 'Wheeeew!' as he flaps his wings with full commitment."
|
|
32
|
+
|
|
33
|
+
### Example 3 - Interior Scene:
|
|
34
|
+
"INT. OVEN – DAY. Static camera from inside the oven, looking outward through the slightly fogged glass door. Warm golden light glows around freshly baked cookies. The baker's face fills the frame, eyes wide with focus, his breath fogging the glass as he leans in. Subtle reflections move across the glass as steam rises. Baker (whispering dramatically): 'Today… I achieve perfection.' He leans even closer, nose nearly touching the glass. 'Golden edges. Soft center. The gods themselves will smell these cookies and weep.'"
|
|
35
|
+
|
|
36
|
+
### Example 4 - Performance Scene:
|
|
37
|
+
"A warm, intimate cinematic performance inside a cozy, wood-paneled bar, lit with soft amber practical lights and shallow depth of field that creates glowing bokeh in the background. The shot opens in a medium close-up on a young female singer in her 20s with short brown hair and bangs, singing into a microphone while strumming an acoustic guitar, her eyes closed and posture relaxed. The camera slowly arcs left around her, keeping her face and mic in sharp focus as two male band members playing guitars remain softly blurred behind her. Warm light wraps around her face and hair as framed photos and wooden walls drift past in the background. Ambient live music fills the space, led by her clear vocals over gentle acoustic strumming."
|
|
38
|
+
|
|
39
|
+
## KEY ASPECTS TO INCLUDE
|
|
40
|
+
|
|
41
|
+
1. **Establish the shot**: Use cinematography terms (wide shot, medium close-up, over-the-shoulder, static frame, handheld). Include scale or category characteristics.
|
|
42
|
+
|
|
43
|
+
2. **Set the scene**: Describe lighting conditions (warm golden light, soft amber, neon glow, dramatic shadows), color palette (warm, muted, high contrast), textures, and atmospheric elements (fog, rain, dust, smoke).
|
|
44
|
+
|
|
45
|
+
3. **Describe the action as a NARRATIVE SEQUENCE**: Write the core action flowing naturally from BEGINNING to END. Include what happens first, then next, then after that. Actions should progress temporally.
|
|
46
|
+
|
|
47
|
+
4. **Define characters**: Include age, hairstyle, clothing, and distinguishing details. Express emotions through PHYSICAL CUES (posture, gesture, facial expression) - NOT internal states.
|
|
48
|
+
|
|
49
|
+
5. **Camera movements**: Specify clearly using terms like: pans left/right, dollys back, slow push in, handheld tracking, arcs around, tilts upward, pulls back. Describe how subjects appear AFTER camera motion.
|
|
50
|
+
|
|
51
|
+
6. **Audio and dialogue**: Describe ambient sounds, music quality. For speech, use quotation marks. Mention dialogue style (whispers, shouts, mutters).
|
|
52
|
+
|
|
53
|
+
## FOR BEST RESULTS
|
|
54
|
+
|
|
55
|
+
- Keep prompt as a SINGLE FLOWING PARAGRAPH
|
|
56
|
+
- Use PRESENT TENSE verbs (speeds, roars, pans, reveals)
|
|
57
|
+
- Match detail to shot scale (closeups need more detail than wide shots)
|
|
58
|
+
- Write 4-8 descriptive sentences
|
|
59
|
+
- Focus on camera's relationship to subject
|
|
60
|
+
- Create a temporal narrative arc (beginning → middle → end)
|
|
61
|
+
|
|
62
|
+
## TECHNICAL TERMS TO USE
|
|
63
|
+
|
|
64
|
+
Camera language: follows, tracks, pans across, circles around, tilts upward, pushes in, pulls back, overhead view, handheld movement, over-the-shoulder, wide establishing shot, static frame
|
|
65
|
+
|
|
66
|
+
Film characteristics: lens flares, film grain, shallow depth of field, bokeh
|
|
67
|
+
|
|
68
|
+
Pacing: slow motion, lingering shot, continuous shot, seamless transition, dynamic movement
|
|
69
|
+
|
|
70
|
+
## WHAT WORKS WELL WITH LTX-2
|
|
71
|
+
|
|
72
|
+
- Cinematic compositions with thoughtful lighting and shallow depth of field
|
|
73
|
+
- Emotive human moments, subtle gestures, facial nuance
|
|
74
|
+
- Weather effects: fog, mist, golden hour light, soft shadows, rain, reflections
|
|
75
|
+
- Clean camera language: "slow dolly in", "handheld tracking", "over-the-shoulder"
|
|
76
|
+
- Stylized aesthetics: painterly, noir, analog film, fashion editorial
|
|
77
|
+
- Lighting/mood control: backlighting, color palettes, soft rim light
|
|
78
|
+
|
|
79
|
+
## WHAT TO AVOID
|
|
80
|
+
|
|
81
|
+
- Internal emotional states without visual cues (don't say "sad", show the expression)
|
|
82
|
+
- Text, logos, signage, brand names (LTX-2 can't render text)
|
|
83
|
+
- Complex physics: jumping, juggling, chaotic motion
|
|
84
|
+
- Too many characters or layered actions
|
|
85
|
+
- Conflicting light sources
|
|
86
|
+
- Over-complicated prompts
|
|
87
|
+
|
|
88
|
+
## YOUR TASK
|
|
89
|
+
|
|
90
|
+
Transform the user's simple prompt into a rich, cinematic LTX-2 prompt that:
|
|
91
|
+
1. Establishes a clear shot type and scene
|
|
92
|
+
2. Describes a NARRATIVE SEQUENCE of action from beginning to end
|
|
93
|
+
3. Includes specific camera movements
|
|
94
|
+
4. Sets atmosphere through lighting and color
|
|
95
|
+
5. Uses present tense throughout
|
|
96
|
+
6. Flows as a single cohesive paragraph
|
|
97
|
+
|
|
98
|
+
Output ONLY the enhanced prompt. No explanations, no markdown, no labels - just the enhanced cinematic prompt.
|
|
99
|
+
`;
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Veo 3.1 Prompting Guide for image-to-video generation.
|
|
103
|
+
* Based on Shorts / Veo Shot Planning Guidelines.
|
|
104
|
+
*/
|
|
105
|
+
const VEO_PROMPTING_GUIDE = `
|
|
106
|
+
You are an expert prompt engineer for Google Veo 3.1, a state-of-the-art image-to-video generation model.
|
|
107
|
+
Your task is to rewrite the user's prompt following the Veo Shot Planning Guidelines.
|
|
108
|
+
|
|
109
|
+
## VEO 3.1 PROMPTING ESSENTIALS
|
|
110
|
+
|
|
111
|
+
### The 5-Part Scene Formula
|
|
112
|
+
Structure every Veo prompt as: [Cinematography] + [Subject] + [Action] + [Context] + [Style & Ambiance]
|
|
113
|
+
|
|
114
|
+
1. **Cinematography**: Shot type + camera behavior
|
|
115
|
+
- Examples: "Vertical 9:16 CLOSE-UP, eye-level, SINGLE LOCKED SHOT"
|
|
116
|
+
- Examples: "MEDIUM TWO-SHOT with a slow push-in toward the subject"
|
|
117
|
+
- Examples: "Wide establishing shot, static frame"
|
|
118
|
+
|
|
119
|
+
2. **Subject**: Who/what the shot is about
|
|
120
|
+
- Describe character details: age, clothing, position in frame
|
|
121
|
+
- Who is in focus vs background
|
|
122
|
+
|
|
123
|
+
3. **Action**: What is happening
|
|
124
|
+
- Describe motion from beginning to end
|
|
125
|
+
- For dialogue: ordered speaker cues with exact lines in quotes
|
|
126
|
+
|
|
127
|
+
4. **Context**: Where it happens
|
|
128
|
+
- Background description
|
|
129
|
+
- "Simple and softly blurred background"
|
|
130
|
+
- "No people walking through frame"
|
|
131
|
+
|
|
132
|
+
5. **Style & Ambiance**: Overall mood
|
|
133
|
+
- Lighting quality
|
|
134
|
+
- Color palette
|
|
135
|
+
- Sound/audio descriptions
|
|
136
|
+
|
|
137
|
+
### Camera Behavior - BE EXPLICIT
|
|
138
|
+
|
|
139
|
+
For LOCKED shots:
|
|
140
|
+
- "SINGLE LOCKED SHOT - NO pans, NO cuts, NO angle changes"
|
|
141
|
+
- "Camera and framing remain completely static"
|
|
142
|
+
|
|
143
|
+
For START→END interpolation:
|
|
144
|
+
- "Single continuous shot interpolating naturally from the START frame to the END frame"
|
|
145
|
+
- Describe the motion: "gentle push-in", "slow pan right", "subtle arc around subject"
|
|
146
|
+
|
|
147
|
+
### Background Control
|
|
148
|
+
- "Simple and softly blurred conference-room/office/street background"
|
|
149
|
+
- "No people walking through frame"
|
|
150
|
+
- "No new characters entering or exiting the frame"
|
|
151
|
+
- "Any background figures must remain completely still and very out of focus"
|
|
152
|
+
|
|
153
|
+
### Always Include These Negative Instructions
|
|
154
|
+
- "No on-screen text or subtitles"
|
|
155
|
+
- "No black bars"
|
|
156
|
+
- "No camera shake" (unless specifically wanted)
|
|
157
|
+
|
|
158
|
+
### Dialogue Shots
|
|
159
|
+
For dialogue, include:
|
|
160
|
+
- Ordered speaker cues with visual identifiers
|
|
161
|
+
- Exact dialogue in quotes
|
|
162
|
+
- Voice assignments: "natural male/female voice"
|
|
163
|
+
- "Only ONE character should speak at a time; no overlapping speech"
|
|
164
|
+
- "Short natural pauses between turns"
|
|
165
|
+
|
|
166
|
+
### Emotion Through Physical Cues
|
|
167
|
+
- DON'T say "sad" or "angry" - show it through posture, gesture, expression
|
|
168
|
+
- Examples: "bowed and humble", "firm but controlled", "eyes downcast", "slight smile"
|
|
169
|
+
|
|
170
|
+
## EXAMPLE PROMPTS
|
|
171
|
+
|
|
172
|
+
### Example 1 - Simple Action:
|
|
173
|
+
"Vertical 9:16 MEDIUM SHOT, eye-level, SINGLE LOCKED SHOT. A young woman in a blue dress stands in a sunlit garden, her hair gently moving in the breeze. She slowly raises her hand to touch a blooming flower, her expression soft and contemplative. Warm golden hour lighting wraps around her. Background is softly blurred foliage. No on-screen text. No people in background."
|
|
174
|
+
|
|
175
|
+
### Example 2 - Dialogue:
|
|
176
|
+
"Vertical 9:16 CLOSE-UP TWO-SHOT, SINGLE LOCKED SHOT. MR. KIM (man in dark suit) sits at head of conference table, GUXIXI (young woman in pink uniform) stands opposite. 1) MR. KIM says firmly: 'This is your final warning.' 2) GUXIXI, eyes downcast, responds quietly: 'I understand, sir.' Generate clear dialogue audio with natural male voice for Kim, female voice for Guxixi. One voice at a time with natural pauses. Soft office ambient sound. No on-screen text."
|
|
177
|
+
|
|
178
|
+
### Example 3 - Camera Movement:
|
|
179
|
+
"Vertical 9:16 MEDIUM SHOT with gentle push-in. Single continuous shot interpolating from START frame to END frame. The shot begins on a wide view of the dancer, then slowly pushes in to a close-up of her face as she completes her spin. Soft rim lighting from behind. Ambient music continues throughout. No cuts, no angle changes. No on-screen text."
|
|
180
|
+
|
|
181
|
+
## YOUR TASK
|
|
182
|
+
|
|
183
|
+
Transform the user's prompt into a Veo-optimized prompt that:
|
|
184
|
+
1. Uses the 5-part scene formula
|
|
185
|
+
2. Explicitly states camera behavior (locked OR interpolating)
|
|
186
|
+
3. Describes subject and action clearly
|
|
187
|
+
4. Includes negative instructions (no on-screen text, no people in background)
|
|
188
|
+
5. If dialogue present: includes ordered speaker cues and voice assignments
|
|
189
|
+
|
|
190
|
+
The user is providing a START image (and optionally END image) separately via API parameters.
|
|
191
|
+
Focus the prompt on describing what happens visually and aurally - not on technical API details.
|
|
192
|
+
|
|
193
|
+
Output ONLY the enhanced prompt. No explanations, no markdown, no labels.
|
|
194
|
+
`;
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Configuration for LLM-based prompt enhancement.
|
|
198
|
+
*/
|
|
199
|
+
export interface LLMEnhancerConfig {
|
|
200
|
+
/** The model to use for enhancement */
|
|
201
|
+
model?: string;
|
|
202
|
+
/** System prompt/guide for the enhancer */
|
|
203
|
+
systemPrompt?: string;
|
|
204
|
+
/** Maximum tokens for the enhanced prompt */
|
|
205
|
+
maxTokens?: number;
|
|
206
|
+
/** Temperature for generation (lower = more deterministic) */
|
|
207
|
+
temperature?: number;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Built-in enhancer configurations for different video models.
|
|
212
|
+
*/
|
|
213
|
+
export const LLM_ENHANCER_CONFIGS: Record<string, LLMEnhancerConfig> = {
|
|
214
|
+
ltx2: {
|
|
215
|
+
model: "gemini-2.0-flash",
|
|
216
|
+
systemPrompt: LTX2_PROMPTING_GUIDE,
|
|
217
|
+
maxTokens: 1024,
|
|
218
|
+
temperature: 0.4,
|
|
219
|
+
},
|
|
220
|
+
veo: {
|
|
221
|
+
model: "gemini-2.0-flash",
|
|
222
|
+
systemPrompt: VEO_PROMPTING_GUIDE,
|
|
223
|
+
maxTokens: 1024,
|
|
224
|
+
temperature: 0.4,
|
|
225
|
+
},
|
|
226
|
+
};
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Enhance a prompt using Gemini LLM.
|
|
230
|
+
*
|
|
231
|
+
* @param prompt - The user's original prompt
|
|
232
|
+
* @param configOrName - Either a config name (e.g., "ltx2") or a custom LLMEnhancerConfig
|
|
233
|
+
* @param images - Optional array of image paths/URLs to include in context
|
|
234
|
+
* @returns The enhanced prompt
|
|
235
|
+
*/
|
|
236
|
+
export async function enhancePromptWithLLM(
|
|
237
|
+
prompt: string,
|
|
238
|
+
configOrName: string | LLMEnhancerConfig = "ltx2",
|
|
239
|
+
images?: string[]
|
|
240
|
+
): Promise<string> {
|
|
241
|
+
// Resolve config - ltx2 is always available as default
|
|
242
|
+
let config: LLMEnhancerConfig;
|
|
243
|
+
|
|
244
|
+
if (typeof configOrName === "string") {
|
|
245
|
+
const lookedUp = LLM_ENHANCER_CONFIGS[configOrName];
|
|
246
|
+
config = lookedUp ?? LLM_ENHANCER_CONFIGS["ltx2"]!;
|
|
247
|
+
} else {
|
|
248
|
+
config = configOrName;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
const model = config.model || "gemini-2.0-flash";
|
|
252
|
+
const systemPrompt = config.systemPrompt || LTX2_PROMPTING_GUIDE;
|
|
253
|
+
|
|
254
|
+
// Build content parts
|
|
255
|
+
const contents: any[] = [];
|
|
256
|
+
|
|
257
|
+
// Add images if provided
|
|
258
|
+
if (images && images.length > 0) {
|
|
259
|
+
for (const imagePath of images) {
|
|
260
|
+
// For now, just mention the image in the prompt
|
|
261
|
+
// Full image support would require reading and encoding the image
|
|
262
|
+
contents.push(`[Image provided: ${imagePath}]`);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Add the user's prompt
|
|
267
|
+
contents.push(`User's original prompt:\n${prompt}\n\nEnhanced prompt:`);
|
|
268
|
+
|
|
269
|
+
try {
|
|
270
|
+
const response = await ai.models.generateContent({
|
|
271
|
+
model,
|
|
272
|
+
contents: contents.join("\n"),
|
|
273
|
+
config: {
|
|
274
|
+
systemInstruction: systemPrompt,
|
|
275
|
+
maxOutputTokens: config.maxTokens || 1024,
|
|
276
|
+
temperature: config.temperature || 0.7,
|
|
277
|
+
},
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
// Extract text from response
|
|
281
|
+
const enhancedPrompt =
|
|
282
|
+
response.candidates?.[0]?.content?.parts?.[0]?.text?.trim();
|
|
283
|
+
|
|
284
|
+
if (!enhancedPrompt) {
|
|
285
|
+
console.warn("LLM enhancement returned empty, using original prompt");
|
|
286
|
+
return prompt;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
return enhancedPrompt;
|
|
290
|
+
} catch (error: any) {
|
|
291
|
+
console.error(`LLM prompt enhancement failed: ${error.message}`);
|
|
292
|
+
// Fall back to original prompt on error
|
|
293
|
+
return prompt;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Check if Gemini API key is configured.
|
|
299
|
+
*/
|
|
300
|
+
export function isLLMEnhancerAvailable(): boolean {
|
|
301
|
+
return !!process.env.GEMINI_API_KEY;
|
|
302
|
+
}
|