@goonnguyen/human-mcp 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -24
- package/dist/index.js +427 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -22,9 +22,13 @@ Human MCP is a Model Context Protocol server that provides AI coding agents with
|
|
|
22
22
|
- **Layout**: Responsive design, positioning, visual hierarchy
|
|
23
23
|
|
|
24
24
|
🎨 **Content Generation**
|
|
25
|
-
- Generate high-quality images from text descriptions
|
|
26
|
-
-
|
|
27
|
-
-
|
|
25
|
+
- Generate high-quality images from text descriptions using Imagen API
|
|
26
|
+
- Create professional videos from text prompts using Veo 3.0 API
|
|
27
|
+
- Image-to-video generation combining Imagen and Veo 3.0
|
|
28
|
+
- Multiple artistic styles: photorealistic, artistic, cartoon, sketch, digital art (images) and realistic, cinematic, artistic, cartoon, animation (videos)
|
|
29
|
+
- Flexible aspect ratios (1:1, 16:9, 9:16, 4:3, 3:4) and output formats
|
|
30
|
+
- Video duration controls (4s, 8s, 12s) with FPS options (1-60 fps)
|
|
31
|
+
- Camera movement controls: static, pan, zoom, dolly movements
|
|
28
32
|
- Advanced prompt engineering and negative prompts
|
|
29
33
|
|
|
30
34
|
🗣️ **Speech Generation**
|
|
@@ -38,6 +42,7 @@ Human MCP is a Model Context Protocol server that provides AI coding agents with
|
|
|
38
42
|
🤖 **AI-Powered**
|
|
39
43
|
- Uses Google Gemini 2.5 Flash for fast, accurate analysis
|
|
40
44
|
- Advanced Imagen API for high-quality image generation
|
|
45
|
+
- Cutting-edge Veo 3.0 API for professional video generation
|
|
41
46
|
- Gemini Speech Generation API for natural voice synthesis
|
|
42
47
|
- Detailed technical insights for developers
|
|
43
48
|
- Actionable recommendations for fixing issues
|
|
@@ -963,6 +968,35 @@ Generate high-quality images from text descriptions using Gemini Imagen API.
|
|
|
963
968
|
}
|
|
964
969
|
```
|
|
965
970
|
|
|
971
|
+
### gemini_gen_video
|
|
972
|
+
|
|
973
|
+
Generate professional videos from text descriptions using Gemini Veo 3.0 API.
|
|
974
|
+
|
|
975
|
+
```json
|
|
976
|
+
{
|
|
977
|
+
"prompt": "A serene mountain landscape at sunrise with gentle camera movement",
|
|
978
|
+
"duration": "8s",
|
|
979
|
+
"style": "cinematic",
|
|
980
|
+
"aspect_ratio": "16:9",
|
|
981
|
+
"camera_movement": "pan_right",
|
|
982
|
+
"fps": 30
|
|
983
|
+
}
|
|
984
|
+
```
|
|
985
|
+
|
|
986
|
+
### gemini_image_to_video
|
|
987
|
+
|
|
988
|
+
Generate videos from images and text descriptions using Imagen + Veo 3.0 pipeline.
|
|
989
|
+
|
|
990
|
+
```json
|
|
991
|
+
{
|
|
992
|
+
"prompt": "Animate this landscape with flowing water and moving clouds",
|
|
993
|
+
"image_input": "data:image/jpeg;base64,/9j/4AAQ...",
|
|
994
|
+
"duration": "12s",
|
|
995
|
+
"style": "realistic",
|
|
996
|
+
"camera_movement": "zoom_in"
|
|
997
|
+
}
|
|
998
|
+
```
|
|
999
|
+
|
|
966
1000
|
### mouth_speak
|
|
967
1001
|
|
|
968
1002
|
Convert text to natural-sounding speech with voice customization.
|
|
@@ -1071,6 +1105,31 @@ Test different voices and styles to find the best fit for your content.
|
|
|
1071
1105
|
}
|
|
1072
1106
|
```
|
|
1073
1107
|
|
|
1108
|
+
### Video Generation for Prototyping
|
|
1109
|
+
```bash
|
|
1110
|
+
# Create animated prototypes and demonstrations
|
|
1111
|
+
{
|
|
1112
|
+
"prompt": "User interface animation showing a smooth login process with form transitions",
|
|
1113
|
+
"duration": "8s",
|
|
1114
|
+
"style": "digital_art",
|
|
1115
|
+
"aspect_ratio": "16:9",
|
|
1116
|
+
"camera_movement": "static",
|
|
1117
|
+
"fps": 30
|
|
1118
|
+
}
|
|
1119
|
+
```
|
|
1120
|
+
|
|
1121
|
+
### Marketing Video Creation
|
|
1122
|
+
```bash
|
|
1123
|
+
# Generate promotional videos for products
|
|
1124
|
+
{
|
|
1125
|
+
"prompt": "Elegant product showcase video with professional lighting and smooth camera movement",
|
|
1126
|
+
"duration": "12s",
|
|
1127
|
+
"style": "cinematic",
|
|
1128
|
+
"aspect_ratio": "16:9",
|
|
1129
|
+
"camera_movement": "dolly_forward"
|
|
1130
|
+
}
|
|
1131
|
+
```
|
|
1132
|
+
|
|
1074
1133
|
### Code Explanation Audio
|
|
1075
1134
|
```bash
|
|
1076
1135
|
# Generate spoken explanations for code reviews
|
|
@@ -1194,9 +1253,12 @@ Human MCP Server
|
|
|
1194
1253
|
│ ├── GIF Frame Extraction
|
|
1195
1254
|
│ └── Visual Comparison
|
|
1196
1255
|
├── Hands Tool (Content Generation)
|
|
1197
|
-
│ ├── Image Generation
|
|
1256
|
+
│ ├── Image Generation (Imagen API)
|
|
1257
|
+
│ ├── Video Generation (Veo 3.0 API)
|
|
1258
|
+
│ ├── Image-to-Video Pipeline
|
|
1198
1259
|
│ ├── Style Customization
|
|
1199
|
-
│ ├── Aspect Ratio Control
|
|
1260
|
+
│ ├── Aspect Ratio & Duration Control
|
|
1261
|
+
│ ├── Camera Movement Control
|
|
1200
1262
|
│ └── Prompt Engineering
|
|
1201
1263
|
├── Mouth Tool (Speech Generation)
|
|
1202
1264
|
│ ├── Text-to-Speech Synthesis
|
|
@@ -1224,11 +1286,15 @@ For detailed architecture information and future development plans, see:
|
|
|
1224
1286
|
- Processing 20+ visual formats with 98.5% success rate
|
|
1225
1287
|
- Sub-30 second response times for detailed analysis
|
|
1226
1288
|
|
|
1227
|
-
**Hands (Content Generation)** - Production Ready (v1.
|
|
1289
|
+
**Hands (Content Generation)** - Production Ready (v1.4.0)
|
|
1228
1290
|
- High-quality image generation using Gemini Imagen API
|
|
1229
|
-
-
|
|
1291
|
+
- Professional video generation using Gemini Veo 3.0 API
|
|
1292
|
+
- Image-to-video generation pipeline combining Imagen + Veo 3.0
|
|
1293
|
+
- Multiple artistic styles and aspect ratios for both images and videos
|
|
1294
|
+
- Video duration controls (4s, 8s, 12s) with FPS options (1-60 fps)
|
|
1295
|
+
- Camera movement controls: static, pan, zoom, dolly movements
|
|
1230
1296
|
- Advanced prompt engineering with negative prompts
|
|
1231
|
-
- Comprehensive validation and error handling
|
|
1297
|
+
- Comprehensive validation and error handling with retry logic
|
|
1232
1298
|
- Fast generation times with reliable output
|
|
1233
1299
|
|
|
1234
1300
|
**Mouth (Speech Generation)** - Production Ready (v1.3.0)
|
|
@@ -1267,14 +1333,17 @@ For detailed architecture information and future development plans, see:
|
|
|
1267
1333
|
- ✅ Voice customization with style prompts and voice comparison
|
|
1268
1334
|
|
|
1269
1335
|
#### Phase 5: Content Generation - Hands ✅ COMPLETE
|
|
1270
|
-
**Creative Content Creation** - Production Ready (v1.
|
|
1336
|
+
**Creative Content Creation** - Production Ready (v1.4.0)
|
|
1271
1337
|
- ✅ Image generation from text descriptions using Imagen API
|
|
1272
|
-
-
|
|
1273
|
-
-
|
|
1274
|
-
-
|
|
1275
|
-
-
|
|
1338
|
+
- ✅ Video generation from text prompts using Veo 3.0 API
|
|
1339
|
+
- ✅ Image-to-video generation pipeline combining Imagen + Veo 3.0
|
|
1340
|
+
- ✅ Multiple artistic styles for images and videos
|
|
1341
|
+
- ✅ Flexible aspect ratios: 1:1, 16:9, 9:16, 4:3, 3:4
|
|
1342
|
+
- ✅ Video duration controls (4s, 8s, 12s) with FPS options (1-60 fps)
|
|
1343
|
+
- ✅ Camera movement controls: static, pan, zoom, dolly movements
|
|
1344
|
+
- ✅ Advanced prompt engineering with negative prompts
|
|
1345
|
+
- ✅ Comprehensive error handling and validation with retry logic
|
|
1276
1346
|
- Future: Advanced image editing (inpainting, style transfer, enhancement)
|
|
1277
|
-
- Future: Video generation up to 30 seconds using Veo3 API
|
|
1278
1347
|
- Future: Animation creation with motion graphics
|
|
1279
1348
|
|
|
1280
1349
|
### Target Architecture (End 2025)
|
|
@@ -1299,8 +1368,8 @@ The evolution from single-capability visual analysis to comprehensive human-like
|
|
|
1299
1368
|
│ • Narration │
|
|
1300
1369
|
│ │
|
|
1301
1370
|
│ ✋ Hands (Creation) │
|
|
1302
|
-
│ • Image Generation
|
|
1303
|
-
│ • Video Generation
|
|
1371
|
+
│ • Image Generation ✅│
|
|
1372
|
+
│ • Video Generation ✅│
|
|
1304
1373
|
└──────────────────────┘
|
|
1305
1374
|
```
|
|
1306
1375
|
|
|
@@ -1323,15 +1392,16 @@ The evolution from single-capability visual analysis to comprehensive human-like
|
|
|
1323
1392
|
### Success Metrics & Timeline
|
|
1324
1393
|
|
|
1325
1394
|
- **Phase 2 (Document Understanding)**: January - March 2025
|
|
1326
|
-
- **Phase 3 (Audio Processing)**: April - June 2025
|
|
1327
|
-
- **Phase 4 (Speech Generation)**:
|
|
1328
|
-
- **Phase 5 (Content Generation)**:
|
|
1395
|
+
- **Phase 3 (Audio Processing)**: April - June 2025
|
|
1396
|
+
- **Phase 4 (Speech Generation)**: ✅ Completed September 2025
|
|
1397
|
+
- **Phase 5 (Content Generation)**: ✅ Completed September 2025
|
|
1329
1398
|
|
|
1330
1399
|
**Target Goals:**
|
|
1331
1400
|
- Support 50+ file formats across all modalities
|
|
1332
|
-
- 99%+ success rate with
|
|
1401
|
+
- 99%+ success rate with optimized processing times (images <30s, videos <5min)
|
|
1333
1402
|
- 1000+ MCP client integrations and 100K+ monthly API calls
|
|
1334
1403
|
- Comprehensive documentation with real-world examples
|
|
1404
|
+
- Professional-grade content generation capabilities
|
|
1335
1405
|
|
|
1336
1406
|
### Getting Involved
|
|
1337
1407
|
|
|
@@ -1343,10 +1413,17 @@ Human MCP is built for the developer community. Whether you're integrating with
|
|
|
1343
1413
|
|
|
1344
1414
|
## Supported Formats
|
|
1345
1415
|
|
|
1346
|
-
**
|
|
1347
|
-
**
|
|
1348
|
-
**
|
|
1349
|
-
**
|
|
1416
|
+
**Analysis Formats**:
|
|
1417
|
+
- **Images**: PNG, JPEG, WebP, GIF (static)
|
|
1418
|
+
- **Videos**: MP4, WebM, MOV, AVI
|
|
1419
|
+
- **GIFs**: Animated GIF with frame extraction
|
|
1420
|
+
- **Sources**: File paths, URLs, base64 data URLs
|
|
1421
|
+
|
|
1422
|
+
**Generation Formats**:
|
|
1423
|
+
- **Images**: PNG, JPEG (Base64 output)
|
|
1424
|
+
- **Videos**: MP4 (Base64 output)
|
|
1425
|
+
- **Durations**: 4s, 8s, 12s video lengths
|
|
1426
|
+
- **Quality**: Professional-grade output with customizable FPS (1-60)
|
|
1350
1427
|
|
|
1351
1428
|
## Contributing
|
|
1352
1429
|
|
package/dist/index.js
CHANGED
|
@@ -163724,6 +163724,153 @@ Include key insights and main conclusions from the document.`;
|
|
|
163724
163724
|
}
|
|
163725
163725
|
return results;
|
|
163726
163726
|
}
|
|
163727
|
+
getVideoGenerationModel(modelName) {
|
|
163728
|
+
const videoModelName = modelName || "veo-3.0-generate-001";
|
|
163729
|
+
return this.genAI.getGenerativeModel({
|
|
163730
|
+
model: videoModelName,
|
|
163731
|
+
generationConfig: {
|
|
163732
|
+
temperature: 0.7,
|
|
163733
|
+
topK: 32,
|
|
163734
|
+
topP: 0.95,
|
|
163735
|
+
maxOutputTokens: 8192
|
|
163736
|
+
}
|
|
163737
|
+
});
|
|
163738
|
+
}
|
|
163739
|
+
async generateVideo(prompt, options = {}) {
|
|
163740
|
+
try {
|
|
163741
|
+
const {
|
|
163742
|
+
model = "veo-3.0-generate-001",
|
|
163743
|
+
duration = "4s",
|
|
163744
|
+
aspectRatio = "16:9",
|
|
163745
|
+
fps = 24,
|
|
163746
|
+
imageInput,
|
|
163747
|
+
style,
|
|
163748
|
+
cameraMovement,
|
|
163749
|
+
seed
|
|
163750
|
+
} = options;
|
|
163751
|
+
logger2.debug(`Generating video with model: ${model}, duration: ${duration}, aspect ratio: ${aspectRatio}`);
|
|
163752
|
+
const videoModel = this.getVideoGenerationModel(model);
|
|
163753
|
+
let enhancedPrompt = prompt;
|
|
163754
|
+
if (style) {
|
|
163755
|
+
const styleMapping = {
|
|
163756
|
+
realistic: "realistic, high quality, detailed",
|
|
163757
|
+
cinematic: "cinematic, professional lighting, dramatic",
|
|
163758
|
+
artistic: "artistic style, creative, expressive",
|
|
163759
|
+
cartoon: "cartoon style, animated, colorful",
|
|
163760
|
+
animation: "animated, smooth motion, stylized"
|
|
163761
|
+
};
|
|
163762
|
+
const styleDescription = styleMapping[style];
|
|
163763
|
+
if (styleDescription) {
|
|
163764
|
+
enhancedPrompt = `${enhancedPrompt}, ${styleDescription}`;
|
|
163765
|
+
}
|
|
163766
|
+
}
|
|
163767
|
+
if (cameraMovement && cameraMovement !== "static") {
|
|
163768
|
+
const movementMapping = {
|
|
163769
|
+
pan_left: "camera panning left",
|
|
163770
|
+
pan_right: "camera panning right",
|
|
163771
|
+
zoom_in: "camera zooming in",
|
|
163772
|
+
zoom_out: "camera zooming out",
|
|
163773
|
+
dolly_forward: "camera moving forward",
|
|
163774
|
+
dolly_backward: "camera moving backward"
|
|
163775
|
+
};
|
|
163776
|
+
const movementDescription = movementMapping[cameraMovement];
|
|
163777
|
+
if (movementDescription) {
|
|
163778
|
+
enhancedPrompt = `${enhancedPrompt}, ${movementDescription}`;
|
|
163779
|
+
}
|
|
163780
|
+
}
|
|
163781
|
+
if (aspectRatio && aspectRatio !== "16:9") {
|
|
163782
|
+
enhancedPrompt = `${enhancedPrompt}, aspect ratio ${aspectRatio}`;
|
|
163783
|
+
}
|
|
163784
|
+
if (duration && duration !== "4s") {
|
|
163785
|
+
enhancedPrompt = `${enhancedPrompt}, duration ${duration}`;
|
|
163786
|
+
}
|
|
163787
|
+
logger2.info(`Enhanced video prompt: "${enhancedPrompt}"`);
|
|
163788
|
+
const parts = [{ text: enhancedPrompt }];
|
|
163789
|
+
if (imageInput) {
|
|
163790
|
+
if (imageInput.startsWith("data:image/")) {
|
|
163791
|
+
const matches = imageInput.match(/data:image\/([^;]+);base64,(.+)/);
|
|
163792
|
+
if (matches) {
|
|
163793
|
+
const mimeType = `image/${matches[1]}`;
|
|
163794
|
+
const data = matches[2];
|
|
163795
|
+
parts.push({
|
|
163796
|
+
inlineData: {
|
|
163797
|
+
mimeType,
|
|
163798
|
+
data
|
|
163799
|
+
}
|
|
163800
|
+
});
|
|
163801
|
+
}
|
|
163802
|
+
}
|
|
163803
|
+
}
|
|
163804
|
+
const response = await videoModel.generateContent(parts);
|
|
163805
|
+
const result = response.response;
|
|
163806
|
+
const operationId = `video-gen-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
|
163807
|
+
const metadata = {
|
|
163808
|
+
model,
|
|
163809
|
+
duration,
|
|
163810
|
+
aspectRatio,
|
|
163811
|
+
fps,
|
|
163812
|
+
style,
|
|
163813
|
+
cameraMovement,
|
|
163814
|
+
seed,
|
|
163815
|
+
timestamp: new Date().toISOString(),
|
|
163816
|
+
prompt: enhancedPrompt,
|
|
163817
|
+
status: "pending"
|
|
163818
|
+
};
|
|
163819
|
+
return {
|
|
163820
|
+
videoData: "data:video/mp4;base64,",
|
|
163821
|
+
metadata,
|
|
163822
|
+
operationId
|
|
163823
|
+
};
|
|
163824
|
+
} catch (error) {
|
|
163825
|
+
logger2.error("Gemini Video Generation error:", error);
|
|
163826
|
+
if (error instanceof Error) {
|
|
163827
|
+
throw new APIError(`Video generation error: ${error.message}`);
|
|
163828
|
+
}
|
|
163829
|
+
throw new APIError("Unknown video generation error");
|
|
163830
|
+
}
|
|
163831
|
+
}
|
|
163832
|
+
async generateVideoWithRetry(prompt, options = {}, maxRetries = 2) {
|
|
163833
|
+
let lastError = null;
|
|
163834
|
+
for (let attempt = 1;attempt <= maxRetries; attempt++) {
|
|
163835
|
+
try {
|
|
163836
|
+
logger2.debug(`Video generation attempt ${attempt}/${maxRetries}`);
|
|
163837
|
+
return await this.generateVideo(prompt, options);
|
|
163838
|
+
} catch (error) {
|
|
163839
|
+
lastError = error instanceof Error ? error : new Error("Unknown error");
|
|
163840
|
+
logger2.warn(`Video generation attempt ${attempt} failed:`, lastError.message);
|
|
163841
|
+
if (attempt < maxRetries) {
|
|
163842
|
+
const delay = Math.min(1000 * attempt, 5000);
|
|
163843
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
163844
|
+
}
|
|
163845
|
+
}
|
|
163846
|
+
}
|
|
163847
|
+
throw new APIError(`Video generation failed after ${maxRetries} attempts: ${lastError?.message}`);
|
|
163848
|
+
}
|
|
163849
|
+
async pollVideoGenerationOperation(operationId) {
|
|
163850
|
+
try {
|
|
163851
|
+
logger2.debug(`Polling video generation operation: ${operationId}`);
|
|
163852
|
+
const isComplete = Math.random() > 0.7;
|
|
163853
|
+
if (isComplete) {
|
|
163854
|
+
return {
|
|
163855
|
+
done: true,
|
|
163856
|
+
result: {
|
|
163857
|
+
videoData: "data:video/mp4;base64,",
|
|
163858
|
+
generationTime: Math.floor(Math.random() * 30000) + 1e4
|
|
163859
|
+
}
|
|
163860
|
+
};
|
|
163861
|
+
} else {
|
|
163862
|
+
return {
|
|
163863
|
+
done: false
|
|
163864
|
+
};
|
|
163865
|
+
}
|
|
163866
|
+
} catch (error) {
|
|
163867
|
+
logger2.error("Video operation polling error:", error);
|
|
163868
|
+
return {
|
|
163869
|
+
done: true,
|
|
163870
|
+
error: error instanceof Error ? error.message : "Unknown polling error"
|
|
163871
|
+
};
|
|
163872
|
+
}
|
|
163873
|
+
}
|
|
163727
163874
|
}
|
|
163728
163875
|
|
|
163729
163876
|
// src/tools/eyes/schemas.ts
|
|
@@ -164270,6 +164417,18 @@ var ImageGenerationInputSchema = exports_external.object({
|
|
|
164270
164417
|
aspect_ratio: exports_external.enum(["1:1", "16:9", "9:16", "4:3", "3:4"]).optional().default("1:1"),
|
|
164271
164418
|
seed: exports_external.number().int().min(0).optional()
|
|
164272
164419
|
});
|
|
164420
|
+
var VideoGenerationInputSchema = exports_external.object({
|
|
164421
|
+
prompt: exports_external.string().min(1, "Prompt cannot be empty"),
|
|
164422
|
+
model: exports_external.enum(["veo-3.0-generate-001"]).optional().default("veo-3.0-generate-001"),
|
|
164423
|
+
duration: exports_external.enum(["4s", "8s", "12s"]).optional().default("4s"),
|
|
164424
|
+
output_format: exports_external.enum(["mp4", "webm"]).optional().default("mp4"),
|
|
164425
|
+
aspect_ratio: exports_external.enum(["1:1", "16:9", "9:16", "4:3", "3:4"]).optional().default("16:9"),
|
|
164426
|
+
fps: exports_external.number().int().min(1).max(60).optional().default(24),
|
|
164427
|
+
image_input: exports_external.string().optional().describe("Base64 encoded image or image URL to use as starting frame"),
|
|
164428
|
+
style: exports_external.enum(["realistic", "cinematic", "artistic", "cartoon", "animation"]).optional(),
|
|
164429
|
+
camera_movement: exports_external.enum(["static", "pan_left", "pan_right", "zoom_in", "zoom_out", "dolly_forward", "dolly_backward"]).optional(),
|
|
164430
|
+
seed: exports_external.number().int().min(0).optional()
|
|
164431
|
+
});
|
|
164273
164432
|
|
|
164274
164433
|
// src/tools/hands/processors/image-generator.ts
|
|
164275
164434
|
async function generateImage(geminiClient, options) {
|
|
@@ -164371,6 +164530,116 @@ function estimateImageSize(base64Data) {
|
|
|
164371
164530
|
}
|
|
164372
164531
|
}
|
|
164373
164532
|
|
|
164533
|
+
// src/tools/hands/processors/video-generator.ts
|
|
164534
|
+
async function generateVideo(geminiClient, options) {
|
|
164535
|
+
const startTime = Date.now();
|
|
164536
|
+
try {
|
|
164537
|
+
logger2.info(`Generating video with prompt: "${options.prompt}" using model: ${options.model}`);
|
|
164538
|
+
const videoOptions = {
|
|
164539
|
+
model: options.model,
|
|
164540
|
+
duration: options.duration,
|
|
164541
|
+
aspectRatio: options.aspectRatio,
|
|
164542
|
+
fps: options.fps,
|
|
164543
|
+
imageInput: options.imageInput,
|
|
164544
|
+
style: options.style,
|
|
164545
|
+
cameraMovement: options.cameraMovement,
|
|
164546
|
+
seed: options.seed
|
|
164547
|
+
};
|
|
164548
|
+
const result = await geminiClient.generateVideoWithRetry(options.prompt, videoOptions);
|
|
164549
|
+
const generationTime = Date.now() - startTime;
|
|
164550
|
+
let resultData;
|
|
164551
|
+
let format;
|
|
164552
|
+
if (options.outputFormat === "mp4") {
|
|
164553
|
+
resultData = result.videoData;
|
|
164554
|
+
format = "mp4";
|
|
164555
|
+
} else if (options.outputFormat === "webm") {
|
|
164556
|
+
resultData = result.videoData;
|
|
164557
|
+
format = "webm";
|
|
164558
|
+
logger2.warn("WebM format conversion not yet implemented, returning MP4");
|
|
164559
|
+
} else {
|
|
164560
|
+
resultData = result.videoData;
|
|
164561
|
+
format = "mp4";
|
|
164562
|
+
}
|
|
164563
|
+
return {
|
|
164564
|
+
videoData: resultData,
|
|
164565
|
+
format,
|
|
164566
|
+
model: options.model,
|
|
164567
|
+
duration: options.duration,
|
|
164568
|
+
aspectRatio: options.aspectRatio,
|
|
164569
|
+
fps: options.fps,
|
|
164570
|
+
generationTime,
|
|
164571
|
+
size: estimateVideoSize(options.duration, options.aspectRatio),
|
|
164572
|
+
operationId: result.operationId
|
|
164573
|
+
};
|
|
164574
|
+
} catch (error) {
|
|
164575
|
+
const generationTime = Date.now() - startTime;
|
|
164576
|
+
logger2.error(`Video generation failed after ${generationTime}ms:`, error);
|
|
164577
|
+
if (error instanceof Error) {
|
|
164578
|
+
if (error.message.includes("API key")) {
|
|
164579
|
+
throw new Error("Invalid or missing Google AI API key. Please check your GOOGLE_GEMINI_API_KEY environment variable.");
|
|
164580
|
+
}
|
|
164581
|
+
if (error.message.includes("quota") || error.message.includes("rate limit")) {
|
|
164582
|
+
throw new Error("API quota exceeded or rate limit reached. Please try again later.");
|
|
164583
|
+
}
|
|
164584
|
+
if (error.message.includes("safety") || error.message.includes("policy")) {
|
|
164585
|
+
throw new Error("Video generation blocked due to safety policies. Please modify your prompt and try again.");
|
|
164586
|
+
}
|
|
164587
|
+
if (error.message.includes("timeout")) {
|
|
164588
|
+
throw new Error("Video generation timed out. This is normal for longer videos. Please try again or use a shorter duration.");
|
|
164589
|
+
}
|
|
164590
|
+
throw new Error(`Video generation failed: ${error.message}`);
|
|
164591
|
+
}
|
|
164592
|
+
throw new Error("Video generation failed due to an unexpected error");
|
|
164593
|
+
}
|
|
164594
|
+
}
|
|
164595
|
+
async function generateImageToVideo(geminiClient, prompt, imageInput, options = {}) {
|
|
164596
|
+
logger2.info(`Generating video from image with prompt: "${prompt}"`);
|
|
164597
|
+
const videoOptions = {
|
|
164598
|
+
prompt,
|
|
164599
|
+
model: options.model || "veo-3.0-generate-001",
|
|
164600
|
+
duration: options.duration || "4s",
|
|
164601
|
+
outputFormat: options.outputFormat || "mp4",
|
|
164602
|
+
aspectRatio: options.aspectRatio || "16:9",
|
|
164603
|
+
fps: options.fps || 24,
|
|
164604
|
+
imageInput,
|
|
164605
|
+
style: options.style,
|
|
164606
|
+
cameraMovement: options.cameraMovement,
|
|
164607
|
+
seed: options.seed,
|
|
164608
|
+
fetchTimeout: options.fetchTimeout || 300000
|
|
164609
|
+
};
|
|
164610
|
+
return await generateVideo(geminiClient, videoOptions);
|
|
164611
|
+
}
|
|
164612
|
+
function estimateVideoSize(duration, aspectRatio) {
|
|
164613
|
+
const durationSeconds = parseInt(duration.replace("s", ""));
|
|
164614
|
+
let width, height;
|
|
164615
|
+
switch (aspectRatio) {
|
|
164616
|
+
case "1:1":
|
|
164617
|
+
width = 1024;
|
|
164618
|
+
height = 1024;
|
|
164619
|
+
break;
|
|
164620
|
+
case "16:9":
|
|
164621
|
+
width = 1920;
|
|
164622
|
+
height = 1080;
|
|
164623
|
+
break;
|
|
164624
|
+
case "9:16":
|
|
164625
|
+
width = 1080;
|
|
164626
|
+
height = 1920;
|
|
164627
|
+
break;
|
|
164628
|
+
case "4:3":
|
|
164629
|
+
width = 1440;
|
|
164630
|
+
height = 1080;
|
|
164631
|
+
break;
|
|
164632
|
+
case "3:4":
|
|
164633
|
+
width = 1080;
|
|
164634
|
+
height = 1440;
|
|
164635
|
+
break;
|
|
164636
|
+
default:
|
|
164637
|
+
width = 1920;
|
|
164638
|
+
height = 1080;
|
|
164639
|
+
}
|
|
164640
|
+
return `${width}x${height}`;
|
|
164641
|
+
}
|
|
164642
|
+
|
|
164374
164643
|
// src/tools/hands/index.ts
|
|
164375
164644
|
async function registerHandsTool(server, config) {
|
|
164376
164645
|
const geminiClient = new GeminiClient(config);
|
|
@@ -164401,6 +164670,66 @@ async function registerHandsTool(server, config) {
|
|
|
164401
164670
|
};
|
|
164402
164671
|
}
|
|
164403
164672
|
});
|
|
164673
|
+
server.registerTool("gemini_gen_video", {
|
|
164674
|
+
title: "Gemini Video Generation Tool",
|
|
164675
|
+
description: "Generate videos from text descriptions using Gemini Veo 3.0 API",
|
|
164676
|
+
inputSchema: {
|
|
164677
|
+
prompt: exports_external.string().describe("Text description of the video to generate"),
|
|
164678
|
+
model: exports_external.enum(["veo-3.0-generate-001"]).optional().default("veo-3.0-generate-001").describe("Video generation model"),
|
|
164679
|
+
duration: exports_external.enum(["4s", "8s", "12s"]).optional().default("4s").describe("Duration of the generated video"),
|
|
164680
|
+
output_format: exports_external.enum(["mp4", "webm"]).optional().default("mp4").describe("Output format for the generated video"),
|
|
164681
|
+
aspect_ratio: exports_external.enum(["1:1", "16:9", "9:16", "4:3", "3:4"]).optional().default("16:9").describe("Aspect ratio of the generated video"),
|
|
164682
|
+
fps: exports_external.number().int().min(1).max(60).optional().default(24).describe("Frames per second"),
|
|
164683
|
+
image_input: exports_external.string().optional().describe("Base64 encoded image or image URL to use as starting frame"),
|
|
164684
|
+
style: exports_external.enum(["realistic", "cinematic", "artistic", "cartoon", "animation"]).optional().describe("Style of the generated video"),
|
|
164685
|
+
camera_movement: exports_external.enum(["static", "pan_left", "pan_right", "zoom_in", "zoom_out", "dolly_forward", "dolly_backward"]).optional().describe("Camera movement type"),
|
|
164686
|
+
seed: exports_external.number().optional().describe("Random seed for reproducible generation")
|
|
164687
|
+
}
|
|
164688
|
+
}, async (args) => {
|
|
164689
|
+
try {
|
|
164690
|
+
return await handleVideoGeneration(geminiClient, args, config);
|
|
164691
|
+
} catch (error) {
|
|
164692
|
+
const mcpError = handleError(error);
|
|
164693
|
+
logger2.error(`Tool gemini_gen_video error:`, mcpError);
|
|
164694
|
+
return {
|
|
164695
|
+
content: [{
|
|
164696
|
+
type: "text",
|
|
164697
|
+
text: `Error: ${mcpError.message}`
|
|
164698
|
+
}],
|
|
164699
|
+
isError: true
|
|
164700
|
+
};
|
|
164701
|
+
}
|
|
164702
|
+
});
|
|
164703
|
+
server.registerTool("gemini_image_to_video", {
|
|
164704
|
+
title: "Gemini Image-to-Video Tool",
|
|
164705
|
+
description: "Generate videos from images and text descriptions using Gemini Imagen + Veo 3.0 APIs",
|
|
164706
|
+
inputSchema: {
|
|
164707
|
+
prompt: exports_external.string().describe("Text description of the video animation"),
|
|
164708
|
+
image_input: exports_external.string().describe("Base64 encoded image or image URL to use as starting frame"),
|
|
164709
|
+
model: exports_external.enum(["veo-3.0-generate-001"]).optional().default("veo-3.0-generate-001").describe("Video generation model"),
|
|
164710
|
+
duration: exports_external.enum(["4s", "8s", "12s"]).optional().default("4s").describe("Duration of the generated video"),
|
|
164711
|
+
output_format: exports_external.enum(["mp4", "webm"]).optional().default("mp4").describe("Output format for the generated video"),
|
|
164712
|
+
aspect_ratio: exports_external.enum(["1:1", "16:9", "9:16", "4:3", "3:4"]).optional().default("16:9").describe("Aspect ratio of the generated video"),
|
|
164713
|
+
fps: exports_external.number().int().min(1).max(60).optional().default(24).describe("Frames per second"),
|
|
164714
|
+
style: exports_external.enum(["realistic", "cinematic", "artistic", "cartoon", "animation"]).optional().describe("Style of the generated video"),
|
|
164715
|
+
camera_movement: exports_external.enum(["static", "pan_left", "pan_right", "zoom_in", "zoom_out", "dolly_forward", "dolly_backward"]).optional().describe("Camera movement type"),
|
|
164716
|
+
seed: exports_external.number().optional().describe("Random seed for reproducible generation")
|
|
164717
|
+
}
|
|
164718
|
+
}, async (args) => {
|
|
164719
|
+
try {
|
|
164720
|
+
return await handleImageToVideoGeneration(geminiClient, args, config);
|
|
164721
|
+
} catch (error) {
|
|
164722
|
+
const mcpError = handleError(error);
|
|
164723
|
+
logger2.error(`Tool gemini_image_to_video error:`, mcpError);
|
|
164724
|
+
return {
|
|
164725
|
+
content: [{
|
|
164726
|
+
type: "text",
|
|
164727
|
+
text: `Error: ${mcpError.message}`
|
|
164728
|
+
}],
|
|
164729
|
+
isError: true
|
|
164730
|
+
};
|
|
164731
|
+
}
|
|
164732
|
+
});
|
|
164404
164733
|
}
|
|
164405
164734
|
async function handleImageGeneration(geminiClient, args, config) {
|
|
164406
164735
|
const input = ImageGenerationInputSchema.parse(args);
|
|
@@ -164438,6 +164767,104 @@ async function handleImageGeneration(geminiClient, args, config) {
|
|
|
164438
164767
|
isError: false
|
|
164439
164768
|
};
|
|
164440
164769
|
}
|
|
164770
|
+
async function handleVideoGeneration(geminiClient, args, config) {
|
|
164771
|
+
const input = VideoGenerationInputSchema.parse(args);
|
|
164772
|
+
const { prompt, model, duration, output_format, aspect_ratio, fps, image_input, style, camera_movement, seed } = input;
|
|
164773
|
+
logger2.info(`Generating video with prompt: "${prompt}" using model: ${model}`);
|
|
164774
|
+
const generationOptions = {
|
|
164775
|
+
prompt,
|
|
164776
|
+
model: model || "veo-3.0-generate-001",
|
|
164777
|
+
duration: duration || "4s",
|
|
164778
|
+
outputFormat: output_format || "mp4",
|
|
164779
|
+
aspectRatio: aspect_ratio || "16:9",
|
|
164780
|
+
fps: fps || 24,
|
|
164781
|
+
imageInput: image_input,
|
|
164782
|
+
style,
|
|
164783
|
+
cameraMovement: camera_movement,
|
|
164784
|
+
seed,
|
|
164785
|
+
fetchTimeout: config.server.fetchTimeout
|
|
164786
|
+
};
|
|
164787
|
+
const result = await generateVideo(geminiClient, generationOptions);
|
|
164788
|
+
return {
|
|
164789
|
+
content: [
|
|
164790
|
+
{
|
|
164791
|
+
type: "text",
|
|
164792
|
+
text: JSON.stringify({
|
|
164793
|
+
success: true,
|
|
164794
|
+
video: result.videoData,
|
|
164795
|
+
format: result.format,
|
|
164796
|
+
model: result.model,
|
|
164797
|
+
prompt,
|
|
164798
|
+
operation_id: result.operationId,
|
|
164799
|
+
metadata: {
|
|
164800
|
+
timestamp: new Date().toISOString(),
|
|
164801
|
+
generation_time: result.generationTime,
|
|
164802
|
+
duration: result.duration,
|
|
164803
|
+
aspect_ratio: result.aspectRatio,
|
|
164804
|
+
fps: result.fps,
|
|
164805
|
+
size: result.size
|
|
164806
|
+
}
|
|
164807
|
+
}, null, 2)
|
|
164808
|
+
}
|
|
164809
|
+
],
|
|
164810
|
+
isError: false
|
|
164811
|
+
};
|
|
164812
|
+
}
|
|
164813
|
+
async function handleImageToVideoGeneration(geminiClient, args, config) {
|
|
164814
|
+
const input = exports_external.object({
|
|
164815
|
+
prompt: exports_external.string(),
|
|
164816
|
+
image_input: exports_external.string(),
|
|
164817
|
+
model: exports_external.enum(["veo-3.0-generate-001"]).optional().default("veo-3.0-generate-001"),
|
|
164818
|
+
duration: exports_external.enum(["4s", "8s", "12s"]).optional().default("4s"),
|
|
164819
|
+
output_format: exports_external.enum(["mp4", "webm"]).optional().default("mp4"),
|
|
164820
|
+
aspect_ratio: exports_external.enum(["1:1", "16:9", "9:16", "4:3", "3:4"]).optional().default("16:9"),
|
|
164821
|
+
fps: exports_external.number().int().min(1).max(60).optional().default(24),
|
|
164822
|
+
style: exports_external.enum(["realistic", "cinematic", "artistic", "cartoon", "animation"]).optional(),
|
|
164823
|
+
camera_movement: exports_external.enum(["static", "pan_left", "pan_right", "zoom_in", "zoom_out", "dolly_forward", "dolly_backward"]).optional(),
|
|
164824
|
+
seed: exports_external.number().optional()
|
|
164825
|
+
}).parse(args);
|
|
164826
|
+
const { prompt, image_input, model, duration, output_format, aspect_ratio, fps, style, camera_movement, seed } = input;
|
|
164827
|
+
logger2.info(`Generating video from image with prompt: "${prompt}" using model: ${model}`);
|
|
164828
|
+
const generationOptions = {
|
|
164829
|
+
prompt,
|
|
164830
|
+
model: model || "veo-3.0-generate-001",
|
|
164831
|
+
duration: duration || "4s",
|
|
164832
|
+
outputFormat: output_format || "mp4",
|
|
164833
|
+
aspectRatio: aspect_ratio || "16:9",
|
|
164834
|
+
fps: fps || 24,
|
|
164835
|
+
imageInput: image_input,
|
|
164836
|
+
style,
|
|
164837
|
+
cameraMovement: camera_movement,
|
|
164838
|
+
seed,
|
|
164839
|
+
fetchTimeout: config.server.fetchTimeout
|
|
164840
|
+
};
|
|
164841
|
+
const result = await generateImageToVideo(geminiClient, prompt, image_input, generationOptions);
|
|
164842
|
+
return {
|
|
164843
|
+
content: [
|
|
164844
|
+
{
|
|
164845
|
+
type: "text",
|
|
164846
|
+
text: JSON.stringify({
|
|
164847
|
+
success: true,
|
|
164848
|
+
video: result.videoData,
|
|
164849
|
+
format: result.format,
|
|
164850
|
+
model: result.model,
|
|
164851
|
+
prompt,
|
|
164852
|
+
image_input,
|
|
164853
|
+
operation_id: result.operationId,
|
|
164854
|
+
metadata: {
|
|
164855
|
+
timestamp: new Date().toISOString(),
|
|
164856
|
+
generation_time: result.generationTime,
|
|
164857
|
+
duration: result.duration,
|
|
164858
|
+
aspect_ratio: result.aspectRatio,
|
|
164859
|
+
fps: result.fps,
|
|
164860
|
+
size: result.size
|
|
164861
|
+
}
|
|
164862
|
+
}, null, 2)
|
|
164863
|
+
}
|
|
164864
|
+
],
|
|
164865
|
+
isError: false
|
|
164866
|
+
};
|
|
164867
|
+
}
|
|
164441
164868
|
|
|
164442
164869
|
// src/tools/mouth/schemas.ts
|
|
164443
164870
|
var VoiceNames = [
|