@mixio-pro/kalaasetu-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +127 -0
- package/package.json +57 -0
- package/src/index.ts +50 -0
- package/src/tools/gemini.ts +395 -0
- package/src/tools/hunyuan-avatar.ts +102 -0
- package/src/tools/image-to-video.ts +231 -0
- package/src/tools/infinitalk.ts +96 -0
- package/src/tools/perplexity.ts +190 -0
- package/src/tools/youtube.ts +52 -0
- package/src/utils/fal.utils.ts +45 -0
- package/src/utils/index.ts +1 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Kalaasetu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# kalaasetu-mcp
|
|
2
|
+
|
|
3
|
+
Kalaasetu MCP Server - A powerful Model Context Protocol server providing various AI tools for content generation and analysis.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
### 🎨 Gemini Tools
|
|
8
|
+
- **Text to Image**: Generate images from text prompts
|
|
9
|
+
- **Image Editing**: Edit existing images with AI
|
|
10
|
+
- **Image Analysis**: Analyze and describe images
|
|
11
|
+
- **Text-to-Speech**: Generate natural speech from text
|
|
12
|
+
- **Video Analysis**: Analyze video content
|
|
13
|
+
|
|
14
|
+
### 📹 Video Tools
|
|
15
|
+
- **Image-to-Video** (Vertex AI Veo): Generate videos from images using Google's Veo models
|
|
16
|
+
- **Infinitalk** (FAL AI): Create talking avatar videos with lip-sync from image and audio
|
|
17
|
+
|
|
18
|
+
### 🎬 YouTube Tools
|
|
19
|
+
- **YouTube Analyzer**: Analyze YouTube videos and extract insights
|
|
20
|
+
|
|
21
|
+
### 🔍 Perplexity Search Tools
|
|
22
|
+
- **Image Search**: Search for images using Perplexity AI with domain and format filters
|
|
23
|
+
- **Video Search**: Search for videos using Perplexity AI with domain filters
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
To install dependencies:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
bun install
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Configuration
|
|
34
|
+
|
|
35
|
+
Set up the required API keys as environment variables:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# For Gemini and Vertex AI tools
|
|
39
|
+
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/credentials.json"
|
|
40
|
+
export GEMINI_API_KEY="your-gemini-api-key"
|
|
41
|
+
|
|
42
|
+
# For FAL AI Infinitalk
|
|
43
|
+
export FAL_KEY="your-fal-api-key"
|
|
44
|
+
|
|
45
|
+
# For Perplexity Search Tools
|
|
46
|
+
export PERPLEXITY_API_KEY="your-perplexity-api-key"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Running the Server
|
|
50
|
+
|
|
51
|
+
To run:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
bun run index.ts
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Tool: Infinitalk
|
|
58
|
+
|
|
59
|
+
Generate talking avatar videos from images and audio using FAL AI's Infinitalk model.
|
|
60
|
+
|
|
61
|
+
### Parameters
|
|
62
|
+
|
|
63
|
+
- `image_url` (required): URL of the input image
|
|
64
|
+
- `audio_url` (required): URL of the audio file for lip-sync
|
|
65
|
+
- `prompt` (required): Text description guiding video generation
|
|
66
|
+
- `num_frames` (optional): Number of frames (41-721, default: 145)
|
|
67
|
+
- `resolution` (optional): Video resolution - "480p" or "720p" (default: "480p")
|
|
68
|
+
- `seed` (optional): Random seed for reproducibility (default: 42)
|
|
69
|
+
- `acceleration` (optional): Generation speed - "none", "regular", or "high" (default: "regular")
|
|
70
|
+
- `fal_key` (optional): FAL API key (uses FAL_KEY env var if not provided)
|
|
71
|
+
|
|
72
|
+
### Example Usage
|
|
73
|
+
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"image_url": "https://example.com/portrait.png",
|
|
77
|
+
"audio_url": "https://example.com/speech.mp3",
|
|
78
|
+
"prompt": "A person speaking professionally in a podcast setting",
|
|
79
|
+
"resolution": "720p",
|
|
80
|
+
"num_frames": 200
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Tool: Perplexity Images
|
|
85
|
+
|
|
86
|
+
Search for images using Perplexity AI with advanced filtering options.
|
|
87
|
+
|
|
88
|
+
### Parameters
|
|
89
|
+
|
|
90
|
+
- `query` (required): The search query for images
|
|
91
|
+
- `image_domain_filter` (optional): Array of domains to include or exclude (prefix with '-' to exclude)
|
|
92
|
+
- Example: `["wikimedia.org", "-gettyimages.com"]`
|
|
93
|
+
- `image_format_filter` (optional): Array of allowed image formats
|
|
94
|
+
- Example: `["jpg", "png", "gif"]`
|
|
95
|
+
|
|
96
|
+
### Example Usage
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"query": "mountain landscapes",
|
|
101
|
+
"image_domain_filter": ["wikimedia.org", "-gettyimages.com"],
|
|
102
|
+
"image_format_filter": ["jpg", "png"]
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Tool: Perplexity Videos
|
|
107
|
+
|
|
108
|
+
Search for videos using Perplexity AI with domain filtering.
|
|
109
|
+
|
|
110
|
+
### Parameters
|
|
111
|
+
|
|
112
|
+
- `query` (required): The search query for videos
|
|
113
|
+
- `search_domain_filter` (optional): Array of domains to limit search (use '-' prefix to exclude)
|
|
114
|
+
- Example: `["youtube.com"]` or `["-tiktok.com"]`
|
|
115
|
+
|
|
116
|
+
### Example Usage
|
|
117
|
+
|
|
118
|
+
```json
|
|
119
|
+
{
|
|
120
|
+
"query": "yoga for beginners",
|
|
121
|
+
"search_domain_filter": ["youtube.com"]
|
|
122
|
+
}
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Project Info
|
|
126
|
+
|
|
127
|
+
This project was created using `bun init` in bun v1.3.1. [Bun](https://bun.com) is a fast all-in-one JavaScript runtime.
|
package/package.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mixio-pro/kalaasetu-mcp",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "A powerful Model Context Protocol server providing AI tools for content generation and analysis",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"module": "src/index.ts",
|
|
7
|
+
"main": "src/index.ts",
|
|
8
|
+
"bin": {
|
|
9
|
+
"kalaasetu-mcp": "./src/index.ts"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"src",
|
|
13
|
+
"README.md",
|
|
14
|
+
"LICENSE"
|
|
15
|
+
],
|
|
16
|
+
"scripts": {
|
|
17
|
+
"start": "bun run src/index.ts",
|
|
18
|
+
"dev": "bun --watch src/index.ts"
|
|
19
|
+
},
|
|
20
|
+
"keywords": [
|
|
21
|
+
"mcp",
|
|
22
|
+
"model-context-protocol",
|
|
23
|
+
"ai",
|
|
24
|
+
"gemini",
|
|
25
|
+
"perplexity",
|
|
26
|
+
"video-generation",
|
|
27
|
+
"image-generation",
|
|
28
|
+
"tts",
|
|
29
|
+
"vertex-ai",
|
|
30
|
+
"fal-ai"
|
|
31
|
+
],
|
|
32
|
+
"author": "Kalaasetu",
|
|
33
|
+
"license": "MIT",
|
|
34
|
+
"repository": {
|
|
35
|
+
"type": "git",
|
|
36
|
+
"url": "https://github.com/mixiopro/kalaasetu-mcp.git"
|
|
37
|
+
},
|
|
38
|
+
"bugs": {
|
|
39
|
+
"url": "https://github.com/mixiopro/kalaasetu-mcp/issues"
|
|
40
|
+
},
|
|
41
|
+
"homepage": "https://github.com/mixiopro/kalaasetu-mcp#readme",
|
|
42
|
+
"devDependencies": {
|
|
43
|
+
"@types/bun": "latest"
|
|
44
|
+
},
|
|
45
|
+
"peerDependencies": {
|
|
46
|
+
"typescript": "^5"
|
|
47
|
+
},
|
|
48
|
+
"dependencies": {
|
|
49
|
+
"@fal-ai/client": "^1.7.2",
|
|
50
|
+
"@google/genai": "^1.28.0",
|
|
51
|
+
"@types/wav": "^1.0.4",
|
|
52
|
+
"fastmcp": "^3.22.0",
|
|
53
|
+
"google-auth-library": "^10.5.0",
|
|
54
|
+
"wav": "^1.0.2",
|
|
55
|
+
"zod": "^4.1.12"
|
|
56
|
+
}
|
|
57
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { FastMCP } from "fastmcp";
|
|
3
|
+
import {
|
|
4
|
+
geminiTextToImage,
|
|
5
|
+
geminiEditImage,
|
|
6
|
+
geminiAnalyzeImages,
|
|
7
|
+
geminiSingleSpeakerTts,
|
|
8
|
+
geminiAnalyzeVideos
|
|
9
|
+
} from "./tools/gemini";
|
|
10
|
+
import { analyzeYoutubeVideo } from "./tools/youtube";
|
|
11
|
+
import { imageToVideo } from "./tools/image-to-video";
|
|
12
|
+
import { infinitalk } from "./tools/infinitalk";
|
|
13
|
+
import { hunyuanAvatar } from "./tools/hunyuan-avatar";
|
|
14
|
+
import { perplexityImages, perplexityVideos } from "./tools/perplexity";
|
|
15
|
+
|
|
16
|
+
const server = new FastMCP({
|
|
17
|
+
name: "Kalaasetu MCP Server",
|
|
18
|
+
version: "1.0.0",
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// Gemini Image Tools
|
|
22
|
+
server.addTool(geminiTextToImage);
|
|
23
|
+
server.addTool(geminiEditImage);
|
|
24
|
+
server.addTool(geminiAnalyzeImages);
|
|
25
|
+
|
|
26
|
+
// Gemini TTS Tool
|
|
27
|
+
server.addTool(geminiSingleSpeakerTts);
|
|
28
|
+
|
|
29
|
+
// Gemini Video Analysis Tool
|
|
30
|
+
server.addTool(geminiAnalyzeVideos);
|
|
31
|
+
|
|
32
|
+
// YouTube Analyzer Tool
|
|
33
|
+
server.addTool(analyzeYoutubeVideo);
|
|
34
|
+
|
|
35
|
+
// Vertex AI Image-to-Video Tool
|
|
36
|
+
server.addTool(imageToVideo);
|
|
37
|
+
|
|
38
|
+
// FAL AI Infinitalk Tool
|
|
39
|
+
server.addTool(infinitalk);
|
|
40
|
+
|
|
41
|
+
// FAL AI Hunyuan Avatar Tool
|
|
42
|
+
server.addTool(hunyuanAvatar);
|
|
43
|
+
|
|
44
|
+
// Perplexity Search Tools
|
|
45
|
+
server.addTool(perplexityImages);
|
|
46
|
+
server.addTool(perplexityVideos);
|
|
47
|
+
|
|
48
|
+
server.start({
|
|
49
|
+
transportType: "stdio",
|
|
50
|
+
});
|
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { GoogleGenAI, createPartFromUri, createUserContent } from "@google/genai";
|
|
3
|
+
import * as fs from "fs";
|
|
4
|
+
import * as wav from "wav";
|
|
5
|
+
|
|
6
|
+
const ai = new GoogleGenAI({
|
|
7
|
+
apiKey: process.env.GEMINI_API_KEY || "",
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
function fileToGenerativePart(filePath: string) {
|
|
11
|
+
if (!fs.existsSync(filePath)) {
|
|
12
|
+
throw new Error(`File not found: ${filePath}`);
|
|
13
|
+
}
|
|
14
|
+
const imageBytes = fs.readFileSync(filePath);
|
|
15
|
+
return {
|
|
16
|
+
inlineData: {
|
|
17
|
+
data: Buffer.from(imageBytes).toString("base64"),
|
|
18
|
+
mimeType: "image/jpeg",
|
|
19
|
+
},
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Helper function to save WAV file
|
|
24
|
+
function saveWaveFile(
|
|
25
|
+
filename: string,
|
|
26
|
+
pcmData: Buffer,
|
|
27
|
+
channels = 1,
|
|
28
|
+
rate = 24000,
|
|
29
|
+
sampleWidth = 2,
|
|
30
|
+
): Promise<void> {
|
|
31
|
+
return new Promise((resolve, reject) => {
|
|
32
|
+
const writer = new wav.FileWriter(filename, {
|
|
33
|
+
channels,
|
|
34
|
+
sampleRate: rate,
|
|
35
|
+
bitDepth: sampleWidth * 8,
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
writer.on('finish', resolve);
|
|
39
|
+
writer.on('error', reject);
|
|
40
|
+
|
|
41
|
+
writer.write(pcmData);
|
|
42
|
+
writer.end();
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Helper function to check if URL is YouTube URL
|
|
47
|
+
function isYouTubeUrl(url: string): boolean {
|
|
48
|
+
return url.includes('youtube.com/watch') || url.includes('youtu.be');
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Helper function to get file size in bytes
|
|
52
|
+
function getFileSize(filePath: string): number {
|
|
53
|
+
const stats = fs.statSync(filePath);
|
|
54
|
+
return stats.size;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Helper function to upload file to Gemini API
|
|
58
|
+
async function uploadFileToGemini(filePath: string): Promise<any> {
|
|
59
|
+
try {
|
|
60
|
+
const uploadedFile = await ai.files.upload({
|
|
61
|
+
file: filePath,
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
// Wait for file processing to complete
|
|
65
|
+
let getFile = await ai.files.get({ name: uploadedFile.name! });
|
|
66
|
+
while (getFile.state === 'PROCESSING') {
|
|
67
|
+
await new Promise(resolve => setTimeout(resolve, 3000));
|
|
68
|
+
getFile = await ai.files.get({ name: uploadedFile.name! });
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (getFile.state === 'FAILED') {
|
|
72
|
+
throw new Error('File processing failed');
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return getFile;
|
|
76
|
+
} catch (error: any) {
|
|
77
|
+
throw new Error(`File upload failed: ${error.message}`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Helper function to process video input intelligently
|
|
82
|
+
async function processVideoInput(input: string, config?: { fps?: number; startOffset?: string; endOffset?: string }): Promise<any> {
|
|
83
|
+
if (isYouTubeUrl(input)) {
|
|
84
|
+
return {
|
|
85
|
+
fileData: {
|
|
86
|
+
fileUri: input,
|
|
87
|
+
mimeType: 'video/*',
|
|
88
|
+
videoMetadata: config ? {
|
|
89
|
+
fps: config.fps,
|
|
90
|
+
startOffset: config.startOffset,
|
|
91
|
+
endOffset: config.endOffset
|
|
92
|
+
} : undefined
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
} else {
|
|
96
|
+
// Local file processing - use File Upload API
|
|
97
|
+
if (!fs.existsSync(input)) {
|
|
98
|
+
throw new Error(`Video file not found: ${input}`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Upload file to Gemini API
|
|
102
|
+
const uploadedFile = await uploadFileToGemini(input);
|
|
103
|
+
|
|
104
|
+
return uploadedFile;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export const geminiTextToImage = {
|
|
109
|
+
name: "geminiTextToImage",
|
|
110
|
+
description: "Generate images from text prompts using Gemini 2.5 Flash Image model",
|
|
111
|
+
parameters: z.object({
|
|
112
|
+
prompt: z.string().describe("Text description of the image to generate"),
|
|
113
|
+
aspect_ratio: z.string().optional().describe("Aspect ratio: 1:1, 3:4, 4:3, 9:16, or 16:9"),
|
|
114
|
+
output_path: z.string().optional().describe("File path to save the generated image"),
|
|
115
|
+
}),
|
|
116
|
+
execute: async (args: { prompt: string; aspect_ratio?: string; output_path?: string }) => {
|
|
117
|
+
try {
|
|
118
|
+
const response = await ai.models.generateContent({
|
|
119
|
+
model: "gemini-2.5-flash-image",
|
|
120
|
+
contents: args.prompt,
|
|
121
|
+
config: {
|
|
122
|
+
imageConfig: {
|
|
123
|
+
aspectRatio: args.aspect_ratio || "1:1",
|
|
124
|
+
},
|
|
125
|
+
},
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
let result = "";
|
|
129
|
+
if (response.candidates && response.candidates[0]?.content?.parts) {
|
|
130
|
+
for (const part of response.candidates[0].content.parts) {
|
|
131
|
+
if (part.text) {
|
|
132
|
+
result += part.text;
|
|
133
|
+
} else if (part.inlineData?.data) {
|
|
134
|
+
const imageData = part.inlineData.data;
|
|
135
|
+
if (args.output_path) {
|
|
136
|
+
fs.writeFileSync(args.output_path, Buffer.from(imageData, "base64"));
|
|
137
|
+
result += `\nImage saved to: ${args.output_path}`;
|
|
138
|
+
} else {
|
|
139
|
+
result += `\nGenerated image (base64): ${imageData.substring(0, 100)}...`;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return result || "Image generation completed but no response received";
|
|
145
|
+
} catch (error: any) {
|
|
146
|
+
throw new Error(`Image generation failed: ${error.message}`);
|
|
147
|
+
}
|
|
148
|
+
},
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
export const geminiEditImage = {
|
|
152
|
+
name: "geminiEditImage",
|
|
153
|
+
description: "Edit existing images with text instructions using Gemini 2.5 Flash Image Preview",
|
|
154
|
+
parameters: z.object({
|
|
155
|
+
image_path: z.string().describe("Path to the source image file"),
|
|
156
|
+
prompt: z.string().describe("Text instructions for editing the image"),
|
|
157
|
+
output_path: z.string().optional().describe("File path to save the edited image"),
|
|
158
|
+
reference_images: z.array(z.string()).optional().describe("Additional image paths for reference"),
|
|
159
|
+
}),
|
|
160
|
+
execute: async (args: { image_path: string; prompt: string; output_path?: string; reference_images?: string[] }) => {
|
|
161
|
+
try {
|
|
162
|
+
const contents: any[] = [args.prompt, fileToGenerativePart(args.image_path)];
|
|
163
|
+
|
|
164
|
+
if (args.reference_images) {
|
|
165
|
+
for (const refPath of args.reference_images) {
|
|
166
|
+
contents.push(fileToGenerativePart(refPath));
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const response = await ai.models.generateContent({
|
|
171
|
+
model: "gemini-2.5-flash-image-preview",
|
|
172
|
+
contents: contents,
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
let result = "";
|
|
176
|
+
if (response.candidates && response.candidates[0]?.content?.parts) {
|
|
177
|
+
for (const part of response.candidates[0].content.parts) {
|
|
178
|
+
if (part.text) {
|
|
179
|
+
result += part.text;
|
|
180
|
+
} else if (part.inlineData?.data) {
|
|
181
|
+
const imageData = part.inlineData.data;
|
|
182
|
+
if (args.output_path) {
|
|
183
|
+
fs.writeFileSync(args.output_path, Buffer.from(imageData, "base64"));
|
|
184
|
+
result += `\nEdited image saved to: ${args.output_path}`;
|
|
185
|
+
} else {
|
|
186
|
+
result += `\nEdited image (base64): ${imageData.substring(0, 100)}...`;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
return result || "Image editing completed but no response received";
|
|
192
|
+
} catch (error: any) {
|
|
193
|
+
throw new Error(`Image editing failed: ${error.message}`);
|
|
194
|
+
}
|
|
195
|
+
},
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
export const geminiAnalyzeImages = {
|
|
199
|
+
name: "geminiAnalyzeImages",
|
|
200
|
+
description: "Analyze and describe images using Gemini 2.5 Pro with advanced multimodal understanding",
|
|
201
|
+
parameters: z.object({
|
|
202
|
+
image_paths: z.array(z.string()).describe("Array of image file paths to analyze"),
|
|
203
|
+
prompt: z.string().describe("Text prompt or question about the images"),
|
|
204
|
+
}),
|
|
205
|
+
execute: async (args: { image_paths: string[]; prompt: string }) => {
|
|
206
|
+
try {
|
|
207
|
+
// Handle array parsing
|
|
208
|
+
if (!args.image_paths) {
|
|
209
|
+
throw new Error("Image paths not provided");
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Convert to array if passed as string
|
|
213
|
+
let imagePaths: string[];
|
|
214
|
+
if (typeof args.image_paths === 'string') {
|
|
215
|
+
const strValue = args.image_paths as string;
|
|
216
|
+
if (strValue.startsWith('[') && strValue.endsWith(']')) {
|
|
217
|
+
try {
|
|
218
|
+
imagePaths = JSON.parse(strValue);
|
|
219
|
+
} catch {
|
|
220
|
+
throw new Error("Invalid image_paths format");
|
|
221
|
+
}
|
|
222
|
+
} else {
|
|
223
|
+
imagePaths = [strValue];
|
|
224
|
+
}
|
|
225
|
+
} else if (Array.isArray(args.image_paths)) {
|
|
226
|
+
imagePaths = args.image_paths;
|
|
227
|
+
} else {
|
|
228
|
+
throw new Error("Invalid image_paths: must be array or string");
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (imagePaths.length === 0) {
|
|
232
|
+
throw new Error("At least one image path must be provided");
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const contents: any[] = [args.prompt];
|
|
236
|
+
|
|
237
|
+
for (const imagePath of imagePaths) {
|
|
238
|
+
contents.push(fileToGenerativePart(imagePath));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
const response = await ai.models.generateContent({
|
|
242
|
+
model: "gemini-2.5-pro",
|
|
243
|
+
contents: contents,
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
let result = "";
|
|
247
|
+
if (response.candidates && response.candidates[0]?.content?.parts) {
|
|
248
|
+
for (const part of response.candidates[0].content.parts) {
|
|
249
|
+
if (part.text) {
|
|
250
|
+
result += part.text;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
return result || "Analysis completed but no text response received";
|
|
255
|
+
} catch (error: any) {
|
|
256
|
+
throw new Error(`Image analysis failed: ${error.message}`);
|
|
257
|
+
}
|
|
258
|
+
},
|
|
259
|
+
};
|
|
260
|
+
|
|
261
|
+
export const geminiSingleSpeakerTts = {
|
|
262
|
+
name: "geminiSingleSpeakerTts",
|
|
263
|
+
description: "Generate single speaker voice audio from text using Gemini 2.5 Pro Preview TTS model",
|
|
264
|
+
parameters: z.object({
|
|
265
|
+
text: z.string().describe("Text to convert to speech"),
|
|
266
|
+
voice_name: z.string().describe("Voice name from supported options (e.g., 'Kore', 'Zephyr', 'Puck', etc.)"),
|
|
267
|
+
output_path: z.string().optional().describe("Output WAV file path (optional, defaults to timestamp-based filename)"),
|
|
268
|
+
}),
|
|
269
|
+
execute: async (args: { text: string; voice_name: string; output_path?: string }) => {
|
|
270
|
+
try {
|
|
271
|
+
const response = await ai.models.generateContent({
|
|
272
|
+
model: "gemini-2.5-pro-preview-tts",
|
|
273
|
+
contents: [{ parts: [{ text: args.text }] }],
|
|
274
|
+
config: {
|
|
275
|
+
responseModalities: ['AUDIO'],
|
|
276
|
+
speechConfig: {
|
|
277
|
+
voiceConfig: {
|
|
278
|
+
prebuiltVoiceConfig: {
|
|
279
|
+
voiceName: args.voice_name
|
|
280
|
+
},
|
|
281
|
+
},
|
|
282
|
+
},
|
|
283
|
+
},
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
|
287
|
+
if (!data) {
|
|
288
|
+
throw new Error("No audio data received from Gemini API");
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
const audioBuffer = Buffer.from(data, 'base64');
|
|
292
|
+
|
|
293
|
+
// Generate output filename if not provided
|
|
294
|
+
const outputPath = args.output_path || `voice_output_${Date.now()}.wav`;
|
|
295
|
+
|
|
296
|
+
await saveWaveFile(outputPath, audioBuffer);
|
|
297
|
+
|
|
298
|
+
return `Audio generated successfully: ${outputPath}`;
|
|
299
|
+
} catch (error: any) {
|
|
300
|
+
throw new Error(`Voice generation failed: ${error.message}`);
|
|
301
|
+
}
|
|
302
|
+
},
|
|
303
|
+
};
|
|
304
|
+
|
|
305
|
+
export const geminiAnalyzeVideos = {
|
|
306
|
+
name: "geminiAnalyzeVideos",
|
|
307
|
+
description: "Analyze and understand video content using Gemini 2.5 Flash model. Intelligently handles YouTube URLs and local videos (files <20MB processed inline, ≥20MB uploaded via File API). Supports timestamp queries, clipping, and custom frame rates with default 5 FPS for local videos to optimize processing.",
|
|
308
|
+
parameters: z.object({
|
|
309
|
+
video_inputs: z.array(z.string()).describe("Array of video inputs - mix of local file paths and YouTube URLs (max 10 videos). Local files <20MB processed inline, larger files uploaded via File API automatically."),
|
|
310
|
+
prompt: z.string().describe("Text prompt or question about the videos. Use MM:SS format for timestamp references (e.g., 'What happens at 01:30?')."),
|
|
311
|
+
fps: z.number().optional().describe("Frame rate for video processing (default: 5 FPS for local videos to reduce file size, 1 FPS for YouTube URLs)"),
|
|
312
|
+
start_offset: z.string().optional().describe("Clip start time in seconds with 's' suffix (e.g., '40s')"),
|
|
313
|
+
end_offset: z.string().optional().describe("Clip end time in seconds with 's' suffix (e.g., '80s')"),
|
|
314
|
+
media_resolution: z.string().optional().describe("Media resolution: 'default' or 'low' (low resolution uses ~100 tokens/sec vs 300 tokens/sec)"),
|
|
315
|
+
}),
|
|
316
|
+
execute: async (args: { video_inputs: string[]; prompt: string; fps?: number; start_offset?: string; end_offset?: string; media_resolution?: string }) => {
|
|
317
|
+
try {
|
|
318
|
+
// Handle array parsing
|
|
319
|
+
if (!args.video_inputs) {
|
|
320
|
+
throw new Error("Video inputs not provided");
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Convert to array if passed as string
|
|
324
|
+
let videoInputs: string[];
|
|
325
|
+
if (typeof args.video_inputs === 'string') {
|
|
326
|
+
const strValue = args.video_inputs as string;
|
|
327
|
+
if (strValue.startsWith('[') && strValue.endsWith(']')) {
|
|
328
|
+
try {
|
|
329
|
+
videoInputs = JSON.parse(strValue);
|
|
330
|
+
} catch {
|
|
331
|
+
throw new Error("Invalid video_inputs format");
|
|
332
|
+
}
|
|
333
|
+
} else {
|
|
334
|
+
videoInputs = [strValue];
|
|
335
|
+
}
|
|
336
|
+
} else if (Array.isArray(args.video_inputs)) {
|
|
337
|
+
videoInputs = args.video_inputs;
|
|
338
|
+
} else {
|
|
339
|
+
throw new Error("Invalid video_inputs: must be array or string");
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
if (videoInputs.length === 0) {
|
|
343
|
+
throw new Error("At least one video input must be provided");
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
if (videoInputs.length > 10) {
|
|
347
|
+
throw new Error("Maximum 10 videos per request allowed for Gemini 2.5+ models");
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// Prepare video parts for content
|
|
351
|
+
const videoParts: any[] = [];
|
|
352
|
+
|
|
353
|
+
// Process each video input
|
|
354
|
+
for (const videoInput of videoInputs) {
|
|
355
|
+
const videoConfig = {
|
|
356
|
+
fps: args.fps || (isYouTubeUrl(videoInput) ? 1 : 5), // Default 5 FPS for local, 1 FPS for YouTube
|
|
357
|
+
startOffset: args.start_offset,
|
|
358
|
+
endOffset: args.end_offset
|
|
359
|
+
};
|
|
360
|
+
|
|
361
|
+
const videoPart = await processVideoInput(videoInput, videoConfig);
|
|
362
|
+
videoParts.push(videoPart);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Build content using createUserContent and createPartFromUri for uploaded files
|
|
366
|
+
const contentParts: any[] = [args.prompt];
|
|
367
|
+
|
|
368
|
+
for (const videoPart of videoParts) {
|
|
369
|
+
if (videoPart.uri && videoPart.mimeType) {
|
|
370
|
+
contentParts.push(createPartFromUri(videoPart.uri, videoPart.mimeType));
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
const finalContents = createUserContent(contentParts);
|
|
375
|
+
|
|
376
|
+
const response = await ai.models.generateContent({
|
|
377
|
+
model: 'gemini-2.5-pro',
|
|
378
|
+
contents: finalContents,
|
|
379
|
+
});
|
|
380
|
+
|
|
381
|
+
let result = "";
|
|
382
|
+
if (response.candidates && response.candidates[0]?.content?.parts) {
|
|
383
|
+
for (const part of response.candidates[0].content.parts) {
|
|
384
|
+
if (part.text) {
|
|
385
|
+
result += part.text;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
return result || "Video analysis completed but no text response received";
|
|
391
|
+
} catch (error: any) {
|
|
392
|
+
throw new Error(`Video analysis failed: ${error.message}`);
|
|
393
|
+
}
|
|
394
|
+
},
|
|
395
|
+
};
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { callFalModel } from "../utils/fal.utils";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Calculate number of frames based on audio duration at 25 FPS
|
|
6
|
+
* Adds 1 second buffer to ensure complete audio coverage
|
|
7
|
+
*/
|
|
8
|
+
function calculateFramesFromAudioDuration(audioDurationSeconds: number): number {
|
|
9
|
+
const totalDuration = audioDurationSeconds + 1; // Add 1 second buffer
|
|
10
|
+
const frames = Math.round(totalDuration * 25); // 25 FPS
|
|
11
|
+
|
|
12
|
+
// Clamp to valid range (129-401 frames)
|
|
13
|
+
return Math.max(129, Math.min(401, frames));
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* FAL AI Hunyuan Avatar - High-Fidelity Audio-Driven Human Animation
|
|
18
|
+
*/
|
|
19
|
+
export const hunyuanAvatar = {
|
|
20
|
+
name: "hunyuan_avatar",
|
|
21
|
+
description: "Generate high-fidelity audio-driven human animation videos using FAL AI Hunyuan Avatar. Creates realistic talking avatar animations from an image and audio file.",
|
|
22
|
+
parameters: z.object({
|
|
23
|
+
image_url: z.string().describe("Public URL of the reference image for the avatar."),
|
|
24
|
+
audio_url: z.string().describe("Public URL of the audio file to drive the animation."),
|
|
25
|
+
audio_duration_seconds: z.number().optional().describe("Duration of the audio in seconds. If provided, will automatically calculate optimal frames (audio duration + 1 second buffer at 25 FPS)."),
|
|
26
|
+
text: z.string().optional().describe("Text prompt describing the scene. Default: 'A cat is singing.'"),
|
|
27
|
+
num_frames: z.number().optional().describe("Number of video frames to generate at 25 FPS. Range: 129 to 401. If not provided and audio_duration_seconds is given, will be calculated automatically. Default: 129"),
|
|
28
|
+
num_inference_steps: z.number().optional().describe("Number of inference steps for sampling. Higher values give better quality but take longer. Range: 30 to 50. Default: 30"),
|
|
29
|
+
turbo_mode: z.boolean().optional().describe("If true, the video will be generated faster with no noticeable degradation in visual quality. Default: true"),
|
|
30
|
+
seed: z.number().optional().describe("Random seed for generation."),
|
|
31
|
+
fal_key: z.string().optional().describe("FAL API key. If not provided, will use FAL_KEY environment variable."),
|
|
32
|
+
}),
|
|
33
|
+
execute: async (args: {
|
|
34
|
+
image_url: string;
|
|
35
|
+
audio_url: string;
|
|
36
|
+
audio_duration_seconds?: number;
|
|
37
|
+
text?: string;
|
|
38
|
+
num_frames?: number;
|
|
39
|
+
num_inference_steps?: number;
|
|
40
|
+
turbo_mode?: boolean;
|
|
41
|
+
seed?: number;
|
|
42
|
+
fal_key?: string;
|
|
43
|
+
}) => {
|
|
44
|
+
// Calculate frames from audio duration if provided and num_frames not specified
|
|
45
|
+
let calculatedFrames = args.num_frames;
|
|
46
|
+
if (args.audio_duration_seconds !== undefined && args.num_frames === undefined) {
|
|
47
|
+
calculatedFrames = calculateFramesFromAudioDuration(args.audio_duration_seconds);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Validate num_frames range if provided
|
|
51
|
+
if (calculatedFrames !== undefined && (calculatedFrames < 129 || calculatedFrames > 401)) {
|
|
52
|
+
throw new Error("num_frames must be between 129 and 401");
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Validate num_inference_steps range if provided
|
|
56
|
+
if (args.num_inference_steps !== undefined && (args.num_inference_steps < 30 || args.num_inference_steps > 50)) {
|
|
57
|
+
throw new Error("num_inference_steps must be between 30 and 50");
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Build input payload
|
|
61
|
+
const input: any = {
|
|
62
|
+
image_url: args.image_url,
|
|
63
|
+
audio_url: args.audio_url,
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
// Add optional parameters if provided
|
|
67
|
+
if (args.text !== undefined) {
|
|
68
|
+
input.text = args.text;
|
|
69
|
+
}
|
|
70
|
+
if (calculatedFrames !== undefined) {
|
|
71
|
+
input.num_frames = calculatedFrames;
|
|
72
|
+
}
|
|
73
|
+
if (args.num_inference_steps !== undefined) {
|
|
74
|
+
input.num_inference_steps = args.num_inference_steps;
|
|
75
|
+
}
|
|
76
|
+
if (args.turbo_mode !== undefined) {
|
|
77
|
+
input.turbo_mode = args.turbo_mode;
|
|
78
|
+
}
|
|
79
|
+
if (args.seed !== undefined) {
|
|
80
|
+
input.seed = args.seed;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const result = await callFalModel("fal-ai/hunyuan-avatar", input, { falKey: args.fal_key });
|
|
84
|
+
|
|
85
|
+
// Extract video data from the response
|
|
86
|
+
const videoData = result.data?.video;
|
|
87
|
+
|
|
88
|
+
if (!videoData || !videoData.url) {
|
|
89
|
+
throw new Error(`No video data in completed response: ${JSON.stringify(result.data)}`);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const videoUrl = videoData.url;
|
|
93
|
+
const fileDetails = videoData.file_name && videoData.file_size !== undefined
|
|
94
|
+
? `\nFile: ${videoData.file_name} (${(videoData.file_size / 1024 / 1024).toFixed(2)} MB)`
|
|
95
|
+
: "";
|
|
96
|
+
const requestIdInfo = result.requestId ? `\nRequest ID: ${result.requestId}` : "";
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
return videoUrl
|
|
100
|
+
// return `✅ Hunyuan Avatar video generated successfully!\n\nVideo URL: ${videoUrl}${fileDetails}${requestIdInfo}`;
|
|
101
|
+
},
|
|
102
|
+
};
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import * as fs from "fs";
|
|
3
|
+
import { GoogleAuth } from "google-auth-library";
|
|
4
|
+
import { exec } from "child_process";
|
|
5
|
+
import * as path from "path";
|
|
6
|
+
|
|
7
|
+
async function wait(ms: number): Promise<void> {
|
|
8
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
async function fetchAccessToken(): Promise<string> {
|
|
12
|
+
try {
|
|
13
|
+
const auth = new GoogleAuth({ scopes: ["https://www.googleapis.com/auth/cloud-platform"] });
|
|
14
|
+
const client = await auth.getClient();
|
|
15
|
+
const token = await client.getAccessToken();
|
|
16
|
+
if (!token || !token.token || typeof token.token !== "string") {
|
|
17
|
+
throw new Error("No token from GoogleAuth");
|
|
18
|
+
}
|
|
19
|
+
return token.token;
|
|
20
|
+
} catch (e: any) {
|
|
21
|
+
// Fallback to gcloud
|
|
22
|
+
return await new Promise((resolve, reject) => {
|
|
23
|
+
exec("gcloud auth print-access-token", (err, stdout, stderr) => {
|
|
24
|
+
if (err) {
|
|
25
|
+
reject(new Error(`Failed to fetch an access token (ADC and gcloud): ${stderr || err.message}`));
|
|
26
|
+
return;
|
|
27
|
+
}
|
|
28
|
+
const t = (stdout || "").trim();
|
|
29
|
+
if (!t) {
|
|
30
|
+
reject(new Error("Failed to fetch an access token: empty token from gcloud"));
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
resolve(t);
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function fileToBase64(filePath: string): { data: string; mimeType: string } {
|
|
40
|
+
if (!fs.existsSync(filePath)) {
|
|
41
|
+
throw new Error(`File not found: ${filePath}`);
|
|
42
|
+
}
|
|
43
|
+
const buf = fs.readFileSync(filePath);
|
|
44
|
+
const data = Buffer.from(buf).toString("base64");
|
|
45
|
+
// Default to PNG if not sure, similar to existing code
|
|
46
|
+
const mimeType = "image/png";
|
|
47
|
+
return { data, mimeType };
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export const imageToVideo = {
|
|
51
|
+
name: "imageToVideo",
|
|
52
|
+
description: "Generate videos from an image as starting first frame using Vertex Veo models (predictLongRunning + fetchPredictOperation).",
|
|
53
|
+
parameters: z.object({
|
|
54
|
+
prompt: z.string().describe("Text description for the video"),
|
|
55
|
+
image_path: z.string().optional().describe("Path to source image for image-to-video generation"),
|
|
56
|
+
aspect_ratio: z.string().optional().describe("Video aspect ratio: '16:9' or '9:16' (default: '9:16')"),
|
|
57
|
+
duration_seconds: z.string().optional().describe("Video duration in seconds: '4', '6', or '8' (default: '6')"),
|
|
58
|
+
resolution: z.string().optional().describe("Video resolution: '720p' or '1080p' (default: '720p')"),
|
|
59
|
+
negative_prompt: z.string().optional().describe("Text describing what not to include in the video"),
|
|
60
|
+
person_generation: z.string().optional().describe("Controls generation of people: 'allow_adult' (default for image-to-video) or 'allow_all'"),
|
|
61
|
+
reference_images: z.array(z.string()).optional().describe("Additional image paths for reference (max 3)"),
|
|
62
|
+
output_path: z.string().optional().describe("Output MP4 file path (if multiple predictions, index suffix is added)"),
|
|
63
|
+
project_id: z.string().optional().describe("GCP Project ID (default: mixio-pro)"),
|
|
64
|
+
location_id: z.string().optional().describe("Vertex region (default: us-central1)"),
|
|
65
|
+
model_id: z.string().optional().describe("Model ID (default: veo-3.1-fast-generate-preview)"),
|
|
66
|
+
}),
|
|
67
|
+
execute: async (args: {
|
|
68
|
+
prompt: string;
|
|
69
|
+
image_path?: string;
|
|
70
|
+
aspect_ratio?: string;
|
|
71
|
+
duration_seconds?: string;
|
|
72
|
+
resolution?: string;
|
|
73
|
+
negative_prompt?: string;
|
|
74
|
+
person_generation?: string;
|
|
75
|
+
reference_images?: string[];
|
|
76
|
+
output_path?: string;
|
|
77
|
+
project_id?: string;
|
|
78
|
+
location_id?: string;
|
|
79
|
+
model_id?: string;
|
|
80
|
+
}) => {
|
|
81
|
+
const projectId = args.project_id || "mixio-pro";
|
|
82
|
+
const location = args.location_id || "us-central1";
|
|
83
|
+
const modelId = args.model_id || "veo-3.1-fast-generate-preview";
|
|
84
|
+
|
|
85
|
+
const token = await fetchAccessToken();
|
|
86
|
+
|
|
87
|
+
const url = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:predictLongRunning`;
|
|
88
|
+
|
|
89
|
+
let imagePart: any = undefined;
|
|
90
|
+
if (args.image_path) {
|
|
91
|
+
const { data, mimeType } = fileToBase64(args.image_path);
|
|
92
|
+
imagePart = {
|
|
93
|
+
image: {
|
|
94
|
+
bytesBase64Encoded: data,
|
|
95
|
+
mimeType,
|
|
96
|
+
},
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
let referenceImages: any[] | undefined = undefined;
|
|
101
|
+
if (args.reference_images) {
|
|
102
|
+
let refImages: string[];
|
|
103
|
+
if (typeof args.reference_images === 'string') {
|
|
104
|
+
const strValue = args.reference_images as string;
|
|
105
|
+
if (strValue.startsWith("[") && strValue.endsWith("]")) {
|
|
106
|
+
try {
|
|
107
|
+
refImages = JSON.parse(strValue);
|
|
108
|
+
} catch {
|
|
109
|
+
throw new Error("Invalid reference_images format");
|
|
110
|
+
}
|
|
111
|
+
} else {
|
|
112
|
+
refImages = [strValue];
|
|
113
|
+
}
|
|
114
|
+
} else if (Array.isArray(args.reference_images)) {
|
|
115
|
+
refImages = args.reference_images;
|
|
116
|
+
} else {
|
|
117
|
+
throw new Error("Invalid reference_images: must be array or string");
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (refImages.length > 0) {
|
|
121
|
+
referenceImages = refImages.slice(0, 3).map((p) => {
|
|
122
|
+
const { data, mimeType } = fileToBase64(p);
|
|
123
|
+
return {
|
|
124
|
+
image: {
|
|
125
|
+
bytesBase64Encoded: data,
|
|
126
|
+
mimeType,
|
|
127
|
+
},
|
|
128
|
+
referenceType: "asset",
|
|
129
|
+
};
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const personGeneration = args.person_generation || (args.image_path ? "allow_adult" : "allow_all");
|
|
135
|
+
|
|
136
|
+
const instances: any[] = [
|
|
137
|
+
{
|
|
138
|
+
prompt: args.prompt,
|
|
139
|
+
...(imagePart || {}),
|
|
140
|
+
...(referenceImages ? { referenceImages } : {}),
|
|
141
|
+
},
|
|
142
|
+
];
|
|
143
|
+
|
|
144
|
+
const parameters: any = {
|
|
145
|
+
aspectRatio: args.aspect_ratio || "9:16",
|
|
146
|
+
durationSeconds: parseInt(args.duration_seconds || "6"),
|
|
147
|
+
resolution: args.resolution || "720p",
|
|
148
|
+
negativePrompt: args.negative_prompt,
|
|
149
|
+
generateAudio: false,
|
|
150
|
+
personGeneration,
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
const res = await fetch(url, {
|
|
154
|
+
method: "POST",
|
|
155
|
+
headers: {
|
|
156
|
+
Authorization: `Bearer ${token}`,
|
|
157
|
+
"Content-Type": "application/json",
|
|
158
|
+
},
|
|
159
|
+
body: JSON.stringify({ instances, parameters }),
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
if (!res.ok) {
|
|
163
|
+
const text = await res.text();
|
|
164
|
+
throw new Error(`Vertex request failed: ${res.status} ${text}`);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const op: any = await res.json();
|
|
168
|
+
const name: string = (op as any).name || (op as any).operation || "";
|
|
169
|
+
if (!name) {
|
|
170
|
+
throw new Error("Vertex did not return an operation name for long-running request");
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
let current: any = op;
|
|
174
|
+
let done = !!(op as any).done;
|
|
175
|
+
let tries = 0;
|
|
176
|
+
|
|
177
|
+
// Poll using fetchPredictOperation as per Vertex recommendation
|
|
178
|
+
const fetchUrl = `https://${location}-aiplatform.googleapis.com/v1/projects/${projectId}/locations/${location}/publishers/google/models/${modelId}:fetchPredictOperation`;
|
|
179
|
+
while (!done && tries < 60) {
|
|
180
|
+
await wait(10000);
|
|
181
|
+
const poll = await fetch(fetchUrl, {
|
|
182
|
+
method: "POST",
|
|
183
|
+
headers: {
|
|
184
|
+
Authorization: `Bearer ${token}`,
|
|
185
|
+
"Content-Type": "application/json",
|
|
186
|
+
},
|
|
187
|
+
body: JSON.stringify({ operationName: name }),
|
|
188
|
+
});
|
|
189
|
+
if (!poll.ok) {
|
|
190
|
+
const text = await poll.text();
|
|
191
|
+
throw new Error(`Vertex operation poll failed: ${poll.status} ${text}`);
|
|
192
|
+
}
|
|
193
|
+
current = await poll.json();
|
|
194
|
+
done = !!(current as any).done || !!(current as any).response;
|
|
195
|
+
tries++;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const resp = (current as any).response || current;
|
|
199
|
+
// Decode from response.videos[].bytesBase64Encoded only
|
|
200
|
+
const outputs: string[] = [];
|
|
201
|
+
const saveVideo = (base64: string, index: number) => {
|
|
202
|
+
if (!base64) return;
|
|
203
|
+
const filePath = args.output_path
|
|
204
|
+
? (index === 0 ? args.output_path : args.output_path.replace(/\.mp4$/i, `_${index}.mp4`))
|
|
205
|
+
: `video_output_${Date.now()}${index === 0 ? '' : '_' + index}.mp4`;
|
|
206
|
+
const absPath = path.resolve(filePath);
|
|
207
|
+
const buf = Buffer.from(base64, 'base64');
|
|
208
|
+
fs.writeFileSync(absPath, buf);
|
|
209
|
+
outputs.push(absPath);
|
|
210
|
+
};
|
|
211
|
+
|
|
212
|
+
if (Array.isArray(resp?.videos) && resp.videos.length > 0) {
|
|
213
|
+
for (let i = 0; i < resp.videos.length; i++) {
|
|
214
|
+
const v = resp.videos[i] || {};
|
|
215
|
+
if (typeof v.bytesBase64Encoded === 'string') {
|
|
216
|
+
saveVideo(v.bytesBase64Encoded, i);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
if (outputs.length > 0) {
|
|
221
|
+
return `Video(s) saved: ${outputs.join(', ')}`;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// If nothing saved, return a concise summary plus head/tail snippets of JSON
|
|
225
|
+
let jsonStr = "";
|
|
226
|
+
try { jsonStr = JSON.stringify(resp); } catch {}
|
|
227
|
+
const head150 = jsonStr ? jsonStr.slice(0, 150) : "";
|
|
228
|
+
const tail50 = jsonStr ? jsonStr.slice(Math.max(0, jsonStr.length - 50)) : "";
|
|
229
|
+
return `Vertex operation done but no videos array present. operationName=${name}. json_head150=${head150} json_tail50=${tail50}`;
|
|
230
|
+
},
|
|
231
|
+
};
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { callFalModel } from "../utils/fal.utils";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Calculate number of frames based on audio duration at 25 FPS
|
|
6
|
+
* Adds 1 second buffer to ensure complete audio coverage
|
|
7
|
+
*/
|
|
8
|
+
function calculateFramesFromAudioDuration(audioDurationSeconds: number): number {
|
|
9
|
+
const totalDuration = audioDurationSeconds + 1; // Add 1 second buffer
|
|
10
|
+
const frames = Math.round(totalDuration * 25); // 25 FPS
|
|
11
|
+
|
|
12
|
+
// Clamp to valid range (41-721 frames)
|
|
13
|
+
return Math.max(41, Math.min(721, frames));
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* FAL AI Infinitalk - Generate talking avatar video from image and audio
|
|
18
|
+
*/
|
|
19
|
+
export const infinitalk = {
|
|
20
|
+
name: "infinitalk",
|
|
21
|
+
description: "Generate a talking avatar video from an image and audio file using FAL AI Infinitalk. The avatar lip-syncs to the provided audio with natural facial expressions.",
|
|
22
|
+
parameters: z.object({
|
|
23
|
+
image_url: z.string().describe("Public URL of the input image. If the input image does not match the chosen aspect ratio, it is resized and center cropped."),
|
|
24
|
+
audio_url: z.string().describe("The Public URL of the audio file for lip-sync generation."),
|
|
25
|
+
audio_duration_seconds: z.number().optional().describe("Duration of the audio in seconds. If provided, will automatically calculate optimal frames (audio duration + 1 second buffer at 25 FPS)."),
|
|
26
|
+
prompt: z.string().describe("The text prompt to guide video generation (e.g., 'A woman with colorful hair talking on a podcast')"),
|
|
27
|
+
num_frames: z.number().optional().describe("Number of frames to generate. Must be between 41 to 721. If not provided and audio_duration_seconds is given, will be calculated automatically. Default: 145"),
|
|
28
|
+
resolution: z.enum(["480p", "720p"]).optional().describe("Resolution of the video to generate. Default: '480p'"),
|
|
29
|
+
seed: z.number().optional().describe("Random seed for reproducibility. If not provided, a random seed is chosen. Default: 42"),
|
|
30
|
+
acceleration: z.enum(["none", "regular", "high"]).optional().describe("The acceleration level to use for generation. Default: 'regular'"),
|
|
31
|
+
fal_key: z.string().optional().describe("FAL API key. If not provided, will use FAL_KEY environment variable."),
|
|
32
|
+
}),
|
|
33
|
+
execute: async (args: {
|
|
34
|
+
image_url: string;
|
|
35
|
+
audio_url: string;
|
|
36
|
+
audio_duration_seconds?: number;
|
|
37
|
+
prompt: string;
|
|
38
|
+
num_frames?: number;
|
|
39
|
+
resolution?: "480p" | "720p";
|
|
40
|
+
seed?: number;
|
|
41
|
+
acceleration?: "none" | "regular" | "high";
|
|
42
|
+
fal_key?: string;
|
|
43
|
+
}) => {
|
|
44
|
+
// Calculate frames from audio duration if provided and num_frames not specified
|
|
45
|
+
let calculatedFrames = args.num_frames;
|
|
46
|
+
if (args.audio_duration_seconds !== undefined && args.num_frames === undefined) {
|
|
47
|
+
calculatedFrames = calculateFramesFromAudioDuration(args.audio_duration_seconds);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Validate num_frames range if provided
|
|
51
|
+
if (calculatedFrames !== undefined && (calculatedFrames < 41 || calculatedFrames > 721)) {
|
|
52
|
+
throw new Error("num_frames must be between 41 and 721");
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Build input payload
|
|
56
|
+
const input: any = {
|
|
57
|
+
image_url: args.image_url,
|
|
58
|
+
audio_url: args.audio_url,
|
|
59
|
+
prompt: args.prompt,
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Add optional parameters if provided
|
|
63
|
+
if (calculatedFrames !== undefined) {
|
|
64
|
+
input.num_frames = calculatedFrames;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
input.resolution = args.resolution || '480p';
|
|
68
|
+
|
|
69
|
+
if (args.seed !== undefined) {
|
|
70
|
+
input.seed = args.seed;
|
|
71
|
+
}
|
|
72
|
+
if (args.acceleration !== undefined) {
|
|
73
|
+
input.acceleration = args.acceleration;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const result = await callFalModel("fal-ai/infinitalk", input, { falKey: args.fal_key });
|
|
77
|
+
|
|
78
|
+
// Extract video data from the response
|
|
79
|
+
const videoData = result.data?.video;
|
|
80
|
+
const seed = result.data?.seed;
|
|
81
|
+
|
|
82
|
+
if (!videoData || !videoData.url) {
|
|
83
|
+
throw new Error(`No video data in completed response: ${JSON.stringify(result.data)}`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const videoUrl = videoData.url;
|
|
87
|
+
const fileDetails = videoData.file_name && videoData.file_size !== undefined
|
|
88
|
+
? `\nFile: ${videoData.file_name} (${(videoData.file_size / 1024 / 1024).toFixed(2)} MB)`
|
|
89
|
+
: "";
|
|
90
|
+
const seedInfo = seed !== undefined ? `\nSeed: ${seed}` : "";
|
|
91
|
+
const requestIdInfo = result.requestId ? `\nRequest ID: ${result.requestId}` : "";
|
|
92
|
+
|
|
93
|
+
return videoUrl
|
|
94
|
+
// return `✅ Infinitalk video generated successfully!\n\nVideo URL: ${videoUrl}${fileDetails}${seedInfo}${requestIdInfo}`;
|
|
95
|
+
},
|
|
96
|
+
};
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
|
|
3
|
+
export const perplexityImages = {
|
|
4
|
+
name: "perplexityImages",
|
|
5
|
+
description: "Searches for images using the Perplexity API. Returns a formatted text response that includes a summary and a numbered list of image URLs with citations mapped to the text.",
|
|
6
|
+
parameters: z.object({
|
|
7
|
+
query: z.string().describe("The search query for images."),
|
|
8
|
+
image_domain_filter: z.array(z.string()).optional().describe("A list of domains to include or exclude. To exclude, prefix with '-'. E.g., ['wikimedia.org', '-gettyimages.com']."),
|
|
9
|
+
image_format_filter: z.array(z.string()).optional().describe("A list of allowed image formats. E.g., ['jpg', 'png', 'gif']."),
|
|
10
|
+
}),
|
|
11
|
+
execute: async (args: { query: string; image_domain_filter?: string[]; image_format_filter?: string[] }) => {
|
|
12
|
+
const apiKey = process.env.PERPLEXITY_API_KEY;
|
|
13
|
+
if (!apiKey) {
|
|
14
|
+
throw new Error("PERPLEXITY_API_KEY environment variable is not set.");
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const url = "https://api.perplexity.ai/chat/completions";
|
|
18
|
+
const headers = {
|
|
19
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
20
|
+
"Content-Type": "application/json",
|
|
21
|
+
"accept": "application/json"
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
const payload: any = {
|
|
25
|
+
model: "sonar",
|
|
26
|
+
messages: [
|
|
27
|
+
{ role: "user", content: `Show me images of ${args.query}` }
|
|
28
|
+
],
|
|
29
|
+
return_images: true
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
if (args.image_domain_filter) {
|
|
33
|
+
payload.image_domain_filter = args.image_domain_filter;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (args.image_format_filter) {
|
|
37
|
+
payload.image_format_filter = args.image_format_filter;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const res = await fetch(url, {
|
|
41
|
+
method: "POST",
|
|
42
|
+
headers: headers,
|
|
43
|
+
body: JSON.stringify(payload),
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
if (!res.ok) {
|
|
47
|
+
const text = await res.text();
|
|
48
|
+
throw new Error(`Perplexity API request failed: ${res.status} ${text}`);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const data: any = await res.json();
|
|
52
|
+
let content = data.choices?.[0]?.message?.content;
|
|
53
|
+
const images = data.images;
|
|
54
|
+
const citations = data.citations;
|
|
55
|
+
|
|
56
|
+
if (!images || images.length === 0) {
|
|
57
|
+
return `No direct image URLs found in the API response. The text content was: ${content}`;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Create a map of origin_url -> new 1-based index
|
|
61
|
+
const originUrlToImageIndex: { [key: string]: number } = {};
|
|
62
|
+
images.forEach((img: any, index: number) => {
|
|
63
|
+
if (img.origin_url) {
|
|
64
|
+
originUrlToImageIndex[img.origin_url] = index + 1;
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
// Create a map of old citation index -> new image index
|
|
69
|
+
const oldToNewCitationMap: { [key: number]: number } = {};
|
|
70
|
+
if (citations && Array.isArray(citations)) {
|
|
71
|
+
citations.forEach((citationUrl: string, index: number) => {
|
|
72
|
+
if (originUrlToImageIndex[citationUrl]) {
|
|
73
|
+
oldToNewCitationMap[index + 1] = originUrlToImageIndex[citationUrl];
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Replace citations in the content
|
|
79
|
+
if (content && typeof content === 'string') {
|
|
80
|
+
content = content.replace(/\[(\d+)\]/g, (match: string, oldIndexStr: string) => {
|
|
81
|
+
const oldIndex = parseInt(oldIndexStr, 10);
|
|
82
|
+
const newIndex = oldToNewCitationMap[oldIndex];
|
|
83
|
+
if (newIndex) {
|
|
84
|
+
return `[${newIndex}]`;
|
|
85
|
+
}
|
|
86
|
+
return ''; // Remove citation if it doesn't correspond to an image
|
|
87
|
+
}).replace(/(\s\s+)/g, ' ').trim(); // Clean up extra spaces
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Build the final formatted output
|
|
91
|
+
let output = content + "\n\n--- Images ---\n";
|
|
92
|
+
images.forEach((img: any, index: number) => {
|
|
93
|
+
output += `${index + 1}. ${img.image_url}\n (Source: ${img.origin_url})\n`;
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
return output;
|
|
97
|
+
},
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
export const perplexityVideos = {
|
|
101
|
+
name: "perplexityVideos",
|
|
102
|
+
description: "Searches for videos using the Perplexity API. Returns a formatted text response that includes a summary and a numbered list of video URLs with citations mapped to the text.",
|
|
103
|
+
parameters: z.object({
|
|
104
|
+
query: z.string().describe("The search query for videos."),
|
|
105
|
+
search_domain_filter: z.array(z.string()).optional().describe("A list of domains to limit the search to (e.g., ['youtube.com']). Use a '-' prefix to exclude a domain."),
|
|
106
|
+
}),
|
|
107
|
+
execute: async (args: { query: string; search_domain_filter?: string[] }) => {
|
|
108
|
+
const apiKey = process.env.PERPLEXITY_API_KEY;
|
|
109
|
+
if (!apiKey) {
|
|
110
|
+
throw new Error("PERPLEXITY_API_KEY environment variable is not set.");
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const url = "https://api.perplexity.ai/chat/completions";
|
|
114
|
+
const headers = {
|
|
115
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
116
|
+
"Content-Type": "application/json",
|
|
117
|
+
"accept": "application/json"
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
const payload: any = {
|
|
121
|
+
model: "sonar-pro",
|
|
122
|
+
messages: [
|
|
123
|
+
{ role: "user", content: `Show me videos of ${args.query}` }
|
|
124
|
+
],
|
|
125
|
+
media_response: { overrides: { return_videos: true } }
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
if (args.search_domain_filter) {
|
|
129
|
+
payload.search_domain_filter = args.search_domain_filter;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const res = await fetch(url, {
|
|
133
|
+
method: "POST",
|
|
134
|
+
headers: headers,
|
|
135
|
+
body: JSON.stringify(payload),
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
if (!res.ok) {
|
|
139
|
+
const text = await res.text();
|
|
140
|
+
throw new Error(`Perplexity API request failed: ${res.status} ${text}`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const data: any = await res.json();
|
|
144
|
+
let content = data.choices?.[0]?.message?.content;
|
|
145
|
+
const videos = data.videos;
|
|
146
|
+
const citations = data.citations;
|
|
147
|
+
|
|
148
|
+
if (!videos || videos.length === 0) {
|
|
149
|
+
return `No direct video URLs found in the API response. Full API Response: ${JSON.stringify(data, null, 2)}`;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Create a map of video url -> new 1-based index
|
|
153
|
+
const urlToVideoIndex: { [key: string]: number } = {};
|
|
154
|
+
videos.forEach((video: any, index: number) => {
|
|
155
|
+
if (video.url) {
|
|
156
|
+
urlToVideoIndex[video.url] = index + 1;
|
|
157
|
+
}
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
// Create a map of old citation index -> new video index
|
|
161
|
+
const oldToNewCitationMap: { [key: number]: number } = {};
|
|
162
|
+
if (citations && Array.isArray(citations)) {
|
|
163
|
+
citations.forEach((citationUrl: string, index: number) => {
|
|
164
|
+
if (urlToVideoIndex[citationUrl]) {
|
|
165
|
+
oldToNewCitationMap[index + 1] = urlToVideoIndex[citationUrl];
|
|
166
|
+
}
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Replace citations in the content
|
|
171
|
+
if (content && typeof content === 'string') {
|
|
172
|
+
content = content.replace(/\[(\d+)\]/g, (match: string, oldIndexStr: string) => {
|
|
173
|
+
const oldIndex = parseInt(oldIndexStr, 10);
|
|
174
|
+
const newIndex = oldToNewCitationMap[oldIndex];
|
|
175
|
+
if (newIndex) {
|
|
176
|
+
return `[${newIndex}]`;
|
|
177
|
+
}
|
|
178
|
+
return ''; // Remove citation if it doesn't correspond to a video
|
|
179
|
+
}).replace(/(\s\s+)/g, ' ').trim(); // Clean up extra spaces
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Build the final formatted output
|
|
183
|
+
let output = content + "\n\n--- Videos ---\n";
|
|
184
|
+
videos.forEach((video: any, index: number) => {
|
|
185
|
+
output += `${index + 1}. ${video.url}\n`;
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
return output;
|
|
189
|
+
},
|
|
190
|
+
};
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { GoogleGenAI } from "@google/genai";
|
|
3
|
+
|
|
4
|
+
const ai = new GoogleGenAI({
|
|
5
|
+
apiKey: process.env.GEMINI_API_KEY || "",
|
|
6
|
+
});
|
|
7
|
+
|
|
8
|
+
export const analyzeYoutubeVideo = {
|
|
9
|
+
name: "analyzeYoutubeVideo",
|
|
10
|
+
description: "Analyze YouTube videos for content using the correct GenAI JS API approach with FileData fileUri. Perfect for extracting stock media content, analyzing video content, or getting descriptions of YouTube videos",
|
|
11
|
+
parameters: z.object({
|
|
12
|
+
youtube_url: z.string().describe("YouTube video URL to analyze (format: https://www.youtube.com/watch?v=VIDEO_ID)"),
|
|
13
|
+
prompt: z.string().describe("Analysis prompt or question about the YouTube video content"),
|
|
14
|
+
}),
|
|
15
|
+
execute: async (args: { youtube_url: string; prompt: string }) => {
|
|
16
|
+
try {
|
|
17
|
+
// Validate YouTube URL format
|
|
18
|
+
if (!args.youtube_url || (!args.youtube_url.includes('youtube.com/watch') && !args.youtube_url.includes('youtu.be'))) {
|
|
19
|
+
throw new Error("Invalid YouTube URL format. Expected: https://www.youtube.com/watch?v=VIDEO_ID");
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Create content using the correct FileData approach with fileUri
|
|
23
|
+
const response = await ai.models.generateContent({
|
|
24
|
+
model: 'models/gemini-2.5-flash',
|
|
25
|
+
contents: {
|
|
26
|
+
parts: [
|
|
27
|
+
{
|
|
28
|
+
fileData: {
|
|
29
|
+
fileUri: args.youtube_url
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
{ text: args.prompt }
|
|
33
|
+
]
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
let result = "";
|
|
38
|
+
if (response.candidates && response.candidates[0]?.content?.parts) {
|
|
39
|
+
for (const part of response.candidates[0].content.parts) {
|
|
40
|
+
if (part.text) {
|
|
41
|
+
result += part.text;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return result || "YouTube video analysis completed but no text response received";
|
|
47
|
+
|
|
48
|
+
} catch (error: any) {
|
|
49
|
+
throw new Error(`YouTube video analysis failed: ${error.message}`);
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
};
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { fal } from "@fal-ai/client";
|
|
2
|
+
|
|
3
|
+
export async function callFalModel(
|
|
4
|
+
modelName: string,
|
|
5
|
+
input: any,
|
|
6
|
+
options: { falKey?: string; logs?: boolean } = {}
|
|
7
|
+
) {
|
|
8
|
+
const { falKey, logs = true } = options;
|
|
9
|
+
const key = falKey || process.env.FAL_KEY;
|
|
10
|
+
if (!key) {
|
|
11
|
+
throw new Error("FAL_KEY is required. Provide it via fal_key parameter or FAL_KEY environment variable.");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
fal.config({
|
|
15
|
+
credentials: key,
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
console.log(`[${modelName}] Submitting request to FAL AI...`);
|
|
19
|
+
|
|
20
|
+
try {
|
|
21
|
+
const result = await fal.subscribe(modelName, {
|
|
22
|
+
input,
|
|
23
|
+
logs,
|
|
24
|
+
onQueueUpdate: (update) => {
|
|
25
|
+
if (update.status === "IN_PROGRESS") {
|
|
26
|
+
console.log(`[${modelName}] Status: ${update.status}`);
|
|
27
|
+
if (logs && "logs" in update && update.logs) {
|
|
28
|
+
update.logs.forEach((log) => {
|
|
29
|
+
console.log(`[${modelName}] ${log.message}`);
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
} else if (update.status === "IN_QUEUE") {
|
|
33
|
+
console.log(`[${modelName}] Status: ${update.status} - Waiting in queue...`);
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
console.log(`[${modelName}] Generation completed successfully`);
|
|
39
|
+
|
|
40
|
+
return result;
|
|
41
|
+
} catch (error: any) {
|
|
42
|
+
console.error(`[${modelName}] Error:`, error);
|
|
43
|
+
throw new Error(`FAL AI ${modelName} generation failed: ${error.message || JSON.stringify(error)}`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './fal.utils'
|