media-agent-mcp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- media_agent_mcp/__init__.py +7 -0
- media_agent_mcp/ai_models/__init__.py +17 -0
- media_agent_mcp/ai_models/seed16.py +151 -0
- media_agent_mcp/ai_models/seedance.py +258 -0
- media_agent_mcp/ai_models/seededit.py +94 -0
- media_agent_mcp/ai_models/seedream.py +136 -0
- media_agent_mcp/media_selectors/__init__.py +9 -0
- media_agent_mcp/media_selectors/image_selector.py +119 -0
- media_agent_mcp/media_selectors/video_selector.py +159 -0
- media_agent_mcp/server.py +405 -0
- media_agent_mcp/storage/__init__.py +8 -0
- media_agent_mcp/storage/tos_client.py +98 -0
- media_agent_mcp/video/__init__.py +9 -0
- media_agent_mcp/video/processor.py +337 -0
- media_agent_mcp-0.1.0.dist-info/METADATA +495 -0
- media_agent_mcp-0.1.0.dist-info/RECORD +18 -0
- media_agent_mcp-0.1.0.dist-info/WHEEL +4 -0
- media_agent_mcp-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
"""Image selection module.
|
2
|
+
|
3
|
+
This module provides functionality for selecting the best image from multiple options.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import json
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
import sys
|
10
|
+
from typing import List
|
11
|
+
|
12
|
+
|
13
|
+
from media_agent_mcp.ai_models.seed16 import process_vlm_task
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
def select_best_image(image_urls: List[str], prompt: str) -> dict:
|
19
|
+
"""Select the best image from a list of images based on criteria.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
image_urls: List of paths to images to choose from
|
23
|
+
prompt: The ideal image description
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
Url to the selected best image
|
27
|
+
"""
|
28
|
+
try:
|
29
|
+
if not image_urls:
|
30
|
+
return "Error: No images provided"
|
31
|
+
|
32
|
+
system_prompt = f"""
|
33
|
+
# role
|
34
|
+
You are a images evaluate agent, aim to choose the most suitable image according to the desciption
|
35
|
+
|
36
|
+
# selection criteria (highest → lowest priority)
|
37
|
+
1. **Semantic relevance** – the main subject, setting, and mood must align with the scene description.
|
38
|
+
2. **Tone consistency** – lighting, color palette, realism, and cultural cues should match the overall style (vertical vlog, warm, authentic, etc.).
|
39
|
+
3. **Technical quality** – sharp focus on the key subject, no obvious artifacts or distortion.
|
40
|
+
|
41
|
+
# NEVER choose an image that …
|
42
|
+
- contains **garbled text**, unreadable characters, or **large areas of tiny text**.
|
43
|
+
- is **off-topic** (food, culture, or location does not match the description).
|
44
|
+
- shows strong motion blur, warped anatomy, broken perspective, or clear AI artifacts.
|
45
|
+
- uses lighting or colors that clash with the established warm, look.
|
46
|
+
- Do not choose images do not conform to physical logic.
|
47
|
+
|
48
|
+
# output format, strictly json
|
49
|
+
{{
|
50
|
+
"choice": "the index"
|
51
|
+
"reason": "explain why to choose this, and the disadvantage of other images"
|
52
|
+
}}
|
53
|
+
# example
|
54
|
+
Images:image1, image2, image3
|
55
|
+
Desciption:“Show a slightly messy Nasi Padang takeaway box.”
|
56
|
+
→ 输出:{{
|
57
|
+
"choice": "3",
|
58
|
+
"reason": "3 is the most suitable image, as it aligns with the description of a messy Nasi Padang takeaway box."
|
59
|
+
}}
|
60
|
+
"""
|
61
|
+
|
62
|
+
messages = [{"role": "system", "content": system_prompt}]
|
63
|
+
messages += [
|
64
|
+
{"role": "user", "content": [
|
65
|
+
{"type": "text", "text": f"this is image {i}"},
|
66
|
+
{"type": "image_url", "image_url": {"url": image_urls[i]}}
|
67
|
+
]} for i in range(len(image_urls))
|
68
|
+
]
|
69
|
+
messages.append({"role": "user", "content": f"Image Prompt:{prompt}\n Please choose the best image according to the prompt above."})
|
70
|
+
|
71
|
+
try:
|
72
|
+
response = process_vlm_task(messages, is_json=True)
|
73
|
+
logger.info(f"model response: {response}")
|
74
|
+
|
75
|
+
except Exception as e:
|
76
|
+
logger.error(f"[VLM]Error selecting image: {e}")
|
77
|
+
return {
|
78
|
+
"choice": None,
|
79
|
+
"reason": f"Error selecting image: {str(e)}",
|
80
|
+
"url": image_urls[0] if image_urls else None
|
81
|
+
}
|
82
|
+
|
83
|
+
try:
|
84
|
+
response_json = json.loads(response)
|
85
|
+
choosed_url = image_urls[int(response_json['choice'])]
|
86
|
+
|
87
|
+
return {
|
88
|
+
"choice": response_json['choice'],
|
89
|
+
"reason": response_json['reason'],
|
90
|
+
"url": choosed_url
|
91
|
+
}
|
92
|
+
|
93
|
+
except Exception as e:
|
94
|
+
logger.error(f"[VLM]Error parsing response: {e}")
|
95
|
+
return {
|
96
|
+
"choice": None,
|
97
|
+
"reason": f"Error parsing response: {str(e)}",
|
98
|
+
"url": image_urls[0]
|
99
|
+
}
|
100
|
+
|
101
|
+
except Exception as e:
|
102
|
+
logger.error(f"Error selecting image: {e}")
|
103
|
+
return {
|
104
|
+
"choice": None,
|
105
|
+
"reason": f"Error selecting image: {str(e)}",
|
106
|
+
"url": image_urls[0] if image_urls else None
|
107
|
+
}
|
108
|
+
|
109
|
+
|
110
|
+
if __name__ == '__main__':
|
111
|
+
# Example usage
|
112
|
+
images = [
|
113
|
+
"https://carey.tos-ap-southeast-1.bytepluses.com/Art%20Portrait/Art%20Portrait/Art%20Portrait/Art%20Portrait%20(1).jpg",
|
114
|
+
"https://carey.tos-ap-southeast-1.bytepluses.com/Art%20Portrait/Art%20Portrait/Art%20Portrait/Art%20Portrait%20(2).jpg"
|
115
|
+
]
|
116
|
+
prompt = "一个女人愁眉苦脸"
|
117
|
+
|
118
|
+
best_image = select_best_image(images, prompt)
|
119
|
+
print(f"The best image selected is: {best_image}")
|
@@ -0,0 +1,159 @@
|
|
1
|
+
"""Image selection module.
|
2
|
+
|
3
|
+
This module provides functionality for selecting the best image from multiple options.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import json
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
import sys
|
10
|
+
from typing import List
|
11
|
+
|
12
|
+
from media_agent_mcp.ai_models.seed16 import process_vlm_task
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
def select_best_video(video_urls: List[str], prompt: str) -> dict:
|
18
|
+
"""Select the best image from a list of images based on criteria.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
video_urls: List of paths to images to choose from
|
22
|
+
prompt: The ideal image description
|
23
|
+
|
24
|
+
Returns:
|
25
|
+
Url to the selected best image
|
26
|
+
"""
|
27
|
+
try:
|
28
|
+
if not video_urls:
|
29
|
+
return {
|
30
|
+
"choice": None,
|
31
|
+
"reason": "Error: No images provided",
|
32
|
+
"url": None
|
33
|
+
}
|
34
|
+
|
35
|
+
system_prompt = f"""
|
36
|
+
# Role
|
37
|
+
You are an efficient Video Curation AI Agent.
|
38
|
+
|
39
|
+
# Core Mission
|
40
|
+
Your sole objective is: based on the user's prompt, to select the single most relevant and highest-quality video from a given list of candidate videos, and directly output its index number from the list.
|
41
|
+
|
42
|
+
# Workflow
|
43
|
+
Although your final output is very simple, you must still strictly follow the internal thought process below to ensure the accuracy of your selection:
|
44
|
+
|
45
|
+
Analyze User Prompt:
|
46
|
+
|
47
|
+
Identify the core theme, extract keywords, and determine the user's intent and sentiment.
|
48
|
+
|
49
|
+
Analyze Candidate Videos:
|
50
|
+
|
51
|
+
You will receive a list containing images extracted from multiple video frames, with the videos numbered sequentially (1st, 2nd, ...).
|
52
|
+
|
53
|
+
Carefully analyze all available information for each video, such as its theme, visuals, accuracy, and transcript (text content).
|
54
|
+
|
55
|
+
Evaluate & Score:
|
56
|
+
|
57
|
+
Based on your understanding of the user's needs, internally score and rank each candidate video according to the "Evaluation Criteria" below.
|
58
|
+
|
59
|
+
Final Decision:
|
60
|
+
|
61
|
+
Based on the overall score, select the single highest-ranking and best-matching video.
|
62
|
+
|
63
|
+
# Evaluation Criteria
|
64
|
+
When making your decision, you must internally consider the following dimensions in order of importance:
|
65
|
+
|
66
|
+
# Content Relevance - [Highest Priority]
|
67
|
+
- Is the core content of the video highly consistent with the theme of the user's prompt?
|
68
|
+
|
69
|
+
# User Intent Alignment
|
70
|
+
- Does the style and type of the video match the user's potential intent (e.g., to learn, to be entertained, to be inspired)?
|
71
|
+
|
72
|
+
# Information Quality & Depth
|
73
|
+
- Does the video provide valuable and accurate information?
|
74
|
+
|
75
|
+
## Overall Video Quality
|
76
|
+
- Are the items generated in the video correct?
|
77
|
+
- Is the text garbled?
|
78
|
+
- Are the objects logical?
|
79
|
+
- Is the video is consistent?
|
80
|
+
|
81
|
+
## Do not choose videos like:
|
82
|
+
1. Please avoid selecting videos that are physically illogical, like a person with three arms or a person with a head on the chest.
|
83
|
+
|
84
|
+
# Constraints & Rules
|
85
|
+
Absolutely do not output any extra text, explanations, reasons, punctuation, or sentences. Only a number is needed.
|
86
|
+
You must choose from the video list provided to you. Fabricating information is strictly forbidden.
|
87
|
+
|
88
|
+
# output format, strictly json
|
89
|
+
{{
|
90
|
+
"choice": "the index"
|
91
|
+
"reason": "explain why to choose this, and the disadvantage of other videos"
|
92
|
+
}}
|
93
|
+
# example
|
94
|
+
Images:video1, video2, video3
|
95
|
+
Description:“Show a slightly messy Nasi Padang takeaway box.”
|
96
|
+
→ 输出:{{
|
97
|
+
"choice": "3",
|
98
|
+
"reason": "3 is the most suitable image, as it aligns with the description of a messy Nasi Padang takeaway box. and 1 and 2 are not suitable because they are too clean and do not match the description."
|
99
|
+
}}
|
100
|
+
"""
|
101
|
+
|
102
|
+
messages = [{"role": "system", "content": system_prompt}]
|
103
|
+
messages += [
|
104
|
+
{"role": "user", "content": [
|
105
|
+
{"type": "text", "text": f"this is video {i}"},
|
106
|
+
{"type": "video_url", "video_url": {"url": video_urls[i]}}
|
107
|
+
]} for i in range(len(video_urls))
|
108
|
+
]
|
109
|
+
messages.append({"role": "user", "content": f"Video Prompt:{prompt}\n Please choose the best video according to the prompt above."})
|
110
|
+
|
111
|
+
try:
|
112
|
+
response = process_vlm_task(messages, is_json=True)
|
113
|
+
logger.info(f"model response: {response}")
|
114
|
+
|
115
|
+
except Exception as e:
|
116
|
+
logger.error(f"[VLM]Error selecting image: {e}")
|
117
|
+
return {
|
118
|
+
"choice": None,
|
119
|
+
"reason": f"Error selecting image: {str(e)}",
|
120
|
+
"url": video_urls[0] if video_urls else None
|
121
|
+
}
|
122
|
+
|
123
|
+
try:
|
124
|
+
response_json = json.loads(response)
|
125
|
+
choosed_url = video_urls[int(response_json['choice'])]
|
126
|
+
|
127
|
+
return {
|
128
|
+
"choice": response_json['choice'],
|
129
|
+
"reason": response_json['reason'],
|
130
|
+
"url": choosed_url
|
131
|
+
}
|
132
|
+
|
133
|
+
except Exception as e:
|
134
|
+
logger.error(f"[VLM]Error parsing response: {e}")
|
135
|
+
return {
|
136
|
+
"choice": None,
|
137
|
+
"reason": f"Error parsing response: {str(e)}",
|
138
|
+
"url": video_urls[0]
|
139
|
+
}
|
140
|
+
|
141
|
+
except Exception as e:
|
142
|
+
logger.error(f"Error selecting image: {e}")
|
143
|
+
return {
|
144
|
+
"choice": None,
|
145
|
+
"reason": f"Error selecting image: {str(e)}",
|
146
|
+
"url": video_urls[0] if video_urls else None
|
147
|
+
}
|
148
|
+
|
149
|
+
|
150
|
+
if __name__ == '__main__':
|
151
|
+
# Example usage
|
152
|
+
images = [
|
153
|
+
"https://carey.tos-ap-southeast-1.bytepluses.com/demo/02175205870921200000000000000000000ffffc0a85094bda733.mp4",
|
154
|
+
"https://carey.tos-ap-southeast-1.bytepluses.com/demo/02175205817458400000000000000000000ffffc0a850948120ae.mp4"
|
155
|
+
]
|
156
|
+
prompt = "生成一个很开心的自拍的男生"
|
157
|
+
|
158
|
+
best_image = select_best_video(images, prompt)
|
159
|
+
print(f"The best image selected is: {best_image}")
|
@@ -0,0 +1,405 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""Media Agent MCP Server - A Model Context Protocol server for media processing.
|
3
|
+
|
4
|
+
This server provides 9 tools for media processing:
|
5
|
+
1. TOS - Save content as URL
|
6
|
+
2. Video Concat - Concatenate two videos
|
7
|
+
3. Video Last Frame - Get the last frame from a video
|
8
|
+
4. Seedream - Creating images (AI model)
|
9
|
+
5. Seedance (lite & pro) - Creating videos (AI model)
|
10
|
+
6. Seededit - Maintain the main character (AI model)
|
11
|
+
7. Seed1.6 (VLM) - Do vision tasks in workflow (AI model)
|
12
|
+
8. Image Selector - Choose the best one from images
|
13
|
+
9. Video Selector - Choose the best video from videos
|
14
|
+
"""
|
15
|
+
|
16
|
+
import argparse
|
17
|
+
import logging
|
18
|
+
from typing import Optional, Dict, Any
|
19
|
+
import json
|
20
|
+
|
21
|
+
from mcp.server.fastmcp import FastMCP
|
22
|
+
|
23
|
+
# Import modules
|
24
|
+
from media_agent_mcp.storage import upload_to_tos
|
25
|
+
from media_agent_mcp.video import concat_videos, extract_last_frame
|
26
|
+
from media_agent_mcp.ai_models.seedream import generate_image
|
27
|
+
from media_agent_mcp.ai_models.seedance import generate_video
|
28
|
+
from media_agent_mcp.ai_models.seededit import seededit
|
29
|
+
from media_agent_mcp.media_selectors.image_selector import select_best_image
|
30
|
+
from media_agent_mcp.media_selectors.video_selector import select_best_video
|
31
|
+
|
32
|
+
# Configure logging
|
33
|
+
logging.basicConfig(level=logging.INFO)
|
34
|
+
logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
# Initialize FastMCP server (will be configured in main function)
|
37
|
+
mcp = FastMCP("Media Agent MCP")
|
38
|
+
|
39
|
+
|
40
|
+
@mcp.tool()
|
41
|
+
def video_concat_tool(video_urls: list[str]) -> str:
|
42
|
+
"""
|
43
|
+
Concatenate multiple videos from URLs and upload to TOS.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
video_urls: List of video URLs to concatenate in order
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
JSON string with status, data, and message
|
50
|
+
"""
|
51
|
+
try:
|
52
|
+
result = concat_videos(video_urls)
|
53
|
+
if isinstance(result, dict):
|
54
|
+
return json.dumps(result)
|
55
|
+
else:
|
56
|
+
# Handle legacy string returns
|
57
|
+
if result.startswith("Error:"):
|
58
|
+
return json.dumps({
|
59
|
+
"status": "error",
|
60
|
+
"data": None,
|
61
|
+
"message": result
|
62
|
+
})
|
63
|
+
else:
|
64
|
+
return json.dumps({
|
65
|
+
"status": "success",
|
66
|
+
"data": {"url": result},
|
67
|
+
"message": "Videos concatenated successfully"
|
68
|
+
})
|
69
|
+
except Exception as e:
|
70
|
+
logger.error(f"Error in video_concat_tool: {str(e)}")
|
71
|
+
return json.dumps({
|
72
|
+
"status": "error",
|
73
|
+
"data": None,
|
74
|
+
"message": f"Error: {str(e)}"
|
75
|
+
})
|
76
|
+
|
77
|
+
|
78
|
+
@mcp.tool()
|
79
|
+
def video_last_frame_tool(video_url: str) -> str:
|
80
|
+
"""
|
81
|
+
Extract the last frame from a video file and upload to TOS.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
video_url: URL or path to the video file
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
JSON string with status, data, and message
|
88
|
+
"""
|
89
|
+
try:
|
90
|
+
# Extract last frame and upload to TOS
|
91
|
+
result = extract_last_frame(video_url)
|
92
|
+
if isinstance(result, dict):
|
93
|
+
return json.dumps(result)
|
94
|
+
else:
|
95
|
+
# Handle legacy string returns
|
96
|
+
if result.startswith("Error:"):
|
97
|
+
return json.dumps({
|
98
|
+
"status": "error",
|
99
|
+
"data": None,
|
100
|
+
"message": result
|
101
|
+
})
|
102
|
+
else:
|
103
|
+
return json.dumps({
|
104
|
+
"status": "success",
|
105
|
+
"data": {"url": result},
|
106
|
+
"message": "Last frame extracted successfully"
|
107
|
+
})
|
108
|
+
|
109
|
+
except Exception as e:
|
110
|
+
logger.error(f"Error in video_last_frame_tool: {str(e)}")
|
111
|
+
return json.dumps({
|
112
|
+
"status": "error",
|
113
|
+
"data": None,
|
114
|
+
"message": f"Error: {str(e)}"
|
115
|
+
})
|
116
|
+
|
117
|
+
|
118
|
+
@mcp.tool()
|
119
|
+
def seedream_generate_image_tool(prompt: str, size: str = "1024x1024") -> str:
|
120
|
+
"""
|
121
|
+
Generate an image using Seedream AI model.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
prompt: Text description of the image to generate
|
125
|
+
size: Size of the image (e.g., "1024x1024")
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
JSON string with status, data, and message
|
129
|
+
"""
|
130
|
+
try:
|
131
|
+
result = generate_image(prompt, size=size)
|
132
|
+
if isinstance(result, dict):
|
133
|
+
return json.dumps(result)
|
134
|
+
else:
|
135
|
+
# Handle legacy string returns
|
136
|
+
if result.startswith("Error:"):
|
137
|
+
return json.dumps({
|
138
|
+
"status": "error",
|
139
|
+
"data": None,
|
140
|
+
"message": result
|
141
|
+
})
|
142
|
+
else:
|
143
|
+
return json.dumps({
|
144
|
+
"status": "success",
|
145
|
+
"data": {"image_url": result},
|
146
|
+
"message": "Image generated successfully"
|
147
|
+
})
|
148
|
+
except Exception as e:
|
149
|
+
logger.error(f"Error in seedream_generate_image_tool: {str(e)}")
|
150
|
+
return json.dumps({
|
151
|
+
"status": "error",
|
152
|
+
"data": None,
|
153
|
+
"message": f"Error: {str(e)}"
|
154
|
+
})
|
155
|
+
|
156
|
+
|
157
|
+
@mcp.tool()
|
158
|
+
def seedance_generate_video_tool(prompt: str, first_frame_image: str,
|
159
|
+
last_frame_image: str = None, duration: int = 5,
|
160
|
+
resolution: str = "720p") -> str:
|
161
|
+
"""
|
162
|
+
Generate a video using Seedance AI model with first/last frame images.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
prompt: Text description of the video to generate (optional for image-to-video)
|
166
|
+
first_frame_image: URL or base64 of the first frame image
|
167
|
+
last_frame_image: URL or base64 of the last frame image (optional)
|
168
|
+
duration: Duration of the video in seconds (5 or 10)
|
169
|
+
resolution: Video resolution (480p, 720p)
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
JSON string with status, data, and message
|
173
|
+
"""
|
174
|
+
try:
|
175
|
+
if not prompt and not first_frame_image:
|
176
|
+
return json.dumps({
|
177
|
+
"status": "error",
|
178
|
+
"data": None,
|
179
|
+
"message": "Error: Either prompt or first_frame_image must be provided"
|
180
|
+
})
|
181
|
+
|
182
|
+
result = generate_video(
|
183
|
+
prompt=prompt,
|
184
|
+
first_frame_image=first_frame_image,
|
185
|
+
last_frame_image=last_frame_image,
|
186
|
+
duration=duration,
|
187
|
+
resolution=resolution
|
188
|
+
)
|
189
|
+
|
190
|
+
if isinstance(result, dict):
|
191
|
+
return json.dumps(result)
|
192
|
+
else:
|
193
|
+
# Handle legacy string returns
|
194
|
+
if result.startswith("Error:"):
|
195
|
+
return json.dumps({
|
196
|
+
"status": "error",
|
197
|
+
"data": None,
|
198
|
+
"message": result
|
199
|
+
})
|
200
|
+
else:
|
201
|
+
return json.dumps({
|
202
|
+
"status": "success",
|
203
|
+
"data": {"url": result},
|
204
|
+
"message": "Video generated successfully"
|
205
|
+
})
|
206
|
+
except Exception as e:
|
207
|
+
logger.error(f"Error in seedance_generate_video_tool: {str(e)}")
|
208
|
+
return json.dumps({
|
209
|
+
"status": "error",
|
210
|
+
"data": None,
|
211
|
+
"message": f"Error: {str(e)}"
|
212
|
+
})
|
213
|
+
|
214
|
+
|
215
|
+
@mcp.tool()
|
216
|
+
def seededit_tool(image_url: str, prompt: str, seed: int = -1, scale: float = 0.5, charactor_keep: bool = False) -> str:
|
217
|
+
"""
|
218
|
+
Edit a image using Seededit model.
|
219
|
+
|
220
|
+
Args:
|
221
|
+
image_url: Input image URL for editing
|
222
|
+
prompt: Text prompt for image editing
|
223
|
+
seed: Random seed for reproducibility (-1 for random)
|
224
|
+
scale: Influence degree of text description (0-1)
|
225
|
+
charactor_keep: whether to keep the main charactor in this image, if you wanna change the main charactor, please keep False
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
JSON string with status, data, and message
|
229
|
+
"""
|
230
|
+
try:
|
231
|
+
result = seededit(
|
232
|
+
image_url=image_url,
|
233
|
+
prompt=prompt,
|
234
|
+
charactor_keep=charactor_keep,
|
235
|
+
return_url=True,
|
236
|
+
scale=scale,
|
237
|
+
seed=seed
|
238
|
+
)
|
239
|
+
|
240
|
+
if isinstance(result, dict):
|
241
|
+
return json.dumps(result)
|
242
|
+
else:
|
243
|
+
# Handle legacy string returns
|
244
|
+
if result.startswith("Error:"):
|
245
|
+
return json.dumps({
|
246
|
+
"status": "error",
|
247
|
+
"data": None,
|
248
|
+
"message": result
|
249
|
+
})
|
250
|
+
else:
|
251
|
+
return json.dumps({
|
252
|
+
"status": "success",
|
253
|
+
"data": {"image_url": result},
|
254
|
+
"message": "Image editing completed successfully"
|
255
|
+
})
|
256
|
+
except Exception as e:
|
257
|
+
logger.error(f"Error in seededit_tool: {str(e)}")
|
258
|
+
return json.dumps({
|
259
|
+
"status": "error",
|
260
|
+
"data": None,
|
261
|
+
"message": f"Error: {str(e)}"
|
262
|
+
})
|
263
|
+
|
264
|
+
@mcp.tool()
|
265
|
+
def vlm_vision_task_tool(messages: list) -> str:
|
266
|
+
"""
|
267
|
+
Perform vision-language tasks using VLM model.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
messages: OpenAI-compatible messages format
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
JSON string with status, data, and message
|
274
|
+
"""
|
275
|
+
try:
|
276
|
+
from media_agent_mcp.ai_models.seed16 import process_vlm_task
|
277
|
+
|
278
|
+
result = process_vlm_task(messages)
|
279
|
+
if isinstance(result, dict):
|
280
|
+
return json.dumps(result)
|
281
|
+
else:
|
282
|
+
# Handle legacy string returns
|
283
|
+
if result.startswith("Error:"):
|
284
|
+
return json.dumps({
|
285
|
+
"status": "error",
|
286
|
+
"data": None,
|
287
|
+
"message": result
|
288
|
+
})
|
289
|
+
else:
|
290
|
+
return json.dumps({
|
291
|
+
"status": "success",
|
292
|
+
"data": {"result": result},
|
293
|
+
"message": "Vision task completed successfully"
|
294
|
+
})
|
295
|
+
|
296
|
+
except Exception as e:
|
297
|
+
logger.error(f"Error in vlm_vision_task_tool: {str(e)}")
|
298
|
+
return json.dumps({
|
299
|
+
"status": "error",
|
300
|
+
"data": None,
|
301
|
+
"message": f"Error: {str(e)}"
|
302
|
+
})
|
303
|
+
|
304
|
+
|
305
|
+
@mcp.tool()
|
306
|
+
def image_selector_tool(image_paths: list[str], prompt: str) -> str:
|
307
|
+
"""
|
308
|
+
Select the best image from multiple options using VLM model.
|
309
|
+
|
310
|
+
Args:
|
311
|
+
image_paths: List of paths to image files
|
312
|
+
prompt: Selection criteria prompt
|
313
|
+
|
314
|
+
Returns:
|
315
|
+
JSON string with status, data, and message
|
316
|
+
"""
|
317
|
+
try:
|
318
|
+
result = select_best_image(image_paths, prompt)
|
319
|
+
return json.dumps({
|
320
|
+
"status": "success",
|
321
|
+
"data": result,
|
322
|
+
"message": "Image selection completed successfully"
|
323
|
+
})
|
324
|
+
except Exception as e:
|
325
|
+
return json.dumps({
|
326
|
+
"status": "error",
|
327
|
+
"data": None,
|
328
|
+
"message": f"Image selection failed: {str(e)}"
|
329
|
+
})
|
330
|
+
|
331
|
+
|
332
|
+
@mcp.tool()
|
333
|
+
def video_selector_tool(video_paths: list[str], prompt: str) -> str:
|
334
|
+
"""
|
335
|
+
Select the best video from multiple options using VLM model.
|
336
|
+
|
337
|
+
Args:
|
338
|
+
video_paths: List of paths to videos to choose from
|
339
|
+
prompt: Selection criteria prompt
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
JSON string with status, data, and message
|
343
|
+
"""
|
344
|
+
try:
|
345
|
+
result = select_best_video(video_paths, prompt)
|
346
|
+
return json.dumps({
|
347
|
+
"status": "success",
|
348
|
+
"data": result,
|
349
|
+
"message": "Video selection completed successfully"
|
350
|
+
})
|
351
|
+
except Exception as e:
|
352
|
+
return json.dumps({
|
353
|
+
"status": "error",
|
354
|
+
"data": None,
|
355
|
+
"message": f"Video selection failed: {str(e)}"
|
356
|
+
})
|
357
|
+
|
358
|
+
|
359
|
+
|
360
|
+
def main():
|
361
|
+
"""Main entry point for the MCP server."""
|
362
|
+
# Parse command line arguments
|
363
|
+
parser = argparse.ArgumentParser(description='Media Agent MCP Server')
|
364
|
+
parser.add_argument('--transport', type=str, choices=['sse', 'stdio'], default='stdio',
|
365
|
+
help='Transport method: sse or stdio (default: stdio)')
|
366
|
+
parser.add_argument('--host', type=str, default='127.0.0.1',
|
367
|
+
help='Host for SSE transport (default: 127.0.0.1)')
|
368
|
+
parser.add_argument('--port', type=int, default=8000,
|
369
|
+
help='Port for SSE transport (default: 8000)')
|
370
|
+
parser.add_argument('--version', action='store_true',
|
371
|
+
help='Show version information')
|
372
|
+
|
373
|
+
args = parser.parse_args()
|
374
|
+
|
375
|
+
if args.version:
|
376
|
+
print("Media Agent MCP Server v0.1.0")
|
377
|
+
return
|
378
|
+
|
379
|
+
logger.info("Starting Media Agent MCP Server...")
|
380
|
+
logger.info(f"Transport: {args.transport}")
|
381
|
+
if args.transport == 'sse':
|
382
|
+
logger.info(f"SSE Server will run on {args.host}:{args.port}")
|
383
|
+
|
384
|
+
logger.info("Available tools:")
|
385
|
+
logger.info(" 1. video_last_frame_tool - Extract last frame from video and upload to TOS")
|
386
|
+
logger.info(" 2. video_concat_tool - Concatenate two videos")
|
387
|
+
logger.info(" 3. seedream_generate_image_tool - Generate images with AI (direct URL return)")
|
388
|
+
logger.info(" 4. seedance_generate_video_tool - Generate videos with AI (async with polling)")
|
389
|
+
logger.info(" 5. seededit_maintain_character_tool - Edit images while maintaining character")
|
390
|
+
logger.info(" 6. vlm_vision_task_tool - Perform vision tasks with OpenAI-compatible messages")
|
391
|
+
logger.info(" 7. image_selector_tool - Select best image using VLM model")
|
392
|
+
logger.info(" 8. video_selector_tool - Select best video using VLM model")
|
393
|
+
logger.info(" 9. tos_save_content_tool - Save content to TOS and return URL")
|
394
|
+
|
395
|
+
# Start the server with specified transport
|
396
|
+
if args.transport == 'sse':
|
397
|
+
logger.info(f"Starting SSE server on {args.host}:{args.port}")
|
398
|
+
mcp.run(transport="sse")
|
399
|
+
else:
|
400
|
+
# Default stdio transport
|
401
|
+
mcp.run(transport="stdio")
|
402
|
+
|
403
|
+
|
404
|
+
if __name__ == "__main__":
|
405
|
+
main()
|