mimo-multimodal-mcp 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ # Xiaomi MiMo API Key
2
+ # Get your API key from: https://platform.xiaomimimo.com
3
+ MIMO_API_KEY=your_api_key_here
4
+
5
+ # MiMo API Base URL (optional, defaults to https://api.xiaomimimo.com/v1)
6
+ # MIMO_API_BASE=https://api.xiaomimimo.com/v1
@@ -0,0 +1,24 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [created]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment: pypi
11
+ permissions:
12
+ id-token: write
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+ - name: Install build tools
20
+ run: pip install build
21
+ - name: Build package
22
+ run: python -m build
23
+ - name: Publish to PyPI
24
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,33 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ .venv/
8
+ venv/
9
+ ENV/
10
+
11
+ # Distribution
12
+ dist/
13
+ build/
14
+ *.egg-info/
15
+ *.egg
16
+
17
+ # Environment
18
+ .env
19
+ .env.local
20
+ .venv/key
21
+
22
+ # IDE
23
+ .idea/
24
+ .vscode/
25
+ *.swp
26
+ *.swo
27
+
28
+ # OS
29
+ .DS_Store
30
+ Thumbs.db
31
+
32
+ # Lock files (optional, uncomment if needed)
33
+ # uv.lock
@@ -0,0 +1,172 @@
1
+ Metadata-Version: 2.4
2
+ Name: mimo-multimodal-mcp
3
+ Version: 0.3.0
4
+ Summary: MCP server for Xiaomi MiMo multimodal understanding (image, audio, video)
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: mcp[cli]>=1.2.0
7
+ Requires-Dist: openai>=1.0.0
8
+ Requires-Dist: python-dotenv>=1.0.0
9
+ Description-Content-Type: text/markdown
10
+
11
+ # MiMo Multimodal Understanding MCP Server
12
+
13
+ MCP server for Xiaomi MiMo multimodal understanding API (image, audio, video).
14
+
15
+ ## Features
16
+
17
+ - **Image Understanding**: Single/multiple images, URL and local file support
18
+ - **Audio Understanding**: Single/multiple audio, URL and local file support
19
+ - **Video Understanding**: Single/multiple video, URL and local file support, configurable fps and resolution
20
+
21
+ ## Setup
22
+
23
+ ### 1. Install dependencies
24
+
25
+ ```bash
26
+ uv sync
27
+ ```
28
+
29
+ ### 2. Configure API Key
30
+
31
+ Copy `.env.example` to `.env` and fill in your API key:
32
+
33
+ ```bash
34
+ cp .env.example .env
35
+ ```
36
+
37
+ Or set environment variable directly:
38
+
39
+ ```bash
40
+ export MIMO_API_KEY=your_api_key_here
41
+ ```
42
+
43
+ Get your API key from: https://platform.xiaomimimo.com
44
+
45
+ ### 3. (Optional) Configure API Base URL
46
+
47
+ By default, the server uses `https://api.xiaomimimo.com/v1`. To use a different API endpoint:
48
+
49
+ ```bash
50
+ export MIMO_API_BASE=https://your-custom-endpoint/v1
51
+ ```
52
+
53
+ Or add it to your `.env` file:
54
+
55
+ ```
56
+ MIMO_API_BASE=https://your-custom-endpoint/v1
57
+ ```
58
+
59
+ ## Usage
60
+
61
+ ### Development mode (with MCP Inspector)
62
+
63
+ ```bash
64
+ uv run mcp dev src/mimo_multimodal_mcp/server.py
65
+ ```
66
+
67
+ ### Install to Claude Desktop
68
+
69
+ ```bash
70
+ uv run mcp install src/mimo_multimodal_mcp/server.py
71
+ ```
72
+
73
+ ### Direct execution
74
+
75
+ ```bash
76
+ uv run python src/mimo_multimodal_mcp/server.py
77
+ ```
78
+
79
+ ## Tools
80
+
81
+ ### `understand_image`
82
+
83
+ Analyze images using Xiaomi MiMo multimodal model.
84
+
85
+ | Parameter | Type | Required | Description |
86
+ |-----------|------|----------|-------------|
87
+ | `prompt` | string | Yes | Image understanding task description |
88
+ | `image_url` | string | No | Single image URL or data:image base64 |
89
+ | `image_path` | string | No | Single local image file path |
90
+ | `image_urls` | list[string] | No | Multiple image URLs |
91
+ | `image_paths` | list[string] | No | Multiple local image file paths |
92
+ | `system_prompt` | string | No | Custom system prompt |
93
+ | `max_tokens` | integer | No | Max output length (default: 32768) |
94
+
95
+ **Supported formats**: JPEG, PNG, GIF, WebP
96
+ **Size limit**: 10MB
97
+
98
+ ### `understand_audio`
99
+
100
+ Analyze audio using Xiaomi MiMo multimodal model.
101
+
102
+ | Parameter | Type | Required | Description |
103
+ |-----------|------|----------|-------------|
104
+ | `prompt` | string | Yes | Audio understanding task description |
105
+ | `audio_url` | string | No | Single audio URL |
106
+ | `audio_path` | string | No | Single local audio file path |
107
+ | `audio_urls` | list[string] | No | Multiple audio URLs |
108
+ | `audio_paths` | list[string] | No | Multiple local audio file paths |
109
+ | `system_prompt` | string | No | Custom system prompt |
110
+ | `max_tokens` | integer | No | Max output length (default: 32768) |
111
+
112
+ **Supported formats**: MP3, WAV, FLAC, M4A, OGG
113
+ **Size limit**: URL 100MB, Base64 50MB
114
+
115
+ ### `understand_video`
116
+
117
+ Analyze video using Xiaomi MiMo multimodal model.
118
+
119
+ | Parameter | Type | Required | Description |
120
+ |-----------|------|----------|-------------|
121
+ | `prompt` | string | Yes | Video understanding task description |
122
+ | `video_url` | string | No | Single video URL |
123
+ | `video_path` | string | No | Single local video file path |
124
+ | `video_urls` | list[string] | No | Multiple video URLs |
125
+ | `video_paths` | list[string] | No | Multiple local video file paths |
126
+ | `fps` | float | No | Frames per second, range [0.1, 10], default: 2 |
127
+ | `media_resolution` | string | No | Resolution: "default" or "max" |
128
+ | `system_prompt` | string | No | Custom system prompt |
129
+ | `max_tokens` | integer | No | Max output length (default: 32768) |
130
+
131
+ **Supported formats**: MP4, MOV, AVI, WMV
132
+ **Size limit**: URL 300MB, Base64 50MB
133
+
134
+ ## Examples
135
+
136
+ ### Image Understanding
137
+
138
+ ```python
139
+ # URL
140
+ await understand_image(prompt="Describe this image", image_url="https://example.com/image.jpg")
141
+
142
+ # Local file
143
+ await understand_image(prompt="What text is in this?", image_path="/path/to/screenshot.png")
144
+
145
+ # Multiple images
146
+ await understand_image(prompt="Compare these", image_urls=["url1", "url2"])
147
+ ```
148
+
149
+ ### Audio Understanding
150
+
151
+ ```python
152
+ # URL
153
+ await understand_audio(prompt="Transcribe this audio", audio_url="https://example.com/audio.wav")
154
+
155
+ # Local file
156
+ await understand_audio(prompt="What is being said?", audio_path="/path/to/audio.mp3")
157
+ ```
158
+
159
+ ### Video Understanding
160
+
161
+ ```python
162
+ # URL with default settings
163
+ await understand_video(prompt="Describe this video", video_url="https://example.com/video.mp4")
164
+
165
+ # URL with custom fps and resolution
166
+ await understand_video(
167
+ prompt="Describe the action",
168
+ video_url="https://example.com/video.mp4",
169
+ fps=5.0,
170
+ media_resolution="max"
171
+ )
172
+ ```
@@ -0,0 +1,162 @@
1
+ # MiMo Multimodal Understanding MCP Server
2
+
3
+ MCP server for Xiaomi MiMo multimodal understanding API (image, audio, video).
4
+
5
+ ## Features
6
+
7
+ - **Image Understanding**: Single/multiple images, URL and local file support
8
+ - **Audio Understanding**: Single/multiple audio, URL and local file support
9
+ - **Video Understanding**: Single/multiple video, URL and local file support, configurable fps and resolution
10
+
11
+ ## Setup
12
+
13
+ ### 1. Install dependencies
14
+
15
+ ```bash
16
+ uv sync
17
+ ```
18
+
19
+ ### 2. Configure API Key
20
+
21
+ Copy `.env.example` to `.env` and fill in your API key:
22
+
23
+ ```bash
24
+ cp .env.example .env
25
+ ```
26
+
27
+ Or set environment variable directly:
28
+
29
+ ```bash
30
+ export MIMO_API_KEY=your_api_key_here
31
+ ```
32
+
33
+ Get your API key from: https://platform.xiaomimimo.com
34
+
35
+ ### 3. (Optional) Configure API Base URL
36
+
37
+ By default, the server uses `https://api.xiaomimimo.com/v1`. To use a different API endpoint:
38
+
39
+ ```bash
40
+ export MIMO_API_BASE=https://your-custom-endpoint/v1
41
+ ```
42
+
43
+ Or add it to your `.env` file:
44
+
45
+ ```
46
+ MIMO_API_BASE=https://your-custom-endpoint/v1
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ### Development mode (with MCP Inspector)
52
+
53
+ ```bash
54
+ uv run mcp dev src/mimo_multimodal_mcp/server.py
55
+ ```
56
+
57
+ ### Install to Claude Desktop
58
+
59
+ ```bash
60
+ uv run mcp install src/mimo_multimodal_mcp/server.py
61
+ ```
62
+
63
+ ### Direct execution
64
+
65
+ ```bash
66
+ uv run python src/mimo_multimodal_mcp/server.py
67
+ ```
68
+
69
+ ## Tools
70
+
71
+ ### `understand_image`
72
+
73
+ Analyze images using Xiaomi MiMo multimodal model.
74
+
75
+ | Parameter | Type | Required | Description |
76
+ |-----------|------|----------|-------------|
77
+ | `prompt` | string | Yes | Image understanding task description |
78
+ | `image_url` | string | No | Single image URL or data:image base64 |
79
+ | `image_path` | string | No | Single local image file path |
80
+ | `image_urls` | list[string] | No | Multiple image URLs |
81
+ | `image_paths` | list[string] | No | Multiple local image file paths |
82
+ | `system_prompt` | string | No | Custom system prompt |
83
+ | `max_tokens` | integer | No | Max output length (default: 32768) |
84
+
85
+ **Supported formats**: JPEG, PNG, GIF, WebP
86
+ **Size limit**: 10MB
87
+
88
+ ### `understand_audio`
89
+
90
+ Analyze audio using Xiaomi MiMo multimodal model.
91
+
92
+ | Parameter | Type | Required | Description |
93
+ |-----------|------|----------|-------------|
94
+ | `prompt` | string | Yes | Audio understanding task description |
95
+ | `audio_url` | string | No | Single audio URL |
96
+ | `audio_path` | string | No | Single local audio file path |
97
+ | `audio_urls` | list[string] | No | Multiple audio URLs |
98
+ | `audio_paths` | list[string] | No | Multiple local audio file paths |
99
+ | `system_prompt` | string | No | Custom system prompt |
100
+ | `max_tokens` | integer | No | Max output length (default: 32768) |
101
+
102
+ **Supported formats**: MP3, WAV, FLAC, M4A, OGG
103
+ **Size limit**: URL 100MB, Base64 50MB
104
+
105
+ ### `understand_video`
106
+
107
+ Analyze video using Xiaomi MiMo multimodal model.
108
+
109
+ | Parameter | Type | Required | Description |
110
+ |-----------|------|----------|-------------|
111
+ | `prompt` | string | Yes | Video understanding task description |
112
+ | `video_url` | string | No | Single video URL |
113
+ | `video_path` | string | No | Single local video file path |
114
+ | `video_urls` | list[string] | No | Multiple video URLs |
115
+ | `video_paths` | list[string] | No | Multiple local video file paths |
116
+ | `fps` | float | No | Frames per second, range [0.1, 10], default: 2 |
117
+ | `media_resolution` | string | No | Resolution: "default" or "max" |
118
+ | `system_prompt` | string | No | Custom system prompt |
119
+ | `max_tokens` | integer | No | Max output length (default: 32768) |
120
+
121
+ **Supported formats**: MP4, MOV, AVI, WMV
122
+ **Size limit**: URL 300MB, Base64 50MB
123
+
124
+ ## Examples
125
+
126
+ ### Image Understanding
127
+
128
+ ```python
129
+ # URL
130
+ await understand_image(prompt="Describe this image", image_url="https://example.com/image.jpg")
131
+
132
+ # Local file
133
+ await understand_image(prompt="What text is in this?", image_path="/path/to/screenshot.png")
134
+
135
+ # Multiple images
136
+ await understand_image(prompt="Compare these", image_urls=["url1", "url2"])
137
+ ```
138
+
139
+ ### Audio Understanding
140
+
141
+ ```python
142
+ # URL
143
+ await understand_audio(prompt="Transcribe this audio", audio_url="https://example.com/audio.wav")
144
+
145
+ # Local file
146
+ await understand_audio(prompt="What is being said?", audio_path="/path/to/audio.mp3")
147
+ ```
148
+
149
+ ### Video Understanding
150
+
151
+ ```python
152
+ # URL with default settings
153
+ await understand_video(prompt="Describe this video", video_url="https://example.com/video.mp4")
154
+
155
+ # URL with custom fps and resolution
156
+ await understand_video(
157
+ prompt="Describe the action",
158
+ video_url="https://example.com/video.mp4",
159
+ fps=5.0,
160
+ media_resolution="max"
161
+ )
162
+ ```
@@ -0,0 +1,21 @@
1
+ [project]
2
+ name = "mimo-multimodal-mcp"
3
+ version = "0.3.0"
4
+ description = "MCP server for Xiaomi MiMo multimodal understanding (image, audio, video)"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "mcp[cli]>=1.2.0",
9
+ "openai>=1.0.0",
10
+ "python-dotenv>=1.0.0",
11
+ ]
12
+
13
+ [project.scripts]
14
+ mimo-multimodal-mcp = "mimo_multimodal_mcp.server:main"
15
+
16
+ [build-system]
17
+ requires = ["hatchling"]
18
+ build-backend = "hatchling.build"
19
+
20
+ [tool.hatch.build.targets.wheel]
21
+ packages = ["src/mimo_multimodal_mcp"]
@@ -0,0 +1,5 @@
1
+ """MiMo Multimodal Understanding MCP Server."""
2
+
3
+ from .server import mcp
4
+
5
+ __all__ = ["mcp"]