pipecat-ai-mcp-server 0.0.4__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pipecat_ai_mcp_server-0.0.11/.claude/skills/pipecat/SKILL.md +35 -0
  2. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/ISSUE_TEMPLATE/1-bug_report.yml +0 -9
  3. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/ISSUE_TEMPLATE/2-question.yml +0 -9
  4. pipecat_ai_mcp_server-0.0.11/CHANGELOG.md +109 -0
  5. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/PKG-INFO +35 -57
  6. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/README.md +24 -52
  7. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/pyproject.toml +10 -2
  8. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_ai_mcp_server.egg-info/PKG-INFO +35 -57
  9. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_ai_mcp_server.egg-info/SOURCES.txt +7 -2
  10. pipecat_ai_mcp_server-0.0.11/src/pipecat_ai_mcp_server.egg-info/requires.txt +22 -0
  11. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_mcp_server/agent.py +72 -33
  12. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_mcp_server/agent_ipc.py +3 -8
  13. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_mcp_server/bot.py +12 -0
  14. pipecat_ai_mcp_server-0.0.11/src/pipecat_mcp_server/processors/kokoro_tts.py +177 -0
  15. pipecat_ai_mcp_server-0.0.11/src/pipecat_mcp_server/processors/screen_capture/__init__.py +11 -0
  16. pipecat_ai_mcp_server-0.0.11/src/pipecat_mcp_server/processors/screen_capture/base_capture_backend.py +86 -0
  17. pipecat_ai_mcp_server-0.0.11/src/pipecat_mcp_server/processors/screen_capture/linux_x11_capture_backend.py +257 -0
  18. pipecat_ai_mcp_server-0.0.11/src/pipecat_mcp_server/processors/screen_capture/macos_capture_backend.py +303 -0
  19. pipecat_ai_mcp_server-0.0.11/src/pipecat_mcp_server/processors/screen_capture/screen_capture_processor.py +147 -0
  20. pipecat_ai_mcp_server-0.0.11/src/pipecat_mcp_server/processors/vision.py +62 -0
  21. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_mcp_server/server.py +46 -0
  22. pipecat_ai_mcp_server-0.0.4/.claude/skills/pipecat/SKILL.md +0 -26
  23. pipecat_ai_mcp_server-0.0.4/CHANGELOG.md +0 -38
  24. pipecat_ai_mcp_server-0.0.4/src/pipecat_ai_mcp_server.egg-info/requires.txt +0 -11
  25. pipecat_ai_mcp_server-0.0.4/src/pipecat_mcp_server/processors/screen_capture.py +0 -240
  26. pipecat_ai_mcp_server-0.0.4/uv.lock +0 -7265
  27. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/ISSUE_TEMPLATE/3-feature_request.yml +0 -0
  28. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  29. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  30. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/workflows/build.yaml +0 -0
  31. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/workflows/format.yaml +0 -0
  32. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.github/workflows/publish.yaml +0 -0
  33. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/.gitignore +0 -0
  34. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/CHANGELOG.md.template +0 -0
  35. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/LICENSE +0 -0
  36. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/pipecat.png +0 -0
  37. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/setup.cfg +0 -0
  38. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_ai_mcp_server.egg-info/dependency_links.txt +0 -0
  39. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_ai_mcp_server.egg-info/entry_points.txt +0 -0
  40. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_ai_mcp_server.egg-info/top_level.txt +0 -0
  41. {pipecat_ai_mcp_server-0.0.4 → pipecat_ai_mcp_server-0.0.11}/src/pipecat_mcp_server/processors/__init__.py +0 -0
@@ -0,0 +1,35 @@
1
+ ---
2
+ name: pipecat
3
+ description: Start a voice conversation using the Pipecat MCP server
4
+ ---
5
+
6
+ Start a voice conversation using the Pipecat MCP server.
7
+
8
+ ## Flow
9
+
10
+ 1. Print a nicely formatted message with bullet points in the terminal with the following information:
11
+ - The voice session is starting
12
+ - Once ready, they can connect via the transport of their choice (Pipecat Playground, Daily room, or phone call)
13
+ - Models are downloaded on the first user connection, so the first connection may take a moment
14
+ - If the connection is not established and the user cannot hear any audio, they should check the terminal for errors from the Pipecat MCP server
15
+ 2. Call `start()` to initialize the voice agent
16
+ 3. Greet the user with `speak()`, then call `listen()` to wait for input
17
+ 4. When the user asks you to perform a task:
18
+ - Acknowledge the request with `speak()` (do NOT call `listen()` yet)
19
+ - Perform the work (edit files, run commands, etc.)
20
+ - IMPORTANT: Call `speak()` frequently to give progress updates — after each significant step (e.g., "Reading the file now", "Making the change", "Done with the first file, moving to the next one"). Never let more than a few tool calls go by in silence.
21
+ - Once the task is complete, use `speak()` to report the result
22
+ - Only then call `listen()` to wait for the next user input
23
+ 5. When the user asks a simple question or makes conversation (no task to perform), respond with `speak()` then immediately call `listen()`
24
+ 6. If the user wants to end the conversation, ask for verbal confirmation before stopping. When in doubt, keep listening.
25
+ 7. Once confirmed, say goodbye with `speak()`, then call `stop()`
26
+
27
+ The key principle: `listen()` means "I'm done and ready for the user to talk." Never call it while you still have work to do or updates to communicate.
28
+
29
+ ## Guidelines
30
+
31
+ - Keep all responses and progress updates to 1-2 short sentences. Brevity is critical for voice.
32
+ - When the user asks you to perform a task (e.g., edit a file, create a PR), verbally acknowledge the request first, then start working on it. Do not work in silence.
33
+ - Before any change (files, PRs, issues, etc.), show the proposed change in the terminal, use `speak()` to ask for verbal confirmation, then call `listen()` to get the user's response before proceeding.
34
+ - When using `list_windows()` and `screen_capture()`, if there are multiple windows for the same app or you're unsure which window the user wants, ask for clarification before capturing.
35
+ - Always call `stop()` when the conversation ends.
@@ -50,15 +50,6 @@ body:
50
50
  validations:
51
51
  required: true
52
52
 
53
- - type: input
54
- id: browser
55
- attributes:
56
- label: Browser
57
- description: Which browser are you using?
58
- placeholder: e.g., Chrome 139.0.7258.127
59
- validations:
60
- required: true
61
-
62
53
  - type: textarea
63
54
  id: description
64
55
  attributes:
@@ -50,15 +50,6 @@ body:
50
50
  validations:
51
51
  required: false
52
52
 
53
- - type: input
54
- id: browser
55
- attributes:
56
- label: Browser
57
- description: Which browser are you using?
58
- placeholder: e.g., Chrome 139.0.7258.127
59
- validations:
60
- required: false
61
-
62
53
  - type: textarea
63
54
  id: question
64
55
  attributes:
@@ -0,0 +1,109 @@
1
+ # Changelog
2
+
3
+ All notable changes to **Pipecat MCP Server** will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.0.11] - 2026-02-02
9
+
10
+ ### Added
11
+
12
+ - New `capture_screenshot()` MCP tool that captures the current screen frame and
13
+ returns an image path. This allows the agent to visually analyze what's on
14
+ screen and help with debugging, UI feedback, and more.
15
+
16
+ ## [0.0.10] - 2026-02-01
17
+
18
+ ### Added
19
+
20
+ - New `list_windows()` MCP tool to list all open windows with title, app name,
21
+ and window ID.
22
+
23
+ - New `screen_capture(window_id)` MCP tool to start or switch screen capture to
24
+ a specific window or full screen during a voice conversation.
25
+
26
+ ### Changed
27
+
28
+ - Screen capture dependencies are now included by default (no longer an optional
29
+ `[screen]` extra).
30
+
31
+ - Screen capture is no longer configured via environment variables
32
+ (`PIPECAT_MCP_SERVER_SCREEN_CAPTURE`, `PIPECAT_MCP_SERVER_SCREEN_WINDOW`).
33
+ Use the `list_windows()` and `screen_capture()` tools instead.
34
+
35
+ ## [0.0.9] - 2026-01-31
36
+
37
+ ### Changed
38
+
39
+ - Linux X11 screen capture backend using python-xlib.
40
+
41
+ - Native macOS screen capture using ScreenCaptureKit. Supports true window-level
42
+ capture not affected by overlapping windows.
43
+
44
+ ## [0.0.8] - 2026-01-31
45
+
46
+ ### Changed
47
+
48
+ - Updated to Pipecat >= 0.0.101.
49
+
50
+ ## [0.0.7] - 2026-01-31
51
+
52
+ ### Changed
53
+
54
+ - `KokoroTTSService` now uses `kokoro-onnx`.
55
+
56
+ ## [0.0.6] - 2026-01-29
57
+
58
+ ### Added
59
+
60
+ - Added `KokoroTTSService` processor.
61
+
62
+ - Added noise cancellation with `RNNoiseFilter`.
63
+
64
+ - Simplified the `/pipecat` skill instructions.
65
+
66
+ ### Changed
67
+
68
+ - Replaced third-party STT/TTS services (Deepgram, Cartesia) with local models:
69
+ Faster Whisper for speech-to-text and Kokoro for text-to-speech. No API keys
70
+ required.
71
+
72
+ ## [0.0.5] - 2026-01-28
73
+
74
+ ### Fixed
75
+
76
+ - Fixed an issue that would cause an MCP session to crash and would force the
77
+ MCP client to reconnect each time.
78
+
79
+ ## [0.0.4] - 2026-01-26
80
+
81
+ ### Fixed
82
+
83
+ - Fixed an issue where Daily clients couldn't reconnect after disconnecting.
84
+
85
+ ## [0.0.3] - 2026-01-26
86
+
87
+ ### Fixed
88
+
89
+ - Fixed premature exit of the `/pipecat` skill when user responds with phrases
90
+ like "no", "nothing", or "that's it" instead of explicit ending phrases.
91
+
92
+ - Fixed an issue where WebRTC clients couldn't reconnect after disconnecting.
93
+ The agent now properly handles disconnect/reconnect cycles.
94
+
95
+ - Fixed an issue where `pipecat-mcp-server` could hang indefinitely after
96
+ pressing Ctrl-C.
97
+
98
+ ## [0.0.2] - 2026-01-26
99
+
100
+ ### Fixed
101
+
102
+ - Fixed an issue that would cause the Pipecat agent to not load if the optional
103
+ `daily` dependency was not installed.
104
+
105
+ - Added missing support for `telnyx`, `plivo` and `exotel` telephony providers.
106
+
107
+ ## [0.0.1] - 2026-01-26
108
+
109
+ Initial public release.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pipecat-ai-mcp-server
3
- Version: 0.0.4
3
+ Version: 0.0.11
4
4
  Summary: Pipecat MCP server for your AI agents
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Homepage, https://pipecat.ai
@@ -18,15 +18,21 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.10
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
+ Requires-Dist: kokoro-onnx<1,>=0.5.0
21
22
  Requires-Dist: loguru<1,>=0.7.0
22
23
  Requires-Dist: mcp>=1.0.0
23
- Requires-Dist: pipecat-ai[cartesia,deepgram,local-smart-turn-v3,runner,silero,webrtc,websocket]>=0.0.100
24
+ Requires-Dist: pip>=25.3
25
+ Requires-Dist: pipecat-ai[cartesia,deepgram,local-smart-turn-v3,rnnoise,runner,silero,webrtc,websocket]>=0.0.101
26
+ Requires-Dist: pipecat-ai[mlx-whisper]>=0.0.100; sys_platform == "darwin"
27
+ Requires-Dist: pipecat-ai[whisper]>=0.0.100; sys_platform != "darwin"
28
+ Requires-Dist: pyobjc-framework-CoreMedia>=11.0; sys_platform == "darwin"
29
+ Requires-Dist: pyobjc-framework-Quartz>=11.0; sys_platform == "darwin"
30
+ Requires-Dist: pyobjc-framework-ScreenCaptureKit>=11.0; sys_platform == "darwin"
24
31
  Requires-Dist: python-dotenv<2,>=1.0.0
32
+ Requires-Dist: python-xlib>=0.33; sys_platform == "linux"
33
+ Requires-Dist: requests<3,>=2.32.5
25
34
  Provides-Extra: daily
26
35
  Requires-Dist: daily-python~=0.23.0; extra == "daily"
27
- Provides-Extra: screen
28
- Requires-Dist: mss>=10.0.0; extra == "screen"
29
- Requires-Dist: pywinctl>=0.4; extra == "screen"
30
36
  Dynamic: license-file
31
37
 
32
38
  <h1><div align="center">
@@ -39,16 +45,18 @@ Dynamic: license-file
39
45
 
40
46
  Pipecat MCP Server gives your AI agents a voice using [Pipecat](https://github.com/pipecat-ai/pipecat). It should work with any [MCP](https://modelcontextprotocol.io/)-compatible client:
41
47
 
42
- The Pipecat MCP Server exposes **voice-related tools** (`start`, `listen`, `speak`, `stop`) to MCP-compatible clients, but **it does not itself provide microphone or speaker access**.
48
+ The Pipecat MCP Server exposes **voice-related** and **screen capture** tools to MCP-compatible clients, but **it does not itself provide microphone or speaker access**.
43
49
 
44
- Audio input/output is handled by a **separate audio transport**, such as:
50
+ Audio input/output is handled by a **separate audio/video transport**, such as:
45
51
 
46
52
  - **Pipecat Playground** (local browser UI)
47
53
  - **Daily** (WebRTC room)
48
54
  - **Phone providers** (Twilio, Telnyx, etc.)
49
55
 
50
56
  > **MCP clients like Cursor, Claude Code, and Codex control the agent, but they are not audio devices.**
51
- > To hear or speak, you must also connect via one of the audio transports.
57
+ > To hear, speak or see, you must connect via one of the audio transports.
58
+
59
+ <p align="center"><video src="https://github.com/user-attachments/assets/0ad14e37-2de7-46df-870a-167aa667df16" width="500" controls></video></p>
52
60
 
53
61
  ## 🧭 Getting started
54
62
 
@@ -56,9 +64,8 @@ Audio input/output is handled by a **separate audio transport**, such as:
56
64
 
57
65
  - Python 3.10 or later
58
66
  - [uv](https://docs.astral.sh/uv/getting-started/installation/) package manager
59
- - API keys for third-party services (Speech-to-Text, Text-to-Speech, ...)
60
67
 
61
- By default, the voice agent uses [Deepgram](https://deepgram.com) for speech-to-text and [Cartesia](https://cartesia.ai/) for text-to-speech.
68
+ By default, the voice agent uses local models (no API keys required): [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) for speech-to-text and [Kokoro](https://github.com/hexgrad/kokoro) for text-to-speech. The Whisper models are approximately 1.5 GB and are downloaded automatically on the first connection, so the initial startup may take a moment.
62
69
 
63
70
  ### Installation
64
71
 
@@ -82,14 +89,7 @@ uv tool install -e /path/to/repo/pipecat-mcp-server
82
89
 
83
90
  ## Running the server
84
91
 
85
- First, set your API keys as environment variables:
86
-
87
- ```bash
88
- export DEEPGRAM_API_KEY=your-deepgram-key
89
- export CARTESIA_API_KEY=your-cartesia-key
90
- ```
91
-
92
- Then start the server:
92
+ Start the server:
93
93
 
94
94
  ```bash
95
95
  pipecat-mcp-server
@@ -109,6 +109,20 @@ The [Pipecat skill](.claude/skills/pipecat/SKILL.md) provides a better voice con
109
109
 
110
110
  Alternatively, just tell your agent something like `Let's have a voice conversation`. In this case, the agent won't ask for verbal confirmation before making changes.
111
111
 
112
+ ## 🖥️ Screen Capture & Analysis
113
+
114
+ Screen capture lets you stream your screen (or a specific window) to your configured transport, and ask the agent to help with what it sees.
115
+
116
+ For example:
117
+ - *"capture my browser window"* — starts streaming that window
118
+ - *"what's causing this error?"* — the agent analyzes the screen and helps debug
119
+ - *"how does this UI look?"* — get feedback on your design
120
+
121
+ **Supported platforms:**
122
+
123
+ - **macOS** — uses ScreenCaptureKit for true window-level capture (not affected by overlapping windows)
124
+ - **Linux (X11)** — uses Xlib for window and full-screen capture
125
+
112
126
  ## 💻 MCP Client: Claude Code
113
127
 
114
128
  ### Adding the MCP server
@@ -230,11 +244,11 @@ First, install the server with the Daily dependency:
230
244
  uv tool install pipecat-ai-mcp-server[daily]
231
245
  ```
232
246
 
233
- Then, set the `DAILY_API_KEY` environment variable to your Daily API key and `DAILY_SAMPLE_ROOM_URL` to your desired Daily room URL and pass the `-d` argument to `pipecat-mcp-server`.
247
+ Then, set the `DAILY_API_KEY` environment variable to your Daily API key and `DAILY_ROOM_URL` to your desired Daily room URL and pass the `-d` argument to `pipecat-mcp-server`.
234
248
 
235
249
  ```bash
236
250
  export DAILY_API_KEY=your-daily-api-key
237
- export DAILY_SAMPLE_ROOM_URL=your-daily-room
251
+ export DAILY_ROOM_URL=your-daily-room
238
252
 
239
253
  pipecat-mcp-server -d
240
254
  ```
@@ -271,45 +285,9 @@ pipecat-mcp-server -t twilio -x your-proxy.ngrok.app
271
285
 
272
286
  Configure your provider's phone number to point to your ngrok URL, then call your number to connect.
273
287
 
274
- ## 🧪 Screen Capture (Experimental)
275
-
276
- You can enable screen capture to stream your screen (or a specific window) to the Pipecat Playground or Daily room. This lets you see what's happening on your computer remotely while having a voice conversation with the agent.
277
-
278
- First, install the server with the screen capture dependency:
279
-
280
- ```bash
281
- uv tool install "pipecat-ai-mcp-server[screen]"
282
- ```
283
-
284
- Then, define the following environment variables:
285
-
286
- | Variable | Description |
287
- |-------------------------------------|--------------------------------------------------------------------|
288
- | `PIPECAT_MCP_SERVER_SCREEN_CAPTURE` | Set to any value (e.g., `1`) to enable screen capture |
289
- | `PIPECAT_MCP_SERVER_SCREEN_WINDOW` | Optional. Window name to capture (partial match, case-insensitive) |
290
-
291
- For example, to capture your entire primary monitor:
292
-
293
- ```bash
294
- export PIPECAT_MCP_SERVER_SCREEN_CAPTURE=1
295
-
296
- pipecat-mcp-server
297
- ```
298
-
299
- And to capture a specific window:
300
-
301
- ```bash
302
- export PIPECAT_MCP_SERVER_SCREEN_CAPTURE=1
303
- export PIPECAT_MCP_SERVER_SCREEN_WINDOW="claude"
304
-
305
- pipecat-mcp-server
306
- ```
307
-
308
- > ℹ️ **Note:** Window capture is based on window coordinates, not content. If another window overlaps the target, the overlapping content will be captured. The capture region updates dynamically if the window is moved. If the specified window is not found, capture falls back to the full screen.
309
-
310
288
  ## 📚 What's Next?
311
289
 
312
- - **Customize services**: Edit `agent.py` to use different STT/TTS providers (ElevenLabs, OpenAI, etc.)
290
+ - **Customize services**: Edit `agent.py` to use different STT/TTS providers
313
291
  - **Change transport**: Configure for Twilio, WebRTC, or other transports
314
292
  - **Add to your project**: Use this as a template for voice-enabled MCP tools
315
293
  - **Learn more**: Check out [Pipecat's docs](https://docs.pipecat.ai/) for advanced features
@@ -8,16 +8,18 @@
8
8
 
9
9
  Pipecat MCP Server gives your AI agents a voice using [Pipecat](https://github.com/pipecat-ai/pipecat). It should work with any [MCP](https://modelcontextprotocol.io/)-compatible client:
10
10
 
11
- The Pipecat MCP Server exposes **voice-related tools** (`start`, `listen`, `speak`, `stop`) to MCP-compatible clients, but **it does not itself provide microphone or speaker access**.
11
+ The Pipecat MCP Server exposes **voice-related** and **screen capture** tools to MCP-compatible clients, but **it does not itself provide microphone or speaker access**.
12
12
 
13
- Audio input/output is handled by a **separate audio transport**, such as:
13
+ Audio input/output is handled by a **separate audio/video transport**, such as:
14
14
 
15
15
  - **Pipecat Playground** (local browser UI)
16
16
  - **Daily** (WebRTC room)
17
17
  - **Phone providers** (Twilio, Telnyx, etc.)
18
18
 
19
19
  > **MCP clients like Cursor, Claude Code, and Codex control the agent, but they are not audio devices.**
20
- > To hear or speak, you must also connect via one of the audio transports.
20
+ > To hear, speak or see, you must connect via one of the audio transports.
21
+
22
+ <p align="center"><video src="https://github.com/user-attachments/assets/0ad14e37-2de7-46df-870a-167aa667df16" width="500" controls></video></p>
21
23
 
22
24
  ## 🧭 Getting started
23
25
 
@@ -25,9 +27,8 @@ Audio input/output is handled by a **separate audio transport**, such as:
25
27
 
26
28
  - Python 3.10 or later
27
29
  - [uv](https://docs.astral.sh/uv/getting-started/installation/) package manager
28
- - API keys for third-party services (Speech-to-Text, Text-to-Speech, ...)
29
30
 
30
- By default, the voice agent uses [Deepgram](https://deepgram.com) for speech-to-text and [Cartesia](https://cartesia.ai/) for text-to-speech.
31
+ By default, the voice agent uses local models (no API keys required): [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) for speech-to-text and [Kokoro](https://github.com/hexgrad/kokoro) for text-to-speech. The Whisper models are approximately 1.5 GB and are downloaded automatically on the first connection, so the initial startup may take a moment.
31
32
 
32
33
  ### Installation
33
34
 
@@ -51,14 +52,7 @@ uv tool install -e /path/to/repo/pipecat-mcp-server
51
52
 
52
53
  ## Running the server
53
54
 
54
- First, set your API keys as environment variables:
55
-
56
- ```bash
57
- export DEEPGRAM_API_KEY=your-deepgram-key
58
- export CARTESIA_API_KEY=your-cartesia-key
59
- ```
60
-
61
- Then start the server:
55
+ Start the server:
62
56
 
63
57
  ```bash
64
58
  pipecat-mcp-server
@@ -78,6 +72,20 @@ The [Pipecat skill](.claude/skills/pipecat/SKILL.md) provides a better voice con
78
72
 
79
73
  Alternatively, just tell your agent something like `Let's have a voice conversation`. In this case, the agent won't ask for verbal confirmation before making changes.
80
74
 
75
+ ## 🖥️ Screen Capture & Analysis
76
+
77
+ Screen capture lets you stream your screen (or a specific window) to your configured transport, and ask the agent to help with what it sees.
78
+
79
+ For example:
80
+ - *"capture my browser window"* — starts streaming that window
81
+ - *"what's causing this error?"* — the agent analyzes the screen and helps debug
82
+ - *"how does this UI look?"* — get feedback on your design
83
+
84
+ **Supported platforms:**
85
+
86
+ - **macOS** — uses ScreenCaptureKit for true window-level capture (not affected by overlapping windows)
87
+ - **Linux (X11)** — uses Xlib for window and full-screen capture
88
+
81
89
  ## 💻 MCP Client: Claude Code
82
90
 
83
91
  ### Adding the MCP server
@@ -199,11 +207,11 @@ First, install the server with the Daily dependency:
199
207
  uv tool install pipecat-ai-mcp-server[daily]
200
208
  ```
201
209
 
202
- Then, set the `DAILY_API_KEY` environment variable to your Daily API key and `DAILY_SAMPLE_ROOM_URL` to your desired Daily room URL and pass the `-d` argument to `pipecat-mcp-server`.
210
+ Then, set the `DAILY_API_KEY` environment variable to your Daily API key and `DAILY_ROOM_URL` to your desired Daily room URL and pass the `-d` argument to `pipecat-mcp-server`.
203
211
 
204
212
  ```bash
205
213
  export DAILY_API_KEY=your-daily-api-key
206
- export DAILY_SAMPLE_ROOM_URL=your-daily-room
214
+ export DAILY_ROOM_URL=your-daily-room
207
215
 
208
216
  pipecat-mcp-server -d
209
217
  ```
@@ -240,45 +248,9 @@ pipecat-mcp-server -t twilio -x your-proxy.ngrok.app
240
248
 
241
249
  Configure your provider's phone number to point to your ngrok URL, then call your number to connect.
242
250
 
243
- ## 🧪 Screen Capture (Experimental)
244
-
245
- You can enable screen capture to stream your screen (or a specific window) to the Pipecat Playground or Daily room. This lets you see what's happening on your computer remotely while having a voice conversation with the agent.
246
-
247
- First, install the server with the screen capture dependency:
248
-
249
- ```bash
250
- uv tool install "pipecat-ai-mcp-server[screen]"
251
- ```
252
-
253
- Then, define the following environment variables:
254
-
255
- | Variable | Description |
256
- |-------------------------------------|--------------------------------------------------------------------|
257
- | `PIPECAT_MCP_SERVER_SCREEN_CAPTURE` | Set to any value (e.g., `1`) to enable screen capture |
258
- | `PIPECAT_MCP_SERVER_SCREEN_WINDOW` | Optional. Window name to capture (partial match, case-insensitive) |
259
-
260
- For example, to capture your entire primary monitor:
261
-
262
- ```bash
263
- export PIPECAT_MCP_SERVER_SCREEN_CAPTURE=1
264
-
265
- pipecat-mcp-server
266
- ```
267
-
268
- And to capture a specific window:
269
-
270
- ```bash
271
- export PIPECAT_MCP_SERVER_SCREEN_CAPTURE=1
272
- export PIPECAT_MCP_SERVER_SCREEN_WINDOW="claude"
273
-
274
- pipecat-mcp-server
275
- ```
276
-
277
- > ℹ️ **Note:** Window capture is based on window coordinates, not content. If another window overlaps the target, the overlapping content will be captured. The capture region updates dynamically if the window is moved. If the specified window is not found, capture falls back to the full screen.
278
-
279
251
  ## 📚 What's Next?
280
252
 
281
- - **Customize services**: Edit `agent.py` to use different STT/TTS providers (ElevenLabs, OpenAI, etc.)
253
+ - **Customize services**: Edit `agent.py` to use different STT/TTS providers
282
254
  - **Change transport**: Configure for Twilio, WebRTC, or other transports
283
255
  - **Add to your project**: Use this as a template for voice-enabled MCP tools
284
256
  - **Learn more**: Check out [Pipecat's docs](https://docs.pipecat.ai/) for advanced features
@@ -20,10 +20,19 @@ classifiers = [
20
20
  "Topic :: Scientific/Engineering :: Artificial Intelligence"
21
21
  ]
22
22
  dependencies = [
23
+ "kokoro-onnx>=0.5.0,<1",
23
24
  "loguru>=0.7.0,<1",
24
25
  "mcp>=1.0.0",
25
- "pipecat-ai[cartesia,deepgram,local-smart-turn-v3,runner,silero,webrtc,websocket]>=0.0.100",
26
+ "pip>=25.3",
27
+ "pipecat-ai[cartesia,deepgram,local-smart-turn-v3,rnnoise,runner,silero,webrtc,websocket]>=0.0.101",
28
+ "pipecat-ai[mlx-whisper]>=0.0.100; sys_platform == 'darwin'",
29
+ "pipecat-ai[whisper]>=0.0.100; sys_platform != 'darwin'",
30
+ "pyobjc-framework-CoreMedia>=11.0; sys_platform == 'darwin'",
31
+ "pyobjc-framework-Quartz>=11.0; sys_platform == 'darwin'",
32
+ "pyobjc-framework-ScreenCaptureKit>=11.0; sys_platform == 'darwin'",
26
33
  "python-dotenv>=1.0.0,<2",
34
+ "python-xlib>=0.33; sys_platform == 'linux'",
35
+ "requests>=2.32.5,<3",
27
36
  ]
28
37
 
29
38
  [project.urls]
@@ -35,7 +44,6 @@ Changelog = "https://github.com/pipecat-ai/pipecat-mcp-server/blob/main/CHANGELO
35
44
 
36
45
  [project.optional-dependencies]
37
46
  daily = [ "daily-python~=0.23.0" ]
38
- screen = [ "mss>=10.0.0", "pywinctl>=0.4" ]
39
47
 
40
48
  [dependency-groups]
41
49
  dev = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pipecat-ai-mcp-server
3
- Version: 0.0.4
3
+ Version: 0.0.11
4
4
  Summary: Pipecat MCP server for your AI agents
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Homepage, https://pipecat.ai
@@ -18,15 +18,21 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
18
  Requires-Python: >=3.10
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
+ Requires-Dist: kokoro-onnx<1,>=0.5.0
21
22
  Requires-Dist: loguru<1,>=0.7.0
22
23
  Requires-Dist: mcp>=1.0.0
23
- Requires-Dist: pipecat-ai[cartesia,deepgram,local-smart-turn-v3,runner,silero,webrtc,websocket]>=0.0.100
24
+ Requires-Dist: pip>=25.3
25
+ Requires-Dist: pipecat-ai[cartesia,deepgram,local-smart-turn-v3,rnnoise,runner,silero,webrtc,websocket]>=0.0.101
26
+ Requires-Dist: pipecat-ai[mlx-whisper]>=0.0.100; sys_platform == "darwin"
27
+ Requires-Dist: pipecat-ai[whisper]>=0.0.100; sys_platform != "darwin"
28
+ Requires-Dist: pyobjc-framework-CoreMedia>=11.0; sys_platform == "darwin"
29
+ Requires-Dist: pyobjc-framework-Quartz>=11.0; sys_platform == "darwin"
30
+ Requires-Dist: pyobjc-framework-ScreenCaptureKit>=11.0; sys_platform == "darwin"
24
31
  Requires-Dist: python-dotenv<2,>=1.0.0
32
+ Requires-Dist: python-xlib>=0.33; sys_platform == "linux"
33
+ Requires-Dist: requests<3,>=2.32.5
25
34
  Provides-Extra: daily
26
35
  Requires-Dist: daily-python~=0.23.0; extra == "daily"
27
- Provides-Extra: screen
28
- Requires-Dist: mss>=10.0.0; extra == "screen"
29
- Requires-Dist: pywinctl>=0.4; extra == "screen"
30
36
  Dynamic: license-file
31
37
 
32
38
  <h1><div align="center">
@@ -39,16 +45,18 @@ Dynamic: license-file
39
45
 
40
46
  Pipecat MCP Server gives your AI agents a voice using [Pipecat](https://github.com/pipecat-ai/pipecat). It should work with any [MCP](https://modelcontextprotocol.io/)-compatible client:
41
47
 
42
- The Pipecat MCP Server exposes **voice-related tools** (`start`, `listen`, `speak`, `stop`) to MCP-compatible clients, but **it does not itself provide microphone or speaker access**.
48
+ The Pipecat MCP Server exposes **voice-related** and **screen capture** tools to MCP-compatible clients, but **it does not itself provide microphone or speaker access**.
43
49
 
44
- Audio input/output is handled by a **separate audio transport**, such as:
50
+ Audio input/output is handled by a **separate audio/video transport**, such as:
45
51
 
46
52
  - **Pipecat Playground** (local browser UI)
47
53
  - **Daily** (WebRTC room)
48
54
  - **Phone providers** (Twilio, Telnyx, etc.)
49
55
 
50
56
  > **MCP clients like Cursor, Claude Code, and Codex control the agent, but they are not audio devices.**
51
- > To hear or speak, you must also connect via one of the audio transports.
57
+ > To hear, speak or see, you must connect via one of the audio transports.
58
+
59
+ <p align="center"><video src="https://github.com/user-attachments/assets/0ad14e37-2de7-46df-870a-167aa667df16" width="500" controls></video></p>
52
60
 
53
61
  ## 🧭 Getting started
54
62
 
@@ -56,9 +64,8 @@ Audio input/output is handled by a **separate audio transport**, such as:
56
64
 
57
65
  - Python 3.10 or later
58
66
  - [uv](https://docs.astral.sh/uv/getting-started/installation/) package manager
59
- - API keys for third-party services (Speech-to-Text, Text-to-Speech, ...)
60
67
 
61
- By default, the voice agent uses [Deepgram](https://deepgram.com) for speech-to-text and [Cartesia](https://cartesia.ai/) for text-to-speech.
68
+ By default, the voice agent uses local models (no API keys required): [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) for speech-to-text and [Kokoro](https://github.com/hexgrad/kokoro) for text-to-speech. The Whisper models are approximately 1.5 GB and are downloaded automatically on the first connection, so the initial startup may take a moment.
62
69
 
63
70
  ### Installation
64
71
 
@@ -82,14 +89,7 @@ uv tool install -e /path/to/repo/pipecat-mcp-server
82
89
 
83
90
  ## Running the server
84
91
 
85
- First, set your API keys as environment variables:
86
-
87
- ```bash
88
- export DEEPGRAM_API_KEY=your-deepgram-key
89
- export CARTESIA_API_KEY=your-cartesia-key
90
- ```
91
-
92
- Then start the server:
92
+ Start the server:
93
93
 
94
94
  ```bash
95
95
  pipecat-mcp-server
@@ -109,6 +109,20 @@ The [Pipecat skill](.claude/skills/pipecat/SKILL.md) provides a better voice con
109
109
 
110
110
  Alternatively, just tell your agent something like `Let's have a voice conversation`. In this case, the agent won't ask for verbal confirmation before making changes.
111
111
 
112
+ ## 🖥️ Screen Capture & Analysis
113
+
114
+ Screen capture lets you stream your screen (or a specific window) to your configured transport, and ask the agent to help with what it sees.
115
+
116
+ For example:
117
+ - *"capture my browser window"* — starts streaming that window
118
+ - *"what's causing this error?"* — the agent analyzes the screen and helps debug
119
+ - *"how does this UI look?"* — get feedback on your design
120
+
121
+ **Supported platforms:**
122
+
123
+ - **macOS** — uses ScreenCaptureKit for true window-level capture (not affected by overlapping windows)
124
+ - **Linux (X11)** — uses Xlib for window and full-screen capture
125
+
112
126
  ## 💻 MCP Client: Claude Code
113
127
 
114
128
  ### Adding the MCP server
@@ -230,11 +244,11 @@ First, install the server with the Daily dependency:
230
244
  uv tool install pipecat-ai-mcp-server[daily]
231
245
  ```
232
246
 
233
- Then, set the `DAILY_API_KEY` environment variable to your Daily API key and `DAILY_SAMPLE_ROOM_URL` to your desired Daily room URL and pass the `-d` argument to `pipecat-mcp-server`.
247
+ Then, set the `DAILY_API_KEY` environment variable to your Daily API key and `DAILY_ROOM_URL` to your desired Daily room URL and pass the `-d` argument to `pipecat-mcp-server`.
234
248
 
235
249
  ```bash
236
250
  export DAILY_API_KEY=your-daily-api-key
237
- export DAILY_SAMPLE_ROOM_URL=your-daily-room
251
+ export DAILY_ROOM_URL=your-daily-room
238
252
 
239
253
  pipecat-mcp-server -d
240
254
  ```
@@ -271,45 +285,9 @@ pipecat-mcp-server -t twilio -x your-proxy.ngrok.app
271
285
 
272
286
  Configure your provider's phone number to point to your ngrok URL, then call your number to connect.
273
287
 
274
- ## 🧪 Screen Capture (Experimental)
275
-
276
- You can enable screen capture to stream your screen (or a specific window) to the Pipecat Playground or Daily room. This lets you see what's happening on your computer remotely while having a voice conversation with the agent.
277
-
278
- First, install the server with the screen capture dependency:
279
-
280
- ```bash
281
- uv tool install "pipecat-ai-mcp-server[screen]"
282
- ```
283
-
284
- Then, define the following environment variables:
285
-
286
- | Variable | Description |
287
- |-------------------------------------|--------------------------------------------------------------------|
288
- | `PIPECAT_MCP_SERVER_SCREEN_CAPTURE` | Set to any value (e.g., `1`) to enable screen capture |
289
- | `PIPECAT_MCP_SERVER_SCREEN_WINDOW` | Optional. Window name to capture (partial match, case-insensitive) |
290
-
291
- For example, to capture your entire primary monitor:
292
-
293
- ```bash
294
- export PIPECAT_MCP_SERVER_SCREEN_CAPTURE=1
295
-
296
- pipecat-mcp-server
297
- ```
298
-
299
- And to capture a specific window:
300
-
301
- ```bash
302
- export PIPECAT_MCP_SERVER_SCREEN_CAPTURE=1
303
- export PIPECAT_MCP_SERVER_SCREEN_WINDOW="claude"
304
-
305
- pipecat-mcp-server
306
- ```
307
-
308
- > ℹ️ **Note:** Window capture is based on window coordinates, not content. If another window overlaps the target, the overlapping content will be captured. The capture region updates dynamically if the window is moved. If the specified window is not found, capture falls back to the full screen.
309
-
310
288
  ## 📚 What's Next?
311
289
 
312
- - **Customize services**: Edit `agent.py` to use different STT/TTS providers (ElevenLabs, OpenAI, etc.)
290
+ - **Customize services**: Edit `agent.py` to use different STT/TTS providers
313
291
  - **Change transport**: Configure for Twilio, WebRTC, or other transports
314
292
  - **Add to your project**: Use this as a template for voice-enabled MCP tools
315
293
  - **Learn more**: Check out [Pipecat's docs](https://docs.pipecat.ai/) for advanced features