voice-mcp-server 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +193 -0
- package/build/index.js +51 -0
- package/config/config.yaml +25 -0
- package/config/microphone/live_mic.yaml +1 -0
- package/config/speaker/elevenlabs_speaker.yaml +3 -0
- package/config/speaker/kokoro_speaker.yaml +3 -0
- package/config/stt/mlx_whisper_large_v3.yaml +2 -0
- package/config/vad/ptt_vad.yaml +8 -0
- package/config/vad/silero_vad.yaml +7 -0
- package/package.json +40 -0
- package/requirements.txt +126 -0
- package/src/adapters_real/__init__.py +0 -0
- package/src/adapters_real/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/kokoro_speaker.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/live_mic.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/ptt_vad.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/queue_llm.cpython-312.pyc +0 -0
- package/src/adapters_real/__pycache__/whisper_stt.cpython-312.pyc +0 -0
- package/src/adapters_real/echo_llm.py +28 -0
- package/src/adapters_real/elevenlabs_speaker.py +117 -0
- package/src/adapters_real/kokoro_speaker.py +122 -0
- package/src/adapters_real/live_mic.py +64 -0
- package/src/adapters_real/live_speaker.py +66 -0
- package/src/adapters_real/ptt_vad.py +36 -0
- package/src/adapters_real/queue_llm.py +36 -0
- package/src/adapters_real/silero_vad.py +43 -0
- package/src/adapters_real/wav_mic.py +17 -0
- package/src/adapters_real/whisper_stt.py +32 -0
- package/src/daemon/__init__.py +0 -0
- package/src/daemon/audio_server.py +363 -0
- package/src/index.ts +63 -0
- package/src/mcp_server.py +254 -0
- package/src/simulation/__init__.py +0 -0
- package/src/simulation/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/engine.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/models.cpython-312.pyc +0 -0
- package/src/simulation/__pycache__/ports.cpython-312.pyc +0 -0
- package/src/simulation/adapters.py +131 -0
- package/src/simulation/engine.py +242 -0
- package/src/simulation/models.py +25 -0
- package/src/simulation/ports.py +57 -0
- package/src/simulation/tests/__init__.py +0 -0
- package/src/simulation/tests/test_scenarios.py +510 -0
- package/tsconfig.json +15 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Erick Vazquez Santillan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# 🎙️ Voice MCP Server
|
|
4
|
+
|
|
5
|
+
**Give your AI agents a voice, real ears, and the ability to handle interruptions in real-time.**
|
|
6
|
+
|
|
7
|
+
[](https://www.npmjs.com/package/voice-mcp-server)
|
|
8
|
+
[](#-target-environment)
|
|
9
|
+
[](https://python.org)
|
|
10
|
+
[](https://modelcontextprotocol.io/)
|
|
11
|
+
|
|
12
|
+
</div>
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## 💡 The Pitch
|
|
17
|
+
|
|
18
|
+
Typical AI assistants generate passive blocks of text. **Voice MCP Server** changes the paradigm by granting [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) compatible agents (like Gemini, Claude, or Cursor) the ability to actively **speak and listen in real-time**.
|
|
19
|
+
|
|
20
|
+
Instead of a standard text response, the AI can initiate a **bidirectional voice loop** with you. Featuring blazing-fast local transcription and true human-like **barge-in (interruption) detection**, your AI will naturally stop talking the moment you interrupt it, transcribe what you said, and gracefully pivot the conversation.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 🏗️ Architecture: The Best of Both Worlds
|
|
25
|
+
|
|
26
|
+
To make this server universally compatible via NPM while securely wielding heavy Python machine-learning models, Voice MCP Server utilizes a hybrid architecture.
|
|
27
|
+
|
|
28
|
+
```mermaid
|
|
29
|
+
graph TD
|
|
30
|
+
Client[MCP Client<br>Cursor, Claude, Gemini] <-->|stdio| NodeBridge(Node.js Bridge<br>npx voice-mcp-server)
|
|
31
|
+
|
|
32
|
+
subgraph Core System
|
|
33
|
+
NodeBridge <-->|Spawns & Pipes stdio| PythonEngine[Python Core Engine<br>fastmcp]
|
|
34
|
+
PythonEngine <-->|Unix Domain Sockets<br>~/Library/Application Support/VoiceMCP| AudioDaemon[Audio & ML Daemon]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
subgraph Hot-Swappable Adapters
|
|
38
|
+
AudioDaemon -.-> TTS[TTS: say / kokoro / elevenlabs]
|
|
39
|
+
AudioDaemon -.-> STT[STT: mlx_whisper_large_v3]
|
|
40
|
+
AudioDaemon -.-> VAD[VAD: silero / ptt]
|
|
41
|
+
AudioDaemon -.-> MIC[Mic: pyaudio]
|
|
42
|
+
end
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
1. **The Entry Point (Node.js):** Distributed as a standard NPM package (`voice-mcp-server`). Running `npx voice-mcp-server` spins up a TypeScript bridge that smoothly interfaces with standard MCP `stdio` requirements. It automatically locates your Python environment and spawns the core engine.
|
|
46
|
+
2. **The Core Engine (Python):** Powered by the `fastmcp` framework, a Python daemon manages complex agent logic, ML inference, and tool execution.
|
|
47
|
+
3. **Firewall-Friendly Sockets:** The internal audio daemon communicates entirely via Unix Domain Sockets isolated in `~/Library/Application Support/VoiceMCP`. This ensures zero annoying macOS *"Do you want to allow this application to accept network connections?"* popups during local development.
|
|
48
|
+
|
|
49
|
+
-----
|
|
50
|
+
|
|
51
|
+
## 💻 Target Environment
|
|
52
|
+
|
|
53
|
+
> [!WARNING]
|
|
54
|
+
> **Apple Silicon Required:** Because this project heavily relies on `mlx_whisper_large_v3` for hardware-accelerated local Speech-to-Text and macOS-native audio commands (like `say`), it is currently highly optimized for, and restricted to, **macOS systems with M-series chips (M1/M2/M3/M4)**.
|
|
55
|
+
|
|
56
|
+
-----
|
|
57
|
+
|
|
58
|
+
## 🧩 Hot-Swappable Hardware & AI Adapters
|
|
59
|
+
|
|
60
|
+
The system is built on a highly modular adapter pattern configured via `hydra` YAML files. **The AI can dynamically swap these out at runtime without restarting the server.**
|
|
61
|
+
|
|
62
|
+
| Component | Available Adapters | Description |
|
|
63
|
+
| :--- | :--- | :--- |
|
|
64
|
+
| **🔊 Speakers (TTS)** | `live_speaker` | Blazing-fast, zero-latency native macOS `say` command. |
|
|
65
|
+
| | `kokoro_speaker` | High-quality, emotive local ML Text-to-Speech. |
|
|
66
|
+
| | `elevenlabs_speaker` | Premium cloud-based ultra-realistic voices. |
|
|
67
|
+
| **🎙️ Microphones** | `live_mic` | Direct hardware integration via PyAudio. |
|
|
68
|
+
| **🤫 VAD (Activity)** | `silero_vad` | Conversational mode powered by Silero, heavily optimized for 1-second barge-ins. |
|
|
69
|
+
| | `ptt_vad` | Manual Push-to-Talk / Walkie-Talkie mode for noisy environments. |
|
|
70
|
+
| **📝 STT (Transcription)**| `mlx_whisper_large_v3`| Blazing fast local transcription leveraging Apple's MLX framework. |
|
|
71
|
+
|
|
72
|
+
-----
|
|
73
|
+
|
|
74
|
+
## 🛠️ Tools Exposed to the LLM
|
|
75
|
+
|
|
76
|
+
Once connected, the server equips your AI agent with two powerful MCP tools:
|
|
77
|
+
|
|
78
|
+
### 1. `voice_converse`
|
|
79
|
+
|
|
80
|
+
The core communication loop. The AI calls this tool and passes a string of text it wants to say.
|
|
81
|
+
|
|
82
|
+
1. The server renders and plays the TTS.
|
|
83
|
+
2. The server instantly activates the microphone and listens for the user's reply via VAD.
|
|
84
|
+
3. The server transcribes the audio and returns the text to the AI.
|
|
85
|
+
|
|
86
|
+
**Interrupt Handling (Barge-in):** If the user interrupts the AI mid-sentence, playback instantly stops. The server captures the interruption, transcribes it, and returns the response alongside a `was_interrupted: true` flag. This allows the AI to organically realize it was cut off and address the interruption naturally.
|
|
87
|
+
|
|
88
|
+
### 2. `configure_audio_engine`
|
|
89
|
+
|
|
90
|
+
Grants the AI meta-awareness over its own hardware and software stack. If you ask your AI, *"Switch to a more realistic voice"* or *"Change to push-to-talk mode,"* it can autonomously call this tool to swap out the active Hydra configuration on the fly.
|
|
91
|
+
|
|
92
|
+
-----
|
|
93
|
+
|
|
94
|
+
## 🚀 Installation & Setup
|
|
95
|
+
|
|
96
|
+
Since this project bridges Node.js and Python, you will need to set up the Python environment with the required ML dependencies, followed by the NPM bridge.
|
|
97
|
+
|
|
98
|
+
### 1. Prerequisites
|
|
99
|
+
|
|
100
|
+
* **Node.js** (v18+)
|
|
101
|
+
* **Python** (3.10+)
|
|
102
|
+
* **macOS** (M1/M2/M3/M4 chip heavily recommended)
|
|
103
|
+
|
|
104
|
+
### 2. Setup the Python Environment
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Clone the repository
|
|
108
|
+
git clone https://github.com/your-username/voice-mcp-server.git
|
|
109
|
+
cd voice-mcp-server
|
|
110
|
+
|
|
111
|
+
# Create and activate a virtual environment
|
|
112
|
+
python3 -m venv venv
|
|
113
|
+
source venv/bin/activate
|
|
114
|
+
|
|
115
|
+
# Install the heavy ML dependencies
|
|
116
|
+
pip install -r requirements.txt
|
|
117
|
+
|
|
118
|
+
# Download required ML models locally
|
|
119
|
+
# Kokoro, Silero VAD, and MLX Whisper will automatically pull their weights on the first run.
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 3. Install the NPM Bridge
|
|
123
|
+
|
|
124
|
+
Install the package globally, or simply rely on `npx` when configuring your client:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Build the TypeScript entry point
|
|
128
|
+
npm install
|
|
129
|
+
npm run build
|
|
130
|
+
|
|
131
|
+
# Install globally
|
|
132
|
+
npm install -g .
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
-----
|
|
136
|
+
|
|
137
|
+
## 🔌 Usage: MCP Client Configuration
|
|
138
|
+
|
|
139
|
+
You can seamlessly plug Voice MCP Server into any standard MCP-compatible client.
|
|
140
|
+
|
|
141
|
+
### For Gemini CLI
|
|
142
|
+
|
|
143
|
+
Add the server to your global configuration:
|
|
144
|
+
```bash
|
|
145
|
+
gemini mcp add voice-mcp-server node /absolute/path/to/voice_mcp_server/build/index.js
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### For Cursor
|
|
149
|
+
|
|
150
|
+
1. Navigate to **Cursor Settings > Features > MCP Server**
|
|
151
|
+
2. Click **+ Add New MCP Server**
|
|
152
|
+
3. **Name:** `Voice`
|
|
153
|
+
4. **Type:** `command`
|
|
154
|
+
5. **Command:** `node /absolute/path/to/voice_mcp_server/build/index.js`
|
|
155
|
+
|
|
156
|
+
### For Claude Desktop
|
|
157
|
+
|
|
158
|
+
Add the following to your `claude_desktop_config.json` (usually located at `~/Library/Application Support/Claude/claude_desktop_config.json`):
|
|
159
|
+
|
|
160
|
+
```json
|
|
161
|
+
{
|
|
162
|
+
"mcpServers": {
|
|
163
|
+
"voice-mcp": {
|
|
164
|
+
"command": "node",
|
|
165
|
+
"args": ["/absolute/path/to/voice_mcp_server/build/index.js"]
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
-----
|
|
172
|
+
|
|
173
|
+
## 💡 Example Prompt
|
|
174
|
+
|
|
175
|
+
Once connected, test the server by sending this prompt to your AI:
|
|
176
|
+
|
|
177
|
+
> *"Let's test your voice capabilities! Please use the `voice_converse` tool to introduce yourself and tell me a story about a brave robot. If I interrupt you while you are speaking, stop the story and acknowledge my interruption in your next response."*
|
|
178
|
+
|
|
179
|
+
-----
|
|
180
|
+
|
|
181
|
+
## 🤝 Contributing
|
|
182
|
+
|
|
183
|
+
Contributions, pull requests, and bug reports are highly welcome! Whether you want to add support for Windows/Linux by removing the MLX dependency, build new STT adapters (like Groq or Deepgram), or improve the TTS engines, please open an issue or submit a PR.
|
|
184
|
+
|
|
185
|
+
1. Fork the Project
|
|
186
|
+
2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
|
|
187
|
+
3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
|
|
188
|
+
4. Push to the Branch (`git push origin feature/AmazingFeature`)
|
|
189
|
+
5. Open a Pull Request
|
|
190
|
+
|
|
191
|
+
## 📄 License
|
|
192
|
+
|
|
193
|
+
This project is open-sourced under the MIT License.
|
package/build/index.js
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { spawn } from "node:child_process";
|
|
3
|
+
import { join, dirname } from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
import { existsSync } from "node:fs";
|
|
6
|
+
// Get the directory of the current module
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const __dirname = dirname(__filename);
|
|
9
|
+
// Root of the project
|
|
10
|
+
const projectRoot = join(__dirname, "..");
|
|
11
|
+
// Path to the Python script
|
|
12
|
+
const pythonScriptPath = join(projectRoot, "src", "mcp_server.py");
|
|
13
|
+
/**
|
|
14
|
+
* Locate the best Python executable to use.
|
|
15
|
+
* Priority:
|
|
16
|
+
* 1. Local venv inside the project
|
|
17
|
+
* 2. System python3
|
|
18
|
+
*/
|
|
19
|
+
function getPythonExecutable() {
|
|
20
|
+
const venvPath = join(projectRoot, "venv", "bin", "python3");
|
|
21
|
+
if (existsSync(venvPath)) {
|
|
22
|
+
return venvPath;
|
|
23
|
+
}
|
|
24
|
+
return "python3";
|
|
25
|
+
}
|
|
26
|
+
const pythonExecutable = getPythonExecutable();
|
|
27
|
+
/**
|
|
28
|
+
* Start the Python MCP Server and bridge standard I/O.
|
|
29
|
+
*/
|
|
30
|
+
function startBridge() {
|
|
31
|
+
const pythonProcess = spawn(pythonExecutable, [pythonScriptPath], {
|
|
32
|
+
stdio: ["pipe", "pipe", "inherit"],
|
|
33
|
+
env: {
|
|
34
|
+
...process.env,
|
|
35
|
+
// Ensure Python output isn't buffered
|
|
36
|
+
PYTHONUNBUFFERED: "1",
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
// Pipe our stdin into Python's stdin
|
|
40
|
+
process.stdin.pipe(pythonProcess.stdin);
|
|
41
|
+
// Pipe Python's stdout back to our stdout
|
|
42
|
+
pythonProcess.stdout.pipe(process.stdout);
|
|
43
|
+
// Handle process termination
|
|
44
|
+
pythonProcess.on("exit", (code) => {
|
|
45
|
+
process.exit(code ?? 0);
|
|
46
|
+
});
|
|
47
|
+
// Forward signals
|
|
48
|
+
process.on("SIGINT", () => pythonProcess.kill("SIGINT"));
|
|
49
|
+
process.on("SIGTERM", () => pythonProcess.kill("SIGTERM"));
|
|
50
|
+
}
|
|
51
|
+
startBridge();
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# --- Global Adapter Configuration ---
|
|
2
|
+
# Use these defaults to instantly swap out hardware and AI models.
|
|
3
|
+
# The configurations for each adapter are loaded dynamically from their respective folders.
|
|
4
|
+
defaults:
|
|
5
|
+
# Available microphones: live_mic (PyAudio real hardware)
|
|
6
|
+
- microphone: live_mic
|
|
7
|
+
|
|
8
|
+
# Available speakers: kokoro_speaker (local TTS), elevenlabs_speaker (cloud TTS), live_speaker (macOS 'say')
|
|
9
|
+
- speaker: kokoro_speaker
|
|
10
|
+
|
|
11
|
+
# Available VADs:
|
|
12
|
+
# - ptt_vad: Walkie-Talkie mode (Hold 'Shift' to talk. Instant response. Ignores TV/noise).
|
|
13
|
+
# - silero_vad: Conversational AI mode (Listens automatically. Tuned for 1-second barge-ins).
|
|
14
|
+
- vad: ptt_vad
|
|
15
|
+
|
|
16
|
+
# Available STTs: mlx_whisper_large_v3 (Apple Silicon GPU), whisper_stt (Google Cloud API)
|
|
17
|
+
- stt: mlx_whisper_large_v3
|
|
18
|
+
|
|
19
|
+
- _self_
|
|
20
|
+
|
|
21
|
+
# --- Global Engine Configuration ---
|
|
22
|
+
# Note: VAD-specific tuning (like barge-in thresholds and probability)
|
|
23
|
+
# is now colocated inside the specific VAD yaml files (e.g. config/vad/ptt_vad.yaml).
|
|
24
|
+
config:
|
|
25
|
+
vad_silence_grace_ms: 100
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
_target_: adapters_real.live_mic.LiveMicrophone
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
_target_: adapters_real.ptt_vad.PushToTalkVAD
|
|
2
|
+
key_name: "shift"
|
|
3
|
+
|
|
4
|
+
# PTT specific tuning (tuned for instant response)
|
|
5
|
+
vad_probability_threshold: 0.50
|
|
6
|
+
vad_bargein_threshold_ms: 100
|
|
7
|
+
endpointing_patience_normal_ms: 300
|
|
8
|
+
endpointing_patience_interrupted_ms: 300
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "voice-mcp-server",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "An MCP server to allow LLMs to speak and listen via bidirectional voice loops",
|
|
5
|
+
"main": "build/index.js",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"bin": {
|
|
8
|
+
"voice-mcp-server": "./build/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"build",
|
|
12
|
+
"src",
|
|
13
|
+
"config",
|
|
14
|
+
"requirements.txt",
|
|
15
|
+
"tsconfig.json"
|
|
16
|
+
],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"build": "tsc && chmod +x build/index.js",
|
|
19
|
+
"start": "node build/index.js",
|
|
20
|
+
"prepublishOnly": "npm run build"
|
|
21
|
+
},
|
|
22
|
+
"keywords": [
|
|
23
|
+
"mcp",
|
|
24
|
+
"voice",
|
|
25
|
+
"llm",
|
|
26
|
+
"audio",
|
|
27
|
+
"tts",
|
|
28
|
+
"stt",
|
|
29
|
+
"vad"
|
|
30
|
+
],
|
|
31
|
+
"author": "Erick Vazquez Santillan",
|
|
32
|
+
"license": "MIT",
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"@modelcontextprotocol/sdk": "^1.5.0"
|
|
35
|
+
},
|
|
36
|
+
"devDependencies": {
|
|
37
|
+
"@types/node": "^20.0.0",
|
|
38
|
+
"typescript": "^5.0.0"
|
|
39
|
+
}
|
|
40
|
+
}
|
package/requirements.txt
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
accelerate==1.13.0
|
|
2
|
+
addict==2.4.0
|
|
3
|
+
annotated-doc==0.0.4
|
|
4
|
+
annotated-types==0.7.0
|
|
5
|
+
antlr4-python3-runtime==4.9.3
|
|
6
|
+
anyio==4.13.0
|
|
7
|
+
attrs==26.1.0
|
|
8
|
+
babel==2.18.0
|
|
9
|
+
blis==1.3.3
|
|
10
|
+
catalogue==2.0.10
|
|
11
|
+
certifi==2026.2.25
|
|
12
|
+
cffi==2.0.0
|
|
13
|
+
charset-normalizer==3.4.6
|
|
14
|
+
click==8.3.1
|
|
15
|
+
cloudpathlib==0.23.0
|
|
16
|
+
confection==1.3.3
|
|
17
|
+
cryptography==46.0.6
|
|
18
|
+
csvw==3.7.0
|
|
19
|
+
curated-tokenizers==0.0.9
|
|
20
|
+
curated-transformers==0.1.1
|
|
21
|
+
cymem==2.0.13
|
|
22
|
+
dlinfo==2.0.0
|
|
23
|
+
docopt==0.6.2
|
|
24
|
+
elevenlabs==2.40.0
|
|
25
|
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
|
|
26
|
+
espeakng-loader==0.2.4
|
|
27
|
+
fastapi==0.135.2
|
|
28
|
+
filelock==3.25.2
|
|
29
|
+
fsspec==2026.3.0
|
|
30
|
+
h11==0.16.0
|
|
31
|
+
hf-xet==1.4.2
|
|
32
|
+
httpcore==1.0.9
|
|
33
|
+
httpx==0.28.1
|
|
34
|
+
httpx-sse==0.4.3
|
|
35
|
+
huggingface_hub==1.8.0
|
|
36
|
+
hydra-core==1.3.2
|
|
37
|
+
idna==3.11
|
|
38
|
+
isodate==0.7.2
|
|
39
|
+
Jinja2==3.1.6
|
|
40
|
+
joblib==1.5.3
|
|
41
|
+
jsonschema==4.26.0
|
|
42
|
+
jsonschema-specifications==2025.9.1
|
|
43
|
+
kokoro==0.9.4
|
|
44
|
+
language-tags==1.2.0
|
|
45
|
+
llvmlite==0.46.0
|
|
46
|
+
loguru==0.7.3
|
|
47
|
+
markdown-it-py==4.0.0
|
|
48
|
+
MarkupSafe==3.0.3
|
|
49
|
+
mcp==1.26.0
|
|
50
|
+
mdurl==0.1.2
|
|
51
|
+
misaki==0.9.4
|
|
52
|
+
mlx==0.31.1
|
|
53
|
+
mlx-metal==0.31.1
|
|
54
|
+
mlx-whisper==0.4.3
|
|
55
|
+
more-itertools==10.8.0
|
|
56
|
+
mpmath==1.3.0
|
|
57
|
+
murmurhash==1.0.15
|
|
58
|
+
networkx==3.6.1
|
|
59
|
+
num2words==0.5.14
|
|
60
|
+
numba==0.64.0
|
|
61
|
+
numpy==1.26.4
|
|
62
|
+
omegaconf==2.3.0
|
|
63
|
+
packaging==26.0
|
|
64
|
+
phonemizer-fork==3.3.2
|
|
65
|
+
preshed==3.0.13
|
|
66
|
+
psutil==7.2.2
|
|
67
|
+
PyAudio==0.2.14
|
|
68
|
+
pycparser==3.0
|
|
69
|
+
pydantic==2.12.5
|
|
70
|
+
pydantic-settings==2.13.1
|
|
71
|
+
pydantic_core==2.41.5
|
|
72
|
+
Pygments==2.20.0
|
|
73
|
+
PyJWT==2.12.1
|
|
74
|
+
pynput==1.8.1
|
|
75
|
+
pyobjc-core==12.1
|
|
76
|
+
pyobjc-framework-ApplicationServices==12.1
|
|
77
|
+
pyobjc-framework-Cocoa==12.1
|
|
78
|
+
pyobjc-framework-CoreText==12.1
|
|
79
|
+
pyobjc-framework-Quartz==12.1
|
|
80
|
+
pyparsing==3.3.2
|
|
81
|
+
python-dateutil==2.9.0.post0
|
|
82
|
+
python-dotenv==1.2.2
|
|
83
|
+
python-multipart==0.0.22
|
|
84
|
+
PyYAML==6.0.3
|
|
85
|
+
rdflib==7.6.0
|
|
86
|
+
referencing==0.37.0
|
|
87
|
+
regex==2026.3.32
|
|
88
|
+
requests==2.33.1
|
|
89
|
+
rfc3986==1.5.0
|
|
90
|
+
rich==14.3.3
|
|
91
|
+
rpds-py==0.30.0
|
|
92
|
+
safetensors==0.7.0
|
|
93
|
+
scipy==1.17.1
|
|
94
|
+
segments==2.4.0
|
|
95
|
+
setuptools==81.0.0
|
|
96
|
+
shellingham==1.5.4
|
|
97
|
+
silero-vad==6.2.1
|
|
98
|
+
six==1.17.0
|
|
99
|
+
smart_open==7.5.1
|
|
100
|
+
soundfile==0.13.1
|
|
101
|
+
spacy==3.8.14
|
|
102
|
+
spacy-curated-transformers==0.3.1
|
|
103
|
+
spacy-legacy==3.0.12
|
|
104
|
+
spacy-loggers==1.0.5
|
|
105
|
+
srsly==2.5.3
|
|
106
|
+
sse-starlette==3.3.4
|
|
107
|
+
starlette==1.0.0
|
|
108
|
+
sympy==1.14.0
|
|
109
|
+
termcolor==3.3.0
|
|
110
|
+
thinc==8.3.13
|
|
111
|
+
tiktoken==0.12.0
|
|
112
|
+
tokenizers==0.22.2
|
|
113
|
+
torch==2.11.0
|
|
114
|
+
torchaudio==2.11.0
|
|
115
|
+
tqdm==4.67.3
|
|
116
|
+
transformers==5.4.0
|
|
117
|
+
typer==0.24.1
|
|
118
|
+
typing-inspection==0.4.2
|
|
119
|
+
typing_extensions==4.15.0
|
|
120
|
+
uritemplate==4.2.0
|
|
121
|
+
urllib3==2.6.3
|
|
122
|
+
uvicorn==0.42.0
|
|
123
|
+
wasabi==1.1.3
|
|
124
|
+
weasel==1.0.0
|
|
125
|
+
websockets==16.0
|
|
126
|
+
wrapt==2.1.2
|
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from simulation.ports import ILLMBridge
|
|
2
|
+
|
|
3
|
+
class EchoLLMBridge(ILLMBridge):
|
|
4
|
+
def __init__(self, latency_ms: int = 0):
|
|
5
|
+
self.last_call = None
|
|
6
|
+
self.is_requesting = False
|
|
7
|
+
self.latency_ms = latency_ms
|
|
8
|
+
self.current_wait = 0
|
|
9
|
+
|
|
10
|
+
def call_mcp_tool(self, context: dict) -> dict:
|
|
11
|
+
self.start_request(context)
|
|
12
|
+
return self.get_response()
|
|
13
|
+
|
|
14
|
+
def start_request(self, context: dict):
|
|
15
|
+
self.last_call = context
|
|
16
|
+
self.is_requesting = True
|
|
17
|
+
self.current_wait = 0
|
|
18
|
+
|
|
19
|
+
def tick(self, ms: int):
|
|
20
|
+
if self.is_requesting:
|
|
21
|
+
self.current_wait += ms
|
|
22
|
+
|
|
23
|
+
def get_response(self) -> dict | None:
|
|
24
|
+
if self.is_requesting and self.current_wait >= self.latency_ms:
|
|
25
|
+
self.is_requesting = False
|
|
26
|
+
user_speech = self.last_call.get("user_speech", "")
|
|
27
|
+
return {"text": f"I heard you say: {user_speech}", "expect_reply": True}
|
|
28
|
+
return None
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import subprocess
|
|
4
|
+
import httpx
|
|
5
|
+
from simulation.ports import ISpeaker
|
|
6
|
+
from simulation.models import VirtualAudioFrame
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
load_dotenv()
|
|
10
|
+
|
|
11
|
+
class ElevenLabsSpeaker(ISpeaker):
|
|
12
|
+
def __init__(self, wpm=150, voice_id="aEO01A4wXwd1O8GPgGlF"):
|
|
13
|
+
self.wpm = wpm
|
|
14
|
+
self.words_per_ms = (wpm / 60) / 1000
|
|
15
|
+
self.current_text = ""
|
|
16
|
+
self.words = []
|
|
17
|
+
self.process = None
|
|
18
|
+
self.start_time = 0
|
|
19
|
+
self.voice_id = voice_id
|
|
20
|
+
self.api_key = os.getenv("ELEVENLABS_API_KEY")
|
|
21
|
+
self.temp_file = "/tmp/elevenlabs_output.mp3"
|
|
22
|
+
|
|
23
|
+
def speak(self, text: str):
|
|
24
|
+
if not text.strip():
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
self.current_text = text
|
|
28
|
+
self.words = text.split()
|
|
29
|
+
|
|
30
|
+
if self.api_key:
|
|
31
|
+
url = f"https://api.elevenlabs.io/v1/text-to-speech/{self.voice_id}"
|
|
32
|
+
headers = {
|
|
33
|
+
"Accept": "audio/mpeg",
|
|
34
|
+
"Content-Type": "application/json",
|
|
35
|
+
"xi-api-key": self.api_key
|
|
36
|
+
}
|
|
37
|
+
data = {
|
|
38
|
+
"text": text,
|
|
39
|
+
"model_id": "eleven_multilingual_v2",
|
|
40
|
+
"voice_settings": {
|
|
41
|
+
"stability": 0.5,
|
|
42
|
+
"similarity_boost": 0.5
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
with httpx.Client() as client:
|
|
47
|
+
try:
|
|
48
|
+
response = client.post(url, json=data, headers=headers, timeout=10.0)
|
|
49
|
+
response.raise_for_status()
|
|
50
|
+
|
|
51
|
+
with open(self.temp_file, "wb") as f:
|
|
52
|
+
f.write(response.content)
|
|
53
|
+
|
|
54
|
+
# Play the downloaded audio
|
|
55
|
+
self.start_time = time.time()
|
|
56
|
+
self.process = subprocess.Popen(
|
|
57
|
+
["afplay", self.temp_file],
|
|
58
|
+
stdout=subprocess.DEVNULL,
|
|
59
|
+
stderr=subprocess.DEVNULL
|
|
60
|
+
)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"ElevenLabs API Error: {e}")
|
|
63
|
+
# Fallback to macOS say
|
|
64
|
+
self.start_time = time.time()
|
|
65
|
+
self.process = subprocess.Popen(
|
|
66
|
+
["say", text],
|
|
67
|
+
stdout=subprocess.DEVNULL,
|
|
68
|
+
stderr=subprocess.DEVNULL
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
print("Warning: No ELEVENLABS_API_KEY found, falling back to 'say'")
|
|
72
|
+
self.start_time = time.time()
|
|
73
|
+
self.process = subprocess.Popen(
|
|
74
|
+
["say", text],
|
|
75
|
+
stdout=subprocess.DEVNULL,
|
|
76
|
+
stderr=subprocess.DEVNULL
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def play_frame(self, frame: VirtualAudioFrame):
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
def tick(self, ms: int):
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
def is_speaking(self) -> bool:
|
|
86
|
+
if self.process is None:
|
|
87
|
+
return False
|
|
88
|
+
is_running = self.process.poll() is None
|
|
89
|
+
if not is_running:
|
|
90
|
+
self.current_text = ""
|
|
91
|
+
self.words = []
|
|
92
|
+
self.process = None
|
|
93
|
+
return is_running
|
|
94
|
+
|
|
95
|
+
def has_started_audio(self) -> bool:
|
|
96
|
+
return self.is_speaking()
|
|
97
|
+
|
|
98
|
+
def flush(self) -> str:
|
|
99
|
+
if not self.is_speaking():
|
|
100
|
+
return ""
|
|
101
|
+
|
|
102
|
+
# Immediately kill the playback process
|
|
103
|
+
self.process.kill()
|
|
104
|
+
|
|
105
|
+
# Explicitly wait for the process to terminate and reap it
|
|
106
|
+
self.process.wait()
|
|
107
|
+
|
|
108
|
+
elapsed_ms = (time.time() - self.start_time) * 1000
|
|
109
|
+
words_spoken = int(elapsed_ms * self.words_per_ms)
|
|
110
|
+
|
|
111
|
+
spoken = " ".join(self.words[:words_spoken])
|
|
112
|
+
|
|
113
|
+
self.current_text = ""
|
|
114
|
+
self.words = []
|
|
115
|
+
self.process = None
|
|
116
|
+
|
|
117
|
+
return spoken
|