deepslate-livekit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # NodeJS stuff
10
+ node_modules/
11
+
12
+ # Virtual environments
13
+ .venv
14
+
15
+ # IDE files
16
+ .idea/
17
+
18
+ # Development files
19
+ .env.local
20
+ .no-update
21
+
22
+ # Local settings
23
+ .claude/
24
+ CLAUDE.local.md
25
+
@@ -0,0 +1,252 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepslate-livekit
3
+ Version: 0.1.0
4
+ Summary: LiveKit plugin for deepslate.eu
5
+ Project-URL: Documentation, https://docs.deepslate.eu/
6
+ Project-URL: Website, https://deepslate.eu/
7
+ Project-URL: Source, https://github.com/deepslate-labs/deepslate-sdks
8
+ Keywords: ai,audio,deepslate,livekit,realtime,video,voice
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Topic :: Multimedia :: Sound/Audio
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: deepslate-core==0.1.0
18
+ Requires-Dist: livekit-agents>=1.3.8
19
+ Description-Content-Type: text/markdown
20
+
21
+ # deepslate-livekit
22
+
23
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
24
+ [![Documentation](https://img.shields.io/badge/docs-deepslate.eu-green)](https://docs.deepslate.eu/)
25
+ [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
26
+
27
+ LiveKit Agents plugin for [Deepslate's](https://deepslate.eu/) realtime voice AI API.
28
+
29
+ `deepslate-livekit` provides a `RealtimeModel` implementation for the [LiveKit Agents](https://github.com/livekit/agents) framework, enabling seamless integration with Deepslate's unified voice AI infrastructure — speech-to-speech streaming, server-side VAD, LLM inference, and optional ElevenLabs TTS, all in a single WebSocket connection.
30
+
31
+ ---
32
+
33
+ ## Features
34
+
35
+ - **Realtime Voice AI Streaming** — Low-latency bidirectional audio streaming over WebSockets
36
+ - **Server-side VAD** — Voice Activity Detection handled by Deepslate with configurable sensitivity
37
+ - **Function Tools** — Define and invoke tools using LiveKit's `@function_tool()` decorator
38
+ - **ElevenLabs TTS Integration** — Server-side text-to-speech with automatic context truncation on interruption
39
+ - **Automatic Interruption Handling** — Truncates the in-flight response when users interrupt
40
+
41
+ ---
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install deepslate-livekit
47
+ ```
48
+
49
+ ### Requirements
50
+
51
+ - Python 3.11 or higher
52
+
53
+ ### Dependencies (installed automatically)
54
+
55
+ - `deepslate-core` — Shared Deepslate models and base client
56
+ - `livekit-agents>=1.3.8` — LiveKit Agents framework
57
+
58
+ ---
59
+
60
+ ## Prerequisites
61
+
62
+ ### Deepslate Account
63
+
64
+ Sign up at [deepslate.eu](https://deepslate.eu) and set the following environment variables:
65
+
66
+ ```bash
67
+ DEEPSLATE_VENDOR_ID=your_vendor_id
68
+ DEEPSLATE_ORGANIZATION_ID=your_organization_id
69
+ DEEPSLATE_API_KEY=your_api_key
70
+ ```
71
+
72
+ ### ElevenLabs TTS (Optional)
73
+
74
+ For server-side text-to-speech with automatic interruption handling:
75
+
76
+ ```bash
77
+ ELEVENLABS_API_KEY=your_elevenlabs_api_key
78
+ ELEVENLABS_VOICE_ID=your_voice_id
79
+ ELEVENLABS_MODEL_ID=eleven_turbo_v2 # optional
80
+ ```
81
+
82
+ > **Note:** You can alternatively use LiveKit's built-in client-side TTS. However, context truncation on interruption only works with server-side TTS configured via `ElevenLabsTtsConfig`.
83
+
84
+ ---
85
+
86
+ ## Quick Start
87
+
88
+ ```python
89
+ from livekit import agents
90
+ from livekit.agents import AgentServer, AgentSession, Agent, room_io
91
+
92
+ from deepslate_livekit import RealtimeModel, ElevenLabsTtsConfig
93
+
94
+
95
+ class Assistant(Agent):
96
+ def __init__(self) -> None:
97
+ super().__init__(instructions="You are a helpful voice AI assistant.")
98
+
99
+
100
+ server = AgentServer()
101
+
102
+
103
+ @server.rtc_session()
104
+ async def my_agent(ctx: agents.JobContext):
105
+ session = AgentSession(
106
+ llm=RealtimeModel(
107
+ tts_config=ElevenLabsTtsConfig.from_env()
108
+ ),
109
+ )
110
+
111
+ await session.start(
112
+ room=ctx.room,
113
+ agent=Assistant(),
114
+ room_options=room_io.RoomOptions(),
115
+ )
116
+
117
+ await session.generate_reply(
118
+ instructions="Greet the user and offer your assistance."
119
+ )
120
+
121
+
122
+ if __name__ == "__main__":
123
+ agents.cli.run_app(server)
124
+ ```
125
+
126
+ ---
127
+
128
+ ## Configuration
129
+
130
+ ### `RealtimeModel`
131
+
132
+ | Parameter | Type | Default | Description |
133
+ |--------------------------|-----------------------|----------------------------------|---------------------------------------------------------|
134
+ | `vendor_id` | `str` | env: `DEEPSLATE_VENDOR_ID` | Deepslate vendor ID |
135
+ | `organization_id` | `str` | env: `DEEPSLATE_ORGANIZATION_ID` | Deepslate organization ID |
136
+ | `api_key` | `str` | env: `DEEPSLATE_API_KEY` | Deepslate API key |
137
+ | `base_url` | `str` | `"https://app.deepslate.eu"` | Base URL for Deepslate API |
138
+ | `system_prompt` | `str` | `"You are a helpful assistant."` | System prompt for the model |
139
+ | `generate_reply_timeout` | `float` | `30.0` | Timeout in seconds for `generate_reply` (0 = no limit) |
140
+ | `tts_config` | `ElevenLabsTtsConfig` | `None` | TTS configuration (enables server-side audio output) |
141
+
142
+ You can also pass a `VadConfig` instance to tune voice activity detection — see [VAD Configuration](#vad-configuration) below.
143
+
144
+ ### VAD Configuration
145
+
146
+ ```python
147
+ from deepslate_livekit import RealtimeModel, VadConfig
148
+
149
+ llm = RealtimeModel(
150
+ vad_config=VadConfig(
151
+ confidence_threshold=0.5, # 0.0–1.0: minimum confidence to classify as speech
152
+ min_volume=0.01, # 0.0–1.0: minimum volume to classify as speech
153
+ start_duration_ms=200, # ms of speech required to trigger start
154
+ stop_duration_ms=500, # ms of silence required to trigger stop
155
+ backbuffer_duration_ms=1000 # ms of audio buffered before detection triggers
156
+ )
157
+ )
158
+ ```
159
+
160
+ | Parameter | Type | Default | Description |
161
+ |------------------------------|---------|---------|-----------------------------------------------------------|
162
+ | `confidence_threshold` | `float` | `0.5` | Minimum confidence to consider audio as speech (0.0–1.0) |
163
+ | `min_volume` | `float` | `0.01` | Minimum volume threshold (0.0–1.0) |
164
+ | `start_duration_ms` | `int` | `200` | Duration of speech required to detect start (ms) |
165
+ | `stop_duration_ms` | `int` | `500` | Duration of silence required to detect end (ms) |
166
+ | `backbuffer_duration_ms` | `int` | `1000` | Audio buffer captured before speech detection triggers |
167
+
168
+ **Tuning tips:**
169
+ - **Noisy environments:** Increase `confidence_threshold` (0.6–0.8) and `min_volume` (0.02–0.05)
170
+ - **Lower latency:** Decrease `start_duration_ms` (100–150) and `stop_duration_ms` (200–300)
171
+ - **Natural pacing:** Slightly increase `stop_duration_ms` (600–800)
172
+
173
+ ### `ElevenLabsTtsConfig`
174
+
175
+ | Parameter | Type | Default | Description |
176
+ |------------|----------------------|---------------------------|----------------------------------------------------------------------|
177
+ | `api_key` | `str` | env: `ELEVENLABS_API_KEY` | ElevenLabs API key |
178
+ | `voice_id` | `str` | env: `ELEVENLABS_VOICE_ID` | Voice ID (e.g., `'21m00Tcm4TlvDq8ikWAM'` for Rachel) |
179
+ | `model_id` | `str \| None` | env: `ELEVENLABS_MODEL_ID` | Model ID, e.g., `'eleven_turbo_v2'`; uses ElevenLabs default if unset |
180
+ | `location` | `ElevenLabsLocation` | `ElevenLabsLocation.US` | Regional API endpoint (US works with all accounts; EU/INDIA require enterprise) |
181
+
182
+ Use `ElevenLabsTtsConfig.from_env()` to load from environment variables.
183
+
184
+ ---
185
+
186
+ ## Function Tools
187
+
188
+ Use LiveKit's `@function_tool()` decorator to expose tools to the model:
189
+
190
+ ```python
191
+ from livekit.agents import Agent, function_tool, RunContext
192
+ from deepslate_livekit import RealtimeModel
193
+
194
+
195
+ class Assistant(Agent):
196
+ def __init__(self) -> None:
197
+ super().__init__(instructions="You are a helpful assistant.")
198
+
199
+ @function_tool()
200
+ async def get_weather(self, context: RunContext, location: str) -> str:
201
+ """Get the current weather for a given city."""
202
+ # Your implementation here
203
+ return f"It's sunny and 22°C in {location}."
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Examples
209
+
210
+ The [`examples/`](examples/) directory contains a ready-to-run agent you can use as a starting point.
211
+
212
+ ### `chat_agent.py` — Voice assistant with function tools
213
+
214
+ A fully working LiveKit agent that demonstrates:
215
+ - Connecting to a LiveKit room
216
+ - Server-side ElevenLabs TTS with interruption handling
217
+ - Two example function tools: `lookup_weather` and `get_current_location`
218
+
219
+ ```
220
+ packages/livekit/examples/
221
+ ├── chat_agent.py # The agent
222
+ └── .env.example # Required environment variables
223
+ ```
224
+
225
+ **Setup:**
226
+
227
+ ```bash
228
+ # 1. Install dependencies
229
+ pip install deepslate-livekit python-dotenv
230
+
231
+ # 2. Configure credentials
232
+ cd packages/livekit/examples
233
+ cp .env.example .env
234
+ # Edit .env and fill in your credentials
235
+
236
+ # 3. Run
237
+ python chat_agent.py dev
238
+ ```
239
+
240
+ ---
241
+
242
+ ## Documentation
243
+
244
+ - [Deepslate Documentation](https://docs.deepslate.eu/)
245
+ - [LiveKit Agents Documentation](https://docs.livekit.io/agents/)
246
+ - [API Reference](https://docs.deepslate.eu/api-reference/)
247
+
248
+ ---
249
+
250
+ ## License
251
+
252
+ Apache License 2.0 — see [LICENSE](../../LICENSE) for details.
@@ -0,0 +1,232 @@
1
+ # deepslate-livekit
2
+
3
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
4
+ [![Documentation](https://img.shields.io/badge/docs-deepslate.eu-green)](https://docs.deepslate.eu/)
5
+ [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
6
+
7
+ LiveKit Agents plugin for [Deepslate's](https://deepslate.eu/) realtime voice AI API.
8
+
9
+ `deepslate-livekit` provides a `RealtimeModel` implementation for the [LiveKit Agents](https://github.com/livekit/agents) framework, enabling seamless integration with Deepslate's unified voice AI infrastructure — speech-to-speech streaming, server-side VAD, LLM inference, and optional ElevenLabs TTS, all in a single WebSocket connection.
10
+
11
+ ---
12
+
13
+ ## Features
14
+
15
+ - **Realtime Voice AI Streaming** — Low-latency bidirectional audio streaming over WebSockets
16
+ - **Server-side VAD** — Voice Activity Detection handled by Deepslate with configurable sensitivity
17
+ - **Function Tools** — Define and invoke tools using LiveKit's `@function_tool()` decorator
18
+ - **ElevenLabs TTS Integration** — Server-side text-to-speech with automatic context truncation on interruption
19
+ - **Automatic Interruption Handling** — Truncates the in-flight response when users interrupt
20
+
21
+ ---
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ pip install deepslate-livekit
27
+ ```
28
+
29
+ ### Requirements
30
+
31
+ - Python 3.11 or higher
32
+
33
+ ### Dependencies (installed automatically)
34
+
35
+ - `deepslate-core` — Shared Deepslate models and base client
36
+ - `livekit-agents>=1.3.8` — LiveKit Agents framework
37
+
38
+ ---
39
+
40
+ ## Prerequisites
41
+
42
+ ### Deepslate Account
43
+
44
+ Sign up at [deepslate.eu](https://deepslate.eu) and set the following environment variables:
45
+
46
+ ```bash
47
+ DEEPSLATE_VENDOR_ID=your_vendor_id
48
+ DEEPSLATE_ORGANIZATION_ID=your_organization_id
49
+ DEEPSLATE_API_KEY=your_api_key
50
+ ```
51
+
52
+ ### ElevenLabs TTS (Optional)
53
+
54
+ For server-side text-to-speech with automatic interruption handling:
55
+
56
+ ```bash
57
+ ELEVENLABS_API_KEY=your_elevenlabs_api_key
58
+ ELEVENLABS_VOICE_ID=your_voice_id
59
+ ELEVENLABS_MODEL_ID=eleven_turbo_v2 # optional
60
+ ```
61
+
62
+ > **Note:** You can alternatively use LiveKit's built-in client-side TTS. However, context truncation on interruption only works with server-side TTS configured via `ElevenLabsTtsConfig`.
63
+
64
+ ---
65
+
66
+ ## Quick Start
67
+
68
+ ```python
69
+ from livekit import agents
70
+ from livekit.agents import AgentServer, AgentSession, Agent, room_io
71
+
72
+ from deepslate_livekit import RealtimeModel, ElevenLabsTtsConfig
73
+
74
+
75
+ class Assistant(Agent):
76
+ def __init__(self) -> None:
77
+ super().__init__(instructions="You are a helpful voice AI assistant.")
78
+
79
+
80
+ server = AgentServer()
81
+
82
+
83
+ @server.rtc_session()
84
+ async def my_agent(ctx: agents.JobContext):
85
+ session = AgentSession(
86
+ llm=RealtimeModel(
87
+ tts_config=ElevenLabsTtsConfig.from_env()
88
+ ),
89
+ )
90
+
91
+ await session.start(
92
+ room=ctx.room,
93
+ agent=Assistant(),
94
+ room_options=room_io.RoomOptions(),
95
+ )
96
+
97
+ await session.generate_reply(
98
+ instructions="Greet the user and offer your assistance."
99
+ )
100
+
101
+
102
+ if __name__ == "__main__":
103
+ agents.cli.run_app(server)
104
+ ```
105
+
106
+ ---
107
+
108
+ ## Configuration
109
+
110
+ ### `RealtimeModel`
111
+
112
+ | Parameter | Type | Default | Description |
113
+ |--------------------------|-----------------------|----------------------------------|---------------------------------------------------------|
114
+ | `vendor_id` | `str` | env: `DEEPSLATE_VENDOR_ID` | Deepslate vendor ID |
115
+ | `organization_id` | `str` | env: `DEEPSLATE_ORGANIZATION_ID` | Deepslate organization ID |
116
+ | `api_key` | `str` | env: `DEEPSLATE_API_KEY` | Deepslate API key |
117
+ | `base_url` | `str` | `"https://app.deepslate.eu"` | Base URL for Deepslate API |
118
+ | `system_prompt` | `str` | `"You are a helpful assistant."` | System prompt for the model |
119
+ | `generate_reply_timeout` | `float` | `30.0` | Timeout in seconds for `generate_reply` (0 = no limit) |
120
+ | `tts_config` | `ElevenLabsTtsConfig` | `None` | TTS configuration (enables server-side audio output) |
121
+
122
+ You can also pass a `VadConfig` instance to tune voice activity detection — see [VAD Configuration](#vad-configuration) below.
123
+
124
+ ### VAD Configuration
125
+
126
+ ```python
127
+ from deepslate_livekit import RealtimeModel, VadConfig
128
+
129
+ llm = RealtimeModel(
130
+ vad_config=VadConfig(
131
+ confidence_threshold=0.5, # 0.0–1.0: minimum confidence to classify as speech
132
+ min_volume=0.01, # 0.0–1.0: minimum volume to classify as speech
133
+ start_duration_ms=200, # ms of speech required to trigger start
134
+ stop_duration_ms=500, # ms of silence required to trigger stop
135
+ backbuffer_duration_ms=1000 # ms of audio buffered before detection triggers
136
+ )
137
+ )
138
+ ```
139
+
140
+ | Parameter | Type | Default | Description |
141
+ |------------------------------|---------|---------|-----------------------------------------------------------|
142
+ | `confidence_threshold` | `float` | `0.5` | Minimum confidence to consider audio as speech (0.0–1.0) |
143
+ | `min_volume` | `float` | `0.01` | Minimum volume threshold (0.0–1.0) |
144
+ | `start_duration_ms` | `int` | `200` | Duration of speech required to detect start (ms) |
145
+ | `stop_duration_ms` | `int` | `500` | Duration of silence required to detect end (ms) |
146
+ | `backbuffer_duration_ms` | `int` | `1000` | Audio buffer captured before speech detection triggers |
147
+
148
+ **Tuning tips:**
149
+ - **Noisy environments:** Increase `confidence_threshold` (0.6–0.8) and `min_volume` (0.02–0.05)
150
+ - **Lower latency:** Decrease `start_duration_ms` (100–150) and `stop_duration_ms` (200–300)
151
+ - **Natural pacing:** Slightly increase `stop_duration_ms` (600–800)
152
+
153
+ ### `ElevenLabsTtsConfig`
154
+
155
+ | Parameter | Type | Default | Description |
156
+ |------------|----------------------|---------------------------|----------------------------------------------------------------------|
157
+ | `api_key` | `str` | env: `ELEVENLABS_API_KEY` | ElevenLabs API key |
158
+ | `voice_id` | `str` | env: `ELEVENLABS_VOICE_ID` | Voice ID (e.g., `'21m00Tcm4TlvDq8ikWAM'` for Rachel) |
159
+ | `model_id` | `str \| None` | env: `ELEVENLABS_MODEL_ID` | Model ID, e.g., `'eleven_turbo_v2'`; uses ElevenLabs default if unset |
160
+ | `location` | `ElevenLabsLocation` | `ElevenLabsLocation.US` | Regional API endpoint (US works with all accounts; EU/INDIA require enterprise) |
161
+
162
+ Use `ElevenLabsTtsConfig.from_env()` to load from environment variables.
163
+
164
+ ---
165
+
166
+ ## Function Tools
167
+
168
+ Use LiveKit's `@function_tool()` decorator to expose tools to the model:
169
+
170
+ ```python
171
+ from livekit.agents import Agent, function_tool, RunContext
172
+ from deepslate_livekit import RealtimeModel
173
+
174
+
175
+ class Assistant(Agent):
176
+ def __init__(self) -> None:
177
+ super().__init__(instructions="You are a helpful assistant.")
178
+
179
+ @function_tool()
180
+ async def get_weather(self, context: RunContext, location: str) -> str:
181
+ """Get the current weather for a given city."""
182
+ # Your implementation here
183
+ return f"It's sunny and 22°C in {location}."
184
+ ```
185
+
186
+ ---
187
+
188
+ ## Examples
189
+
190
+ The [`examples/`](examples/) directory contains a ready-to-run agent you can use as a starting point.
191
+
192
+ ### `chat_agent.py` — Voice assistant with function tools
193
+
194
+ A fully working LiveKit agent that demonstrates:
195
+ - Connecting to a LiveKit room
196
+ - Server-side ElevenLabs TTS with interruption handling
197
+ - Two example function tools: `lookup_weather` and `get_current_location`
198
+
199
+ ```
200
+ packages/livekit/examples/
201
+ ├── chat_agent.py # The agent
202
+ └── .env.example # Required environment variables
203
+ ```
204
+
205
+ **Setup:**
206
+
207
+ ```bash
208
+ # 1. Install dependencies
209
+ pip install deepslate-livekit python-dotenv
210
+
211
+ # 2. Configure credentials
212
+ cd packages/livekit/examples
213
+ cp .env.example .env
214
+ # Edit .env and fill in your credentials
215
+
216
+ # 3. Run
217
+ python chat_agent.py dev
218
+ ```
219
+
220
+ ---
221
+
222
+ ## Documentation
223
+
224
+ - [Deepslate Documentation](https://docs.deepslate.eu/)
225
+ - [LiveKit Agents Documentation](https://docs.livekit.io/agents/)
226
+ - [API Reference](https://docs.deepslate.eu/api-reference/)
227
+
228
+ ---
229
+
230
+ ## License
231
+
232
+ Apache License 2.0 — see [LICENSE](../../LICENSE) for details.
@@ -0,0 +1,18 @@
1
+ # LiveKit server connection
2
+ LIVEKIT_URL=ws://localhost:7880
3
+ LIVEKIT_API_KEY=devkey
4
+ LIVEKIT_API_SECRET=secret-must-be-at-least-32-characters-long
5
+
6
+ # Deepslate API credentials (required)
7
+ # Sign up at https://deepslate.eu to obtain these.
8
+ DEEPSLATE_VENDOR_ID=
9
+ DEEPSLATE_ORGANIZATION_ID=
10
+ DEEPSLATE_API_KEY=
11
+
12
+ # Optional: override the Deepslate WebSocket endpoint (useful for local dev)
13
+ # DEEPSLATE_WS_URL=
14
+
15
+ # ElevenLabs TTS (optional — omit to disable server-side TTS)
16
+ ELEVENLABS_API_KEY=
17
+ ELEVENLABS_VOICE_ID=
18
+ # ELEVENLABS_MODEL_ID=eleven_turbo_v2
@@ -0,0 +1,102 @@
1
+ """
2
+ Deepslate + LiveKit Agents — Chat Agent Example
3
+ ================================================
4
+ A voice AI assistant that joins a LiveKit room and responds to speech.
5
+ Includes two example function tools (weather lookup and location detection)
6
+ to demonstrate Deepslate's function-calling support.
7
+
8
+ Setup
9
+ -----
10
+ 1. Copy .env.example to .env and fill in your credentials.
11
+ 2. Start a local LiveKit server (or point LIVEKIT_URL at a hosted one).
12
+ 3. Run: python chat_agent.py dev
13
+
14
+ Requirements
15
+ ------------
16
+ pip install deepslate-livekit python-dotenv
17
+ """
18
+
19
+ import os
20
+ import random
21
+ from typing import Any
22
+
23
+ from dotenv import load_dotenv
24
+ from livekit import agents
25
+ from livekit.agents import Agent, AgentServer, AgentSession, RunContext, function_tool, room_io
26
+
27
+ from deepslate.livekit import ElevenLabsTtsConfig, RealtimeModel
28
+
29
+ # Load .env from the examples directory, then allow .env.local to override
30
+ # (useful for keeping secrets out of version control).
31
+ _script_dir = os.path.dirname(os.path.realpath(__file__))
32
+ load_dotenv(os.path.join(_script_dir, ".env"))
33
+ load_dotenv(os.path.join(_script_dir, ".env.local"), override=True)
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Agent definition
38
+ # ---------------------------------------------------------------------------
39
+
40
+ class Assistant(Agent):
41
+ def __init__(self) -> None:
42
+ super().__init__(instructions="You are a helpful voice AI assistant.")
43
+
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Function tools
47
+ # ---------------------------------------------------------------------------
48
+
49
+ @function_tool()
50
+ async def lookup_weather(
51
+ _context: RunContext,
52
+ location: str,
53
+ ) -> dict[str, Any]:
54
+ """Get the current weather for a given location."""
55
+ return {
56
+ "location": location,
57
+ "temperature_celsius": random.randint(10, 35),
58
+ "precipitation": random.choice(["none", "light", "moderate", "heavy"]),
59
+ "air_pressure_hpa": random.randint(900, 1100),
60
+ }
61
+
62
+
63
+ @function_tool()
64
+ async def get_current_location(
65
+ _context: RunContext,
66
+ ) -> str:
67
+ """Get the user's current location."""
68
+ return "Berlin"
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Server + session
73
+ # ---------------------------------------------------------------------------
74
+
75
+ server = AgentServer()
76
+
77
+
78
+ @server.rtc_session()
79
+ async def my_agent(ctx: agents.JobContext):
80
+ session = AgentSession(
81
+ llm=RealtimeModel(
82
+ # DEEPSLATE_WS_URL can be set to override the default endpoint,
83
+ # which is useful for local development/testing.
84
+ ws_url=os.environ.get("DEEPSLATE_WS_URL"),
85
+ tts_config=ElevenLabsTtsConfig.from_env(),
86
+ ),
87
+ tools=[lookup_weather, get_current_location],
88
+ )
89
+
90
+ await session.start(
91
+ room=ctx.room,
92
+ agent=Assistant(),
93
+ room_options=room_io.RoomOptions(),
94
+ )
95
+
96
+ await session.generate_reply(
97
+ instructions="Greet the user and offer your assistance."
98
+ )
99
+
100
+
101
+ if __name__ == "__main__":
102
+ agents.cli.run_app(server)
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "deepslate-livekit"
7
+ version = "0.1.0"
8
+ description = "LiveKit plugin for deepslate.eu"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "deepslate-core==0.1.0",
13
+ "livekit-agents>=1.3.8",
14
+ ]
15
+ keywords = ["voice", "ai", "realtime", "audio", "video", "livekit", "deepslate"]
16
+ classifiers = [
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: Apache Software License",
19
+ "Topic :: Multimedia :: Sound/Audio",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3 :: Only",
24
+ ]
25
+
26
+ [project.urls]
27
+ Documentation = "https://docs.deepslate.eu/"
28
+ Website = "https://deepslate.eu/"
29
+ Source = "https://github.com/deepslate-labs/deepslate-sdks"
30
+
31
+ [tool.hatch.build.targets.wheel]
32
+ packages = ["src/deepslate/livekit"]
33
+
34
+ [tool.uv.sources]
35
+ deepslate-core = { workspace = true }
@@ -0,0 +1,27 @@
1
+ import importlib.metadata
2
+
3
+ try:
4
+ __version__ = importlib.metadata.version("deepslate-livekit")
5
+ except importlib.metadata.PackageNotFoundError:
6
+ __version__ = "unknown"
7
+
8
+ from deepslate.core.options import (
9
+ DeepslateOptions,
10
+ ElevenLabsLocation,
11
+ ElevenLabsTtsConfig,
12
+ VadConfig,
13
+ )
14
+
15
+ from .realtime import DeepslateRealtimeSession, RealtimeModel
16
+
17
+ from . import _plugin # noqa: F401 – triggers Plugin.register_plugin on import
18
+
19
+ __all__ = [
20
+ "__version__",
21
+ "DeepslateOptions",
22
+ "ElevenLabsLocation",
23
+ "ElevenLabsTtsConfig",
24
+ "VadConfig",
25
+ "RealtimeModel",
26
+ "DeepslateRealtimeSession",
27
+ ]
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("deepslate.livekit")
@@ -0,0 +1,18 @@
1
+ from livekit.agents import Plugin
2
+
3
+ import importlib.metadata
4
+
5
+ from ._log import logger
6
+
7
+ try:
8
+ __version__ = importlib.metadata.version("deepslate-livekit")
9
+ except importlib.metadata.PackageNotFoundError:
10
+ __version__ = "unknown"
11
+
12
+
13
+ class DeepslatePlugin(Plugin):
14
+ def __init__(self) -> None:
15
+ super().__init__("deepslate_livekit", __version__, "deepslate_livekit", logger)
16
+
17
+
18
+ Plugin.register_plugin(DeepslatePlugin())
@@ -0,0 +1,6 @@
1
+ from ._model import DeepslateRealtimeSession, RealtimeModel
2
+
3
+ __all__ = [
4
+ "RealtimeModel",
5
+ "DeepslateRealtimeSession",
6
+ ]
@@ -0,0 +1,630 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import json
6
+ import os
7
+ import time
8
+ from dataclasses import dataclass
9
+ from types import SimpleNamespace
10
+ from typing import Any, Literal
11
+
12
+ import aiohttp
13
+ from livekit import rtc
14
+ from livekit.agents import llm, utils, FunctionTool, NOT_GIVEN, NotGivenOr
15
+ from livekit.agents.llm import (
16
+ FunctionCall,
17
+ GenerationCreatedEvent,
18
+ InputSpeechStartedEvent,
19
+ MessageGeneration,
20
+ RawFunctionTool,
21
+ ToolChoice,
22
+ ToolContext,
23
+ )
24
+ from livekit.agents.llm.tool_context import (
25
+ get_raw_function_info,
26
+ is_function_tool,
27
+ is_raw_function_tool,
28
+ )
29
+
30
+ import importlib.metadata
31
+
32
+ try:
33
+ __version__ = importlib.metadata.version("deepslate-livekit")
34
+ except importlib.metadata.PackageNotFoundError:
35
+ __version__ = "unknown"
36
+
37
+ from deepslate.core import (
38
+ BaseDeepslateClient,
39
+ DeepslateOptions,
40
+ DeepslateSession,
41
+ DeepslateSessionListener,
42
+ ElevenLabsTtsConfig,
43
+ FunctionToolDict,
44
+ TriggerMode,
45
+ VadConfig,
46
+ )
47
+
48
+ from .._log import logger
49
+
50
+ DEEPSLATE_BASE_URL = "https://app.deepslate.eu"
51
+
52
+
53
+ @dataclass
54
+ class _ResponseGeneration:
55
+ """Internal state for a response being generated."""
56
+
57
+ message_ch: utils.aio.Chan["MessageGeneration"]
58
+ function_ch: utils.aio.Chan["FunctionCall"]
59
+ text_ch: utils.aio.Chan[str]
60
+ audio_ch: utils.aio.Chan[rtc.AudioFrame]
61
+ done_fut: asyncio.Future[None]
62
+ response_id: str
63
+ created_timestamp: float
64
+ first_token_timestamp: float | None = None
65
+ audio_transcript: str = ""
66
+
67
+
68
+ class RealtimeModel(llm.RealtimeModel):
69
+ """Real-time language model using Deepslate.
70
+
71
+ Connects to Deepslate's WebSocket API for streaming LLM responses.
72
+ Audio format is auto-detected from the first audio frame.
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ vendor_id: str | None = None,
78
+ organization_id: str | None = None,
79
+ api_key: str | None = None,
80
+ base_url: str = DEEPSLATE_BASE_URL,
81
+ system_prompt: str = "You are a helpful assistant.",
82
+ temperature: float = 1.0,
83
+ generate_reply_timeout: float = 30.0,
84
+ # VAD configuration
85
+ vad_confidence_threshold: float = 0.5,
86
+ vad_min_volume: float = 0.01,
87
+ vad_start_duration_ms: int = 200,
88
+ vad_stop_duration_ms: int = 500,
89
+ vad_backbuffer_duration_ms: int = 1000,
90
+ # TTS configuration
91
+ tts_config: ElevenLabsTtsConfig | None = None,
92
+ http_session: aiohttp.ClientSession | None = None,
93
+ # Internal use only - direct WebSocket URL (bypass standard auth)
94
+ ws_url: str | None = None,
95
+ ):
96
+ """Initialize a Deepslate RealtimeModel.
97
+
98
+ Args:
99
+ vendor_id: Deepslate vendor ID. Falls back to DEEPSLATE_VENDOR_ID env var.
100
+ organization_id: Deepslate organization ID. Falls back to DEEPSLATE_ORGANIZATION_ID env var.
101
+ api_key: Deepslate API key. Falls back to DEEPSLATE_API_KEY env var.
102
+ base_url: Base URL for Deepslate API.
103
+ system_prompt: System prompt for the model.
104
+ temperature: Sampling temperature (0.0 to 2.0). Higher values produce more random output.
105
+ generate_reply_timeout: Timeout in seconds for generate_reply (0 = no timeout).
106
+ vad_confidence_threshold: VAD confidence threshold (0.0 to 1.0).
107
+ vad_min_volume: VAD minimum volume threshold (0.0 to 1.0).
108
+ vad_start_duration_ms: Duration of speech to detect start (milliseconds).
109
+ vad_stop_duration_ms: Duration of silence to detect end (milliseconds).
110
+ vad_backbuffer_duration_ms: Audio buffer duration before speech detection (milliseconds).
111
+ tts_config: ElevenLabs TTS configuration. When provided, audio output is enabled
112
+ and Deepslate will use ElevenLabs for text-to-speech synthesis.
113
+ When None (default), only text output is provided.
114
+ http_session: Optional shared aiohttp session.
115
+ """
116
+ super().__init__(
117
+ capabilities=llm.RealtimeCapabilities(
118
+ message_truncation=True,
119
+ turn_detection=True,
120
+ user_transcription=True,
121
+ auto_tool_reply_generation=True,
122
+ audio_output=tts_config is not None,
123
+ manual_function_calls=False,
124
+ )
125
+ )
126
+
127
+ self._tts_config = tts_config
128
+
129
+ if ws_url:
130
+ deepslate_vendor_id = vendor_id or ""
131
+ deepslate_organization_id = organization_id or ""
132
+ deepslate_api_key = api_key or ""
133
+ else:
134
+ deepslate_vendor_id = vendor_id or os.environ.get("DEEPSLATE_VENDOR_ID")
135
+ if not deepslate_vendor_id:
136
+ raise ValueError(
137
+ "Deepslate vendor ID is required. "
138
+ "Provide it via the vendor_id parameter or set the DEEPSLATE_VENDOR_ID environment variable."
139
+ )
140
+
141
+ deepslate_organization_id = organization_id or os.environ.get("DEEPSLATE_ORGANIZATION_ID")
142
+ if not deepslate_organization_id:
143
+ raise ValueError(
144
+ "Deepslate organization ID is required. "
145
+ "Provide it via the organization_id parameter or set the DEEPSLATE_ORGANIZATION_ID environment variable."
146
+ )
147
+
148
+ deepslate_api_key = api_key or os.environ.get("DEEPSLATE_API_KEY")
149
+ if not deepslate_api_key:
150
+ raise ValueError(
151
+ "Deepslate API key is required. "
152
+ "Provide it via the api_key parameter or set the DEEPSLATE_API_KEY environment variable."
153
+ )
154
+
155
+ self._opts = DeepslateOptions(
156
+ vendor_id=deepslate_vendor_id,
157
+ organization_id=deepslate_organization_id,
158
+ api_key=deepslate_api_key,
159
+ base_url=base_url,
160
+ system_prompt=system_prompt,
161
+ temperature=temperature,
162
+ ws_url=ws_url,
163
+ generate_reply_timeout=generate_reply_timeout,
164
+ )
165
+
166
+ self._vad_config = VadConfig(
167
+ confidence_threshold=vad_confidence_threshold,
168
+ min_volume=vad_min_volume,
169
+ start_duration_ms=vad_start_duration_ms,
170
+ stop_duration_ms=vad_stop_duration_ms,
171
+ backbuffer_duration_ms=vad_backbuffer_duration_ms,
172
+ )
173
+
174
+ self._client = BaseDeepslateClient(
175
+ opts=self._opts,
176
+ user_agent=f"DeepslateLiveKit/{__version__}",
177
+ http_session=http_session,
178
+ )
179
+
180
+ @property
181
+ def provider(self) -> str:
182
+ return "deepslate"
183
+
184
+ def session(self) -> "DeepslateRealtimeSession":
185
+ """Create a new Deepslate real-time session."""
186
+ return DeepslateRealtimeSession(realtime_model=self)
187
+
188
+ def update_options(
189
+ self,
190
+ *,
191
+ system_prompt: NotGivenOr[str] = NOT_GIVEN,
192
+ temperature: NotGivenOr[float] = NOT_GIVEN,
193
+ ) -> None:
194
+ """Update model options.
195
+
196
+ Changes take effect on the next session initialization (e.g., after reconnect).
197
+ To apply immediately to an active session use
198
+ ``DeepslateRealtimeSession.update_instructions()`` or send a
199
+ ``ReconfigureSessionRequest`` via the session.
200
+ """
201
+ if utils.is_given(system_prompt):
202
+ self._opts.system_prompt = system_prompt
203
+ if utils.is_given(temperature):
204
+ self._opts.temperature = temperature
205
+
206
+ async def aclose(self) -> None:
207
+ await self._client.aclose()
208
+
209
+
210
+ class DeepslateRealtimeSession(
211
+ llm.RealtimeSession[
212
+ Literal[
213
+ "deepslate_server_event_received",
214
+ "deepslate_client_event_sent",
215
+ "user_transcription",
216
+ "audio_transcript",
217
+ ]
218
+ ],
219
+ DeepslateSessionListener,
220
+ ):
221
+ """A session for the Deepslate Realtime API.
222
+
223
+ Wraps ``DeepslateSession`` from deepslate-core and translates its
224
+ callbacks into LiveKit agent events and channel writes. All
225
+ protobuf details are encapsulated in the core session; this class
226
+ contains only LiveKit-specific logic.
227
+ """
228
+
229
+ def __init__(self, realtime_model: RealtimeModel):
230
+ super().__init__(realtime_model)
231
+ self._realtime_model = realtime_model
232
+ self._opts = realtime_model._opts
233
+
234
+ # LiveKit context
235
+ self._tools = llm.ToolContext.empty()
236
+ self._chat_ctx = llm.ChatContext.empty()
237
+ self._instructions: str | None = None
238
+
239
+ # Generation tracking
240
+ self._current_generation: _ResponseGeneration | None = None
241
+ self._response_created_futures: dict[str, asyncio.Future[GenerationCreatedEvent]] = {}
242
+ self._pending_user_generation: bool = False
243
+ self._pending_user_text: str | None = None
244
+
245
+ # Conversation query tracking: query_id → Future[str]
246
+ self._pending_queries: dict[str, asyncio.Future[str]] = {}
247
+
248
+ # Tool state
249
+ self._tools_dicts: list[FunctionToolDict] = []
250
+ self._tool_choice: ToolChoice | None = None
251
+
252
+ # Core session — owns the WebSocket lifecycle
253
+ self._session = DeepslateSession(
254
+ client=realtime_model._client,
255
+ options=realtime_model._opts,
256
+ vad_config=realtime_model._vad_config,
257
+ tts_config=realtime_model._tts_config,
258
+ listener=self,
259
+ )
260
+ self._session.start()
261
+
262
+ @property
263
+ def chat_ctx(self) -> llm.ChatContext:
264
+ return self._chat_ctx.copy()
265
+
266
+ @property
267
+ def tools(self) -> ToolContext:
268
+ return self._tools.copy()
269
+
270
+ async def update_instructions(self, instructions: str) -> None:
271
+ """Update system prompt for the next session initialization."""
272
+ self._instructions = instructions
273
+ self._opts.system_prompt = instructions
274
+ logger.debug("instructions updated (will take effect on next session)")
275
+
276
+ async def update_chat_ctx(self, chat_ctx: llm.ChatContext) -> None:
277
+ """Capture new user messages and handle function call outputs."""
278
+ existing_ids = {item.id for item in self._chat_ctx.items}
279
+
280
+ for item in chat_ctx.items:
281
+ if item.id not in existing_ids:
282
+ if item.type == "message" and item.role == "user":
283
+ if text := item.text_content:
284
+ self._pending_user_text = text
285
+ elif item.type == "function_call_output":
286
+ await self._session.send_tool_response(item.call_id, item.output)
287
+
288
+ self._chat_ctx = chat_ctx.copy()
289
+
290
+ async def update_tools(self, tools: list[FunctionTool | RawFunctionTool | Any]) -> None:
291
+ """Sync tool definitions to the server."""
292
+ tools_dicts = []
293
+ for tool in tools:
294
+ if is_function_tool(tool):
295
+ schema = llm.utils.build_legacy_openai_schema(tool, internally_tagged=True)
296
+ tools_dicts.append({
297
+ "type": "function",
298
+ "function": {
299
+ "name": schema["name"],
300
+ "description": schema.get("description", ""),
301
+ "parameters": schema.get("parameters", {}),
302
+ },
303
+ })
304
+ elif is_raw_function_tool(tool):
305
+ info = get_raw_function_info(tool)
306
+ tools_dicts.append({
307
+ "type": "function",
308
+ "function": {
309
+ "name": info.name,
310
+ "description": info.raw_schema.get("description", ""),
311
+ "parameters": info.raw_schema.get("parameters", {}),
312
+ },
313
+ })
314
+
315
+ self._tools_dicts = tools_dicts
316
+ self._tools = llm.ToolContext(tools)
317
+ await self._sync_tool_choice()
318
+ logger.debug(f"updated tools: {[t.get('function', {}).get('name') for t in tools_dicts]}")
319
+
320
+ async def update_options(
321
+ self, *, tool_choice: NotGivenOr[ToolChoice | None] = NOT_GIVEN
322
+ ) -> None:
323
+ """Apply a tool_choice constraint."""
324
+ if not utils.is_given(tool_choice):
325
+ logger.warning("Tool choice constraint not given")
326
+ return
327
+ self._tool_choice = tool_choice
328
+ await self._sync_tool_choice()
329
+
330
+ def _effective_tools_dicts(self) -> list[FunctionToolDict]:
331
+ """Return the tools list filtered by the current tool_choice."""
332
+ tc = self._tool_choice
333
+ if tc == "none":
334
+ return []
335
+ if isinstance(tc, dict): # NamedToolChoice
336
+ name = tc.get("function", {}).get("name")
337
+ return [t for t in self._tools_dicts if t.get("function", {}).get("name") == name]
338
+ # "auto", "required", None → send all tools
339
+ return self._tools_dicts
340
+
341
+ async def _sync_tool_choice(self) -> None:
342
+ """Push the effective tool list (after applying tool_choice) to the server."""
343
+ await self._session.update_tools(self._effective_tools_dicts())
344
+
345
+ async def push_audio(self, frame: rtc.AudioFrame) -> None:
346
+ """Push an audio frame to Deepslate."""
347
+ await self._session.send_audio(
348
+ frame.data.tobytes(),
349
+ frame.sample_rate,
350
+ frame.num_channels,
351
+ )
352
+
353
+ def push_video(self, frame: rtc.VideoFrame) -> None:
354
+ """Video input is not supported by Deepslate."""
355
+ logger.warning("Deepslate does not support video input")
356
+
357
+ async def send_text(
358
+ self,
359
+ text: str,
360
+ mode: TriggerMode = TriggerMode.NO_TRIGGER,
361
+ ) -> None:
362
+ """Send text input to Deepslate."""
363
+ await self._session.initialize()
364
+ await self._session.send_text(text, trigger=mode)
365
+
366
+ async def speak_direct(self, text: str, include_in_history: bool = True) -> None:
367
+ """Bypass the LLM and speak text directly via TTS."""
368
+ await self._session.initialize()
369
+ await self._session.send_direct_speech(text, include_in_history)
370
+
371
+ async def query_conversation(
372
+ self,
373
+ prompt: str | None = None,
374
+ instructions: str | None = None,
375
+ ) -> str:
376
+ """Run a one-shot side-channel inference against the current conversation.
377
+
378
+ Returns the model's complete text reply.
379
+ """
380
+ query_id = utils.shortuuid("query_")
381
+ fut: asyncio.Future[str] = asyncio.get_event_loop().create_future()
382
+ self._pending_queries[query_id] = fut
383
+
384
+ await self._session.initialize()
385
+ await self._session.send_conversation_query(query_id, prompt, instructions)
386
+ return await fut
387
+
388
+ async def export_chat_history(self, await_pending: bool = False) -> None:
389
+ """Request the server to export the current chat history.
390
+
391
+ The result is delivered via the ``chat_history_exported`` event.
392
+ """
393
+ await self._session.export_chat_history(await_pending)
394
+
395
+ async def generate_reply(
396
+ self, *, instructions: NotGivenOr[str] = NOT_GIVEN
397
+ ) -> GenerationCreatedEvent:
398
+ """Request the model to generate a reply."""
399
+ fut: asyncio.Future[GenerationCreatedEvent] = asyncio.Future()
400
+ request_id = utils.shortuuid("gen_")
401
+ self._response_created_futures[request_id] = fut
402
+ self._pending_user_generation = True
403
+
404
+ if utils.is_given(instructions):
405
+ self._instructions = instructions
406
+
407
+ if self._pending_user_text:
408
+ if utils.is_given(instructions):
409
+ await self._session.send_text(
410
+ self._pending_user_text,
411
+ trigger=TriggerMode.NO_TRIGGER,
412
+ )
413
+ await self._session.trigger_inference(instructions=instructions)
414
+ else:
415
+ await self._session.initialize()
416
+ await self._session.send_text(
417
+ self._pending_user_text,
418
+ trigger=TriggerMode.IMMEDIATE,
419
+ )
420
+ self._pending_user_text = None
421
+ else:
422
+ await self._session.initialize()
423
+ await self._session.trigger_inference(
424
+ instructions=instructions if utils.is_given(instructions) else None
425
+ )
426
+
427
+ timeout = self._opts.generate_reply_timeout
428
+
429
+ if timeout > 0:
430
+ try:
431
+ return await asyncio.wait_for(fut, timeout=timeout)
432
+ except asyncio.TimeoutError:
433
+ raise TimeoutError(f"generate_reply timed out after {timeout}s")
434
+ else:
435
+ return await fut
436
+
437
+ def commit_audio(self) -> None:
438
+ """Deepslate uses server-side VAD for auto-commit."""
439
+ pass
440
+
441
+ def clear_audio(self) -> None:
442
+ """Audio buffer clearing is not yet supported by the Deepslate backend."""
443
+ logger.warning("clear_audio not yet supported by Deepslate backend")
444
+
445
+ def interrupt(self) -> None:
446
+ """Interrupt the current generation."""
447
+ if self._current_generation:
448
+ self._close_current_generation()
449
+
450
+ def truncate(
451
+ self,
452
+ *,
453
+ message_id: str,
454
+ modalities: list[Literal["text", "audio"]],
455
+ audio_end_ms: int,
456
+ audio_transcript: NotGivenOr[str] = NOT_GIVEN,
457
+ ) -> None:
458
+ """Deepslate handles truncation server-side automatically."""
459
+ pass
460
+
461
+ async def aclose(self) -> None:
462
+ """Close the session."""
463
+ if self._current_generation:
464
+ with contextlib.suppress(asyncio.InvalidStateError):
465
+ self._current_generation.done_fut.set_result(None)
466
+ await self._session.close()
467
+
468
+ async def on_text_fragment(self, text: str) -> None:
469
+ if self._current_generation is None:
470
+ self._create_generation()
471
+ if self._current_generation is None:
472
+ return
473
+ self._current_generation.text_ch.send_nowait(text)
474
+ self._current_generation.audio_transcript += text
475
+ if self._current_generation.first_token_timestamp is None:
476
+ self._current_generation.first_token_timestamp = time.time()
477
+
478
+ async def on_audio_chunk(
479
+ self,
480
+ pcm_bytes: bytes,
481
+ sample_rate: int,
482
+ channels: int,
483
+ transcript: str | None,
484
+ ) -> None:
485
+ if self._current_generation is None:
486
+ self._create_generation()
487
+ if self._current_generation is None:
488
+ return
489
+
490
+ frame = rtc.AudioFrame(
491
+ data=pcm_bytes,
492
+ sample_rate=sample_rate,
493
+ num_channels=channels,
494
+ samples_per_channel=len(pcm_bytes) // 2,
495
+ )
496
+ self._current_generation.audio_ch.send_nowait(frame)
497
+
498
+ if self._current_generation.first_token_timestamp is None:
499
+ self._current_generation.first_token_timestamp = time.time()
500
+
501
+ if transcript:
502
+ self._current_generation.audio_transcript += transcript
503
+ self.emit("audio_transcript", transcript)
504
+
505
+ async def on_tool_call(self, call_id: str, name: str, params: dict) -> None:
506
+ if self._current_generation is None:
507
+ self._create_generation()
508
+ if self._current_generation is None:
509
+ return
510
+ self._current_generation.function_ch.send_nowait(
511
+ FunctionCall(
512
+ call_id=call_id,
513
+ name=name,
514
+ arguments=json.dumps(params),
515
+ )
516
+ )
517
+ logger.debug(f"tool call request: {name}({call_id})")
518
+ self._close_current_generation()
519
+
520
+ async def on_response_begin(self) -> None:
521
+ if self._current_generation is None:
522
+ self._create_generation()
523
+
524
+ async def on_response_end(self) -> None:
525
+ self._close_current_generation()
526
+
527
+ async def on_playback_buffer_clear(self) -> None:
528
+ if self._current_generation is not None:
529
+ self.emit("input_speech_started", InputSpeechStartedEvent())
530
+ self._close_current_generation()
531
+
532
+ async def on_user_transcription(self, text: str, language: str | None, turn_id: int) -> None:
533
+ self.emit(
534
+ "user_transcription",
535
+ SimpleNamespace(text=text, language=language or ""),
536
+ )
537
+
538
+ async def on_chat_history(self, messages) -> None:
539
+ self.emit("chat_history_exported", messages)
540
+
541
+ async def on_conversation_query_result(self, query_id: str, text: str) -> None:
542
+ fut = self._pending_queries.pop(query_id, None)
543
+ if fut is not None and not fut.done():
544
+ fut.set_result(text)
545
+ else:
546
+ logger.warning(
547
+ f"received conversation_query_result for unknown query_id: '{query_id}'"
548
+ )
549
+
550
+ async def on_error(self, category: str, message: str, trace_id: str | None) -> None:
551
+ trace_suffix = f" (trace_id={trace_id})" if trace_id else ""
552
+ error_msg = f"[Deepslate] {category}: {message}{trace_suffix}"
553
+ logger.error(error_msg)
554
+ self.emit(
555
+ "error",
556
+ llm.RealtimeModelError(
557
+ timestamp=time.time(),
558
+ label=self._realtime_model.label,
559
+ error=RuntimeError(error_msg),
560
+ recoverable=False,
561
+ ),
562
+ )
563
+
564
+ async def on_fatal_error(self, e: Exception) -> None:
565
+ self.emit(
566
+ "error",
567
+ llm.RealtimeModelError(
568
+ timestamp=time.time(),
569
+ label=self._realtime_model.label,
570
+ error=e,
571
+ recoverable=False,
572
+ ),
573
+ )
574
+
575
+ def _create_generation(self) -> None:
576
+ is_user_initiated = self._pending_user_generation
577
+ self._pending_user_generation = False
578
+
579
+ response_id = utils.shortuuid("resp_")
580
+ self._current_generation = _ResponseGeneration(
581
+ message_ch=utils.aio.Chan(),
582
+ function_ch=utils.aio.Chan(),
583
+ text_ch=utils.aio.Chan(),
584
+ audio_ch=utils.aio.Chan(),
585
+ done_fut=asyncio.Future(),
586
+ response_id=response_id,
587
+ created_timestamp=time.time(),
588
+ )
589
+
590
+ has_audio = self._realtime_model._tts_config is not None
591
+ msg_modalities: asyncio.Future[list[Literal["text", "audio"]]] = asyncio.Future()
592
+ if has_audio:
593
+ msg_modalities.set_result(["audio", "text"])
594
+ else:
595
+ msg_modalities.set_result(["text"])
596
+ self._current_generation.audio_ch.close()
597
+
598
+ self._current_generation.message_ch.send_nowait(
599
+ MessageGeneration(
600
+ message_id=response_id,
601
+ text_stream=self._current_generation.text_ch,
602
+ audio_stream=self._current_generation.audio_ch,
603
+ modalities=msg_modalities,
604
+ )
605
+ )
606
+
607
+ generation_ev = GenerationCreatedEvent(
608
+ message_stream=self._current_generation.message_ch,
609
+ function_stream=self._current_generation.function_ch,
610
+ user_initiated=is_user_initiated,
611
+ response_id=response_id,
612
+ )
613
+
614
+ self.emit("generation_created", generation_ev)
615
+
616
+ for fut in list(self._response_created_futures.values()):
617
+ if not fut.done():
618
+ fut.set_result(generation_ev)
619
+ self._response_created_futures.clear()
620
+
621
+ def _close_current_generation(self) -> None:
622
+ if self._current_generation is None:
623
+ return
624
+ self._current_generation.text_ch.close()
625
+ self._current_generation.audio_ch.close()
626
+ self._current_generation.function_ch.close()
627
+ self._current_generation.message_ch.close()
628
+ with contextlib.suppress(asyncio.InvalidStateError):
629
+ self._current_generation.done_fut.set_result(None)
630
+ self._current_generation = None