roomkit 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roomkit/AGENTS.md +17 -0
- roomkit/__init__.py +45 -0
- roomkit/_version.py +1 -1
- roomkit/channels/voice.py +728 -0
- roomkit/core/_channel_ops.py +7 -0
- roomkit/core/_inbound.py +4 -0
- roomkit/core/framework.py +177 -1
- roomkit/core/hooks.py +32 -6
- roomkit/llms.txt +4 -2
- roomkit/models/enums.py +12 -0
- roomkit/sources/__init__.py +4 -4
- roomkit/sources/sse.py +226 -0
- roomkit/voice/__init__.py +99 -0
- roomkit/voice/backends/__init__.py +1 -0
- roomkit/voice/backends/base.py +264 -0
- roomkit/voice/backends/fastrtc.py +467 -0
- roomkit/voice/backends/mock.py +302 -0
- roomkit/voice/base.py +115 -0
- roomkit/voice/events.py +140 -0
- roomkit/voice/stt/__init__.py +1 -0
- roomkit/voice/stt/base.py +58 -0
- roomkit/voice/stt/deepgram.py +214 -0
- roomkit/voice/stt/mock.py +40 -0
- roomkit/voice/tts/__init__.py +1 -0
- roomkit/voice/tts/base.py +58 -0
- roomkit/voice/tts/elevenlabs.py +329 -0
- roomkit/voice/tts/mock.py +51 -0
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/METADATA +26 -6
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/RECORD +31 -14
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/WHEEL +1 -1
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Voice support for RoomKit (STT, TTS, streaming audio)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from roomkit.voice.backends.base import VoiceBackend
|
|
8
|
+
from roomkit.voice.base import (
|
|
9
|
+
AudioChunk,
|
|
10
|
+
BargeInCallback,
|
|
11
|
+
PartialTranscriptionCallback,
|
|
12
|
+
SpeechEndCallback,
|
|
13
|
+
SpeechStartCallback,
|
|
14
|
+
TranscriptionResult,
|
|
15
|
+
VADAudioLevelCallback,
|
|
16
|
+
VADSilenceCallback,
|
|
17
|
+
VoiceCapability,
|
|
18
|
+
VoiceSession,
|
|
19
|
+
VoiceSessionState,
|
|
20
|
+
)
|
|
21
|
+
from roomkit.voice.events import (
|
|
22
|
+
BargeInEvent,
|
|
23
|
+
PartialTranscriptionEvent,
|
|
24
|
+
TTSCancelledEvent,
|
|
25
|
+
VADAudioLevelEvent,
|
|
26
|
+
VADSilenceEvent,
|
|
27
|
+
)
|
|
28
|
+
from roomkit.voice.stt.base import STTProvider
|
|
29
|
+
from roomkit.voice.tts.base import TTSProvider
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
# Base types
|
|
33
|
+
"AudioChunk",
|
|
34
|
+
"TranscriptionResult",
|
|
35
|
+
"VoiceBackend",
|
|
36
|
+
"VoiceCapability",
|
|
37
|
+
"VoiceSession",
|
|
38
|
+
"VoiceSessionState",
|
|
39
|
+
# Callback types
|
|
40
|
+
"BargeInCallback",
|
|
41
|
+
"PartialTranscriptionCallback",
|
|
42
|
+
"SpeechEndCallback",
|
|
43
|
+
"SpeechStartCallback",
|
|
44
|
+
"VADAudioLevelCallback",
|
|
45
|
+
"VADSilenceCallback",
|
|
46
|
+
# Event types (RFC §19)
|
|
47
|
+
"BargeInEvent",
|
|
48
|
+
"PartialTranscriptionEvent",
|
|
49
|
+
"TTSCancelledEvent",
|
|
50
|
+
"VADAudioLevelEvent",
|
|
51
|
+
"VADSilenceEvent",
|
|
52
|
+
# Providers
|
|
53
|
+
"STTProvider",
|
|
54
|
+
"TTSProvider",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
# Optional providers (lazy imports to avoid requiring dependencies)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_deepgram_provider() -> type:
|
|
61
|
+
"""Get DeepgramSTTProvider class (requires httpx, websockets)."""
|
|
62
|
+
from roomkit.voice.stt.deepgram import DeepgramSTTProvider
|
|
63
|
+
|
|
64
|
+
return DeepgramSTTProvider
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_deepgram_config() -> type:
|
|
68
|
+
"""Get DeepgramConfig class."""
|
|
69
|
+
from roomkit.voice.stt.deepgram import DeepgramConfig
|
|
70
|
+
|
|
71
|
+
return DeepgramConfig
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_elevenlabs_provider() -> type:
|
|
75
|
+
"""Get ElevenLabsTTSProvider class (requires httpx, websockets)."""
|
|
76
|
+
from roomkit.voice.tts.elevenlabs import ElevenLabsTTSProvider
|
|
77
|
+
|
|
78
|
+
return ElevenLabsTTSProvider
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_elevenlabs_config() -> type:
|
|
82
|
+
"""Get ElevenLabsConfig class."""
|
|
83
|
+
from roomkit.voice.tts.elevenlabs import ElevenLabsConfig
|
|
84
|
+
|
|
85
|
+
return ElevenLabsConfig
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def get_fastrtc_backend() -> type:
|
|
89
|
+
"""Get FastRTCVoiceBackend class (requires fastrtc, numpy)."""
|
|
90
|
+
from roomkit.voice.backends.fastrtc import FastRTCVoiceBackend
|
|
91
|
+
|
|
92
|
+
return FastRTCVoiceBackend
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_mount_fastrtc_voice() -> Any:
|
|
96
|
+
"""Get mount_fastrtc_voice function (requires fastrtc, numpy)."""
|
|
97
|
+
from roomkit.voice.backends.fastrtc import mount_fastrtc_voice
|
|
98
|
+
|
|
99
|
+
return mount_fastrtc_voice
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Voice transport backends."""
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""VoiceBackend abstract base class."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from roomkit.voice.base import (
|
|
10
|
+
AudioChunk,
|
|
11
|
+
BargeInCallback,
|
|
12
|
+
PartialTranscriptionCallback,
|
|
13
|
+
SpeechEndCallback,
|
|
14
|
+
SpeechStartCallback,
|
|
15
|
+
VADAudioLevelCallback,
|
|
16
|
+
VADSilenceCallback,
|
|
17
|
+
VoiceCapability,
|
|
18
|
+
VoiceSession,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VoiceBackend(ABC):
|
|
23
|
+
"""Abstract base class for voice transport backends.
|
|
24
|
+
|
|
25
|
+
VoiceBackend handles the transport layer for real-time audio:
|
|
26
|
+
- Managing voice session connections
|
|
27
|
+
- Voice Activity Detection (VAD) callbacks
|
|
28
|
+
- Streaming audio to/from clients
|
|
29
|
+
|
|
30
|
+
The backend is framework-agnostic. Web framework integration (FastAPI routes,
|
|
31
|
+
WebSocket endpoints) is the responsibility of the application layer.
|
|
32
|
+
|
|
33
|
+
Example usage with a hypothetical WebRTC backend:
|
|
34
|
+
backend = WebRTCVoiceBackend()
|
|
35
|
+
|
|
36
|
+
# Register VAD callbacks
|
|
37
|
+
backend.on_speech_start(handle_speech_start)
|
|
38
|
+
backend.on_speech_end(handle_speech_end)
|
|
39
|
+
|
|
40
|
+
# Connect a participant
|
|
41
|
+
session = await backend.connect("room-1", "user-1", "voice-channel")
|
|
42
|
+
|
|
43
|
+
# Stream audio to the client
|
|
44
|
+
await backend.send_audio(session, audio_chunks)
|
|
45
|
+
|
|
46
|
+
# Disconnect
|
|
47
|
+
await backend.disconnect(session)
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def name(self) -> str:
|
|
53
|
+
"""Backend name (e.g., 'webrtc', 'websocket', 'livekit')."""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
async def connect(
|
|
58
|
+
self,
|
|
59
|
+
room_id: str,
|
|
60
|
+
participant_id: str,
|
|
61
|
+
channel_id: str,
|
|
62
|
+
*,
|
|
63
|
+
metadata: dict[str, Any] | None = None,
|
|
64
|
+
) -> VoiceSession:
|
|
65
|
+
"""Create a new voice session for a participant.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
room_id: The room to join.
|
|
69
|
+
participant_id: The participant's ID.
|
|
70
|
+
channel_id: The voice channel ID.
|
|
71
|
+
metadata: Optional session metadata.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
A VoiceSession representing the connection.
|
|
75
|
+
"""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
@abstractmethod
|
|
79
|
+
async def disconnect(self, session: VoiceSession) -> None:
|
|
80
|
+
"""End a voice session.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
session: The session to disconnect.
|
|
84
|
+
"""
|
|
85
|
+
...
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def on_speech_start(self, callback: SpeechStartCallback) -> None:
|
|
89
|
+
"""Register a callback for when VAD detects speech starting.
|
|
90
|
+
|
|
91
|
+
The callback receives the VoiceSession where speech was detected.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
callback: Function called when speech starts.
|
|
95
|
+
"""
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
@abstractmethod
|
|
99
|
+
def on_speech_end(self, callback: SpeechEndCallback) -> None:
|
|
100
|
+
"""Register a callback for when VAD detects speech ending.
|
|
101
|
+
|
|
102
|
+
The callback receives the VoiceSession and the audio bytes
|
|
103
|
+
captured during the speech segment.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
callback: Function called when speech ends with audio data.
|
|
107
|
+
"""
|
|
108
|
+
...
|
|
109
|
+
|
|
110
|
+
@abstractmethod
|
|
111
|
+
async def send_audio(
|
|
112
|
+
self,
|
|
113
|
+
session: VoiceSession,
|
|
114
|
+
audio: bytes | AsyncIterator[AudioChunk],
|
|
115
|
+
) -> None:
|
|
116
|
+
"""Send audio to a voice session.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
session: The target session.
|
|
120
|
+
audio: Raw audio bytes or an async iterator of AudioChunks
|
|
121
|
+
for streaming.
|
|
122
|
+
"""
|
|
123
|
+
...
|
|
124
|
+
|
|
125
|
+
def get_session(self, session_id: str) -> VoiceSession | None:
|
|
126
|
+
"""Get a session by ID.
|
|
127
|
+
|
|
128
|
+
Override for backends that track sessions internally.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
session_id: The session ID to look up.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
The VoiceSession if found, None otherwise.
|
|
135
|
+
"""
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
def list_sessions(self, room_id: str) -> list[VoiceSession]:
|
|
139
|
+
"""List all active sessions in a room.
|
|
140
|
+
|
|
141
|
+
Override for backends that track sessions internally.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
room_id: The room to list sessions for.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of active VoiceSessions in the room.
|
|
148
|
+
"""
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
async def close(self) -> None:
|
|
152
|
+
"""Release backend resources.
|
|
153
|
+
|
|
154
|
+
Override in subclasses that need cleanup.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
# -------------------------------------------------------------------------
|
|
158
|
+
# Enhanced voice capabilities (RFC §19)
|
|
159
|
+
# -------------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def capabilities(self) -> VoiceCapability:
|
|
163
|
+
"""Declare supported capabilities.
|
|
164
|
+
|
|
165
|
+
Override to enable features like interruption, partial STT, etc.
|
|
166
|
+
By default, no optional capabilities are supported.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Flags indicating supported capabilities.
|
|
170
|
+
"""
|
|
171
|
+
return VoiceCapability.NONE
|
|
172
|
+
|
|
173
|
+
def on_partial_transcription(
|
|
174
|
+
self, callback: PartialTranscriptionCallback
|
|
175
|
+
) -> None:
|
|
176
|
+
"""Register callback for partial transcription results.
|
|
177
|
+
|
|
178
|
+
Only called if capabilities includes PARTIAL_STT.
|
|
179
|
+
Backends that support streaming STT should call this callback
|
|
180
|
+
with interim results as they become available.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
callback: Function called with (session, text, confidence, is_stable).
|
|
184
|
+
"""
|
|
185
|
+
pass # Default no-op, override if supported
|
|
186
|
+
|
|
187
|
+
def on_vad_silence(self, callback: VADSilenceCallback) -> None:
|
|
188
|
+
"""Register callback for silence detection.
|
|
189
|
+
|
|
190
|
+
Only called if capabilities includes VAD_SILENCE.
|
|
191
|
+
Backends should call this when silence is detected after speech,
|
|
192
|
+
potentially before the full speech_end event.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
callback: Function called with (session, silence_duration_ms).
|
|
196
|
+
"""
|
|
197
|
+
pass # Default no-op, override if supported
|
|
198
|
+
|
|
199
|
+
def on_vad_audio_level(self, callback: VADAudioLevelCallback) -> None:
|
|
200
|
+
"""Register callback for audio level updates.
|
|
201
|
+
|
|
202
|
+
Only called if capabilities includes VAD_AUDIO_LEVEL.
|
|
203
|
+
Backends should call this periodically (e.g., 10Hz) with
|
|
204
|
+
current audio level for UI feedback.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
callback: Function called with (session, level_db, is_speech).
|
|
208
|
+
"""
|
|
209
|
+
pass # Default no-op, override if supported
|
|
210
|
+
|
|
211
|
+
def on_barge_in(self, callback: BargeInCallback) -> None:
|
|
212
|
+
"""Register callback for barge-in detection.
|
|
213
|
+
|
|
214
|
+
Only called if capabilities includes BARGE_IN.
|
|
215
|
+
Backends should call this when user starts speaking while
|
|
216
|
+
audio is being played (TTS interruption).
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
callback: Function called with (session).
|
|
220
|
+
"""
|
|
221
|
+
pass # Default no-op, override if supported
|
|
222
|
+
|
|
223
|
+
async def cancel_audio(self, session: VoiceSession) -> bool:
|
|
224
|
+
"""Cancel ongoing audio playback for a session.
|
|
225
|
+
|
|
226
|
+
Only works if capabilities includes INTERRUPTION.
|
|
227
|
+
Used for barge-in handling to stop TTS playback.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
session: The session to cancel audio for.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
True if audio was cancelled, False if nothing was playing.
|
|
234
|
+
"""
|
|
235
|
+
return False # Default no-op, override if supported
|
|
236
|
+
|
|
237
|
+
def is_playing(self, session: VoiceSession) -> bool:
|
|
238
|
+
"""Check if audio is currently being sent to the session.
|
|
239
|
+
|
|
240
|
+
Used for barge-in detection to know if interruption is possible.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
session: The session to check.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
True if audio is currently playing, False otherwise.
|
|
247
|
+
"""
|
|
248
|
+
return False # Default: assume not playing
|
|
249
|
+
|
|
250
|
+
async def send_transcription(
|
|
251
|
+
self, session: VoiceSession, text: str, role: str = "user"
|
|
252
|
+
) -> None:
|
|
253
|
+
"""Send transcription text to the client for UI display.
|
|
254
|
+
|
|
255
|
+
Optional method for backends that support sending text updates.
|
|
256
|
+
Called by VoiceChannel after STT transcription to show the user
|
|
257
|
+
what they said, and after AI response to show what the assistant said.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
session: The voice session to send to.
|
|
261
|
+
text: The transcribed or response text.
|
|
262
|
+
role: Either "user" (transcription) or "assistant" (AI response).
|
|
263
|
+
"""
|
|
264
|
+
pass # Default no-op, override if supported
|