roomkit 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ """Voice support for RoomKit (STT, TTS, streaming audio)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from roomkit.voice.backends.base import VoiceBackend
8
+ from roomkit.voice.base import (
9
+ AudioChunk,
10
+ BargeInCallback,
11
+ PartialTranscriptionCallback,
12
+ SpeechEndCallback,
13
+ SpeechStartCallback,
14
+ TranscriptionResult,
15
+ VADAudioLevelCallback,
16
+ VADSilenceCallback,
17
+ VoiceCapability,
18
+ VoiceSession,
19
+ VoiceSessionState,
20
+ )
21
+ from roomkit.voice.events import (
22
+ BargeInEvent,
23
+ PartialTranscriptionEvent,
24
+ TTSCancelledEvent,
25
+ VADAudioLevelEvent,
26
+ VADSilenceEvent,
27
+ )
28
+ from roomkit.voice.stt.base import STTProvider
29
+ from roomkit.voice.tts.base import TTSProvider
30
+
31
+ __all__ = [
32
+ # Base types
33
+ "AudioChunk",
34
+ "TranscriptionResult",
35
+ "VoiceBackend",
36
+ "VoiceCapability",
37
+ "VoiceSession",
38
+ "VoiceSessionState",
39
+ # Callback types
40
+ "BargeInCallback",
41
+ "PartialTranscriptionCallback",
42
+ "SpeechEndCallback",
43
+ "SpeechStartCallback",
44
+ "VADAudioLevelCallback",
45
+ "VADSilenceCallback",
46
+ # Event types (RFC §19)
47
+ "BargeInEvent",
48
+ "PartialTranscriptionEvent",
49
+ "TTSCancelledEvent",
50
+ "VADAudioLevelEvent",
51
+ "VADSilenceEvent",
52
+ # Providers
53
+ "STTProvider",
54
+ "TTSProvider",
55
+ ]
56
+
57
+ # Optional providers (lazy imports to avoid requiring dependencies)
58
+
59
+
60
+ def get_deepgram_provider() -> type:
61
+ """Get DeepgramSTTProvider class (requires httpx, websockets)."""
62
+ from roomkit.voice.stt.deepgram import DeepgramSTTProvider
63
+
64
+ return DeepgramSTTProvider
65
+
66
+
67
+ def get_deepgram_config() -> type:
68
+ """Get DeepgramConfig class."""
69
+ from roomkit.voice.stt.deepgram import DeepgramConfig
70
+
71
+ return DeepgramConfig
72
+
73
+
74
+ def get_elevenlabs_provider() -> type:
75
+ """Get ElevenLabsTTSProvider class (requires httpx, websockets)."""
76
+ from roomkit.voice.tts.elevenlabs import ElevenLabsTTSProvider
77
+
78
+ return ElevenLabsTTSProvider
79
+
80
+
81
+ def get_elevenlabs_config() -> type:
82
+ """Get ElevenLabsConfig class."""
83
+ from roomkit.voice.tts.elevenlabs import ElevenLabsConfig
84
+
85
+ return ElevenLabsConfig
86
+
87
+
88
+ def get_fastrtc_backend() -> type:
89
+ """Get FastRTCVoiceBackend class (requires fastrtc, numpy)."""
90
+ from roomkit.voice.backends.fastrtc import FastRTCVoiceBackend
91
+
92
+ return FastRTCVoiceBackend
93
+
94
+
95
+ def get_mount_fastrtc_voice() -> Any:
96
+ """Get mount_fastrtc_voice function (requires fastrtc, numpy)."""
97
+ from roomkit.voice.backends.fastrtc import mount_fastrtc_voice
98
+
99
+ return mount_fastrtc_voice
@@ -0,0 +1 @@
1
+ """Voice transport backends."""
@@ -0,0 +1,264 @@
1
+ """VoiceBackend abstract base class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import AsyncIterator
7
+ from typing import Any
8
+
9
+ from roomkit.voice.base import (
10
+ AudioChunk,
11
+ BargeInCallback,
12
+ PartialTranscriptionCallback,
13
+ SpeechEndCallback,
14
+ SpeechStartCallback,
15
+ VADAudioLevelCallback,
16
+ VADSilenceCallback,
17
+ VoiceCapability,
18
+ VoiceSession,
19
+ )
20
+
21
+
22
+ class VoiceBackend(ABC):
23
+ """Abstract base class for voice transport backends.
24
+
25
+ VoiceBackend handles the transport layer for real-time audio:
26
+ - Managing voice session connections
27
+ - Voice Activity Detection (VAD) callbacks
28
+ - Streaming audio to/from clients
29
+
30
+ The backend is framework-agnostic. Web framework integration (FastAPI routes,
31
+ WebSocket endpoints) is the responsibility of the application layer.
32
+
33
+ Example usage with a hypothetical WebRTC backend:
34
+ backend = WebRTCVoiceBackend()
35
+
36
+ # Register VAD callbacks
37
+ backend.on_speech_start(handle_speech_start)
38
+ backend.on_speech_end(handle_speech_end)
39
+
40
+ # Connect a participant
41
+ session = await backend.connect("room-1", "user-1", "voice-channel")
42
+
43
+ # Stream audio to the client
44
+ await backend.send_audio(session, audio_chunks)
45
+
46
+ # Disconnect
47
+ await backend.disconnect(session)
48
+ """
49
+
50
+ @property
51
+ @abstractmethod
52
+ def name(self) -> str:
53
+ """Backend name (e.g., 'webrtc', 'websocket', 'livekit')."""
54
+ ...
55
+
56
+ @abstractmethod
57
+ async def connect(
58
+ self,
59
+ room_id: str,
60
+ participant_id: str,
61
+ channel_id: str,
62
+ *,
63
+ metadata: dict[str, Any] | None = None,
64
+ ) -> VoiceSession:
65
+ """Create a new voice session for a participant.
66
+
67
+ Args:
68
+ room_id: The room to join.
69
+ participant_id: The participant's ID.
70
+ channel_id: The voice channel ID.
71
+ metadata: Optional session metadata.
72
+
73
+ Returns:
74
+ A VoiceSession representing the connection.
75
+ """
76
+ ...
77
+
78
+ @abstractmethod
79
+ async def disconnect(self, session: VoiceSession) -> None:
80
+ """End a voice session.
81
+
82
+ Args:
83
+ session: The session to disconnect.
84
+ """
85
+ ...
86
+
87
+ @abstractmethod
88
+ def on_speech_start(self, callback: SpeechStartCallback) -> None:
89
+ """Register a callback for when VAD detects speech starting.
90
+
91
+ The callback receives the VoiceSession where speech was detected.
92
+
93
+ Args:
94
+ callback: Function called when speech starts.
95
+ """
96
+ ...
97
+
98
+ @abstractmethod
99
+ def on_speech_end(self, callback: SpeechEndCallback) -> None:
100
+ """Register a callback for when VAD detects speech ending.
101
+
102
+ The callback receives the VoiceSession and the audio bytes
103
+ captured during the speech segment.
104
+
105
+ Args:
106
+ callback: Function called when speech ends with audio data.
107
+ """
108
+ ...
109
+
110
+ @abstractmethod
111
+ async def send_audio(
112
+ self,
113
+ session: VoiceSession,
114
+ audio: bytes | AsyncIterator[AudioChunk],
115
+ ) -> None:
116
+ """Send audio to a voice session.
117
+
118
+ Args:
119
+ session: The target session.
120
+ audio: Raw audio bytes or an async iterator of AudioChunks
121
+ for streaming.
122
+ """
123
+ ...
124
+
125
+ def get_session(self, session_id: str) -> VoiceSession | None:
126
+ """Get a session by ID.
127
+
128
+ Override for backends that track sessions internally.
129
+
130
+ Args:
131
+ session_id: The session ID to look up.
132
+
133
+ Returns:
134
+ The VoiceSession if found, None otherwise.
135
+ """
136
+ return None
137
+
138
+ def list_sessions(self, room_id: str) -> list[VoiceSession]:
139
+ """List all active sessions in a room.
140
+
141
+ Override for backends that track sessions internally.
142
+
143
+ Args:
144
+ room_id: The room to list sessions for.
145
+
146
+ Returns:
147
+ List of active VoiceSessions in the room.
148
+ """
149
+ return []
150
+
151
+ async def close(self) -> None:
152
+ """Release backend resources.
153
+
154
+ Override in subclasses that need cleanup.
155
+ """
156
+
157
+ # -------------------------------------------------------------------------
158
+ # Enhanced voice capabilities (RFC §19)
159
+ # -------------------------------------------------------------------------
160
+
161
+ @property
162
+ def capabilities(self) -> VoiceCapability:
163
+ """Declare supported capabilities.
164
+
165
+ Override to enable features like interruption, partial STT, etc.
166
+ By default, no optional capabilities are supported.
167
+
168
+ Returns:
169
+ Flags indicating supported capabilities.
170
+ """
171
+ return VoiceCapability.NONE
172
+
173
+ def on_partial_transcription(
174
+ self, callback: PartialTranscriptionCallback
175
+ ) -> None:
176
+ """Register callback for partial transcription results.
177
+
178
+ Only called if capabilities includes PARTIAL_STT.
179
+ Backends that support streaming STT should call this callback
180
+ with interim results as they become available.
181
+
182
+ Args:
183
+ callback: Function called with (session, text, confidence, is_stable).
184
+ """
185
+ pass # Default no-op, override if supported
186
+
187
+ def on_vad_silence(self, callback: VADSilenceCallback) -> None:
188
+ """Register callback for silence detection.
189
+
190
+ Only called if capabilities includes VAD_SILENCE.
191
+ Backends should call this when silence is detected after speech,
192
+ potentially before the full speech_end event.
193
+
194
+ Args:
195
+ callback: Function called with (session, silence_duration_ms).
196
+ """
197
+ pass # Default no-op, override if supported
198
+
199
+ def on_vad_audio_level(self, callback: VADAudioLevelCallback) -> None:
200
+ """Register callback for audio level updates.
201
+
202
+ Only called if capabilities includes VAD_AUDIO_LEVEL.
203
+ Backends should call this periodically (e.g., 10Hz) with
204
+ current audio level for UI feedback.
205
+
206
+ Args:
207
+ callback: Function called with (session, level_db, is_speech).
208
+ """
209
+ pass # Default no-op, override if supported
210
+
211
+ def on_barge_in(self, callback: BargeInCallback) -> None:
212
+ """Register callback for barge-in detection.
213
+
214
+ Only called if capabilities includes BARGE_IN.
215
+ Backends should call this when user starts speaking while
216
+ audio is being played (TTS interruption).
217
+
218
+ Args:
219
+ callback: Function called with (session).
220
+ """
221
+ pass # Default no-op, override if supported
222
+
223
+ async def cancel_audio(self, session: VoiceSession) -> bool:
224
+ """Cancel ongoing audio playback for a session.
225
+
226
+ Only works if capabilities includes INTERRUPTION.
227
+ Used for barge-in handling to stop TTS playback.
228
+
229
+ Args:
230
+ session: The session to cancel audio for.
231
+
232
+ Returns:
233
+ True if audio was cancelled, False if nothing was playing.
234
+ """
235
+ return False # Default no-op, override if supported
236
+
237
+ def is_playing(self, session: VoiceSession) -> bool:
238
+ """Check if audio is currently being sent to the session.
239
+
240
+ Used for barge-in detection to know if interruption is possible.
241
+
242
+ Args:
243
+ session: The session to check.
244
+
245
+ Returns:
246
+ True if audio is currently playing, False otherwise.
247
+ """
248
+ return False # Default: assume not playing
249
+
250
+ async def send_transcription(
251
+ self, session: VoiceSession, text: str, role: str = "user"
252
+ ) -> None:
253
+ """Send transcription text to the client for UI display.
254
+
255
+ Optional method for backends that support sending text updates.
256
+ Called by VoiceChannel after STT transcription to show the user
257
+ what they said, and after AI response to show what the assistant said.
258
+
259
+ Args:
260
+ session: The voice session to send to.
261
+ text: The transcribed or response text.
262
+ role: Either "user" (transcription) or "assistant" (AI response).
263
+ """
264
+ pass # Default no-op, override if supported