roomkit 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,302 @@
1
+ """Mock voice backend for testing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import AsyncIterator
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+ from uuid import uuid4
9
+
10
+ from roomkit.voice.backends.base import VoiceBackend
11
+ from roomkit.voice.base import (
12
+ AudioChunk,
13
+ BargeInCallback,
14
+ PartialTranscriptionCallback,
15
+ SpeechEndCallback,
16
+ SpeechStartCallback,
17
+ VADAudioLevelCallback,
18
+ VADSilenceCallback,
19
+ VoiceCapability,
20
+ VoiceSession,
21
+ VoiceSessionState,
22
+ )
23
+
24
+
25
+ @dataclass
26
+ class MockVoiceCall:
27
+ """Record of a call made to MockVoiceBackend."""
28
+
29
+ method: str
30
+ args: dict[str, Any] = field(default_factory=dict)
31
+
32
+
33
+ class MockVoiceBackend(VoiceBackend):
34
+ """Mock voice backend for testing.
35
+
36
+ Tracks all method calls and provides helpers to simulate VAD events.
37
+
38
+ Example:
39
+ backend = MockVoiceBackend()
40
+
41
+ # Track calls
42
+ session = await backend.connect("room-1", "user-1", "voice-1")
43
+ assert backend.calls[-1].method == "connect"
44
+
45
+ # Simulate speech events
46
+ await backend.simulate_speech_start(session)
47
+ await backend.simulate_speech_end(session, b"audio-data")
48
+
49
+ # Simulate enhanced events (RFC §19)
50
+ await backend.simulate_partial_transcription(session, "Hello", 0.8, False)
51
+ await backend.simulate_vad_silence(session, 500)
52
+ await backend.simulate_barge_in(session)
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ *,
58
+ capabilities: VoiceCapability = VoiceCapability.NONE,
59
+ ) -> None:
60
+ """Initialize MockVoiceBackend.
61
+
62
+ Args:
63
+ capabilities: Optional capabilities to enable for testing.
64
+ Defaults to NONE. Set to test capability-dependent behavior.
65
+ """
66
+ self._sessions: dict[str, VoiceSession] = {}
67
+ self._speech_start_callbacks: list[SpeechStartCallback] = []
68
+ self._speech_end_callbacks: list[SpeechEndCallback] = []
69
+ # Enhanced callbacks (RFC §19)
70
+ self._partial_transcription_callbacks: list[PartialTranscriptionCallback] = []
71
+ self._vad_silence_callbacks: list[VADSilenceCallback] = []
72
+ self._vad_audio_level_callbacks: list[VADAudioLevelCallback] = []
73
+ self._barge_in_callbacks: list[BargeInCallback] = []
74
+ # Tracking
75
+ self.calls: list[MockVoiceCall] = []
76
+ self.sent_audio: list[tuple[str, bytes]] = [] # (session_id, audio)
77
+ self.sent_transcriptions: list[tuple[str, str, str]] = [] # (session_id, text, role)
78
+ self._playing_sessions: set[str] = set() # Sessions currently receiving audio
79
+ self._capabilities = capabilities
80
+
81
+ @property
82
+ def name(self) -> str:
83
+ return "MockVoiceBackend"
84
+
85
+ @property
86
+ def capabilities(self) -> VoiceCapability:
87
+ return self._capabilities
88
+
89
+ async def connect(
90
+ self,
91
+ room_id: str,
92
+ participant_id: str,
93
+ channel_id: str,
94
+ *,
95
+ metadata: dict[str, Any] | None = None,
96
+ ) -> VoiceSession:
97
+ session = VoiceSession(
98
+ id=uuid4().hex,
99
+ room_id=room_id,
100
+ participant_id=participant_id,
101
+ channel_id=channel_id,
102
+ state=VoiceSessionState.ACTIVE,
103
+ metadata=metadata or {},
104
+ )
105
+ self._sessions[session.id] = session
106
+ self.calls.append(
107
+ MockVoiceCall(
108
+ method="connect",
109
+ args={
110
+ "room_id": room_id,
111
+ "participant_id": participant_id,
112
+ "channel_id": channel_id,
113
+ "metadata": metadata,
114
+ },
115
+ )
116
+ )
117
+ return session
118
+
119
+ async def disconnect(self, session: VoiceSession) -> None:
120
+ if session.id in self._sessions:
121
+ self._sessions[session.id] = VoiceSession(
122
+ id=session.id,
123
+ room_id=session.room_id,
124
+ participant_id=session.participant_id,
125
+ channel_id=session.channel_id,
126
+ state=VoiceSessionState.ENDED,
127
+ created_at=session.created_at,
128
+ metadata=session.metadata,
129
+ )
130
+ self.calls.append(
131
+ MockVoiceCall(method="disconnect", args={"session_id": session.id})
132
+ )
133
+
134
+ def on_speech_start(self, callback: SpeechStartCallback) -> None:
135
+ self._speech_start_callbacks.append(callback)
136
+ self.calls.append(MockVoiceCall(method="on_speech_start"))
137
+
138
+ def on_speech_end(self, callback: SpeechEndCallback) -> None:
139
+ self._speech_end_callbacks.append(callback)
140
+ self.calls.append(MockVoiceCall(method="on_speech_end"))
141
+
142
+ async def send_audio(
143
+ self,
144
+ session: VoiceSession,
145
+ audio: bytes | AsyncIterator[AudioChunk],
146
+ ) -> None:
147
+ if isinstance(audio, bytes):
148
+ self.sent_audio.append((session.id, audio))
149
+ else:
150
+ # Collect chunks from async iterator
151
+ chunks: list[bytes] = []
152
+ async for chunk in audio:
153
+ chunks.append(chunk.data)
154
+ combined = b"".join(chunks)
155
+ self.sent_audio.append((session.id, combined))
156
+
157
+ self.calls.append(
158
+ MockVoiceCall(method="send_audio", args={"session_id": session.id})
159
+ )
160
+
161
+ def get_session(self, session_id: str) -> VoiceSession | None:
162
+ return self._sessions.get(session_id)
163
+
164
+ def list_sessions(self, room_id: str) -> list[VoiceSession]:
165
+ return [s for s in self._sessions.values() if s.room_id == room_id]
166
+
167
+ async def close(self) -> None:
168
+ self._sessions.clear()
169
+ self._playing_sessions.clear()
170
+ self.calls.append(MockVoiceCall(method="close"))
171
+
172
+ async def send_transcription(
173
+ self, session: VoiceSession, text: str, role: str = "user"
174
+ ) -> None:
175
+ self.sent_transcriptions.append((session.id, text, role))
176
+ self.calls.append(
177
+ MockVoiceCall(
178
+ method="send_transcription",
179
+ args={"session_id": session.id, "text": text, "role": role},
180
+ )
181
+ )
182
+
183
+ # -------------------------------------------------------------------------
184
+ # Enhanced voice capabilities (RFC §19)
185
+ # -------------------------------------------------------------------------
186
+
187
+ def on_partial_transcription(
188
+ self, callback: PartialTranscriptionCallback
189
+ ) -> None:
190
+ self._partial_transcription_callbacks.append(callback)
191
+ self.calls.append(MockVoiceCall(method="on_partial_transcription"))
192
+
193
+ def on_vad_silence(self, callback: VADSilenceCallback) -> None:
194
+ self._vad_silence_callbacks.append(callback)
195
+ self.calls.append(MockVoiceCall(method="on_vad_silence"))
196
+
197
+ def on_vad_audio_level(self, callback: VADAudioLevelCallback) -> None:
198
+ self._vad_audio_level_callbacks.append(callback)
199
+ self.calls.append(MockVoiceCall(method="on_vad_audio_level"))
200
+
201
+ def on_barge_in(self, callback: BargeInCallback) -> None:
202
+ self._barge_in_callbacks.append(callback)
203
+ self.calls.append(MockVoiceCall(method="on_barge_in"))
204
+
205
+ async def cancel_audio(self, session: VoiceSession) -> bool:
206
+ was_playing = session.id in self._playing_sessions
207
+ self._playing_sessions.discard(session.id)
208
+ self.calls.append(
209
+ MockVoiceCall(
210
+ method="cancel_audio",
211
+ args={"session_id": session.id, "was_playing": was_playing},
212
+ )
213
+ )
214
+ return was_playing
215
+
216
+ def is_playing(self, session: VoiceSession) -> bool:
217
+ return session.id in self._playing_sessions
218
+
219
+ # -------------------------------------------------------------------------
220
+ # Test helpers
221
+ # -------------------------------------------------------------------------
222
+
223
+ async def simulate_speech_start(self, session: VoiceSession) -> None:
224
+ """Simulate VAD detecting speech start.
225
+
226
+ Fires all registered on_speech_start callbacks.
227
+ """
228
+ for callback in self._speech_start_callbacks:
229
+ result = callback(session)
230
+ if hasattr(result, "__await__"):
231
+ await result
232
+
233
+ async def simulate_speech_end(self, session: VoiceSession, audio: bytes) -> None:
234
+ """Simulate VAD detecting speech end.
235
+
236
+ Fires all registered on_speech_end callbacks with the audio data.
237
+ """
238
+ for callback in self._speech_end_callbacks:
239
+ result = callback(session, audio)
240
+ if hasattr(result, "__await__"):
241
+ await result
242
+
243
+ async def simulate_partial_transcription(
244
+ self,
245
+ session: VoiceSession,
246
+ text: str,
247
+ confidence: float = 0.8,
248
+ is_stable: bool = False,
249
+ ) -> None:
250
+ """Simulate streaming STT partial result.
251
+
252
+ Fires all registered on_partial_transcription callbacks.
253
+ """
254
+ for callback in self._partial_transcription_callbacks:
255
+ result = callback(session, text, confidence, is_stable)
256
+ if hasattr(result, "__await__"):
257
+ await result
258
+
259
+ async def simulate_vad_silence(
260
+ self, session: VoiceSession, silence_duration_ms: int
261
+ ) -> None:
262
+ """Simulate VAD detecting silence.
263
+
264
+ Fires all registered on_vad_silence callbacks.
265
+ """
266
+ for callback in self._vad_silence_callbacks:
267
+ result = callback(session, silence_duration_ms)
268
+ if hasattr(result, "__await__"):
269
+ await result
270
+
271
+ async def simulate_vad_audio_level(
272
+ self,
273
+ session: VoiceSession,
274
+ level_db: float,
275
+ is_speech: bool = True,
276
+ ) -> None:
277
+ """Simulate audio level update.
278
+
279
+ Fires all registered on_vad_audio_level callbacks.
280
+ """
281
+ for callback in self._vad_audio_level_callbacks:
282
+ result = callback(session, level_db, is_speech)
283
+ if hasattr(result, "__await__"):
284
+ await result
285
+
286
+ async def simulate_barge_in(self, session: VoiceSession) -> None:
287
+ """Simulate user speaking while TTS is playing (barge-in).
288
+
289
+ Fires all registered on_barge_in callbacks.
290
+ """
291
+ for callback in self._barge_in_callbacks:
292
+ result = callback(session)
293
+ if hasattr(result, "__await__"):
294
+ await result
295
+
296
+ def start_playing(self, session: VoiceSession) -> None:
297
+ """Mark a session as playing audio (for barge-in testing)."""
298
+ self._playing_sessions.add(session.id)
299
+
300
+ def stop_playing(self, session: VoiceSession) -> None:
301
+ """Mark a session as no longer playing audio."""
302
+ self._playing_sessions.discard(session.id)
roomkit/voice/base.py ADDED
@@ -0,0 +1,115 @@
1
+ """Base models for voice support."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Callable
6
+ from dataclasses import dataclass, field
7
+ from datetime import UTC, datetime
8
+ from enum import Flag, StrEnum, auto, unique
9
+ from typing import Any
10
+
11
+
12
+ @unique
13
+ class VoiceSessionState(StrEnum):
14
+ """State of a voice session."""
15
+
16
+ CONNECTING = "connecting"
17
+ ACTIVE = "active"
18
+ PAUSED = "paused"
19
+ ENDED = "ended"
20
+
21
+
22
+ class VoiceCapability(Flag):
23
+ """Capabilities a VoiceBackend can support (RFC §19).
24
+
25
+ Backends declare their capabilities via the `capabilities` property.
26
+ This allows RoomKit to know which features are available and
27
+ enables integrators to choose backends based on their needs.
28
+
29
+ Example:
30
+ class MyBackend(VoiceBackend):
31
+ @property
32
+ def capabilities(self) -> VoiceCapability:
33
+ return (
34
+ VoiceCapability.INTERRUPTION |
35
+ VoiceCapability.PARTIAL_STT |
36
+ VoiceCapability.BARGE_IN
37
+ )
38
+ """
39
+
40
+ NONE = 0
41
+ """No optional capabilities (default)."""
42
+
43
+ INTERRUPTION = auto()
44
+ """Backend can cancel ongoing audio playback (cancel_audio)."""
45
+
46
+ PARTIAL_STT = auto()
47
+ """Backend provides partial/streaming transcription results."""
48
+
49
+ VAD_SILENCE = auto()
50
+ """Backend emits silence detection events."""
51
+
52
+ VAD_AUDIO_LEVEL = auto()
53
+ """Backend emits periodic audio level events."""
54
+
55
+ BARGE_IN = auto()
56
+ """Backend detects and handles barge-in (user interrupts TTS)."""
57
+
58
+
59
+ @dataclass
60
+ class AudioChunk:
61
+ """A chunk of audio data for streaming."""
62
+
63
+ data: bytes
64
+ sample_rate: int = 16000
65
+ channels: int = 1
66
+ format: str = "pcm_s16le"
67
+ timestamp_ms: int | None = None
68
+ is_final: bool = False
69
+
70
+
71
+ def _utcnow() -> datetime:
72
+ """Get current UTC time (timezone-aware)."""
73
+ return datetime.now(UTC)
74
+
75
+
76
+ @dataclass
77
+ class VoiceSession:
78
+ """Active voice connection for a participant."""
79
+
80
+ id: str
81
+ room_id: str
82
+ participant_id: str
83
+ channel_id: str
84
+ state: VoiceSessionState = VoiceSessionState.CONNECTING
85
+ created_at: datetime = field(default_factory=_utcnow)
86
+ metadata: dict[str, Any] = field(default_factory=dict)
87
+
88
+
89
+ @dataclass
90
+ class TranscriptionResult:
91
+ """Result from speech-to-text transcription."""
92
+
93
+ text: str
94
+ is_final: bool = True
95
+ confidence: float | None = None
96
+ language: str | None = None
97
+ words: list[dict[str, Any]] = field(default_factory=list)
98
+
99
+
100
+ # Type aliases for voice callbacks
101
+ SpeechStartCallback = Callable[[VoiceSession], Any]
102
+ SpeechEndCallback = Callable[[VoiceSession, bytes], Any]
103
+
104
+ # Enhanced voice callbacks (RFC §19)
105
+ PartialTranscriptionCallback = Callable[[VoiceSession, str, float, bool], Any]
106
+ """Callback for partial transcription: (session, text, confidence, is_stable)."""
107
+
108
+ VADSilenceCallback = Callable[[VoiceSession, int], Any]
109
+ """Callback for silence detection: (session, silence_duration_ms)."""
110
+
111
+ VADAudioLevelCallback = Callable[[VoiceSession, float, bool], Any]
112
+ """Callback for audio level: (session, level_db, is_speech)."""
113
+
114
+ BargeInCallback = Callable[[VoiceSession], Any]
115
+ """Callback for barge-in detection: (session)."""
@@ -0,0 +1,140 @@
1
+ """Voice event types for enhanced voice support (RFC §19)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from datetime import UTC, datetime
7
+ from typing import TYPE_CHECKING, Literal
8
+
9
+ if TYPE_CHECKING:
10
+ from roomkit.voice.base import VoiceSession
11
+
12
+
13
+ def _utcnow() -> datetime:
14
+ """Get current UTC time (timezone-aware)."""
15
+ return datetime.now(UTC)
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class BargeInEvent:
20
+ """User started speaking while TTS was playing.
21
+
22
+ This event is fired when the VAD detects speech starting while
23
+ audio is being sent to the user. This allows the system to:
24
+ - Cancel the current TTS playback
25
+ - Adjust response strategy (e.g., acknowledge interruption)
26
+ - Track conversation dynamics
27
+ """
28
+
29
+ session: VoiceSession
30
+ """The voice session where barge-in occurred."""
31
+
32
+ interrupted_text: str
33
+ """The text that was being spoken when interrupted."""
34
+
35
+ audio_position_ms: int
36
+ """How far into the TTS audio playback (in milliseconds)."""
37
+
38
+ timestamp: datetime = field(default_factory=_utcnow)
39
+ """When the barge-in was detected."""
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class TTSCancelledEvent:
44
+ """TTS playback was cancelled.
45
+
46
+ This event is fired when TTS synthesis or playback is stopped
47
+ before completion. Reasons include:
48
+ - barge_in: User started speaking
49
+ - explicit: Application called interrupt()
50
+ - disconnect: Session ended
51
+ - error: TTS or playback error
52
+ """
53
+
54
+ session: VoiceSession
55
+ """The voice session where TTS was cancelled."""
56
+
57
+ reason: Literal["barge_in", "explicit", "disconnect", "error"]
58
+ """Why the TTS was cancelled."""
59
+
60
+ text: str
61
+ """The text that was being synthesized."""
62
+
63
+ audio_position_ms: int
64
+ """How far into playback (0 if not started)."""
65
+
66
+ timestamp: datetime = field(default_factory=_utcnow)
67
+ """When the cancellation occurred."""
68
+
69
+
70
+ @dataclass(frozen=True)
71
+ class PartialTranscriptionEvent:
72
+ """Interim transcription result during speech.
73
+
74
+ This event is fired by backends that support streaming STT,
75
+ providing real-time transcription updates before the final
76
+ result. Use cases include:
77
+ - Live captions/subtitles
78
+ - Early intent detection
79
+ - Visual feedback during speech
80
+ """
81
+
82
+ session: VoiceSession
83
+ """The voice session being transcribed."""
84
+
85
+ text: str
86
+ """The current transcription (may change in subsequent events)."""
87
+
88
+ confidence: float
89
+ """Confidence score (0.0 to 1.0)."""
90
+
91
+ is_stable: bool
92
+ """True if this portion is unlikely to change significantly."""
93
+
94
+ timestamp: datetime = field(default_factory=_utcnow)
95
+ """When this transcription was received."""
96
+
97
+
98
+ @dataclass(frozen=True)
99
+ class VADSilenceEvent:
100
+ """Silence detected after speech.
101
+
102
+ This event is fired when the VAD detects a period of silence
103
+ following speech. It can be used for:
104
+ - Early end-of-utterance detection (before full speech_end)
105
+ - Adaptive silence thresholds
106
+ - Turn-taking management
107
+ """
108
+
109
+ session: VoiceSession
110
+ """The voice session where silence was detected."""
111
+
112
+ silence_duration_ms: int
113
+ """Duration of silence in milliseconds."""
114
+
115
+ timestamp: datetime = field(default_factory=_utcnow)
116
+ """When the silence was detected."""
117
+
118
+
119
+ @dataclass(frozen=True)
120
+ class VADAudioLevelEvent:
121
+ """Periodic audio level update for UI feedback.
122
+
123
+ This event is fired periodically (typically 10Hz) to provide
124
+ audio level information for UI visualization. Use cases include:
125
+ - Audio level meters
126
+ - Speaking indicators
127
+ - Noise detection
128
+ """
129
+
130
+ session: VoiceSession
131
+ """The voice session."""
132
+
133
+ level_db: float
134
+ """Audio level in dB (typically -60 to 0, where 0 is max)."""
135
+
136
+ is_speech: bool
137
+ """VAD's determination if this audio contains speech."""
138
+
139
+ timestamp: datetime = field(default_factory=_utcnow)
140
+ """When this measurement was taken."""
@@ -0,0 +1 @@
1
+ """Speech-to-text providers."""
@@ -0,0 +1,58 @@
1
+ """Speech-to-text provider ABC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import AsyncIterator
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from roomkit.models.event import AudioContent
11
+ from roomkit.voice.base import AudioChunk, TranscriptionResult
12
+
13
+
14
+ class STTProvider(ABC):
15
+ """Speech-to-text provider."""
16
+
17
+ @property
18
+ def name(self) -> str:
19
+ """Provider name (e.g. 'whisper', 'deepgram')."""
20
+ return self.__class__.__name__
21
+
22
+ @abstractmethod
23
+ async def transcribe(self, audio: AudioContent | AudioChunk) -> str:
24
+ """Transcribe complete audio to text.
25
+
26
+ Args:
27
+ audio: Audio content (URL) or raw audio chunk.
28
+
29
+ Returns:
30
+ Transcribed text.
31
+ """
32
+ ...
33
+
34
+ async def transcribe_stream(
35
+ self, audio_stream: AsyncIterator[AudioChunk]
36
+ ) -> AsyncIterator[TranscriptionResult]:
37
+ """Stream transcription with partial results.
38
+
39
+ Override for providers that support streaming.
40
+ Default: buffers all audio and returns single result.
41
+ """
42
+ chunks: list[AudioChunk] = []
43
+ async for chunk in audio_stream:
44
+ chunks.append(chunk)
45
+
46
+ # Combine chunks and transcribe
47
+ combined = AudioChunk(
48
+ data=b"".join(c.data for c in chunks),
49
+ sample_rate=chunks[0].sample_rate if chunks else 16000,
50
+ )
51
+ text = await self.transcribe(combined)
52
+
53
+ from roomkit.voice.base import TranscriptionResult
54
+
55
+ yield TranscriptionResult(text=text, is_final=True)
56
+
57
+ async def close(self) -> None: # noqa: B027
58
+ """Release resources. Override in subclasses if needed."""