roomkit 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roomkit/AGENTS.md +17 -0
- roomkit/__init__.py +45 -0
- roomkit/_version.py +1 -1
- roomkit/channels/voice.py +728 -0
- roomkit/core/_channel_ops.py +7 -0
- roomkit/core/_inbound.py +4 -0
- roomkit/core/framework.py +177 -1
- roomkit/core/hooks.py +32 -6
- roomkit/llms.txt +4 -2
- roomkit/models/enums.py +12 -0
- roomkit/sources/__init__.py +4 -4
- roomkit/sources/sse.py +226 -0
- roomkit/voice/__init__.py +99 -0
- roomkit/voice/backends/__init__.py +1 -0
- roomkit/voice/backends/base.py +264 -0
- roomkit/voice/backends/fastrtc.py +467 -0
- roomkit/voice/backends/mock.py +302 -0
- roomkit/voice/base.py +115 -0
- roomkit/voice/events.py +140 -0
- roomkit/voice/stt/__init__.py +1 -0
- roomkit/voice/stt/base.py +58 -0
- roomkit/voice/stt/deepgram.py +214 -0
- roomkit/voice/stt/mock.py +40 -0
- roomkit/voice/tts/__init__.py +1 -0
- roomkit/voice/tts/base.py +58 -0
- roomkit/voice/tts/elevenlabs.py +329 -0
- roomkit/voice/tts/mock.py +51 -0
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/METADATA +26 -6
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/RECORD +31 -14
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/WHEEL +1 -1
- {roomkit-0.1.1.dist-info → roomkit-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""Mock voice backend for testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import AsyncIterator
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
from uuid import uuid4
|
|
9
|
+
|
|
10
|
+
from roomkit.voice.backends.base import VoiceBackend
|
|
11
|
+
from roomkit.voice.base import (
|
|
12
|
+
AudioChunk,
|
|
13
|
+
BargeInCallback,
|
|
14
|
+
PartialTranscriptionCallback,
|
|
15
|
+
SpeechEndCallback,
|
|
16
|
+
SpeechStartCallback,
|
|
17
|
+
VADAudioLevelCallback,
|
|
18
|
+
VADSilenceCallback,
|
|
19
|
+
VoiceCapability,
|
|
20
|
+
VoiceSession,
|
|
21
|
+
VoiceSessionState,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class MockVoiceCall:
|
|
27
|
+
"""Record of a call made to MockVoiceBackend."""
|
|
28
|
+
|
|
29
|
+
method: str
|
|
30
|
+
args: dict[str, Any] = field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MockVoiceBackend(VoiceBackend):
|
|
34
|
+
"""Mock voice backend for testing.
|
|
35
|
+
|
|
36
|
+
Tracks all method calls and provides helpers to simulate VAD events.
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
backend = MockVoiceBackend()
|
|
40
|
+
|
|
41
|
+
# Track calls
|
|
42
|
+
session = await backend.connect("room-1", "user-1", "voice-1")
|
|
43
|
+
assert backend.calls[-1].method == "connect"
|
|
44
|
+
|
|
45
|
+
# Simulate speech events
|
|
46
|
+
await backend.simulate_speech_start(session)
|
|
47
|
+
await backend.simulate_speech_end(session, b"audio-data")
|
|
48
|
+
|
|
49
|
+
# Simulate enhanced events (RFC §19)
|
|
50
|
+
await backend.simulate_partial_transcription(session, "Hello", 0.8, False)
|
|
51
|
+
await backend.simulate_vad_silence(session, 500)
|
|
52
|
+
await backend.simulate_barge_in(session)
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
*,
|
|
58
|
+
capabilities: VoiceCapability = VoiceCapability.NONE,
|
|
59
|
+
) -> None:
|
|
60
|
+
"""Initialize MockVoiceBackend.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
capabilities: Optional capabilities to enable for testing.
|
|
64
|
+
Defaults to NONE. Set to test capability-dependent behavior.
|
|
65
|
+
"""
|
|
66
|
+
self._sessions: dict[str, VoiceSession] = {}
|
|
67
|
+
self._speech_start_callbacks: list[SpeechStartCallback] = []
|
|
68
|
+
self._speech_end_callbacks: list[SpeechEndCallback] = []
|
|
69
|
+
# Enhanced callbacks (RFC §19)
|
|
70
|
+
self._partial_transcription_callbacks: list[PartialTranscriptionCallback] = []
|
|
71
|
+
self._vad_silence_callbacks: list[VADSilenceCallback] = []
|
|
72
|
+
self._vad_audio_level_callbacks: list[VADAudioLevelCallback] = []
|
|
73
|
+
self._barge_in_callbacks: list[BargeInCallback] = []
|
|
74
|
+
# Tracking
|
|
75
|
+
self.calls: list[MockVoiceCall] = []
|
|
76
|
+
self.sent_audio: list[tuple[str, bytes]] = [] # (session_id, audio)
|
|
77
|
+
self.sent_transcriptions: list[tuple[str, str, str]] = [] # (session_id, text, role)
|
|
78
|
+
self._playing_sessions: set[str] = set() # Sessions currently receiving audio
|
|
79
|
+
self._capabilities = capabilities
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def name(self) -> str:
|
|
83
|
+
return "MockVoiceBackend"
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def capabilities(self) -> VoiceCapability:
|
|
87
|
+
return self._capabilities
|
|
88
|
+
|
|
89
|
+
async def connect(
|
|
90
|
+
self,
|
|
91
|
+
room_id: str,
|
|
92
|
+
participant_id: str,
|
|
93
|
+
channel_id: str,
|
|
94
|
+
*,
|
|
95
|
+
metadata: dict[str, Any] | None = None,
|
|
96
|
+
) -> VoiceSession:
|
|
97
|
+
session = VoiceSession(
|
|
98
|
+
id=uuid4().hex,
|
|
99
|
+
room_id=room_id,
|
|
100
|
+
participant_id=participant_id,
|
|
101
|
+
channel_id=channel_id,
|
|
102
|
+
state=VoiceSessionState.ACTIVE,
|
|
103
|
+
metadata=metadata or {},
|
|
104
|
+
)
|
|
105
|
+
self._sessions[session.id] = session
|
|
106
|
+
self.calls.append(
|
|
107
|
+
MockVoiceCall(
|
|
108
|
+
method="connect",
|
|
109
|
+
args={
|
|
110
|
+
"room_id": room_id,
|
|
111
|
+
"participant_id": participant_id,
|
|
112
|
+
"channel_id": channel_id,
|
|
113
|
+
"metadata": metadata,
|
|
114
|
+
},
|
|
115
|
+
)
|
|
116
|
+
)
|
|
117
|
+
return session
|
|
118
|
+
|
|
119
|
+
async def disconnect(self, session: VoiceSession) -> None:
|
|
120
|
+
if session.id in self._sessions:
|
|
121
|
+
self._sessions[session.id] = VoiceSession(
|
|
122
|
+
id=session.id,
|
|
123
|
+
room_id=session.room_id,
|
|
124
|
+
participant_id=session.participant_id,
|
|
125
|
+
channel_id=session.channel_id,
|
|
126
|
+
state=VoiceSessionState.ENDED,
|
|
127
|
+
created_at=session.created_at,
|
|
128
|
+
metadata=session.metadata,
|
|
129
|
+
)
|
|
130
|
+
self.calls.append(
|
|
131
|
+
MockVoiceCall(method="disconnect", args={"session_id": session.id})
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def on_speech_start(self, callback: SpeechStartCallback) -> None:
|
|
135
|
+
self._speech_start_callbacks.append(callback)
|
|
136
|
+
self.calls.append(MockVoiceCall(method="on_speech_start"))
|
|
137
|
+
|
|
138
|
+
def on_speech_end(self, callback: SpeechEndCallback) -> None:
|
|
139
|
+
self._speech_end_callbacks.append(callback)
|
|
140
|
+
self.calls.append(MockVoiceCall(method="on_speech_end"))
|
|
141
|
+
|
|
142
|
+
async def send_audio(
|
|
143
|
+
self,
|
|
144
|
+
session: VoiceSession,
|
|
145
|
+
audio: bytes | AsyncIterator[AudioChunk],
|
|
146
|
+
) -> None:
|
|
147
|
+
if isinstance(audio, bytes):
|
|
148
|
+
self.sent_audio.append((session.id, audio))
|
|
149
|
+
else:
|
|
150
|
+
# Collect chunks from async iterator
|
|
151
|
+
chunks: list[bytes] = []
|
|
152
|
+
async for chunk in audio:
|
|
153
|
+
chunks.append(chunk.data)
|
|
154
|
+
combined = b"".join(chunks)
|
|
155
|
+
self.sent_audio.append((session.id, combined))
|
|
156
|
+
|
|
157
|
+
self.calls.append(
|
|
158
|
+
MockVoiceCall(method="send_audio", args={"session_id": session.id})
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def get_session(self, session_id: str) -> VoiceSession | None:
|
|
162
|
+
return self._sessions.get(session_id)
|
|
163
|
+
|
|
164
|
+
def list_sessions(self, room_id: str) -> list[VoiceSession]:
|
|
165
|
+
return [s for s in self._sessions.values() if s.room_id == room_id]
|
|
166
|
+
|
|
167
|
+
async def close(self) -> None:
|
|
168
|
+
self._sessions.clear()
|
|
169
|
+
self._playing_sessions.clear()
|
|
170
|
+
self.calls.append(MockVoiceCall(method="close"))
|
|
171
|
+
|
|
172
|
+
async def send_transcription(
|
|
173
|
+
self, session: VoiceSession, text: str, role: str = "user"
|
|
174
|
+
) -> None:
|
|
175
|
+
self.sent_transcriptions.append((session.id, text, role))
|
|
176
|
+
self.calls.append(
|
|
177
|
+
MockVoiceCall(
|
|
178
|
+
method="send_transcription",
|
|
179
|
+
args={"session_id": session.id, "text": text, "role": role},
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# -------------------------------------------------------------------------
|
|
184
|
+
# Enhanced voice capabilities (RFC §19)
|
|
185
|
+
# -------------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
def on_partial_transcription(
|
|
188
|
+
self, callback: PartialTranscriptionCallback
|
|
189
|
+
) -> None:
|
|
190
|
+
self._partial_transcription_callbacks.append(callback)
|
|
191
|
+
self.calls.append(MockVoiceCall(method="on_partial_transcription"))
|
|
192
|
+
|
|
193
|
+
def on_vad_silence(self, callback: VADSilenceCallback) -> None:
|
|
194
|
+
self._vad_silence_callbacks.append(callback)
|
|
195
|
+
self.calls.append(MockVoiceCall(method="on_vad_silence"))
|
|
196
|
+
|
|
197
|
+
def on_vad_audio_level(self, callback: VADAudioLevelCallback) -> None:
|
|
198
|
+
self._vad_audio_level_callbacks.append(callback)
|
|
199
|
+
self.calls.append(MockVoiceCall(method="on_vad_audio_level"))
|
|
200
|
+
|
|
201
|
+
def on_barge_in(self, callback: BargeInCallback) -> None:
|
|
202
|
+
self._barge_in_callbacks.append(callback)
|
|
203
|
+
self.calls.append(MockVoiceCall(method="on_barge_in"))
|
|
204
|
+
|
|
205
|
+
async def cancel_audio(self, session: VoiceSession) -> bool:
|
|
206
|
+
was_playing = session.id in self._playing_sessions
|
|
207
|
+
self._playing_sessions.discard(session.id)
|
|
208
|
+
self.calls.append(
|
|
209
|
+
MockVoiceCall(
|
|
210
|
+
method="cancel_audio",
|
|
211
|
+
args={"session_id": session.id, "was_playing": was_playing},
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
return was_playing
|
|
215
|
+
|
|
216
|
+
def is_playing(self, session: VoiceSession) -> bool:
|
|
217
|
+
return session.id in self._playing_sessions
|
|
218
|
+
|
|
219
|
+
# -------------------------------------------------------------------------
|
|
220
|
+
# Test helpers
|
|
221
|
+
# -------------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
async def simulate_speech_start(self, session: VoiceSession) -> None:
|
|
224
|
+
"""Simulate VAD detecting speech start.
|
|
225
|
+
|
|
226
|
+
Fires all registered on_speech_start callbacks.
|
|
227
|
+
"""
|
|
228
|
+
for callback in self._speech_start_callbacks:
|
|
229
|
+
result = callback(session)
|
|
230
|
+
if hasattr(result, "__await__"):
|
|
231
|
+
await result
|
|
232
|
+
|
|
233
|
+
async def simulate_speech_end(self, session: VoiceSession, audio: bytes) -> None:
|
|
234
|
+
"""Simulate VAD detecting speech end.
|
|
235
|
+
|
|
236
|
+
Fires all registered on_speech_end callbacks with the audio data.
|
|
237
|
+
"""
|
|
238
|
+
for callback in self._speech_end_callbacks:
|
|
239
|
+
result = callback(session, audio)
|
|
240
|
+
if hasattr(result, "__await__"):
|
|
241
|
+
await result
|
|
242
|
+
|
|
243
|
+
async def simulate_partial_transcription(
|
|
244
|
+
self,
|
|
245
|
+
session: VoiceSession,
|
|
246
|
+
text: str,
|
|
247
|
+
confidence: float = 0.8,
|
|
248
|
+
is_stable: bool = False,
|
|
249
|
+
) -> None:
|
|
250
|
+
"""Simulate streaming STT partial result.
|
|
251
|
+
|
|
252
|
+
Fires all registered on_partial_transcription callbacks.
|
|
253
|
+
"""
|
|
254
|
+
for callback in self._partial_transcription_callbacks:
|
|
255
|
+
result = callback(session, text, confidence, is_stable)
|
|
256
|
+
if hasattr(result, "__await__"):
|
|
257
|
+
await result
|
|
258
|
+
|
|
259
|
+
async def simulate_vad_silence(
|
|
260
|
+
self, session: VoiceSession, silence_duration_ms: int
|
|
261
|
+
) -> None:
|
|
262
|
+
"""Simulate VAD detecting silence.
|
|
263
|
+
|
|
264
|
+
Fires all registered on_vad_silence callbacks.
|
|
265
|
+
"""
|
|
266
|
+
for callback in self._vad_silence_callbacks:
|
|
267
|
+
result = callback(session, silence_duration_ms)
|
|
268
|
+
if hasattr(result, "__await__"):
|
|
269
|
+
await result
|
|
270
|
+
|
|
271
|
+
async def simulate_vad_audio_level(
|
|
272
|
+
self,
|
|
273
|
+
session: VoiceSession,
|
|
274
|
+
level_db: float,
|
|
275
|
+
is_speech: bool = True,
|
|
276
|
+
) -> None:
|
|
277
|
+
"""Simulate audio level update.
|
|
278
|
+
|
|
279
|
+
Fires all registered on_vad_audio_level callbacks.
|
|
280
|
+
"""
|
|
281
|
+
for callback in self._vad_audio_level_callbacks:
|
|
282
|
+
result = callback(session, level_db, is_speech)
|
|
283
|
+
if hasattr(result, "__await__"):
|
|
284
|
+
await result
|
|
285
|
+
|
|
286
|
+
async def simulate_barge_in(self, session: VoiceSession) -> None:
|
|
287
|
+
"""Simulate user speaking while TTS is playing (barge-in).
|
|
288
|
+
|
|
289
|
+
Fires all registered on_barge_in callbacks.
|
|
290
|
+
"""
|
|
291
|
+
for callback in self._barge_in_callbacks:
|
|
292
|
+
result = callback(session)
|
|
293
|
+
if hasattr(result, "__await__"):
|
|
294
|
+
await result
|
|
295
|
+
|
|
296
|
+
def start_playing(self, session: VoiceSession) -> None:
|
|
297
|
+
"""Mark a session as playing audio (for barge-in testing)."""
|
|
298
|
+
self._playing_sessions.add(session.id)
|
|
299
|
+
|
|
300
|
+
def stop_playing(self, session: VoiceSession) -> None:
|
|
301
|
+
"""Mark a session as no longer playing audio."""
|
|
302
|
+
self._playing_sessions.discard(session.id)
|
roomkit/voice/base.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Base models for voice support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from enum import Flag, StrEnum, auto, unique
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@unique
|
|
13
|
+
class VoiceSessionState(StrEnum):
|
|
14
|
+
"""State of a voice session."""
|
|
15
|
+
|
|
16
|
+
CONNECTING = "connecting"
|
|
17
|
+
ACTIVE = "active"
|
|
18
|
+
PAUSED = "paused"
|
|
19
|
+
ENDED = "ended"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VoiceCapability(Flag):
|
|
23
|
+
"""Capabilities a VoiceBackend can support (RFC §19).
|
|
24
|
+
|
|
25
|
+
Backends declare their capabilities via the `capabilities` property.
|
|
26
|
+
This allows RoomKit to know which features are available and
|
|
27
|
+
enables integrators to choose backends based on their needs.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
class MyBackend(VoiceBackend):
|
|
31
|
+
@property
|
|
32
|
+
def capabilities(self) -> VoiceCapability:
|
|
33
|
+
return (
|
|
34
|
+
VoiceCapability.INTERRUPTION |
|
|
35
|
+
VoiceCapability.PARTIAL_STT |
|
|
36
|
+
VoiceCapability.BARGE_IN
|
|
37
|
+
)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
NONE = 0
|
|
41
|
+
"""No optional capabilities (default)."""
|
|
42
|
+
|
|
43
|
+
INTERRUPTION = auto()
|
|
44
|
+
"""Backend can cancel ongoing audio playback (cancel_audio)."""
|
|
45
|
+
|
|
46
|
+
PARTIAL_STT = auto()
|
|
47
|
+
"""Backend provides partial/streaming transcription results."""
|
|
48
|
+
|
|
49
|
+
VAD_SILENCE = auto()
|
|
50
|
+
"""Backend emits silence detection events."""
|
|
51
|
+
|
|
52
|
+
VAD_AUDIO_LEVEL = auto()
|
|
53
|
+
"""Backend emits periodic audio level events."""
|
|
54
|
+
|
|
55
|
+
BARGE_IN = auto()
|
|
56
|
+
"""Backend detects and handles barge-in (user interrupts TTS)."""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class AudioChunk:
|
|
61
|
+
"""A chunk of audio data for streaming."""
|
|
62
|
+
|
|
63
|
+
data: bytes
|
|
64
|
+
sample_rate: int = 16000
|
|
65
|
+
channels: int = 1
|
|
66
|
+
format: str = "pcm_s16le"
|
|
67
|
+
timestamp_ms: int | None = None
|
|
68
|
+
is_final: bool = False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _utcnow() -> datetime:
|
|
72
|
+
"""Get current UTC time (timezone-aware)."""
|
|
73
|
+
return datetime.now(UTC)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class VoiceSession:
|
|
78
|
+
"""Active voice connection for a participant."""
|
|
79
|
+
|
|
80
|
+
id: str
|
|
81
|
+
room_id: str
|
|
82
|
+
participant_id: str
|
|
83
|
+
channel_id: str
|
|
84
|
+
state: VoiceSessionState = VoiceSessionState.CONNECTING
|
|
85
|
+
created_at: datetime = field(default_factory=_utcnow)
|
|
86
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class TranscriptionResult:
|
|
91
|
+
"""Result from speech-to-text transcription."""
|
|
92
|
+
|
|
93
|
+
text: str
|
|
94
|
+
is_final: bool = True
|
|
95
|
+
confidence: float | None = None
|
|
96
|
+
language: str | None = None
|
|
97
|
+
words: list[dict[str, Any]] = field(default_factory=list)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# Type aliases for voice callbacks
|
|
101
|
+
SpeechStartCallback = Callable[[VoiceSession], Any]
|
|
102
|
+
SpeechEndCallback = Callable[[VoiceSession, bytes], Any]
|
|
103
|
+
|
|
104
|
+
# Enhanced voice callbacks (RFC §19)
|
|
105
|
+
PartialTranscriptionCallback = Callable[[VoiceSession, str, float, bool], Any]
|
|
106
|
+
"""Callback for partial transcription: (session, text, confidence, is_stable)."""
|
|
107
|
+
|
|
108
|
+
VADSilenceCallback = Callable[[VoiceSession, int], Any]
|
|
109
|
+
"""Callback for silence detection: (session, silence_duration_ms)."""
|
|
110
|
+
|
|
111
|
+
VADAudioLevelCallback = Callable[[VoiceSession, float, bool], Any]
|
|
112
|
+
"""Callback for audio level: (session, level_db, is_speech)."""
|
|
113
|
+
|
|
114
|
+
BargeInCallback = Callable[[VoiceSession], Any]
|
|
115
|
+
"""Callback for barge-in detection: (session)."""
|
roomkit/voice/events.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Voice event types for enhanced voice support (RFC §19)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from roomkit.voice.base import VoiceSession
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _utcnow() -> datetime:
|
|
14
|
+
"""Get current UTC time (timezone-aware)."""
|
|
15
|
+
return datetime.now(UTC)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class BargeInEvent:
|
|
20
|
+
"""User started speaking while TTS was playing.
|
|
21
|
+
|
|
22
|
+
This event is fired when the VAD detects speech starting while
|
|
23
|
+
audio is being sent to the user. This allows the system to:
|
|
24
|
+
- Cancel the current TTS playback
|
|
25
|
+
- Adjust response strategy (e.g., acknowledge interruption)
|
|
26
|
+
- Track conversation dynamics
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
session: VoiceSession
|
|
30
|
+
"""The voice session where barge-in occurred."""
|
|
31
|
+
|
|
32
|
+
interrupted_text: str
|
|
33
|
+
"""The text that was being spoken when interrupted."""
|
|
34
|
+
|
|
35
|
+
audio_position_ms: int
|
|
36
|
+
"""How far into the TTS audio playback (in milliseconds)."""
|
|
37
|
+
|
|
38
|
+
timestamp: datetime = field(default_factory=_utcnow)
|
|
39
|
+
"""When the barge-in was detected."""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class TTSCancelledEvent:
|
|
44
|
+
"""TTS playback was cancelled.
|
|
45
|
+
|
|
46
|
+
This event is fired when TTS synthesis or playback is stopped
|
|
47
|
+
before completion. Reasons include:
|
|
48
|
+
- barge_in: User started speaking
|
|
49
|
+
- explicit: Application called interrupt()
|
|
50
|
+
- disconnect: Session ended
|
|
51
|
+
- error: TTS or playback error
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
session: VoiceSession
|
|
55
|
+
"""The voice session where TTS was cancelled."""
|
|
56
|
+
|
|
57
|
+
reason: Literal["barge_in", "explicit", "disconnect", "error"]
|
|
58
|
+
"""Why the TTS was cancelled."""
|
|
59
|
+
|
|
60
|
+
text: str
|
|
61
|
+
"""The text that was being synthesized."""
|
|
62
|
+
|
|
63
|
+
audio_position_ms: int
|
|
64
|
+
"""How far into playback (0 if not started)."""
|
|
65
|
+
|
|
66
|
+
timestamp: datetime = field(default_factory=_utcnow)
|
|
67
|
+
"""When the cancellation occurred."""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class PartialTranscriptionEvent:
|
|
72
|
+
"""Interim transcription result during speech.
|
|
73
|
+
|
|
74
|
+
This event is fired by backends that support streaming STT,
|
|
75
|
+
providing real-time transcription updates before the final
|
|
76
|
+
result. Use cases include:
|
|
77
|
+
- Live captions/subtitles
|
|
78
|
+
- Early intent detection
|
|
79
|
+
- Visual feedback during speech
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
session: VoiceSession
|
|
83
|
+
"""The voice session being transcribed."""
|
|
84
|
+
|
|
85
|
+
text: str
|
|
86
|
+
"""The current transcription (may change in subsequent events)."""
|
|
87
|
+
|
|
88
|
+
confidence: float
|
|
89
|
+
"""Confidence score (0.0 to 1.0)."""
|
|
90
|
+
|
|
91
|
+
is_stable: bool
|
|
92
|
+
"""True if this portion is unlikely to change significantly."""
|
|
93
|
+
|
|
94
|
+
timestamp: datetime = field(default_factory=_utcnow)
|
|
95
|
+
"""When this transcription was received."""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass(frozen=True)
|
|
99
|
+
class VADSilenceEvent:
|
|
100
|
+
"""Silence detected after speech.
|
|
101
|
+
|
|
102
|
+
This event is fired when the VAD detects a period of silence
|
|
103
|
+
following speech. It can be used for:
|
|
104
|
+
- Early end-of-utterance detection (before full speech_end)
|
|
105
|
+
- Adaptive silence thresholds
|
|
106
|
+
- Turn-taking management
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
session: VoiceSession
|
|
110
|
+
"""The voice session where silence was detected."""
|
|
111
|
+
|
|
112
|
+
silence_duration_ms: int
|
|
113
|
+
"""Duration of silence in milliseconds."""
|
|
114
|
+
|
|
115
|
+
timestamp: datetime = field(default_factory=_utcnow)
|
|
116
|
+
"""When the silence was detected."""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass(frozen=True)
|
|
120
|
+
class VADAudioLevelEvent:
|
|
121
|
+
"""Periodic audio level update for UI feedback.
|
|
122
|
+
|
|
123
|
+
This event is fired periodically (typically 10Hz) to provide
|
|
124
|
+
audio level information for UI visualization. Use cases include:
|
|
125
|
+
- Audio level meters
|
|
126
|
+
- Speaking indicators
|
|
127
|
+
- Noise detection
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
session: VoiceSession
|
|
131
|
+
"""The voice session."""
|
|
132
|
+
|
|
133
|
+
level_db: float
|
|
134
|
+
"""Audio level in dB (typically -60 to 0, where 0 is max)."""
|
|
135
|
+
|
|
136
|
+
is_speech: bool
|
|
137
|
+
"""VAD's determination if this audio contains speech."""
|
|
138
|
+
|
|
139
|
+
timestamp: datetime = field(default_factory=_utcnow)
|
|
140
|
+
"""When this measurement was taken."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Speech-to-text providers."""
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Speech-to-text provider ABC."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from roomkit.models.event import AudioContent
|
|
11
|
+
from roomkit.voice.base import AudioChunk, TranscriptionResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class STTProvider(ABC):
|
|
15
|
+
"""Speech-to-text provider."""
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def name(self) -> str:
|
|
19
|
+
"""Provider name (e.g. 'whisper', 'deepgram')."""
|
|
20
|
+
return self.__class__.__name__
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
async def transcribe(self, audio: AudioContent | AudioChunk) -> str:
|
|
24
|
+
"""Transcribe complete audio to text.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
audio: Audio content (URL) or raw audio chunk.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Transcribed text.
|
|
31
|
+
"""
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
async def transcribe_stream(
|
|
35
|
+
self, audio_stream: AsyncIterator[AudioChunk]
|
|
36
|
+
) -> AsyncIterator[TranscriptionResult]:
|
|
37
|
+
"""Stream transcription with partial results.
|
|
38
|
+
|
|
39
|
+
Override for providers that support streaming.
|
|
40
|
+
Default: buffers all audio and returns single result.
|
|
41
|
+
"""
|
|
42
|
+
chunks: list[AudioChunk] = []
|
|
43
|
+
async for chunk in audio_stream:
|
|
44
|
+
chunks.append(chunk)
|
|
45
|
+
|
|
46
|
+
# Combine chunks and transcribe
|
|
47
|
+
combined = AudioChunk(
|
|
48
|
+
data=b"".join(c.data for c in chunks),
|
|
49
|
+
sample_rate=chunks[0].sample_rate if chunks else 16000,
|
|
50
|
+
)
|
|
51
|
+
text = await self.transcribe(combined)
|
|
52
|
+
|
|
53
|
+
from roomkit.voice.base import TranscriptionResult
|
|
54
|
+
|
|
55
|
+
yield TranscriptionResult(text=text, is_final=True)
|
|
56
|
+
|
|
57
|
+
async def close(self) -> None: # noqa: B027
|
|
58
|
+
"""Release resources. Override in subclasses if needed."""
|