roomkit 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roomkit/__init__.py +45 -0
- roomkit/_version.py +1 -1
- roomkit/channels/voice.py +728 -0
- roomkit/core/_channel_ops.py +7 -0
- roomkit/core/_inbound.py +4 -0
- roomkit/core/framework.py +177 -1
- roomkit/core/hooks.py +32 -6
- roomkit/models/enums.py +12 -0
- roomkit/sources/__init__.py +4 -4
- roomkit/sources/sse.py +226 -0
- roomkit/voice/__init__.py +99 -0
- roomkit/voice/backends/__init__.py +1 -0
- roomkit/voice/backends/base.py +264 -0
- roomkit/voice/backends/fastrtc.py +467 -0
- roomkit/voice/backends/mock.py +302 -0
- roomkit/voice/base.py +115 -0
- roomkit/voice/events.py +140 -0
- roomkit/voice/stt/__init__.py +1 -0
- roomkit/voice/stt/base.py +58 -0
- roomkit/voice/stt/deepgram.py +214 -0
- roomkit/voice/stt/mock.py +40 -0
- roomkit/voice/tts/__init__.py +1 -0
- roomkit/voice/tts/base.py +58 -0
- roomkit/voice/tts/elevenlabs.py +329 -0
- roomkit/voice/tts/mock.py +51 -0
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/METADATA +11 -2
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/RECORD +29 -12
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/WHEEL +1 -1
- {roomkit-0.1.0.dist-info → roomkit-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,728 @@
|
|
|
1
|
+
"""Voice channel for real-time audio communication."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
from roomkit.channels.base import Channel
|
|
12
|
+
from roomkit.models.channel import ChannelCapabilities
|
|
13
|
+
from roomkit.models.enums import (
|
|
14
|
+
ChannelCategory,
|
|
15
|
+
ChannelDirection,
|
|
16
|
+
ChannelMediaType,
|
|
17
|
+
ChannelType,
|
|
18
|
+
HookTrigger,
|
|
19
|
+
)
|
|
20
|
+
from roomkit.voice.base import VoiceCapability
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from roomkit.core.framework import RoomKit
|
|
24
|
+
from roomkit.models.channel import ChannelBinding, ChannelOutput
|
|
25
|
+
from roomkit.models.context import RoomContext
|
|
26
|
+
from roomkit.models.delivery import InboundMessage
|
|
27
|
+
from roomkit.models.event import RoomEvent
|
|
28
|
+
from roomkit.voice.backends.base import VoiceBackend
|
|
29
|
+
from roomkit.voice.base import AudioChunk, VoiceSession
|
|
30
|
+
from roomkit.voice.stt.base import STTProvider
|
|
31
|
+
from roomkit.voice.tts.base import TTSProvider
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger("roomkit.voice")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _utcnow() -> datetime:
|
|
37
|
+
"""Get current UTC time (timezone-aware)."""
|
|
38
|
+
return datetime.now(UTC)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class TTSPlaybackState:
|
|
43
|
+
"""Track ongoing TTS playback for barge-in detection."""
|
|
44
|
+
|
|
45
|
+
session_id: str
|
|
46
|
+
text: str
|
|
47
|
+
started_at: datetime = field(default_factory=_utcnow)
|
|
48
|
+
total_duration_ms: int | None = None
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def position_ms(self) -> int:
|
|
52
|
+
"""Estimate current playback position based on elapsed time."""
|
|
53
|
+
elapsed = datetime.now(UTC) - self.started_at
|
|
54
|
+
return int(elapsed.total_seconds() * 1000)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class VoiceChannel(Channel):
|
|
58
|
+
"""Real-time voice communication channel.
|
|
59
|
+
|
|
60
|
+
Supports two modes:
|
|
61
|
+
- **Streaming mode** (default): Audio is streamed directly via VoiceBackend.
|
|
62
|
+
When a backend is configured, deliver() streams TTS audio to the session.
|
|
63
|
+
- **Store-and-forward mode**: Audio is synthesized and stored for later retrieval.
|
|
64
|
+
Requires a MediaStore for URL generation (not yet implemented).
|
|
65
|
+
|
|
66
|
+
When a VoiceBackend is configured, the channel automatically:
|
|
67
|
+
- Registers for VAD (Voice Activity Detection) callbacks
|
|
68
|
+
- Transcribes speech using the STT provider
|
|
69
|
+
- Routes transcriptions through the standard inbound pipeline
|
|
70
|
+
- Synthesizes AI responses using TTS and streams to the client
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
channel_type = ChannelType.VOICE
|
|
74
|
+
category = ChannelCategory.TRANSPORT
|
|
75
|
+
direction = ChannelDirection.BIDIRECTIONAL
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
channel_id: str,
|
|
80
|
+
*,
|
|
81
|
+
stt: STTProvider | None = None,
|
|
82
|
+
tts: TTSProvider | None = None,
|
|
83
|
+
backend: VoiceBackend | None = None,
|
|
84
|
+
streaming: bool = True,
|
|
85
|
+
enable_barge_in: bool = True,
|
|
86
|
+
barge_in_threshold_ms: int = 200,
|
|
87
|
+
) -> None:
|
|
88
|
+
"""Initialize voice channel.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
channel_id: Unique channel identifier.
|
|
92
|
+
stt: Speech-to-text provider for transcription.
|
|
93
|
+
tts: Text-to-speech provider for synthesis.
|
|
94
|
+
backend: Voice transport backend for real-time audio.
|
|
95
|
+
streaming: If True (default), deliver() streams audio via backend.
|
|
96
|
+
If False, deliver() requires MediaStore support (not implemented).
|
|
97
|
+
enable_barge_in: If True (default), detect when user speaks during TTS
|
|
98
|
+
and fire ON_BARGE_IN hook. Requires backend with INTERRUPTION capability.
|
|
99
|
+
barge_in_threshold_ms: Minimum TTS playback time before barge-in is
|
|
100
|
+
detected. Helps avoid false triggers from very short interruptions.
|
|
101
|
+
"""
|
|
102
|
+
super().__init__(channel_id)
|
|
103
|
+
self._stt = stt
|
|
104
|
+
self._tts = tts
|
|
105
|
+
self._backend = backend
|
|
106
|
+
self._streaming = streaming
|
|
107
|
+
self._enable_barge_in = enable_barge_in
|
|
108
|
+
self._barge_in_threshold_ms = barge_in_threshold_ms
|
|
109
|
+
self._framework: RoomKit | None = None
|
|
110
|
+
# Map session_id -> (room_id, binding) for routing
|
|
111
|
+
self._session_bindings: dict[str, tuple[str, ChannelBinding]] = {}
|
|
112
|
+
# Track TTS playback for barge-in detection
|
|
113
|
+
self._playing_sessions: dict[str, TTSPlaybackState] = {}
|
|
114
|
+
|
|
115
|
+
# Register VAD callbacks if backend is provided
|
|
116
|
+
if backend:
|
|
117
|
+
backend.on_speech_start(self._on_speech_start)
|
|
118
|
+
backend.on_speech_end(self._on_speech_end)
|
|
119
|
+
|
|
120
|
+
# Register for enhanced callbacks based on capabilities
|
|
121
|
+
if VoiceCapability.PARTIAL_STT in backend.capabilities:
|
|
122
|
+
backend.on_partial_transcription(self._on_partial_transcription)
|
|
123
|
+
if VoiceCapability.VAD_SILENCE in backend.capabilities:
|
|
124
|
+
backend.on_vad_silence(self._on_vad_silence)
|
|
125
|
+
if VoiceCapability.VAD_AUDIO_LEVEL in backend.capabilities:
|
|
126
|
+
backend.on_vad_audio_level(self._on_vad_audio_level)
|
|
127
|
+
if VoiceCapability.BARGE_IN in backend.capabilities:
|
|
128
|
+
backend.on_barge_in(self._on_backend_barge_in)
|
|
129
|
+
|
|
130
|
+
def set_framework(self, framework: RoomKit) -> None:
|
|
131
|
+
"""Set the framework reference for inbound routing.
|
|
132
|
+
|
|
133
|
+
Called automatically when the channel is registered with RoomKit.
|
|
134
|
+
"""
|
|
135
|
+
self._framework = framework
|
|
136
|
+
|
|
137
|
+
def bind_session(
|
|
138
|
+
self, session: VoiceSession, room_id: str, binding: ChannelBinding
|
|
139
|
+
) -> None:
|
|
140
|
+
"""Bind a voice session to a room for message routing.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
session: The voice session to bind.
|
|
144
|
+
room_id: The room to route messages to.
|
|
145
|
+
binding: The channel binding for delivery.
|
|
146
|
+
"""
|
|
147
|
+
self._session_bindings[session.id] = (room_id, binding)
|
|
148
|
+
|
|
149
|
+
def unbind_session(self, session: VoiceSession) -> None:
|
|
150
|
+
"""Remove session binding."""
|
|
151
|
+
self._session_bindings.pop(session.id, None)
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def backend(self) -> VoiceBackend | None:
|
|
155
|
+
"""The voice backend (if configured)."""
|
|
156
|
+
return self._backend
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def info(self) -> dict[str, Any]:
|
|
160
|
+
return {
|
|
161
|
+
"stt": self._stt.name if self._stt else None,
|
|
162
|
+
"tts": self._tts.name if self._tts else None,
|
|
163
|
+
"backend": self._backend.name if self._backend else None,
|
|
164
|
+
"streaming": self._streaming,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
def capabilities(self) -> ChannelCapabilities:
|
|
168
|
+
return ChannelCapabilities(
|
|
169
|
+
media_types=[ChannelMediaType.AUDIO, ChannelMediaType.TEXT],
|
|
170
|
+
supports_audio=True,
|
|
171
|
+
supported_audio_formats=["wav", "mp3", "ogg", "webm"],
|
|
172
|
+
max_audio_duration_seconds=3600,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def _on_speech_start(self, session: VoiceSession) -> None:
|
|
176
|
+
"""Handle VAD speech start event.
|
|
177
|
+
|
|
178
|
+
Fires ON_SPEECH_START hooks for the bound room.
|
|
179
|
+
Also checks for barge-in if TTS is playing.
|
|
180
|
+
"""
|
|
181
|
+
binding_info = self._session_bindings.get(session.id)
|
|
182
|
+
if not binding_info or not self._framework:
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
room_id, _ = binding_info
|
|
186
|
+
|
|
187
|
+
# Fire hook asynchronously (don't block VAD processing)
|
|
188
|
+
try:
|
|
189
|
+
loop = asyncio.get_running_loop()
|
|
190
|
+
except RuntimeError:
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
# Check for barge-in (user speaking while TTS is playing)
|
|
194
|
+
playback = self._playing_sessions.get(session.id)
|
|
195
|
+
if (
|
|
196
|
+
self._enable_barge_in
|
|
197
|
+
and playback
|
|
198
|
+
and playback.position_ms >= self._barge_in_threshold_ms
|
|
199
|
+
):
|
|
200
|
+
loop.create_task(
|
|
201
|
+
self._handle_barge_in(session, playback, room_id),
|
|
202
|
+
name=f"barge_in:{session.id}",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
loop.create_task(
|
|
206
|
+
self._fire_speech_start_hooks(session, room_id),
|
|
207
|
+
name=f"speech_start:{session.id}",
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
async def _fire_speech_start_hooks(
|
|
211
|
+
self, session: VoiceSession, room_id: str
|
|
212
|
+
) -> None:
|
|
213
|
+
"""Fire ON_SPEECH_START hooks."""
|
|
214
|
+
if not self._framework:
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
context = await self._framework._build_context(room_id)
|
|
219
|
+
# Skip event filtering for voice hooks - they receive VoiceSession instead of RoomEvent
|
|
220
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
221
|
+
room_id,
|
|
222
|
+
HookTrigger.ON_SPEECH_START,
|
|
223
|
+
session, # type: ignore[arg-type]
|
|
224
|
+
context,
|
|
225
|
+
skip_event_filter=True,
|
|
226
|
+
)
|
|
227
|
+
except Exception:
|
|
228
|
+
logger.exception("Error firing ON_SPEECH_START hooks")
|
|
229
|
+
|
|
230
|
+
async def _handle_barge_in(
|
|
231
|
+
self, session: VoiceSession, playback: TTSPlaybackState, room_id: str
|
|
232
|
+
) -> None:
|
|
233
|
+
"""Handle barge-in: fire hook and interrupt TTS."""
|
|
234
|
+
if not self._framework:
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
from roomkit.voice.events import BargeInEvent
|
|
239
|
+
|
|
240
|
+
context = await self._framework._build_context(room_id)
|
|
241
|
+
|
|
242
|
+
# Fire ON_BARGE_IN hook
|
|
243
|
+
event = BargeInEvent(
|
|
244
|
+
session=session,
|
|
245
|
+
interrupted_text=playback.text,
|
|
246
|
+
audio_position_ms=playback.position_ms,
|
|
247
|
+
)
|
|
248
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
249
|
+
room_id,
|
|
250
|
+
HookTrigger.ON_BARGE_IN,
|
|
251
|
+
event, # type: ignore[arg-type]
|
|
252
|
+
context,
|
|
253
|
+
skip_event_filter=True,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Interrupt TTS playback
|
|
257
|
+
await self.interrupt(session, reason="barge_in")
|
|
258
|
+
|
|
259
|
+
except Exception:
|
|
260
|
+
logger.exception("Error handling barge-in for session %s", session.id)
|
|
261
|
+
|
|
262
|
+
async def interrupt(
|
|
263
|
+
self, session: VoiceSession, *, reason: str = "explicit"
|
|
264
|
+
) -> bool:
|
|
265
|
+
"""Interrupt ongoing TTS playback for a session.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
session: The voice session to interrupt.
|
|
269
|
+
reason: Why the TTS was cancelled ('barge_in', 'explicit', etc.)
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
True if TTS was cancelled, False if nothing was playing.
|
|
273
|
+
"""
|
|
274
|
+
playback = self._playing_sessions.pop(session.id, None)
|
|
275
|
+
if not playback:
|
|
276
|
+
return False
|
|
277
|
+
|
|
278
|
+
# Cancel audio in backend if supported
|
|
279
|
+
if self._backend and VoiceCapability.INTERRUPTION in self._backend.capabilities:
|
|
280
|
+
await self._backend.cancel_audio(session)
|
|
281
|
+
|
|
282
|
+
# Fire ON_TTS_CANCELLED hook
|
|
283
|
+
if self._framework:
|
|
284
|
+
binding_info = self._session_bindings.get(session.id)
|
|
285
|
+
if binding_info:
|
|
286
|
+
room_id, _ = binding_info
|
|
287
|
+
try:
|
|
288
|
+
from roomkit.voice.events import TTSCancelledEvent
|
|
289
|
+
|
|
290
|
+
context = await self._framework._build_context(room_id)
|
|
291
|
+
event = TTSCancelledEvent(
|
|
292
|
+
session=session,
|
|
293
|
+
reason=reason, # type: ignore[arg-type]
|
|
294
|
+
text=playback.text,
|
|
295
|
+
audio_position_ms=playback.position_ms,
|
|
296
|
+
)
|
|
297
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
298
|
+
room_id,
|
|
299
|
+
HookTrigger.ON_TTS_CANCELLED,
|
|
300
|
+
event, # type: ignore[arg-type]
|
|
301
|
+
context,
|
|
302
|
+
skip_event_filter=True,
|
|
303
|
+
)
|
|
304
|
+
except Exception:
|
|
305
|
+
logger.exception("Error firing ON_TTS_CANCELLED hook")
|
|
306
|
+
|
|
307
|
+
logger.info(
|
|
308
|
+
"TTS interrupted for session %s: reason=%s, position=%dms",
|
|
309
|
+
session.id,
|
|
310
|
+
reason,
|
|
311
|
+
playback.position_ms,
|
|
312
|
+
)
|
|
313
|
+
return True
|
|
314
|
+
|
|
315
|
+
def _on_partial_transcription(
|
|
316
|
+
self, session: VoiceSession, text: str, confidence: float, is_stable: bool
|
|
317
|
+
) -> None:
|
|
318
|
+
"""Handle partial transcription from streaming STT."""
|
|
319
|
+
binding_info = self._session_bindings.get(session.id)
|
|
320
|
+
if not binding_info or not self._framework:
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
room_id, _ = binding_info
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
loop = asyncio.get_running_loop()
|
|
327
|
+
except RuntimeError:
|
|
328
|
+
return
|
|
329
|
+
|
|
330
|
+
loop.create_task(
|
|
331
|
+
self._fire_partial_transcription_hook(
|
|
332
|
+
session, text, confidence, is_stable, room_id
|
|
333
|
+
),
|
|
334
|
+
name=f"partial_transcription:{session.id}",
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
async def _fire_partial_transcription_hook(
|
|
338
|
+
self,
|
|
339
|
+
session: VoiceSession,
|
|
340
|
+
text: str,
|
|
341
|
+
confidence: float,
|
|
342
|
+
is_stable: bool,
|
|
343
|
+
room_id: str,
|
|
344
|
+
) -> None:
|
|
345
|
+
"""Fire ON_PARTIAL_TRANSCRIPTION hook."""
|
|
346
|
+
if not self._framework:
|
|
347
|
+
return
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
from roomkit.voice.events import PartialTranscriptionEvent
|
|
351
|
+
|
|
352
|
+
context = await self._framework._build_context(room_id)
|
|
353
|
+
event = PartialTranscriptionEvent(
|
|
354
|
+
session=session,
|
|
355
|
+
text=text,
|
|
356
|
+
confidence=confidence,
|
|
357
|
+
is_stable=is_stable,
|
|
358
|
+
)
|
|
359
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
360
|
+
room_id,
|
|
361
|
+
HookTrigger.ON_PARTIAL_TRANSCRIPTION,
|
|
362
|
+
event, # type: ignore[arg-type]
|
|
363
|
+
context,
|
|
364
|
+
skip_event_filter=True,
|
|
365
|
+
)
|
|
366
|
+
except Exception:
|
|
367
|
+
logger.exception("Error firing ON_PARTIAL_TRANSCRIPTION hook")
|
|
368
|
+
|
|
369
|
+
def _on_vad_silence(self, session: VoiceSession, silence_duration_ms: int) -> None:
|
|
370
|
+
"""Handle VAD silence detection."""
|
|
371
|
+
binding_info = self._session_bindings.get(session.id)
|
|
372
|
+
if not binding_info or not self._framework:
|
|
373
|
+
return
|
|
374
|
+
|
|
375
|
+
room_id, _ = binding_info
|
|
376
|
+
|
|
377
|
+
try:
|
|
378
|
+
loop = asyncio.get_running_loop()
|
|
379
|
+
except RuntimeError:
|
|
380
|
+
return
|
|
381
|
+
|
|
382
|
+
loop.create_task(
|
|
383
|
+
self._fire_vad_silence_hook(session, silence_duration_ms, room_id),
|
|
384
|
+
name=f"vad_silence:{session.id}",
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
async def _fire_vad_silence_hook(
|
|
388
|
+
self, session: VoiceSession, silence_duration_ms: int, room_id: str
|
|
389
|
+
) -> None:
|
|
390
|
+
"""Fire ON_VAD_SILENCE hook."""
|
|
391
|
+
if not self._framework:
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
from roomkit.voice.events import VADSilenceEvent
|
|
396
|
+
|
|
397
|
+
context = await self._framework._build_context(room_id)
|
|
398
|
+
event = VADSilenceEvent(
|
|
399
|
+
session=session,
|
|
400
|
+
silence_duration_ms=silence_duration_ms,
|
|
401
|
+
)
|
|
402
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
403
|
+
room_id,
|
|
404
|
+
HookTrigger.ON_VAD_SILENCE,
|
|
405
|
+
event, # type: ignore[arg-type]
|
|
406
|
+
context,
|
|
407
|
+
skip_event_filter=True,
|
|
408
|
+
)
|
|
409
|
+
except Exception:
|
|
410
|
+
logger.exception("Error firing ON_VAD_SILENCE hook")
|
|
411
|
+
|
|
412
|
+
def _on_vad_audio_level(
|
|
413
|
+
self, session: VoiceSession, level_db: float, is_speech: bool
|
|
414
|
+
) -> None:
|
|
415
|
+
"""Handle VAD audio level update."""
|
|
416
|
+
binding_info = self._session_bindings.get(session.id)
|
|
417
|
+
if not binding_info or not self._framework:
|
|
418
|
+
return
|
|
419
|
+
|
|
420
|
+
room_id, _ = binding_info
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
loop = asyncio.get_running_loop()
|
|
424
|
+
except RuntimeError:
|
|
425
|
+
return
|
|
426
|
+
|
|
427
|
+
loop.create_task(
|
|
428
|
+
self._fire_vad_audio_level_hook(session, level_db, is_speech, room_id),
|
|
429
|
+
name=f"vad_audio_level:{session.id}",
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
async def _fire_vad_audio_level_hook(
|
|
433
|
+
self, session: VoiceSession, level_db: float, is_speech: bool, room_id: str
|
|
434
|
+
) -> None:
|
|
435
|
+
"""Fire ON_VAD_AUDIO_LEVEL hook."""
|
|
436
|
+
if not self._framework:
|
|
437
|
+
return
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
from roomkit.voice.events import VADAudioLevelEvent
|
|
441
|
+
|
|
442
|
+
context = await self._framework._build_context(room_id)
|
|
443
|
+
event = VADAudioLevelEvent(
|
|
444
|
+
session=session,
|
|
445
|
+
level_db=level_db,
|
|
446
|
+
is_speech=is_speech,
|
|
447
|
+
)
|
|
448
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
449
|
+
room_id,
|
|
450
|
+
HookTrigger.ON_VAD_AUDIO_LEVEL,
|
|
451
|
+
event, # type: ignore[arg-type]
|
|
452
|
+
context,
|
|
453
|
+
skip_event_filter=True,
|
|
454
|
+
)
|
|
455
|
+
except Exception:
|
|
456
|
+
logger.exception("Error firing ON_VAD_AUDIO_LEVEL hook")
|
|
457
|
+
|
|
458
|
+
def _on_backend_barge_in(self, session: VoiceSession) -> None:
|
|
459
|
+
"""Handle barge-in detected by backend."""
|
|
460
|
+
binding_info = self._session_bindings.get(session.id)
|
|
461
|
+
if not binding_info or not self._framework:
|
|
462
|
+
return
|
|
463
|
+
|
|
464
|
+
room_id, _ = binding_info
|
|
465
|
+
playback = self._playing_sessions.get(session.id)
|
|
466
|
+
|
|
467
|
+
if not playback:
|
|
468
|
+
return
|
|
469
|
+
|
|
470
|
+
try:
|
|
471
|
+
loop = asyncio.get_running_loop()
|
|
472
|
+
except RuntimeError:
|
|
473
|
+
return
|
|
474
|
+
|
|
475
|
+
loop.create_task(
|
|
476
|
+
self._handle_barge_in(session, playback, room_id),
|
|
477
|
+
name=f"backend_barge_in:{session.id}",
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
def _on_speech_end(self, session: VoiceSession, audio: bytes) -> None:
|
|
481
|
+
"""Handle VAD speech end event.
|
|
482
|
+
|
|
483
|
+
Fires ON_SPEECH_END hooks, transcribes audio, and routes to framework.
|
|
484
|
+
"""
|
|
485
|
+
binding_info = self._session_bindings.get(session.id)
|
|
486
|
+
if not binding_info or not self._framework:
|
|
487
|
+
return
|
|
488
|
+
|
|
489
|
+
room_id, _ = binding_info
|
|
490
|
+
|
|
491
|
+
# Process asynchronously
|
|
492
|
+
try:
|
|
493
|
+
loop = asyncio.get_running_loop()
|
|
494
|
+
except RuntimeError:
|
|
495
|
+
return
|
|
496
|
+
|
|
497
|
+
loop.create_task(
|
|
498
|
+
self._process_speech_end(session, audio, room_id),
|
|
499
|
+
name=f"speech_end:{session.id}",
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
async def _process_speech_end(
|
|
503
|
+
self, session: VoiceSession, audio: bytes, room_id: str
|
|
504
|
+
) -> None:
|
|
505
|
+
"""Process speech end: fire hooks, transcribe, route inbound."""
|
|
506
|
+
if not self._framework:
|
|
507
|
+
return
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
context = await self._framework._build_context(room_id)
|
|
511
|
+
|
|
512
|
+
# Fire ON_SPEECH_END hooks
|
|
513
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
514
|
+
room_id,
|
|
515
|
+
HookTrigger.ON_SPEECH_END,
|
|
516
|
+
session, # type: ignore[arg-type]
|
|
517
|
+
context,
|
|
518
|
+
skip_event_filter=True,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# Transcribe if STT is configured
|
|
522
|
+
if not self._stt:
|
|
523
|
+
logger.warning("Speech ended but no STT provider configured")
|
|
524
|
+
return
|
|
525
|
+
|
|
526
|
+
from roomkit.voice.base import AudioChunk
|
|
527
|
+
|
|
528
|
+
# Get audio parameters from session metadata (set by backend)
|
|
529
|
+
sample_rate = session.metadata.get("input_sample_rate", 16000)
|
|
530
|
+
audio_chunk = AudioChunk(
|
|
531
|
+
data=audio,
|
|
532
|
+
sample_rate=sample_rate,
|
|
533
|
+
channels=1,
|
|
534
|
+
format="pcm_s16le",
|
|
535
|
+
)
|
|
536
|
+
text = await self._stt.transcribe(audio_chunk)
|
|
537
|
+
|
|
538
|
+
if not text.strip():
|
|
539
|
+
logger.debug("Empty transcription, skipping")
|
|
540
|
+
return
|
|
541
|
+
|
|
542
|
+
logger.info("Transcription: %s", text)
|
|
543
|
+
|
|
544
|
+
# Send transcription to client UI (if backend supports it)
|
|
545
|
+
if self._backend:
|
|
546
|
+
await self._backend.send_transcription(session, text, "user")
|
|
547
|
+
|
|
548
|
+
# Fire ON_TRANSCRIPTION hooks (sync, can modify)
|
|
549
|
+
transcription_result = await self._framework.hook_engine.run_sync_hooks(
|
|
550
|
+
room_id,
|
|
551
|
+
HookTrigger.ON_TRANSCRIPTION,
|
|
552
|
+
text, # type: ignore[arg-type]
|
|
553
|
+
context,
|
|
554
|
+
skip_event_filter=True,
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
if not transcription_result.allowed:
|
|
558
|
+
logger.info(
|
|
559
|
+
"Transcription blocked by hook: %s", transcription_result.reason
|
|
560
|
+
)
|
|
561
|
+
return
|
|
562
|
+
|
|
563
|
+
# Use potentially modified text
|
|
564
|
+
final_text = (
|
|
565
|
+
transcription_result.event
|
|
566
|
+
if isinstance(transcription_result.event, str)
|
|
567
|
+
else text
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Route through inbound pipeline
|
|
571
|
+
from roomkit.models.delivery import InboundMessage
|
|
572
|
+
from roomkit.models.event import TextContent
|
|
573
|
+
|
|
574
|
+
inbound = InboundMessage(
|
|
575
|
+
channel_id=self.channel_id,
|
|
576
|
+
sender_id=session.participant_id,
|
|
577
|
+
content=TextContent(body=final_text),
|
|
578
|
+
metadata={"voice_session_id": session.id, "source": "voice"},
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
await self._framework.process_inbound(inbound)
|
|
582
|
+
|
|
583
|
+
except Exception:
|
|
584
|
+
logger.exception("Error processing speech end")
|
|
585
|
+
|
|
586
|
+
async def handle_inbound(
|
|
587
|
+
self, message: InboundMessage, context: RoomContext
|
|
588
|
+
) -> RoomEvent:
|
|
589
|
+
from roomkit.models.event import EventSource
|
|
590
|
+
from roomkit.models.event import RoomEvent as RoomEventModel
|
|
591
|
+
|
|
592
|
+
return RoomEventModel(
|
|
593
|
+
room_id=context.room.id,
|
|
594
|
+
source=EventSource(
|
|
595
|
+
channel_id=self.channel_id,
|
|
596
|
+
channel_type=self.channel_type,
|
|
597
|
+
participant_id=message.sender_id,
|
|
598
|
+
external_id=message.external_id,
|
|
599
|
+
),
|
|
600
|
+
content=message.content,
|
|
601
|
+
idempotency_key=message.idempotency_key,
|
|
602
|
+
metadata=message.metadata,
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
async def deliver(
|
|
606
|
+
self, event: RoomEvent, binding: ChannelBinding, context: RoomContext
|
|
607
|
+
) -> ChannelOutput:
|
|
608
|
+
from roomkit.models.channel import ChannelOutput as ChannelOutputModel
|
|
609
|
+
from roomkit.models.event import TextContent
|
|
610
|
+
|
|
611
|
+
# In streaming mode without backend, delivery is handled externally
|
|
612
|
+
if self._streaming and not self._backend:
|
|
613
|
+
return ChannelOutputModel.empty()
|
|
614
|
+
|
|
615
|
+
# In streaming mode with backend, stream TTS audio to the session
|
|
616
|
+
if self._streaming and self._backend and isinstance(event.content, TextContent):
|
|
617
|
+
await self._deliver_voice(event, binding, context)
|
|
618
|
+
return ChannelOutputModel.empty()
|
|
619
|
+
|
|
620
|
+
# Store-and-forward mode: synthesize audio for later retrieval.
|
|
621
|
+
if not self._streaming and isinstance(event.content, TextContent) and self._tts:
|
|
622
|
+
raise NotImplementedError(
|
|
623
|
+
"VoiceChannel store-and-forward mode requires MediaStore support. "
|
|
624
|
+
"Use streaming=True (default) for real-time voice, or implement "
|
|
625
|
+
"MediaStore for async audio delivery."
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
return ChannelOutputModel.empty()
|
|
629
|
+
|
|
630
|
+
async def _deliver_voice(
|
|
631
|
+
self, event: RoomEvent, binding: ChannelBinding, context: RoomContext
|
|
632
|
+
) -> None:
|
|
633
|
+
"""Deliver text event as voice audio via the backend."""
|
|
634
|
+
if not self._tts or not self._backend or not self._framework:
|
|
635
|
+
return
|
|
636
|
+
|
|
637
|
+
from roomkit.models.event import TextContent
|
|
638
|
+
|
|
639
|
+
if not isinstance(event.content, TextContent):
|
|
640
|
+
return
|
|
641
|
+
|
|
642
|
+
text = event.content.body
|
|
643
|
+
room_id = event.room_id
|
|
644
|
+
|
|
645
|
+
try:
|
|
646
|
+
# Fire BEFORE_TTS hooks (sync, can modify)
|
|
647
|
+
before_result = await self._framework.hook_engine.run_sync_hooks(
|
|
648
|
+
room_id,
|
|
649
|
+
HookTrigger.BEFORE_TTS,
|
|
650
|
+
text, # type: ignore[arg-type]
|
|
651
|
+
context,
|
|
652
|
+
skip_event_filter=True,
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
if not before_result.allowed:
|
|
656
|
+
logger.info("TTS blocked by hook: %s", before_result.reason)
|
|
657
|
+
return
|
|
658
|
+
|
|
659
|
+
# Use potentially modified text
|
|
660
|
+
final_text = (
|
|
661
|
+
before_result.event
|
|
662
|
+
if isinstance(before_result.event, str)
|
|
663
|
+
else text
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
logger.info("AI response: %s", final_text)
|
|
667
|
+
|
|
668
|
+
# Find the session(s) to send audio to
|
|
669
|
+
# Look for sessions bound to this room
|
|
670
|
+
target_sessions: list[VoiceSession] = []
|
|
671
|
+
for session_id, (bound_room_id, bound_binding) in self._session_bindings.items():
|
|
672
|
+
if bound_room_id == room_id and bound_binding.channel_id == binding.channel_id:
|
|
673
|
+
session = self._backend.get_session(session_id)
|
|
674
|
+
if session:
|
|
675
|
+
target_sessions.append(session)
|
|
676
|
+
|
|
677
|
+
if not target_sessions:
|
|
678
|
+
# Fallback: get all sessions in the room from backend
|
|
679
|
+
target_sessions = self._backend.list_sessions(room_id)
|
|
680
|
+
|
|
681
|
+
# Stream TTS audio to each session
|
|
682
|
+
for session in target_sessions:
|
|
683
|
+
# Send AI response text to client UI
|
|
684
|
+
await self._backend.send_transcription(session, final_text, "assistant")
|
|
685
|
+
|
|
686
|
+
# Track playback state for barge-in detection
|
|
687
|
+
self._playing_sessions[session.id] = TTSPlaybackState(
|
|
688
|
+
session_id=session.id,
|
|
689
|
+
text=final_text,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
try:
|
|
693
|
+
# Use streaming if available
|
|
694
|
+
audio_stream = self._tts.synthesize_stream(final_text)
|
|
695
|
+
await self._backend.send_audio(session, audio_stream)
|
|
696
|
+
except NotImplementedError:
|
|
697
|
+
# Fallback to non-streaming
|
|
698
|
+
audio_content = await self._tts.synthesize(final_text)
|
|
699
|
+
# For non-streaming, we'd need to fetch the audio from URL
|
|
700
|
+
# This is a limitation - mock backends handle it
|
|
701
|
+
logger.warning(
|
|
702
|
+
"TTS provider %s doesn't support streaming", self._tts.name
|
|
703
|
+
)
|
|
704
|
+
finally:
|
|
705
|
+
# Clear playback state (TTS finished or failed)
|
|
706
|
+
self._playing_sessions.pop(session.id, None)
|
|
707
|
+
|
|
708
|
+
# Fire AFTER_TTS hooks (async)
|
|
709
|
+
await self._framework.hook_engine.run_async_hooks(
|
|
710
|
+
room_id,
|
|
711
|
+
HookTrigger.AFTER_TTS,
|
|
712
|
+
final_text, # type: ignore[arg-type]
|
|
713
|
+
context,
|
|
714
|
+
skip_event_filter=True,
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
except Exception:
|
|
718
|
+
logger.exception("Error delivering voice audio")
|
|
719
|
+
|
|
720
|
+
async def close(self) -> None:
|
|
721
|
+
if self._stt:
|
|
722
|
+
await self._stt.close()
|
|
723
|
+
if self._tts:
|
|
724
|
+
await self._tts.close()
|
|
725
|
+
if self._backend:
|
|
726
|
+
await self._backend.close()
|
|
727
|
+
self._session_bindings.clear()
|
|
728
|
+
self._playing_sessions.clear()
|