roomkit 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,728 @@
1
+ """Voice channel for real-time audio communication."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from dataclasses import dataclass, field
8
+ from datetime import UTC, datetime
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from roomkit.channels.base import Channel
12
+ from roomkit.models.channel import ChannelCapabilities
13
+ from roomkit.models.enums import (
14
+ ChannelCategory,
15
+ ChannelDirection,
16
+ ChannelMediaType,
17
+ ChannelType,
18
+ HookTrigger,
19
+ )
20
+ from roomkit.voice.base import VoiceCapability
21
+
22
+ if TYPE_CHECKING:
23
+ from roomkit.core.framework import RoomKit
24
+ from roomkit.models.channel import ChannelBinding, ChannelOutput
25
+ from roomkit.models.context import RoomContext
26
+ from roomkit.models.delivery import InboundMessage
27
+ from roomkit.models.event import RoomEvent
28
+ from roomkit.voice.backends.base import VoiceBackend
29
+ from roomkit.voice.base import AudioChunk, VoiceSession
30
+ from roomkit.voice.stt.base import STTProvider
31
+ from roomkit.voice.tts.base import TTSProvider
32
+
33
+ logger = logging.getLogger("roomkit.voice")
34
+
35
+
36
+ def _utcnow() -> datetime:
37
+ """Get current UTC time (timezone-aware)."""
38
+ return datetime.now(UTC)
39
+
40
+
41
+ @dataclass
42
+ class TTSPlaybackState:
43
+ """Track ongoing TTS playback for barge-in detection."""
44
+
45
+ session_id: str
46
+ text: str
47
+ started_at: datetime = field(default_factory=_utcnow)
48
+ total_duration_ms: int | None = None
49
+
50
+ @property
51
+ def position_ms(self) -> int:
52
+ """Estimate current playback position based on elapsed time."""
53
+ elapsed = datetime.now(UTC) - self.started_at
54
+ return int(elapsed.total_seconds() * 1000)
55
+
56
+
57
+ class VoiceChannel(Channel):
58
+ """Real-time voice communication channel.
59
+
60
+ Supports two modes:
61
+ - **Streaming mode** (default): Audio is streamed directly via VoiceBackend.
62
+ When a backend is configured, deliver() streams TTS audio to the session.
63
+ - **Store-and-forward mode**: Audio is synthesized and stored for later retrieval.
64
+ Requires a MediaStore for URL generation (not yet implemented).
65
+
66
+ When a VoiceBackend is configured, the channel automatically:
67
+ - Registers for VAD (Voice Activity Detection) callbacks
68
+ - Transcribes speech using the STT provider
69
+ - Routes transcriptions through the standard inbound pipeline
70
+ - Synthesizes AI responses using TTS and streams to the client
71
+ """
72
+
73
+ channel_type = ChannelType.VOICE
74
+ category = ChannelCategory.TRANSPORT
75
+ direction = ChannelDirection.BIDIRECTIONAL
76
+
77
+ def __init__(
78
+ self,
79
+ channel_id: str,
80
+ *,
81
+ stt: STTProvider | None = None,
82
+ tts: TTSProvider | None = None,
83
+ backend: VoiceBackend | None = None,
84
+ streaming: bool = True,
85
+ enable_barge_in: bool = True,
86
+ barge_in_threshold_ms: int = 200,
87
+ ) -> None:
88
+ """Initialize voice channel.
89
+
90
+ Args:
91
+ channel_id: Unique channel identifier.
92
+ stt: Speech-to-text provider for transcription.
93
+ tts: Text-to-speech provider for synthesis.
94
+ backend: Voice transport backend for real-time audio.
95
+ streaming: If True (default), deliver() streams audio via backend.
96
+ If False, deliver() requires MediaStore support (not implemented).
97
+ enable_barge_in: If True (default), detect when user speaks during TTS
98
+ and fire ON_BARGE_IN hook. Requires backend with INTERRUPTION capability.
99
+ barge_in_threshold_ms: Minimum TTS playback time before barge-in is
100
+ detected. Helps avoid false triggers from very short interruptions.
101
+ """
102
+ super().__init__(channel_id)
103
+ self._stt = stt
104
+ self._tts = tts
105
+ self._backend = backend
106
+ self._streaming = streaming
107
+ self._enable_barge_in = enable_barge_in
108
+ self._barge_in_threshold_ms = barge_in_threshold_ms
109
+ self._framework: RoomKit | None = None
110
+ # Map session_id -> (room_id, binding) for routing
111
+ self._session_bindings: dict[str, tuple[str, ChannelBinding]] = {}
112
+ # Track TTS playback for barge-in detection
113
+ self._playing_sessions: dict[str, TTSPlaybackState] = {}
114
+
115
+ # Register VAD callbacks if backend is provided
116
+ if backend:
117
+ backend.on_speech_start(self._on_speech_start)
118
+ backend.on_speech_end(self._on_speech_end)
119
+
120
+ # Register for enhanced callbacks based on capabilities
121
+ if VoiceCapability.PARTIAL_STT in backend.capabilities:
122
+ backend.on_partial_transcription(self._on_partial_transcription)
123
+ if VoiceCapability.VAD_SILENCE in backend.capabilities:
124
+ backend.on_vad_silence(self._on_vad_silence)
125
+ if VoiceCapability.VAD_AUDIO_LEVEL in backend.capabilities:
126
+ backend.on_vad_audio_level(self._on_vad_audio_level)
127
+ if VoiceCapability.BARGE_IN in backend.capabilities:
128
+ backend.on_barge_in(self._on_backend_barge_in)
129
+
130
+ def set_framework(self, framework: RoomKit) -> None:
131
+ """Set the framework reference for inbound routing.
132
+
133
+ Called automatically when the channel is registered with RoomKit.
134
+ """
135
+ self._framework = framework
136
+
137
+ def bind_session(
138
+ self, session: VoiceSession, room_id: str, binding: ChannelBinding
139
+ ) -> None:
140
+ """Bind a voice session to a room for message routing.
141
+
142
+ Args:
143
+ session: The voice session to bind.
144
+ room_id: The room to route messages to.
145
+ binding: The channel binding for delivery.
146
+ """
147
+ self._session_bindings[session.id] = (room_id, binding)
148
+
149
+ def unbind_session(self, session: VoiceSession) -> None:
150
+ """Remove session binding."""
151
+ self._session_bindings.pop(session.id, None)
152
+
153
+ @property
154
+ def backend(self) -> VoiceBackend | None:
155
+ """The voice backend (if configured)."""
156
+ return self._backend
157
+
158
+ @property
159
+ def info(self) -> dict[str, Any]:
160
+ return {
161
+ "stt": self._stt.name if self._stt else None,
162
+ "tts": self._tts.name if self._tts else None,
163
+ "backend": self._backend.name if self._backend else None,
164
+ "streaming": self._streaming,
165
+ }
166
+
167
+ def capabilities(self) -> ChannelCapabilities:
168
+ return ChannelCapabilities(
169
+ media_types=[ChannelMediaType.AUDIO, ChannelMediaType.TEXT],
170
+ supports_audio=True,
171
+ supported_audio_formats=["wav", "mp3", "ogg", "webm"],
172
+ max_audio_duration_seconds=3600,
173
+ )
174
+
175
+ def _on_speech_start(self, session: VoiceSession) -> None:
176
+ """Handle VAD speech start event.
177
+
178
+ Fires ON_SPEECH_START hooks for the bound room.
179
+ Also checks for barge-in if TTS is playing.
180
+ """
181
+ binding_info = self._session_bindings.get(session.id)
182
+ if not binding_info or not self._framework:
183
+ return
184
+
185
+ room_id, _ = binding_info
186
+
187
+ # Fire hook asynchronously (don't block VAD processing)
188
+ try:
189
+ loop = asyncio.get_running_loop()
190
+ except RuntimeError:
191
+ return
192
+
193
+ # Check for barge-in (user speaking while TTS is playing)
194
+ playback = self._playing_sessions.get(session.id)
195
+ if (
196
+ self._enable_barge_in
197
+ and playback
198
+ and playback.position_ms >= self._barge_in_threshold_ms
199
+ ):
200
+ loop.create_task(
201
+ self._handle_barge_in(session, playback, room_id),
202
+ name=f"barge_in:{session.id}",
203
+ )
204
+
205
+ loop.create_task(
206
+ self._fire_speech_start_hooks(session, room_id),
207
+ name=f"speech_start:{session.id}",
208
+ )
209
+
210
+ async def _fire_speech_start_hooks(
211
+ self, session: VoiceSession, room_id: str
212
+ ) -> None:
213
+ """Fire ON_SPEECH_START hooks."""
214
+ if not self._framework:
215
+ return
216
+
217
+ try:
218
+ context = await self._framework._build_context(room_id)
219
+ # Skip event filtering for voice hooks - they receive VoiceSession instead of RoomEvent
220
+ await self._framework.hook_engine.run_async_hooks(
221
+ room_id,
222
+ HookTrigger.ON_SPEECH_START,
223
+ session, # type: ignore[arg-type]
224
+ context,
225
+ skip_event_filter=True,
226
+ )
227
+ except Exception:
228
+ logger.exception("Error firing ON_SPEECH_START hooks")
229
+
230
+ async def _handle_barge_in(
231
+ self, session: VoiceSession, playback: TTSPlaybackState, room_id: str
232
+ ) -> None:
233
+ """Handle barge-in: fire hook and interrupt TTS."""
234
+ if not self._framework:
235
+ return
236
+
237
+ try:
238
+ from roomkit.voice.events import BargeInEvent
239
+
240
+ context = await self._framework._build_context(room_id)
241
+
242
+ # Fire ON_BARGE_IN hook
243
+ event = BargeInEvent(
244
+ session=session,
245
+ interrupted_text=playback.text,
246
+ audio_position_ms=playback.position_ms,
247
+ )
248
+ await self._framework.hook_engine.run_async_hooks(
249
+ room_id,
250
+ HookTrigger.ON_BARGE_IN,
251
+ event, # type: ignore[arg-type]
252
+ context,
253
+ skip_event_filter=True,
254
+ )
255
+
256
+ # Interrupt TTS playback
257
+ await self.interrupt(session, reason="barge_in")
258
+
259
+ except Exception:
260
+ logger.exception("Error handling barge-in for session %s", session.id)
261
+
262
+ async def interrupt(
263
+ self, session: VoiceSession, *, reason: str = "explicit"
264
+ ) -> bool:
265
+ """Interrupt ongoing TTS playback for a session.
266
+
267
+ Args:
268
+ session: The voice session to interrupt.
269
+ reason: Why the TTS was cancelled ('barge_in', 'explicit', etc.)
270
+
271
+ Returns:
272
+ True if TTS was cancelled, False if nothing was playing.
273
+ """
274
+ playback = self._playing_sessions.pop(session.id, None)
275
+ if not playback:
276
+ return False
277
+
278
+ # Cancel audio in backend if supported
279
+ if self._backend and VoiceCapability.INTERRUPTION in self._backend.capabilities:
280
+ await self._backend.cancel_audio(session)
281
+
282
+ # Fire ON_TTS_CANCELLED hook
283
+ if self._framework:
284
+ binding_info = self._session_bindings.get(session.id)
285
+ if binding_info:
286
+ room_id, _ = binding_info
287
+ try:
288
+ from roomkit.voice.events import TTSCancelledEvent
289
+
290
+ context = await self._framework._build_context(room_id)
291
+ event = TTSCancelledEvent(
292
+ session=session,
293
+ reason=reason, # type: ignore[arg-type]
294
+ text=playback.text,
295
+ audio_position_ms=playback.position_ms,
296
+ )
297
+ await self._framework.hook_engine.run_async_hooks(
298
+ room_id,
299
+ HookTrigger.ON_TTS_CANCELLED,
300
+ event, # type: ignore[arg-type]
301
+ context,
302
+ skip_event_filter=True,
303
+ )
304
+ except Exception:
305
+ logger.exception("Error firing ON_TTS_CANCELLED hook")
306
+
307
+ logger.info(
308
+ "TTS interrupted for session %s: reason=%s, position=%dms",
309
+ session.id,
310
+ reason,
311
+ playback.position_ms,
312
+ )
313
+ return True
314
+
315
+ def _on_partial_transcription(
316
+ self, session: VoiceSession, text: str, confidence: float, is_stable: bool
317
+ ) -> None:
318
+ """Handle partial transcription from streaming STT."""
319
+ binding_info = self._session_bindings.get(session.id)
320
+ if not binding_info or not self._framework:
321
+ return
322
+
323
+ room_id, _ = binding_info
324
+
325
+ try:
326
+ loop = asyncio.get_running_loop()
327
+ except RuntimeError:
328
+ return
329
+
330
+ loop.create_task(
331
+ self._fire_partial_transcription_hook(
332
+ session, text, confidence, is_stable, room_id
333
+ ),
334
+ name=f"partial_transcription:{session.id}",
335
+ )
336
+
337
+ async def _fire_partial_transcription_hook(
338
+ self,
339
+ session: VoiceSession,
340
+ text: str,
341
+ confidence: float,
342
+ is_stable: bool,
343
+ room_id: str,
344
+ ) -> None:
345
+ """Fire ON_PARTIAL_TRANSCRIPTION hook."""
346
+ if not self._framework:
347
+ return
348
+
349
+ try:
350
+ from roomkit.voice.events import PartialTranscriptionEvent
351
+
352
+ context = await self._framework._build_context(room_id)
353
+ event = PartialTranscriptionEvent(
354
+ session=session,
355
+ text=text,
356
+ confidence=confidence,
357
+ is_stable=is_stable,
358
+ )
359
+ await self._framework.hook_engine.run_async_hooks(
360
+ room_id,
361
+ HookTrigger.ON_PARTIAL_TRANSCRIPTION,
362
+ event, # type: ignore[arg-type]
363
+ context,
364
+ skip_event_filter=True,
365
+ )
366
+ except Exception:
367
+ logger.exception("Error firing ON_PARTIAL_TRANSCRIPTION hook")
368
+
369
+ def _on_vad_silence(self, session: VoiceSession, silence_duration_ms: int) -> None:
370
+ """Handle VAD silence detection."""
371
+ binding_info = self._session_bindings.get(session.id)
372
+ if not binding_info or not self._framework:
373
+ return
374
+
375
+ room_id, _ = binding_info
376
+
377
+ try:
378
+ loop = asyncio.get_running_loop()
379
+ except RuntimeError:
380
+ return
381
+
382
+ loop.create_task(
383
+ self._fire_vad_silence_hook(session, silence_duration_ms, room_id),
384
+ name=f"vad_silence:{session.id}",
385
+ )
386
+
387
+ async def _fire_vad_silence_hook(
388
+ self, session: VoiceSession, silence_duration_ms: int, room_id: str
389
+ ) -> None:
390
+ """Fire ON_VAD_SILENCE hook."""
391
+ if not self._framework:
392
+ return
393
+
394
+ try:
395
+ from roomkit.voice.events import VADSilenceEvent
396
+
397
+ context = await self._framework._build_context(room_id)
398
+ event = VADSilenceEvent(
399
+ session=session,
400
+ silence_duration_ms=silence_duration_ms,
401
+ )
402
+ await self._framework.hook_engine.run_async_hooks(
403
+ room_id,
404
+ HookTrigger.ON_VAD_SILENCE,
405
+ event, # type: ignore[arg-type]
406
+ context,
407
+ skip_event_filter=True,
408
+ )
409
+ except Exception:
410
+ logger.exception("Error firing ON_VAD_SILENCE hook")
411
+
412
+ def _on_vad_audio_level(
413
+ self, session: VoiceSession, level_db: float, is_speech: bool
414
+ ) -> None:
415
+ """Handle VAD audio level update."""
416
+ binding_info = self._session_bindings.get(session.id)
417
+ if not binding_info or not self._framework:
418
+ return
419
+
420
+ room_id, _ = binding_info
421
+
422
+ try:
423
+ loop = asyncio.get_running_loop()
424
+ except RuntimeError:
425
+ return
426
+
427
+ loop.create_task(
428
+ self._fire_vad_audio_level_hook(session, level_db, is_speech, room_id),
429
+ name=f"vad_audio_level:{session.id}",
430
+ )
431
+
432
+ async def _fire_vad_audio_level_hook(
433
+ self, session: VoiceSession, level_db: float, is_speech: bool, room_id: str
434
+ ) -> None:
435
+ """Fire ON_VAD_AUDIO_LEVEL hook."""
436
+ if not self._framework:
437
+ return
438
+
439
+ try:
440
+ from roomkit.voice.events import VADAudioLevelEvent
441
+
442
+ context = await self._framework._build_context(room_id)
443
+ event = VADAudioLevelEvent(
444
+ session=session,
445
+ level_db=level_db,
446
+ is_speech=is_speech,
447
+ )
448
+ await self._framework.hook_engine.run_async_hooks(
449
+ room_id,
450
+ HookTrigger.ON_VAD_AUDIO_LEVEL,
451
+ event, # type: ignore[arg-type]
452
+ context,
453
+ skip_event_filter=True,
454
+ )
455
+ except Exception:
456
+ logger.exception("Error firing ON_VAD_AUDIO_LEVEL hook")
457
+
458
+ def _on_backend_barge_in(self, session: VoiceSession) -> None:
459
+ """Handle barge-in detected by backend."""
460
+ binding_info = self._session_bindings.get(session.id)
461
+ if not binding_info or not self._framework:
462
+ return
463
+
464
+ room_id, _ = binding_info
465
+ playback = self._playing_sessions.get(session.id)
466
+
467
+ if not playback:
468
+ return
469
+
470
+ try:
471
+ loop = asyncio.get_running_loop()
472
+ except RuntimeError:
473
+ return
474
+
475
+ loop.create_task(
476
+ self._handle_barge_in(session, playback, room_id),
477
+ name=f"backend_barge_in:{session.id}",
478
+ )
479
+
480
+ def _on_speech_end(self, session: VoiceSession, audio: bytes) -> None:
481
+ """Handle VAD speech end event.
482
+
483
+ Fires ON_SPEECH_END hooks, transcribes audio, and routes to framework.
484
+ """
485
+ binding_info = self._session_bindings.get(session.id)
486
+ if not binding_info or not self._framework:
487
+ return
488
+
489
+ room_id, _ = binding_info
490
+
491
+ # Process asynchronously
492
+ try:
493
+ loop = asyncio.get_running_loop()
494
+ except RuntimeError:
495
+ return
496
+
497
+ loop.create_task(
498
+ self._process_speech_end(session, audio, room_id),
499
+ name=f"speech_end:{session.id}",
500
+ )
501
+
502
+ async def _process_speech_end(
503
+ self, session: VoiceSession, audio: bytes, room_id: str
504
+ ) -> None:
505
+ """Process speech end: fire hooks, transcribe, route inbound."""
506
+ if not self._framework:
507
+ return
508
+
509
+ try:
510
+ context = await self._framework._build_context(room_id)
511
+
512
+ # Fire ON_SPEECH_END hooks
513
+ await self._framework.hook_engine.run_async_hooks(
514
+ room_id,
515
+ HookTrigger.ON_SPEECH_END,
516
+ session, # type: ignore[arg-type]
517
+ context,
518
+ skip_event_filter=True,
519
+ )
520
+
521
+ # Transcribe if STT is configured
522
+ if not self._stt:
523
+ logger.warning("Speech ended but no STT provider configured")
524
+ return
525
+
526
+ from roomkit.voice.base import AudioChunk
527
+
528
+ # Get audio parameters from session metadata (set by backend)
529
+ sample_rate = session.metadata.get("input_sample_rate", 16000)
530
+ audio_chunk = AudioChunk(
531
+ data=audio,
532
+ sample_rate=sample_rate,
533
+ channels=1,
534
+ format="pcm_s16le",
535
+ )
536
+ text = await self._stt.transcribe(audio_chunk)
537
+
538
+ if not text.strip():
539
+ logger.debug("Empty transcription, skipping")
540
+ return
541
+
542
+ logger.info("Transcription: %s", text)
543
+
544
+ # Send transcription to client UI (if backend supports it)
545
+ if self._backend:
546
+ await self._backend.send_transcription(session, text, "user")
547
+
548
+ # Fire ON_TRANSCRIPTION hooks (sync, can modify)
549
+ transcription_result = await self._framework.hook_engine.run_sync_hooks(
550
+ room_id,
551
+ HookTrigger.ON_TRANSCRIPTION,
552
+ text, # type: ignore[arg-type]
553
+ context,
554
+ skip_event_filter=True,
555
+ )
556
+
557
+ if not transcription_result.allowed:
558
+ logger.info(
559
+ "Transcription blocked by hook: %s", transcription_result.reason
560
+ )
561
+ return
562
+
563
+ # Use potentially modified text
564
+ final_text = (
565
+ transcription_result.event
566
+ if isinstance(transcription_result.event, str)
567
+ else text
568
+ )
569
+
570
+ # Route through inbound pipeline
571
+ from roomkit.models.delivery import InboundMessage
572
+ from roomkit.models.event import TextContent
573
+
574
+ inbound = InboundMessage(
575
+ channel_id=self.channel_id,
576
+ sender_id=session.participant_id,
577
+ content=TextContent(body=final_text),
578
+ metadata={"voice_session_id": session.id, "source": "voice"},
579
+ )
580
+
581
+ await self._framework.process_inbound(inbound)
582
+
583
+ except Exception:
584
+ logger.exception("Error processing speech end")
585
+
586
+ async def handle_inbound(
587
+ self, message: InboundMessage, context: RoomContext
588
+ ) -> RoomEvent:
589
+ from roomkit.models.event import EventSource
590
+ from roomkit.models.event import RoomEvent as RoomEventModel
591
+
592
+ return RoomEventModel(
593
+ room_id=context.room.id,
594
+ source=EventSource(
595
+ channel_id=self.channel_id,
596
+ channel_type=self.channel_type,
597
+ participant_id=message.sender_id,
598
+ external_id=message.external_id,
599
+ ),
600
+ content=message.content,
601
+ idempotency_key=message.idempotency_key,
602
+ metadata=message.metadata,
603
+ )
604
+
605
+ async def deliver(
606
+ self, event: RoomEvent, binding: ChannelBinding, context: RoomContext
607
+ ) -> ChannelOutput:
608
+ from roomkit.models.channel import ChannelOutput as ChannelOutputModel
609
+ from roomkit.models.event import TextContent
610
+
611
+ # In streaming mode without backend, delivery is handled externally
612
+ if self._streaming and not self._backend:
613
+ return ChannelOutputModel.empty()
614
+
615
+ # In streaming mode with backend, stream TTS audio to the session
616
+ if self._streaming and self._backend and isinstance(event.content, TextContent):
617
+ await self._deliver_voice(event, binding, context)
618
+ return ChannelOutputModel.empty()
619
+
620
+ # Store-and-forward mode: synthesize audio for later retrieval.
621
+ if not self._streaming and isinstance(event.content, TextContent) and self._tts:
622
+ raise NotImplementedError(
623
+ "VoiceChannel store-and-forward mode requires MediaStore support. "
624
+ "Use streaming=True (default) for real-time voice, or implement "
625
+ "MediaStore for async audio delivery."
626
+ )
627
+
628
+ return ChannelOutputModel.empty()
629
+
630
+ async def _deliver_voice(
631
+ self, event: RoomEvent, binding: ChannelBinding, context: RoomContext
632
+ ) -> None:
633
+ """Deliver text event as voice audio via the backend."""
634
+ if not self._tts or not self._backend or not self._framework:
635
+ return
636
+
637
+ from roomkit.models.event import TextContent
638
+
639
+ if not isinstance(event.content, TextContent):
640
+ return
641
+
642
+ text = event.content.body
643
+ room_id = event.room_id
644
+
645
+ try:
646
+ # Fire BEFORE_TTS hooks (sync, can modify)
647
+ before_result = await self._framework.hook_engine.run_sync_hooks(
648
+ room_id,
649
+ HookTrigger.BEFORE_TTS,
650
+ text, # type: ignore[arg-type]
651
+ context,
652
+ skip_event_filter=True,
653
+ )
654
+
655
+ if not before_result.allowed:
656
+ logger.info("TTS blocked by hook: %s", before_result.reason)
657
+ return
658
+
659
+ # Use potentially modified text
660
+ final_text = (
661
+ before_result.event
662
+ if isinstance(before_result.event, str)
663
+ else text
664
+ )
665
+
666
+ logger.info("AI response: %s", final_text)
667
+
668
+ # Find the session(s) to send audio to
669
+ # Look for sessions bound to this room
670
+ target_sessions: list[VoiceSession] = []
671
+ for session_id, (bound_room_id, bound_binding) in self._session_bindings.items():
672
+ if bound_room_id == room_id and bound_binding.channel_id == binding.channel_id:
673
+ session = self._backend.get_session(session_id)
674
+ if session:
675
+ target_sessions.append(session)
676
+
677
+ if not target_sessions:
678
+ # Fallback: get all sessions in the room from backend
679
+ target_sessions = self._backend.list_sessions(room_id)
680
+
681
+ # Stream TTS audio to each session
682
+ for session in target_sessions:
683
+ # Send AI response text to client UI
684
+ await self._backend.send_transcription(session, final_text, "assistant")
685
+
686
+ # Track playback state for barge-in detection
687
+ self._playing_sessions[session.id] = TTSPlaybackState(
688
+ session_id=session.id,
689
+ text=final_text,
690
+ )
691
+
692
+ try:
693
+ # Use streaming if available
694
+ audio_stream = self._tts.synthesize_stream(final_text)
695
+ await self._backend.send_audio(session, audio_stream)
696
+ except NotImplementedError:
697
+ # Fallback to non-streaming
698
+ audio_content = await self._tts.synthesize(final_text)
699
+ # For non-streaming, we'd need to fetch the audio from URL
700
+ # This is a limitation - mock backends handle it
701
+ logger.warning(
702
+ "TTS provider %s doesn't support streaming", self._tts.name
703
+ )
704
+ finally:
705
+ # Clear playback state (TTS finished or failed)
706
+ self._playing_sessions.pop(session.id, None)
707
+
708
+ # Fire AFTER_TTS hooks (async)
709
+ await self._framework.hook_engine.run_async_hooks(
710
+ room_id,
711
+ HookTrigger.AFTER_TTS,
712
+ final_text, # type: ignore[arg-type]
713
+ context,
714
+ skip_event_filter=True,
715
+ )
716
+
717
+ except Exception:
718
+ logger.exception("Error delivering voice audio")
719
+
720
+ async def close(self) -> None:
721
+ if self._stt:
722
+ await self._stt.close()
723
+ if self._tts:
724
+ await self._tts.close()
725
+ if self._backend:
726
+ await self._backend.close()
727
+ self._session_bindings.clear()
728
+ self._playing_sessions.clear()