dv-pipecat-ai 0.0.85.dev818__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (32) hide show
  1. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/METADATA +2 -1
  2. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/RECORD +32 -29
  3. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +5 -1
  4. pipecat/frames/frames.py +34 -0
  5. pipecat/metrics/connection_metrics.py +45 -0
  6. pipecat/processors/aggregators/llm_response.py +25 -4
  7. pipecat/processors/dtmf_aggregator.py +17 -21
  8. pipecat/processors/frame_processor.py +51 -8
  9. pipecat/processors/metrics/frame_processor_metrics.py +108 -0
  10. pipecat/processors/transcript_processor.py +22 -1
  11. pipecat/serializers/__init__.py +2 -0
  12. pipecat/serializers/asterisk.py +16 -2
  13. pipecat/serializers/convox.py +2 -2
  14. pipecat/serializers/custom.py +2 -2
  15. pipecat/serializers/vi.py +326 -0
  16. pipecat/services/cartesia/tts.py +75 -10
  17. pipecat/services/deepgram/stt.py +317 -17
  18. pipecat/services/elevenlabs/stt.py +487 -19
  19. pipecat/services/elevenlabs/tts.py +28 -4
  20. pipecat/services/google/llm.py +26 -11
  21. pipecat/services/openai/base_llm.py +79 -14
  22. pipecat/services/salesforce/llm.py +321 -86
  23. pipecat/services/sarvam/tts.py +0 -1
  24. pipecat/services/soniox/stt.py +45 -10
  25. pipecat/services/vistaar/llm.py +97 -6
  26. pipecat/transcriptions/language.py +50 -0
  27. pipecat/transports/base_input.py +15 -11
  28. pipecat/transports/base_output.py +29 -3
  29. pipecat/utils/redis.py +58 -0
  30. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL +0 -0
  31. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE +0 -0
  32. {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt +0 -0
@@ -11,19 +11,43 @@ using segmented audio processing. The service uploads audio files and receives
11
11
  transcription results directly.
12
12
  """
13
13
 
14
+ import base64
14
15
  import io
16
+ import json
17
+ from enum import Enum
15
18
  from typing import AsyncGenerator, Optional
16
19
 
17
20
  import aiohttp
18
21
  from loguru import logger
19
22
  from pydantic import BaseModel
20
23
 
21
- from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
22
- from pipecat.services.stt_service import SegmentedSTTService
23
- from pipecat.transcriptions.language import Language
24
+ from pipecat.frames.frames import (
25
+ CancelFrame,
26
+ EndFrame,
27
+ ErrorFrame,
28
+ Frame,
29
+ InterimTranscriptionFrame,
30
+ StartFrame,
31
+ TranscriptionFrame,
32
+ UserStartedSpeakingFrame,
33
+ UserStoppedSpeakingFrame,
34
+ )
35
+ from pipecat.processors.frame_processor import FrameDirection
36
+ from pipecat.services.stt_service import SegmentedSTTService, WebsocketSTTService
37
+ from pipecat.transcriptions.language import Language, resolve_language
24
38
  from pipecat.utils.time import time_now_iso8601
25
39
  from pipecat.utils.tracing.service_decorators import traced_stt
26
40
 
41
+ try:
42
+ from websockets.asyncio.client import connect as websocket_connect
43
+ from websockets.protocol import State
44
+ except ModuleNotFoundError as e:
45
+ logger.error(f"Exception: {e}")
46
+ logger.error(
47
+ "In order to use ElevenLabs Realtime STT, you need to `pip install pipecat-ai[elevenlabs]`."
48
+ )
49
+ raise Exception(f"Missing module: {e}")
50
+
27
51
 
28
52
  def language_to_elevenlabs_language(language: Language) -> Optional[str]:
29
53
  """Convert a Language enum to ElevenLabs language code.
@@ -37,7 +61,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
37
61
  Returns:
38
62
  The corresponding ElevenLabs language code, or None if not supported.
39
63
  """
40
- BASE_LANGUAGES = {
64
+ LANGUAGE_MAP = {
41
65
  Language.AF: "afr", # Afrikaans
42
66
  Language.AM: "amh", # Amharic
43
67
  Language.AR: "ara", # Arabic
@@ -139,15 +163,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
139
163
  Language.ZU: "zul", # Zulu
140
164
  }
141
165
 
142
- result = BASE_LANGUAGES.get(language)
143
-
144
- # If not found in base languages, try to find the base language from a variant
145
- if not result:
146
- lang_str = str(language.value)
147
- base_code = lang_str.split("-")[0].lower()
148
- result = base_code if base_code in BASE_LANGUAGES.values() else None
149
-
150
- return result
166
+ return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
151
167
 
152
168
 
153
169
  class ElevenLabsSTTService(SegmentedSTTService):
@@ -235,7 +251,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
235
251
  Args:
236
252
  language: The language to use for speech-to-text transcription.
237
253
  """
238
- self.logger.info(f"Switching STT language to: [{language}]")
254
+ logger.info(f"Switching STT language to: [{language}]")
239
255
  self._settings["language"] = self.language_to_service_language(language)
240
256
 
241
257
  async def set_model(self, model: str):
@@ -249,7 +265,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
249
265
  This method is provided for interface compatibility.
250
266
  """
251
267
  await super().set_model(model)
252
- self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
268
+ logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
253
269
 
254
270
  async def _transcribe_audio(self, audio_data: bytes) -> dict:
255
271
  """Upload audio data to ElevenLabs and get transcription result.
@@ -283,7 +299,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
283
299
  async with self._session.post(url, data=data, headers=headers) as response:
284
300
  if response.status != 200:
285
301
  error_text = await response.text()
286
- self.logger.error(f"ElevenLabs transcription error: {error_text}")
302
+ logger.error(f"ElevenLabs transcription error: {error_text}")
287
303
  raise Exception(f"Transcription failed with status {response.status}: {error_text}")
288
304
 
289
305
  result = await response.json()
@@ -324,7 +340,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
324
340
  detected_language = result.get("language_code", "eng")
325
341
 
326
342
  await self._handle_transcription(text, True, detected_language)
327
- self.logger.debug(f"Transcription: [{text}]")
343
+ logger.debug(f"Transcription: [{text}]")
328
344
 
329
345
  yield TranscriptionFrame(
330
346
  text,
@@ -335,5 +351,457 @@ class ElevenLabsSTTService(SegmentedSTTService):
335
351
  )
336
352
 
337
353
  except Exception as e:
338
- self.logger.error(f"ElevenLabs STT error: {e}")
339
- yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
354
+ logger.error(f"{self} exception: {e}")
355
+ yield ErrorFrame(error=f"{self} error: {e}")
356
+
357
+
358
+ def audio_format_from_sample_rate(sample_rate: int) -> str:
359
+ """Get the appropriate audio format string for a given sample rate.
360
+
361
+ Args:
362
+ sample_rate: The audio sample rate in Hz.
363
+
364
+ Returns:
365
+ The ElevenLabs audio format string.
366
+ """
367
+ match sample_rate:
368
+ case 8000:
369
+ return "pcm_8000"
370
+ case 16000:
371
+ return "pcm_16000"
372
+ case 22050:
373
+ return "pcm_22050"
374
+ case 24000:
375
+ return "pcm_24000"
376
+ case 44100:
377
+ return "pcm_44100"
378
+ case 48000:
379
+ return "pcm_48000"
380
+ logger.warning(
381
+ f"ElevenLabsRealtimeSTTService: No audio format available for {sample_rate} sample rate, using pcm_16000"
382
+ )
383
+ return "pcm_16000"
384
+
385
+
386
+ class CommitStrategy(str, Enum):
387
+ """Commit strategies for transcript segmentation."""
388
+
389
+ MANUAL = "manual"
390
+ VAD = "vad"
391
+
392
+
393
+ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
394
+ """Speech-to-text service using ElevenLabs' Realtime WebSocket API.
395
+
396
+ This service uses ElevenLabs' Realtime Speech-to-Text API to perform transcription
397
+ with ultra-low latency. It supports both partial (interim) and committed (final)
398
+ transcripts, and can use either manual commit control or automatic Voice Activity
399
+ Detection (VAD) for segment boundaries.
400
+
401
+ By default, uses manual commit strategy where Pipecat's VAD controls when to
402
+ commit transcript segments, providing consistency with other STT services.
403
+ """
404
+
405
+ class InputParams(BaseModel):
406
+ """Configuration parameters for ElevenLabs Realtime STT API.
407
+
408
+ Parameters:
409
+ language_code: ISO-639-1 or ISO-639-3 language code. Leave None for auto-detection.
410
+ commit_strategy: How to segment speech - manual (Pipecat VAD) or vad (ElevenLabs VAD).
411
+ vad_silence_threshold_secs: Seconds of silence before VAD commits (0.3-3.0).
412
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
413
+ vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
414
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
415
+ min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
416
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
417
+ min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
418
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
419
+ """
420
+
421
+ language_code: Optional[str] = None
422
+ commit_strategy: CommitStrategy = CommitStrategy.MANUAL
423
+ vad_silence_threshold_secs: Optional[float] = None
424
+ vad_threshold: Optional[float] = None
425
+ min_speech_duration_ms: Optional[int] = None
426
+ min_silence_duration_ms: Optional[int] = None
427
+
428
+ def __init__(
429
+ self,
430
+ *,
431
+ api_key: str,
432
+ base_url: str = "api.elevenlabs.io",
433
+ model: str = "scribe_v2_realtime",
434
+ sample_rate: Optional[int] = None,
435
+ params: Optional[InputParams] = None,
436
+ **kwargs,
437
+ ):
438
+ """Initialize the ElevenLabs Realtime STT service.
439
+
440
+ Args:
441
+ api_key: ElevenLabs API key for authentication.
442
+ base_url: Base URL for ElevenLabs WebSocket API.
443
+ model: Model ID for transcription. Defaults to "scribe_v2_realtime".
444
+ sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
445
+ params: Configuration parameters for the STT service.
446
+ **kwargs: Additional arguments passed to WebsocketSTTService.
447
+ """
448
+ super().__init__(
449
+ sample_rate=sample_rate,
450
+ **kwargs,
451
+ )
452
+
453
+ params = params or ElevenLabsRealtimeSTTService.InputParams()
454
+
455
+ self._api_key = api_key
456
+ self._base_url = base_url
457
+ self._model_id = model
458
+ self._params = params
459
+ self._audio_format = "" # initialized in start()
460
+ self._receive_task = None
461
+
462
+ def can_generate_metrics(self) -> bool:
463
+ """Check if the service can generate processing metrics.
464
+
465
+ Returns:
466
+ True, as ElevenLabs Realtime STT service supports metrics generation.
467
+ """
468
+ return True
469
+
470
+ async def set_language(self, language: Language):
471
+ """Set the transcription language.
472
+
473
+ Args:
474
+ language: The language to use for speech-to-text transcription.
475
+
476
+ Note:
477
+ Changing language requires reconnecting to the WebSocket.
478
+ """
479
+ logger.info(f"Switching STT language to: [{language}]")
480
+ self._params.language_code = language.value if isinstance(language, Language) else language
481
+ # Reconnect with new settings
482
+ await self._disconnect()
483
+ await self._connect()
484
+
485
+ async def set_model(self, model: str):
486
+ """Set the STT model.
487
+
488
+ Args:
489
+ model: The model name to use for transcription.
490
+
491
+ Note:
492
+ Changing model requires reconnecting to the WebSocket.
493
+ """
494
+ await super().set_model(model)
495
+ logger.info(f"Switching STT model to: [{model}]")
496
+ self._model_id = model
497
+ # Reconnect with new settings
498
+ await self._disconnect()
499
+ await self._connect()
500
+
501
+ async def start(self, frame: StartFrame):
502
+ """Start the STT service and establish WebSocket connection.
503
+
504
+ Args:
505
+ frame: Frame indicating service should start.
506
+ """
507
+ await super().start(frame)
508
+ self._audio_format = audio_format_from_sample_rate(self.sample_rate)
509
+ await self._connect()
510
+
511
+ async def stop(self, frame: EndFrame):
512
+ """Stop the STT service and close WebSocket connection.
513
+
514
+ Args:
515
+ frame: Frame indicating service should stop.
516
+ """
517
+ await super().stop(frame)
518
+ await self._disconnect()
519
+
520
+ async def cancel(self, frame: CancelFrame):
521
+ """Cancel the STT service and close WebSocket connection.
522
+
523
+ Args:
524
+ frame: Frame indicating service should be cancelled.
525
+ """
526
+ await super().cancel(frame)
527
+ await self._disconnect()
528
+
529
+ async def start_metrics(self):
530
+ """Start performance metrics collection for transcription processing."""
531
+ await self.start_ttfb_metrics()
532
+ await self.start_processing_metrics()
533
+
534
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
535
+ """Process incoming frames and handle speech events.
536
+
537
+ Args:
538
+ frame: The frame to process.
539
+ direction: Direction of frame flow in the pipeline.
540
+ """
541
+ await super().process_frame(frame, direction)
542
+
543
+ if isinstance(frame, UserStartedSpeakingFrame):
544
+ # Start metrics when user starts speaking
545
+ await self.start_metrics()
546
+ elif isinstance(frame, UserStoppedSpeakingFrame):
547
+ # Send commit when user stops speaking (manual commit mode)
548
+ if self._params.commit_strategy == CommitStrategy.MANUAL:
549
+ if self._websocket and self._websocket.state is State.OPEN:
550
+ try:
551
+ commit_message = {
552
+ "message_type": "input_audio_chunk",
553
+ "audio_base_64": "",
554
+ "commit": True,
555
+ "sample_rate": self.sample_rate,
556
+ }
557
+ await self._websocket.send(json.dumps(commit_message))
558
+ logger.trace("Sent manual commit to ElevenLabs")
559
+ except Exception as e:
560
+ logger.warning(f"Failed to send commit: {e}")
561
+
562
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
563
+ """Process audio data for speech-to-text transcription.
564
+
565
+ Args:
566
+ audio: Raw audio bytes to transcribe.
567
+
568
+ Yields:
569
+ None - transcription results are handled via WebSocket responses.
570
+ """
571
+ # Reconnect if connection is closed
572
+ if not self._websocket or self._websocket.state is State.CLOSED:
573
+ await self._connect()
574
+
575
+ if self._websocket and self._websocket.state is State.OPEN:
576
+ try:
577
+ # Encode audio as base64
578
+ audio_base64 = base64.b64encode(audio).decode("utf-8")
579
+
580
+ # Send audio chunk
581
+ message = {
582
+ "message_type": "input_audio_chunk",
583
+ "audio_base_64": audio_base64,
584
+ "commit": False,
585
+ "sample_rate": self.sample_rate,
586
+ }
587
+ await self._websocket.send(json.dumps(message))
588
+ except Exception as e:
589
+ logger.error(f"Error sending audio: {e}")
590
+ yield ErrorFrame(f"ElevenLabs Realtime STT error: {str(e)}")
591
+
592
+ yield None
593
+
594
+ async def _connect(self):
595
+ """Establish WebSocket connection to ElevenLabs Realtime STT."""
596
+ await self._connect_websocket()
597
+
598
+ if self._websocket and not self._receive_task:
599
+ self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
600
+
601
+ async def _disconnect(self):
602
+ """Close WebSocket connection and cleanup tasks."""
603
+ if self._receive_task:
604
+ await self.cancel_task(self._receive_task)
605
+ self._receive_task = None
606
+
607
+ await self._disconnect_websocket()
608
+
609
+ async def _connect_websocket(self):
610
+ """Connect to ElevenLabs Realtime STT WebSocket endpoint."""
611
+ try:
612
+ if self._websocket and self._websocket.state is State.OPEN:
613
+ return
614
+
615
+ logger.debug("Connecting to ElevenLabs Realtime STT")
616
+
617
+ # Build query parameters
618
+ params = [f"model_id={self._model_id}"]
619
+
620
+ if self._params.language_code:
621
+ params.append(f"language_code={self._params.language_code}")
622
+
623
+ params.append(f"encoding={self._audio_format}")
624
+ params.append(f"sample_rate={self.sample_rate}")
625
+ params.append(f"commit_strategy={self._params.commit_strategy.value}")
626
+
627
+ # Add VAD parameters if using VAD commit strategy and values are specified
628
+ if self._params.commit_strategy == CommitStrategy.VAD:
629
+ if self._params.vad_silence_threshold_secs is not None:
630
+ params.append(
631
+ f"vad_silence_threshold_secs={self._params.vad_silence_threshold_secs}"
632
+ )
633
+ if self._params.vad_threshold is not None:
634
+ params.append(f"vad_threshold={self._params.vad_threshold}")
635
+ if self._params.min_speech_duration_ms is not None:
636
+ params.append(f"min_speech_duration_ms={self._params.min_speech_duration_ms}")
637
+ if self._params.min_silence_duration_ms is not None:
638
+ params.append(f"min_silence_duration_ms={self._params.min_silence_duration_ms}")
639
+
640
+ ws_url = f"wss://{self._base_url}/v1/speech-to-text/realtime?{'&'.join(params)}"
641
+
642
+ headers = {"xi-api-key": self._api_key}
643
+
644
+ self._websocket = await websocket_connect(ws_url, additional_headers=headers)
645
+ await self._call_event_handler("on_connected")
646
+ logger.debug("Connected to ElevenLabs Realtime STT")
647
+ except Exception as e:
648
+ logger.error(f"{self}: unable to connect to ElevenLabs Realtime STT: {e}")
649
+ await self.push_error(ErrorFrame(f"Connection error: {str(e)}"))
650
+
651
+ async def _disconnect_websocket(self):
652
+ """Disconnect from ElevenLabs Realtime STT WebSocket."""
653
+ try:
654
+ if self._websocket and self._websocket.state is State.OPEN:
655
+ logger.debug("Disconnecting from ElevenLabs Realtime STT")
656
+ await self._websocket.close()
657
+ except Exception as e:
658
+ logger.error(f"{self} error closing websocket: {e}")
659
+ finally:
660
+ self._websocket = None
661
+ await self._call_event_handler("on_disconnected")
662
+
663
+ def _get_websocket(self):
664
+ """Get the current WebSocket connection.
665
+
666
+ Returns:
667
+ The WebSocket connection.
668
+
669
+ Raises:
670
+ Exception: If WebSocket is not connected.
671
+ """
672
+ if self._websocket:
673
+ return self._websocket
674
+ raise Exception("Websocket not connected")
675
+
676
+ async def _process_messages(self):
677
+ """Process incoming WebSocket messages."""
678
+ async for message in self._get_websocket():
679
+ try:
680
+ data = json.loads(message)
681
+ await self._process_response(data)
682
+ except json.JSONDecodeError:
683
+ logger.warning(f"Received non-JSON message: {message}")
684
+ except Exception as e:
685
+ logger.error(f"Error processing message: {e}")
686
+
687
+ async def _receive_messages(self):
688
+ """Continuously receive and process WebSocket messages."""
689
+ try:
690
+ await self._process_messages()
691
+ except Exception as e:
692
+ logger.warning(f"{self} WebSocket connection closed: {e}")
693
+ # Connection closed, will reconnect on next audio chunk
694
+
695
+ async def _process_response(self, data: dict):
696
+ """Process a response message from ElevenLabs.
697
+
698
+ Args:
699
+ data: Parsed JSON response data.
700
+ """
701
+ message_type = data.get("message_type")
702
+
703
+ if message_type == "session_started":
704
+ logger.debug(f"ElevenLabs session started: {data}")
705
+
706
+ elif message_type == "partial_transcript":
707
+ await self._on_partial_transcript(data)
708
+
709
+ elif message_type == "committed_transcript":
710
+ await self._on_committed_transcript(data)
711
+
712
+ elif message_type == "committed_transcript_with_timestamps":
713
+ await self._on_committed_transcript_with_timestamps(data)
714
+
715
+ elif message_type == "input_error":
716
+ error_msg = data.get("error", "Unknown input error")
717
+ logger.error(f"ElevenLabs input error: {error_msg}")
718
+ await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
719
+
720
+ elif message_type in ["auth_error", "quota_exceeded", "transcriber_error", "error"]:
721
+ error_msg = data.get("error", data.get("message", "Unknown error"))
722
+ logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
723
+ await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
724
+
725
+ else:
726
+ logger.debug(f"Unknown message type: {message_type}")
727
+
728
+ async def _on_partial_transcript(self, data: dict):
729
+ """Handle partial transcript (interim results).
730
+
731
+ Args:
732
+ data: Partial transcript data.
733
+ """
734
+ text = data.get("text", "").strip()
735
+ if not text:
736
+ return
737
+
738
+ await self.stop_ttfb_metrics()
739
+
740
+ # Get language if provided
741
+ language = data.get("language_code")
742
+
743
+ logger.trace(f"Partial transcript: [{text}]")
744
+
745
+ await self.push_frame(
746
+ InterimTranscriptionFrame(
747
+ text,
748
+ self._user_id,
749
+ time_now_iso8601(),
750
+ language,
751
+ result=data,
752
+ )
753
+ )
754
+
755
+ @traced_stt
756
+ async def _handle_transcription(
757
+ self, transcript: str, is_final: bool, language: Optional[str] = None
758
+ ):
759
+ """Handle a transcription result with tracing."""
760
+ pass
761
+
762
+ async def _on_committed_transcript(self, data: dict):
763
+ """Handle committed transcript (final results).
764
+
765
+ Args:
766
+ data: Committed transcript data.
767
+ """
768
+ text = data.get("text", "").strip()
769
+ if not text:
770
+ return
771
+
772
+ await self.stop_ttfb_metrics()
773
+ await self.stop_processing_metrics()
774
+
775
+ # Get language if provided
776
+ language = data.get("language_code")
777
+
778
+ logger.debug(f"Committed transcript: [{text}]")
779
+
780
+ await self._handle_transcription(text, True, language)
781
+
782
+ await self.push_frame(
783
+ TranscriptionFrame(
784
+ text,
785
+ self._user_id,
786
+ time_now_iso8601(),
787
+ language,
788
+ result=data,
789
+ )
790
+ )
791
+
792
+ async def _on_committed_transcript_with_timestamps(self, data: dict):
793
+ """Handle committed transcript with word-level timestamps.
794
+
795
+ Args:
796
+ data: Committed transcript data with timestamps.
797
+ """
798
+ text = data.get("text", "").strip()
799
+ if not text:
800
+ return
801
+
802
+ logger.debug(f"Committed transcript with timestamps: [{text}]")
803
+ logger.trace(f"Timestamps: {data.get('words', [])}")
804
+
805
+ # This is sent after the committed_transcript, so we don't need to
806
+ # push another TranscriptionFrame, but we could use the timestamps
807
+ # for additional processing if needed in the future
@@ -14,7 +14,17 @@ import asyncio
14
14
  import base64
15
15
  import json
16
16
  import uuid
17
- from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple, Union
17
+ from typing import (
18
+ Any,
19
+ AsyncGenerator,
20
+ Dict,
21
+ List,
22
+ Literal,
23
+ Mapping,
24
+ Optional,
25
+ Tuple,
26
+ Union,
27
+ )
18
28
 
19
29
  import aiohttp
20
30
  from loguru import logger
@@ -157,7 +167,13 @@ def build_elevenlabs_voice_settings(
157
167
  Returns:
158
168
  Dictionary of voice settings or None if no valid settings are provided.
159
169
  """
160
- voice_setting_keys = ["stability", "similarity_boost", "style", "use_speaker_boost", "speed"]
170
+ voice_setting_keys = [
171
+ "stability",
172
+ "similarity_boost",
173
+ "style",
174
+ "use_speaker_boost",
175
+ "speed",
176
+ ]
161
177
 
162
178
  voice_settings = {}
163
179
  for key in voice_setting_keys:
@@ -503,6 +519,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
503
519
  return
504
520
 
505
521
  self.logger.debug("Connecting to ElevenLabs")
522
+ await self.start_connection_metrics()
506
523
 
507
524
  voice_id = self._voice_id
508
525
  model = self.model_name
@@ -530,17 +547,24 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
530
547
 
531
548
  # Set max websocket message size to 16MB for large audio responses
532
549
  self._websocket = await websocket_connect(
533
- url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
550
+ url,
551
+ max_size=16 * 1024 * 1024,
552
+ additional_headers={"xi-api-key": self._api_key},
534
553
  )
535
554
 
555
+ await self.stop_connection_metrics(success=True, connection_type="websocket")
556
+ await self.stop_reconnection_metrics(success=True, reason="successful_reconnection")
536
557
  await self._call_event_handler("on_connected")
537
558
  except Exception as e:
538
559
  self.logger.error(f"{self} initialization error: {e}")
560
+ await self.stop_connection_metrics(success=False, error=str(e), connection_type="websocket")
561
+ await self.stop_reconnection_metrics(success=False, reason="connection_failed")
539
562
  self._websocket = None
540
563
  await self._call_event_handler("on_connection_error", f"{e}")
541
564
 
542
565
  async def _disconnect_websocket(self):
543
566
  try:
567
+ await self.start_reconnection_metrics()
544
568
  await self.stop_all_metrics()
545
569
 
546
570
  if self._websocket:
@@ -549,7 +573,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
549
573
  if self._context_id:
550
574
  await self._websocket.send(json.dumps({"close_socket": True}))
551
575
  await self._websocket.close()
552
- logger.debug("Disconnected from ElevenLabs")
576
+ self.logger.debug("Disconnected from ElevenLabs")
553
577
  except Exception as e:
554
578
  self.logger.error(f"{self} error closing websocket: {e}")
555
579
  finally:
@@ -760,12 +760,19 @@ class GoogleLLMService(LLMService):
760
760
 
761
761
  generation_config = GenerateContentConfig(system_instruction=system)
762
762
 
763
- # Use the new google-genai client's async method
764
- response = await self._client.aio.models.generate_content(
765
- model=self._model_name,
766
- contents=messages,
767
- config=generation_config,
768
- )
763
+ await self.start_connection_metrics()
764
+
765
+ try:
766
+ # Use the new google-genai client's async method
767
+ response = await self._client.aio.models.generate_content(
768
+ model=self._model_name,
769
+ contents=messages,
770
+ config=generation_config,
771
+ )
772
+ await self.stop_connection_metrics(success=True, connection_type="grpc")
773
+ except Exception as e:
774
+ await self.stop_connection_metrics(success=False, error=str(e), connection_type="grpc")
775
+ raise
769
776
 
770
777
  # Extract text from response
771
778
  if response.candidates and response.candidates[0].content:
@@ -849,11 +856,19 @@ class GoogleLLMService(LLMService):
849
856
  )
850
857
 
851
858
  await self.start_ttfb_metrics()
852
- return await self._client.aio.models.generate_content_stream(
853
- model=self._model_name,
854
- contents=messages,
855
- config=generation_config,
856
- )
859
+ await self.start_connection_metrics()
860
+
861
+ try:
862
+ result = await self._client.aio.models.generate_content_stream(
863
+ model=self._model_name,
864
+ contents=messages,
865
+ config=generation_config,
866
+ )
867
+ await self.stop_connection_metrics(success=True, connection_type="grpc")
868
+ return result
869
+ except Exception as e:
870
+ await self.stop_connection_metrics(success=False, error=str(e), connection_type="grpc")
871
+ raise
857
872
 
858
873
  async def _stream_content_specific_context(
859
874
  self, context: OpenAILLMContext