dv-pipecat-ai 0.0.85.dev850__py3-none-any.whl → 0.0.85.dev852__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev850
3
+ Version: 0.0.85.dev852
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev850.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev852.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -217,7 +217,7 @@ pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41
217
217
  pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
218
218
  pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
219
219
  pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
220
- pipecat/services/elevenlabs/stt.py,sha256=_RhBKpUYEGKMpcO7y4RLxmEOMK11LZFdZqDFIA-DZXk,27303
220
+ pipecat/services/elevenlabs/stt.py,sha256=dy88MvQdhUQ-SFA7YTBRykZsIozMnnYQaJ4og1RYlVc,30811
221
221
  pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
222
222
  pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
223
223
  pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
416
416
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
417
417
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
418
418
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
419
- dv_pipecat_ai-0.0.85.dev850.dist-info/METADATA,sha256=rqzfsDkrkClO-BvwwJr5_b2ggADWXFKhgzPgToBwDm0,32955
420
- dv_pipecat_ai-0.0.85.dev850.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
- dv_pipecat_ai-0.0.85.dev850.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
- dv_pipecat_ai-0.0.85.dev850.dist-info/RECORD,,
419
+ dv_pipecat_ai-0.0.85.dev852.dist-info/METADATA,sha256=L_uFM2KLucwhFvtLcQ9dWL_DQicbrpPyiHOlW81e9LM,32955
420
+ dv_pipecat_ai-0.0.85.dev852.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
+ dv_pipecat_ai-0.0.85.dev852.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
+ dv_pipecat_ai-0.0.85.dev852.dist-info/RECORD,,
@@ -4,14 +4,18 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """ElevenLabs speech-to-text service implementations."""
7
+ """ElevenLabs speech-to-text service implementation.
8
+
9
+ This module provides integration with ElevenLabs' Speech-to-Text API for transcription
10
+ using segmented audio processing. The service uploads audio files and receives
11
+ transcription results directly.
12
+ """
8
13
 
9
- import asyncio
10
14
  import base64
11
15
  import io
12
16
  import json
13
- import urllib.parse
14
- from typing import Any, AsyncGenerator, Dict, Literal, Optional
17
+ from enum import Enum
18
+ from typing import AsyncGenerator, Optional
15
19
 
16
20
  import aiohttp
17
21
  from loguru import logger
@@ -37,9 +41,12 @@ from pipecat.utils.tracing.service_decorators import traced_stt
37
41
  try:
38
42
  from websockets.asyncio.client import connect as websocket_connect
39
43
  from websockets.protocol import State
40
- except ModuleNotFoundError:
41
- websocket_connect = None # type: ignore[assignment]
42
- State = None # type: ignore[assignment]
44
+ except ModuleNotFoundError as e:
45
+ logger.error(f"Exception: {e}")
46
+ logger.error(
47
+ "In order to use ElevenLabs Realtime STT, you need to `pip install pipecat-ai[elevenlabs]`."
48
+ )
49
+ raise Exception(f"Missing module: {e}")
43
50
 
44
51
 
45
52
  def language_to_elevenlabs_language(language: Language) -> Optional[str]:
@@ -159,27 +166,20 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
159
166
  result = BASE_LANGUAGES.get(language)
160
167
 
161
168
  # If not found in base languages, try to find the base language from a variant
169
+ # For example, Language.EN_US (value "en-US") -> Language("en") -> "eng"
162
170
  if not result:
163
171
  lang_str = str(language.value)
164
- base_code = lang_str.split("-")[0].lower()
165
- result = base_code if base_code in BASE_LANGUAGES.values() else None
172
+ base_code = lang_str.split("-")[0] # Get "en" from "en-US"
173
+ try:
174
+ base_language = Language(base_code)
175
+ result = BASE_LANGUAGES.get(base_language)
176
+ except (ValueError, KeyError):
177
+ # If base language not found in Language enum, return None
178
+ result = None
166
179
 
167
180
  return result
168
181
 
169
182
 
170
- def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
171
- """Convert an ElevenLabs language code back to a Language enum value."""
172
- if not language_code:
173
- return None
174
-
175
- normalized = language_code.lower()
176
- for language in Language:
177
- code = language_to_elevenlabs_language(language)
178
- if code and code.lower() == normalized:
179
- return language
180
- return None
181
-
182
-
183
183
  class ElevenLabsSTTService(SegmentedSTTService):
184
184
  """Speech-to-text service using ElevenLabs' file-based API.
185
185
 
@@ -265,7 +265,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
265
265
  Args:
266
266
  language: The language to use for speech-to-text transcription.
267
267
  """
268
- self.logger.info(f"Switching STT language to: [{language}]")
268
+ logger.info(f"Switching STT language to: [{language}]")
269
269
  self._settings["language"] = self.language_to_service_language(language)
270
270
 
271
271
  async def set_model(self, model: str):
@@ -279,7 +279,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
279
279
  This method is provided for interface compatibility.
280
280
  """
281
281
  await super().set_model(model)
282
- self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
282
+ logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
283
283
 
284
284
  async def _transcribe_audio(self, audio_data: bytes) -> dict:
285
285
  """Upload audio data to ElevenLabs and get transcription result.
@@ -313,7 +313,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
313
313
  async with self._session.post(url, data=data, headers=headers) as response:
314
314
  if response.status != 200:
315
315
  error_text = await response.text()
316
- self.logger.error(f"ElevenLabs transcription error: {error_text}")
316
+ logger.error(f"ElevenLabs transcription error: {error_text}")
317
317
  raise Exception(f"Transcription failed with status {response.status}: {error_text}")
318
318
 
319
319
  result = await response.json()
@@ -354,7 +354,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
354
354
  detected_language = result.get("language_code", "eng")
355
355
 
356
356
  await self._handle_transcription(text, True, detected_language)
357
- self.logger.debug(f"Transcription: [{text}]")
357
+ logger.debug(f"Transcription: [{text}]")
358
358
 
359
359
  yield TranscriptionFrame(
360
360
  text,
@@ -365,18 +365,86 @@ class ElevenLabsSTTService(SegmentedSTTService):
365
365
  )
366
366
 
367
367
  except Exception as e:
368
- self.logger.error(f"ElevenLabs STT error: {e}")
368
+ logger.error(f"ElevenLabs STT error: {e}")
369
369
  yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
370
370
 
371
371
 
372
+ def audio_format_from_sample_rate(sample_rate: int) -> str:
373
+ """Get the appropriate audio format string for a given sample rate.
374
+
375
+ Args:
376
+ sample_rate: The audio sample rate in Hz.
377
+
378
+ Returns:
379
+ The ElevenLabs audio format string.
380
+ """
381
+ match sample_rate:
382
+ case 8000:
383
+ return "pcm_8000"
384
+ case 16000:
385
+ return "pcm_16000"
386
+ case 22050:
387
+ return "pcm_22050"
388
+ case 24000:
389
+ return "pcm_24000"
390
+ case 44100:
391
+ return "pcm_44100"
392
+ case 48000:
393
+ return "pcm_48000"
394
+ logger.warning(
395
+ f"ElevenLabsRealtimeSTTService: No audio format available for {sample_rate} sample rate, using pcm_16000"
396
+ )
397
+ return "pcm_16000"
398
+
399
+
400
+ class CommitStrategy(str, Enum):
401
+ """Commit strategies for transcript segmentation."""
402
+
403
+ MANUAL = "manual"
404
+ VAD = "vad"
405
+
406
+
372
407
  class ElevenLabsRealtimeSTTService(WebsocketSTTService):
373
- """Realtime speech-to-text service using ElevenLabs Scribe v2 WebSocket API."""
408
+ """Speech-to-text service using ElevenLabs' Realtime WebSocket API.
409
+
410
+ This service uses ElevenLabs' Realtime Speech-to-Text API to perform transcription
411
+ with ultra-low latency. It supports both partial (interim) and committed (final)
412
+ transcripts, and can use either manual commit control or automatic Voice Activity
413
+ Detection (VAD) for segment boundaries.
414
+
415
+ By default, uses manual commit strategy where Pipecat's VAD controls when to
416
+ commit transcript segments, providing consistency with other STT services.
417
+
418
+ Important:
419
+ When using manual commit strategy with Pipecat's VAD, it is recommended to set
420
+ the VAD `stop_secs` parameter to at least 0.5 seconds. Lower values may result
421
+ in incomplete transcriptions due to a known limitation in the ElevenLabs model
422
+ where audio sent near the commit boundary may not be fully processed.
423
+ """
374
424
 
375
425
  class InputParams(BaseModel):
376
- """Realtime connection parameters derived from ElevenLabs documentation."""
426
+ """Configuration parameters for ElevenLabs Realtime STT API.
377
427
 
378
- language: Optional[Language] = None
379
- commit_strategy: Literal["manual", "vad"] = "manual"
428
+ Parameters:
429
+ language_code: ISO-639-1 or ISO-639-3 language code. Leave None for auto-detection.
430
+ commit_strategy: How to segment speech - manual (Pipecat VAD) or vad (ElevenLabs VAD).
431
+ vad_silence_threshold_secs: Seconds of silence before VAD commits (0.3-3.0).
432
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
433
+ vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
434
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
435
+ min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
436
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
437
+ min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
438
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
439
+
440
+ Note:
441
+ When using manual commit strategy, ensure Pipecat's VAD `stop_secs` is set to
442
+ at least 0.5 seconds to avoid incomplete transcriptions. This is a known
443
+ limitation of the ElevenLabs model.
444
+ """
445
+
446
+ language_code: Optional[str] = None
447
+ commit_strategy: CommitStrategy = CommitStrategy.MANUAL
380
448
  vad_silence_threshold_secs: Optional[float] = None
381
449
  vad_threshold: Optional[float] = None
382
450
  min_speech_duration_ms: Optional[int] = None
@@ -386,210 +454,328 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
386
454
  self,
387
455
  *,
388
456
  api_key: str,
389
- sample_rate: Optional[int] = None,
457
+ base_url: str = "api.elevenlabs.io",
390
458
  model: str = "scribe_v2_realtime",
391
- url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
392
- params: Optional["ElevenLabsRealtimeSTTService.InputParams"] = None,
393
- reconnect_on_error: bool = True,
459
+ sample_rate: Optional[int] = None,
460
+ params: Optional[InputParams] = None,
394
461
  **kwargs,
395
462
  ):
396
- """Initialize the realtime STT service.
463
+ """Initialize the ElevenLabs Realtime STT service.
397
464
 
398
465
  Args:
399
466
  api_key: ElevenLabs API key for authentication.
400
- sample_rate: Optional input sample rate. Defaults to pipeline sample rate.
401
- model: Scribe realtime model identifier.
402
- url: WebSocket endpoint for realtime transcription.
403
- params: Optional realtime configuration options.
404
- reconnect_on_error: Whether to auto-reconnect on transient failures.
405
- **kwargs: Additional arguments forwarded to WebsocketSTTService.
467
+ base_url: Base URL for ElevenLabs WebSocket API.
468
+ model: Model ID for transcription. Defaults to "scribe_v2_realtime".
469
+ sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
470
+ params: Configuration parameters for the STT service.
471
+ **kwargs: Additional arguments passed to WebsocketSTTService.
472
+
473
+ Note:
474
+ When using manual commit strategy (default), configure Pipecat's VAD with
475
+ `stop_secs` of at least 0.5 seconds to ensure complete transcriptions.
406
476
  """
407
- if websocket_connect is None or State is None:
408
- logger.error(
409
- "In order to use ElevenLabsRealtimeSTTService, you need to "
410
- "`pip install pipecat-ai[elevenlabs]` (websockets extra)."
411
- )
412
- raise ModuleNotFoundError("Missing optional dependency: websockets")
477
+ super().__init__(
478
+ sample_rate=sample_rate,
479
+ **kwargs,
480
+ )
413
481
 
414
- super().__init__(sample_rate=sample_rate, reconnect_on_error=reconnect_on_error, **kwargs)
482
+ params = params or ElevenLabsRealtimeSTTService.InputParams()
415
483
 
416
484
  self._api_key = api_key
417
- self._url = url
418
- self.set_model_name(model)
419
- self._model = model
420
- self._params = params or ElevenLabsRealtimeSTTService.InputParams()
421
- self._language_override = self._params.language
422
- self._encoding = None
423
- self._receive_task: Optional[asyncio.Task] = None
424
- self._pending_final_message: Optional[Dict[str, Any]] = None
425
- self._pending_final_task: Optional[asyncio.Task] = None
426
- self._timestamp_merge_delay_s = 0.25
427
- self._ttfb_started = False
428
-
429
- @property
430
- def commit_strategy(self) -> str:
431
- """Return the configured commit strategy (manual or vad)."""
432
- return (self._params.commit_strategy or "manual").lower()
485
+ self._base_url = base_url
486
+ self._model_id = model
487
+ self._params = params
488
+ self._audio_format = "" # initialized in start()
489
+ self._receive_task = None
433
490
 
434
491
  def can_generate_metrics(self) -> bool:
435
- """Realtime ElevenLabs service supports latency metrics."""
492
+ """Check if the service can generate processing metrics.
493
+
494
+ Returns:
495
+ True, as ElevenLabs Realtime STT service supports metrics generation.
496
+ """
436
497
  return True
437
498
 
499
+ async def set_language(self, language: Language):
500
+ """Set the transcription language.
501
+
502
+ Args:
503
+ language: The language to use for speech-to-text transcription.
504
+
505
+ Note:
506
+ Changing language requires reconnecting to the WebSocket.
507
+ """
508
+ logger.info(f"Switching STT language to: [{language}]")
509
+ self._params.language_code = language.value if isinstance(language, Language) else language
510
+ # Reconnect with new settings
511
+ await self._disconnect()
512
+ await self._connect()
513
+
514
+ async def set_model(self, model: str):
515
+ """Set the STT model.
516
+
517
+ Args:
518
+ model: The model name to use for transcription.
519
+
520
+ Note:
521
+ Changing model requires reconnecting to the WebSocket.
522
+ """
523
+ await super().set_model(model)
524
+ logger.info(f"Switching STT model to: [{model}]")
525
+ self._model_id = model
526
+ # Reconnect with new settings
527
+ await self._disconnect()
528
+ await self._connect()
529
+
438
530
  async def start(self, frame: StartFrame):
439
- """Start the realtime STT service and establish WebSocket connection."""
531
+ """Start the STT service and establish WebSocket connection.
532
+
533
+ Args:
534
+ frame: Frame indicating service should start.
535
+ """
440
536
  await super().start(frame)
441
- self._encoding = self._determine_encoding(self.sample_rate)
537
+ self._audio_format = audio_format_from_sample_rate(self.sample_rate)
442
538
  await self._connect()
443
539
 
444
540
  async def stop(self, frame: EndFrame):
445
- """Stop the realtime STT service and close WebSocket connection."""
541
+ """Stop the STT service and close WebSocket connection.
542
+
543
+ Args:
544
+ frame: Frame indicating service should stop.
545
+ """
446
546
  await super().stop(frame)
447
547
  await self._disconnect()
448
548
 
449
549
  async def cancel(self, frame: CancelFrame):
450
- """Cancel the realtime STT service and close WebSocket connection."""
550
+ """Cancel the STT service and close WebSocket connection.
551
+
552
+ Args:
553
+ frame: Frame indicating service should be cancelled.
554
+ """
451
555
  await super().cancel(frame)
452
556
  await self._disconnect()
453
557
 
454
- async def set_language(self, language: Language):
455
- """Update preferred transcription language (requires reconnect)."""
456
- self._language_override = language
457
- self._params.language = language
458
- if self._websocket:
459
- await self._disconnect()
460
- await self._connect()
461
-
462
- async def set_model(self, model: str):
463
- """Set the STT model and reconnect the WebSocket."""
464
- await super().set_model(model)
465
- self._model = model
466
- if self._websocket:
467
- await self._disconnect()
468
- await self._connect()
558
+ async def start_metrics(self):
559
+ """Start performance metrics collection for transcription processing."""
560
+ await self.start_ttfb_metrics()
561
+ await self.start_processing_metrics()
469
562
 
470
563
  async def process_frame(self, frame: Frame, direction: FrameDirection):
471
- """Process frames and handle VAD events for commit strategy."""
564
+ """Process incoming frames and handle speech events.
565
+
566
+ Args:
567
+ frame: The frame to process.
568
+ direction: Direction of frame flow in the pipeline.
569
+ """
472
570
  await super().process_frame(frame, direction)
473
571
 
474
572
  if isinstance(frame, UserStartedSpeakingFrame):
475
- if frame.emulated:
476
- return
477
- self._ttfb_started = False
478
- await self.start_processing_metrics()
573
+ # Start metrics when user starts speaking
574
+ await self.start_metrics()
479
575
  elif isinstance(frame, UserStoppedSpeakingFrame):
480
- if frame.emulated:
481
- return
482
- if self.commit_strategy == "manual":
483
- await self._send_commit()
576
+ # Send commit when user stops speaking (manual commit mode)
577
+ if self._params.commit_strategy == CommitStrategy.MANUAL:
578
+ if self._websocket and self._websocket.state is State.OPEN:
579
+ try:
580
+ commit_message = {
581
+ "message_type": "input_audio_chunk",
582
+ "audio_base_64": "",
583
+ "commit": True,
584
+ "sample_rate": self.sample_rate,
585
+ }
586
+ await self._websocket.send(json.dumps(commit_message))
587
+ logger.trace("Sent manual commit to ElevenLabs")
588
+ except Exception as e:
589
+ logger.warning(f"Failed to send commit: {e}")
484
590
 
485
591
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
486
- """Stream audio chunks over the ElevenLabs realtime WebSocket."""
487
- if not audio:
488
- yield None
489
- return
592
+ """Process audio data for speech-to-text transcription.
490
593
 
491
- await self._ensure_connection()
492
- await self._send_audio_chunk(audio)
493
- yield None
594
+ Args:
595
+ audio: Raw audio bytes to transcribe.
494
596
 
495
- async def _ensure_connection(self):
597
+ Yields:
598
+ None - transcription results are handled via WebSocket responses.
599
+ """
600
+ # Reconnect if connection is closed
496
601
  if not self._websocket or self._websocket.state is State.CLOSED:
497
602
  await self._connect()
498
603
 
604
+ if self._websocket and self._websocket.state is State.OPEN:
605
+ try:
606
+ # Encode audio as base64
607
+ audio_base64 = base64.b64encode(audio).decode("utf-8")
608
+
609
+ # Send audio chunk
610
+ message = {
611
+ "message_type": "input_audio_chunk",
612
+ "audio_base_64": audio_base64,
613
+ "commit": False,
614
+ "sample_rate": self.sample_rate,
615
+ }
616
+ await self._websocket.send(json.dumps(message))
617
+ except Exception as e:
618
+ logger.error(f"Error sending audio: {e}")
619
+ yield ErrorFrame(f"ElevenLabs Realtime STT error: {str(e)}")
620
+
621
+ yield None
622
+
499
623
  async def _connect(self):
624
+ """Establish WebSocket connection to ElevenLabs Realtime STT."""
500
625
  await self._connect_websocket()
626
+
501
627
  if self._websocket and not self._receive_task:
502
- self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
628
+ self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
503
629
 
504
630
  async def _disconnect(self):
631
+ """Close WebSocket connection and cleanup tasks."""
505
632
  if self._receive_task:
506
633
  await self.cancel_task(self._receive_task)
507
634
  self._receive_task = None
508
635
 
509
- await self._clear_pending_final()
510
636
  await self._disconnect_websocket()
511
637
 
512
638
  async def _connect_websocket(self):
639
+ """Connect to ElevenLabs Realtime STT WebSocket endpoint."""
513
640
  try:
514
641
  if self._websocket and self._websocket.state is State.OPEN:
515
642
  return
516
643
 
517
- ws_url = self._build_websocket_url()
644
+ logger.debug("Connecting to ElevenLabs Realtime STT")
645
+
646
+ # Build query parameters
647
+ params = [f"model_id={self._model_id}"]
648
+
649
+ if self._params.language_code:
650
+ params.append(f"language_code={self._params.language_code}")
651
+
652
+ params.append(f"encoding={self._audio_format}")
653
+ params.append(f"sample_rate={self.sample_rate}")
654
+ params.append(f"commit_strategy={self._params.commit_strategy.value}")
655
+
656
+ # Add VAD parameters if using VAD commit strategy and values are specified
657
+ if self._params.commit_strategy == CommitStrategy.VAD:
658
+ if self._params.vad_silence_threshold_secs is not None:
659
+ params.append(
660
+ f"vad_silence_threshold_secs={self._params.vad_silence_threshold_secs}"
661
+ )
662
+ if self._params.vad_threshold is not None:
663
+ params.append(f"vad_threshold={self._params.vad_threshold}")
664
+ if self._params.min_speech_duration_ms is not None:
665
+ params.append(f"min_speech_duration_ms={self._params.min_speech_duration_ms}")
666
+ if self._params.min_silence_duration_ms is not None:
667
+ params.append(f"min_silence_duration_ms={self._params.min_silence_duration_ms}")
668
+
669
+ ws_url = f"wss://{self._base_url}/v1/speech-to-text/realtime?{'&'.join(params)}"
670
+
518
671
  headers = {"xi-api-key": self._api_key}
519
- self.logger.debug(f"Connecting to ElevenLabs realtime STT at {ws_url}")
672
+
520
673
  self._websocket = await websocket_connect(ws_url, additional_headers=headers)
521
674
  await self._call_event_handler("on_connected")
675
+ logger.debug("Connected to ElevenLabs Realtime STT")
522
676
  except Exception as e:
523
- self.logger.error(f"{self} unable to connect to ElevenLabs realtime STT: {e}")
524
- self._websocket = None
525
- await self._call_event_handler("on_connection_error", f"{e}")
677
+ logger.error(f"{self}: unable to connect to ElevenLabs Realtime STT: {e}")
678
+ await self.push_error(ErrorFrame(f"Connection error: {str(e)}"))
526
679
 
527
680
  async def _disconnect_websocket(self):
681
+ """Disconnect from ElevenLabs Realtime STT WebSocket."""
528
682
  try:
529
- await self.stop_all_metrics()
530
683
  if self._websocket and self._websocket.state is State.OPEN:
531
- self.logger.debug("Disconnecting from ElevenLabs realtime STT")
684
+ logger.debug("Disconnecting from ElevenLabs Realtime STT")
532
685
  await self._websocket.close()
533
686
  except Exception as e:
534
- self.logger.error(f"{self} error closing ElevenLabs realtime websocket: {e}")
687
+ logger.error(f"{self} error closing websocket: {e}")
535
688
  finally:
536
689
  self._websocket = None
537
690
  await self._call_event_handler("on_disconnected")
538
691
 
539
- async def _receive_messages(self):
540
- async for message in self._get_websocket():
541
- await self._process_event(message)
542
-
543
692
  def _get_websocket(self):
544
- if not self._websocket:
545
- raise RuntimeError("ElevenLabs realtime websocket not connected")
546
- return self._websocket
693
+ """Get the current WebSocket connection.
694
+
695
+ Returns:
696
+ The WebSocket connection.
697
+
698
+ Raises:
699
+ Exception: If WebSocket is not connected.
700
+ """
701
+ if self._websocket:
702
+ return self._websocket
703
+ raise Exception("Websocket not connected")
547
704
 
548
- async def _process_event(self, message: Any):
705
+ async def _process_messages(self):
706
+ """Process incoming WebSocket messages."""
707
+ async for message in self._get_websocket():
708
+ try:
709
+ data = json.loads(message)
710
+ await self._process_response(data)
711
+ except json.JSONDecodeError:
712
+ logger.warning(f"Received non-JSON message: {message}")
713
+ except Exception as e:
714
+ logger.error(f"Error processing message: {e}")
715
+
716
+ async def _receive_messages(self):
717
+ """Continuously receive and process WebSocket messages."""
549
718
  try:
550
- data = json.loads(message)
551
- except json.JSONDecodeError:
552
- self.logger.warning(f"ElevenLabs realtime STT sent invalid JSON: {message}")
553
- return
719
+ await self._process_messages()
720
+ except Exception as e:
721
+ logger.warning(f"{self} WebSocket connection closed: {e}")
722
+ # Connection closed, will reconnect on next audio chunk
723
+
724
+ async def _process_response(self, data: dict):
725
+ """Process a response message from ElevenLabs.
554
726
 
727
+ Args:
728
+ data: Parsed JSON response data.
729
+ """
555
730
  message_type = data.get("message_type")
556
731
 
557
732
  if message_type == "session_started":
558
- self.logger.debug("ElevenLabs realtime session started")
559
- return
733
+ logger.debug(f"ElevenLabs session started: {data}")
734
+
735
+ elif message_type == "partial_transcript":
736
+ await self._on_partial_transcript(data)
560
737
 
561
- if message_type == "partial_transcript":
562
- await self._emit_partial_transcript(data)
563
738
  elif message_type == "committed_transcript":
564
- await self._handle_committed_transcript(data)
739
+ await self._on_committed_transcript(data)
740
+
565
741
  elif message_type == "committed_transcript_with_timestamps":
566
- await self._handle_committed_transcript_with_timestamps(data)
567
- elif message_type in {
742
+ await self._on_committed_transcript_with_timestamps(data)
743
+
744
+ elif message_type == "input_error":
745
+ error_msg = data.get("error", "Unknown input error")
746
+ logger.error(f"ElevenLabs input error: {error_msg}")
747
+ await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
748
+
749
+ elif message_type in [
568
750
  "auth_error",
569
751
  "quota_exceeded",
570
752
  "transcriber_error",
571
- "input_error",
572
753
  "error",
573
- }:
574
- fatal = message_type in {"auth_error", "quota_exceeded", "error"}
575
- description = data.get("error", data)
576
- await self.push_error(
577
- ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
578
- )
754
+ ]:
755
+ error_msg = data.get("error", data.get("message", "Unknown error"))
756
+ logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
757
+ await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
758
+
579
759
  else:
580
- self.logger.debug(f"Unhandled ElevenLabs realtime message: {data}")
760
+ logger.debug(f"Unknown message type: {message_type}")
761
+
762
+ async def _on_partial_transcript(self, data: dict):
763
+ """Handle partial transcript (interim results).
581
764
 
582
- async def _emit_partial_transcript(self, data: Dict[str, Any]):
583
- text = (data.get("text") or data.get("transcript") or "").strip()
765
+ Args:
766
+ data: Partial transcript data.
767
+ """
768
+ text = data.get("text", "").strip()
584
769
  if not text:
585
770
  return
586
771
 
587
- language = (
588
- elevenlabs_language_code_to_language(data.get("language_code"))
589
- or self._language_override
590
- )
591
772
  await self.stop_ttfb_metrics()
592
773
 
774
+ # Get language if provided
775
+ language = data.get("language_code")
776
+
777
+ logger.trace(f"Partial transcript: [{text}]")
778
+
593
779
  await self.push_frame(
594
780
  InterimTranscriptionFrame(
595
781
  text,
@@ -600,143 +786,56 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
600
786
  )
601
787
  )
602
788
 
603
- async def _handle_committed_transcript(self, data: Dict[str, Any]):
604
- if self._pending_final_message:
605
- await self._emit_transcription(self._pending_final_message)
606
- self._pending_final_message = None
607
-
608
- self._pending_final_message = data
609
- await self._schedule_pending_final_emit()
610
-
611
- async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
612
- if self._pending_final_message:
613
- merged = {**self._pending_final_message, **data}
614
- await self._emit_transcription(merged)
615
- await self._clear_pending_final()
616
- else:
617
- await self._emit_transcription(data)
618
-
619
- async def _schedule_pending_final_emit(self):
620
- await self._clear_pending_final(timer_only=True)
621
- self._pending_final_task = asyncio.create_task(self._emit_pending_after_delay())
622
-
623
- async def _emit_pending_after_delay(self):
624
- try:
625
- await asyncio.sleep(self._timestamp_merge_delay_s)
626
- if self._pending_final_message:
627
- await self._emit_transcription(self._pending_final_message)
628
- self._pending_final_message = None
629
- except asyncio.CancelledError:
630
- pass
631
- finally:
632
- self._pending_final_task = None
633
-
634
- async def _clear_pending_final(self, timer_only: bool = False):
635
- if self._pending_final_task:
636
- await self.cancel_task(self._pending_final_task)
637
- self._pending_final_task = None
789
+ @traced_stt
790
+ async def _handle_transcription(
791
+ self, transcript: str, is_final: bool, language: Optional[str] = None
792
+ ):
793
+ """Handle a transcription result with tracing."""
794
+ pass
638
795
 
639
- if not timer_only:
640
- self._pending_final_message = None
796
+ async def _on_committed_transcript(self, data: dict):
797
+ """Handle committed transcript (final results).
641
798
 
642
- async def _emit_transcription(self, data: Dict[str, Any]):
643
- text = (data.get("text") or data.get("transcript") or "").strip()
799
+ Args:
800
+ data: Committed transcript data.
801
+ """
802
+ text = data.get("text", "").strip()
644
803
  if not text:
645
804
  return
646
805
 
647
- language = (
648
- elevenlabs_language_code_to_language(data.get("language_code"))
649
- or self._language_override
650
- )
651
806
  await self.stop_ttfb_metrics()
652
-
653
- frame = TranscriptionFrame(
654
- text,
655
- self._user_id,
656
- time_now_iso8601(),
657
- language,
658
- result=data,
659
- )
660
-
661
- await self.push_frame(frame)
662
- await self._handle_transcription(text, True, language)
663
807
  await self.stop_processing_metrics()
664
808
 
665
- async def _send_audio_chunk(self, audio: bytes):
666
- if not audio or not self._websocket:
667
- return
809
+ # Get language if provided
810
+ language = data.get("language_code")
668
811
 
669
- if not self._ttfb_started:
670
- await self.start_ttfb_metrics()
671
- self._ttfb_started = True
672
-
673
- payload = {
674
- "message_type": "input_audio_chunk",
675
- "audio_base_64": base64.b64encode(audio).decode("ascii"),
676
- "commit": False,
677
- "sample_rate": self.sample_rate,
678
- }
679
- await self._websocket.send(json.dumps(payload))
812
+ logger.debug(f"Committed transcript: [{text}]")
680
813
 
681
- async def _send_commit(self):
682
- if not self._websocket:
683
- return
684
- payload = {
685
- "message_type": "input_audio_chunk",
686
- "audio_base_64": "",
687
- "commit": True,
688
- "sample_rate": self.sample_rate,
689
- }
690
- await self._websocket.send(json.dumps(payload))
814
+ await self._handle_transcription(text, True, language)
691
815
 
692
- def _build_websocket_url(self) -> str:
693
- if not self.sample_rate:
694
- raise ValueError(
695
- "ElevenLabs realtime STT requires a valid sample rate (start() must run first)."
816
+ await self.push_frame(
817
+ TranscriptionFrame(
818
+ text,
819
+ self._user_id,
820
+ time_now_iso8601(),
821
+ language,
822
+ result=data,
696
823
  )
824
+ )
697
825
 
698
- params = {
699
- "model_id": self._model,
700
- "encoding": self._encoding or "pcm_16000",
701
- "sample_rate": str(self.sample_rate),
702
- "commit_strategy": self.commit_strategy,
703
- }
826
+ async def _on_committed_transcript_with_timestamps(self, data: dict):
827
+ """Handle committed transcript with word-level timestamps.
704
828
 
705
- language_code = (
706
- language_to_elevenlabs_language(self._language_override)
707
- if self._language_override
708
- else None
709
- )
710
- if language_code:
711
- params["language_code"] = language_code
712
-
713
- if self._params.vad_silence_threshold_secs is not None:
714
- params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
715
- if self._params.vad_threshold is not None:
716
- params["vad_threshold"] = str(self._params.vad_threshold)
717
- if self._params.min_speech_duration_ms is not None:
718
- params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
719
- if self._params.min_silence_duration_ms is not None:
720
- params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
721
-
722
- return f"{self._url}?{urllib.parse.urlencode(params)}"
723
-
724
- def _determine_encoding(self, sample_rate: int) -> str:
725
- if not sample_rate:
726
- raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
727
-
728
- supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
729
- if sample_rate not in supported_rates:
730
- raise ValueError(
731
- f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
732
- f"Received {sample_rate} Hz."
733
- )
734
- return f"pcm_{sample_rate}"
829
+ Args:
830
+ data: Committed transcript data with timestamps.
831
+ """
832
+ text = data.get("text", "").strip()
833
+ if not text:
834
+ return
735
835
 
736
- @traced_stt
737
- async def _handle_transcription(
738
- self, transcript: str, is_final: bool, language: Optional[Language] = None
739
- ):
740
- """Handle a transcription result with tracing."""
741
- # Metrics are stopped by the caller when needed.
742
- return
836
+ logger.debug(f"Committed transcript with timestamps: [{text}]")
837
+ logger.trace(f"Timestamps: {data.get('words', [])}")
838
+
839
+ # This is sent after the committed_transcript, so we don't need to
840
+ # push another TranscriptionFrame, but we could use the timestamps
841
+ # for additional processing if needed in the future