dv-pipecat-ai 0.0.85.dev851__py3-none-any.whl → 0.0.85.dev852__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev851
3
+ Version: 0.0.85.dev852
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev851.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev852.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -217,7 +217,7 @@ pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41
217
217
  pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
218
218
  pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
219
219
  pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
220
- pipecat/services/elevenlabs/stt.py,sha256=ZOVDJo3cG-f3ZugBIdxR5jrxJFtbfmDAP8Ps_KLyOgs,30117
220
+ pipecat/services/elevenlabs/stt.py,sha256=dy88MvQdhUQ-SFA7YTBRykZsIozMnnYQaJ4og1RYlVc,30811
221
221
  pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
222
222
  pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
223
223
  pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
416
416
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
417
417
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
418
418
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
419
- dv_pipecat_ai-0.0.85.dev851.dist-info/METADATA,sha256=lmgj2aZSwfm8h9V1nljEVf_41rQpqAp-13HAtCuXiMw,32955
420
- dv_pipecat_ai-0.0.85.dev851.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
- dv_pipecat_ai-0.0.85.dev851.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
- dv_pipecat_ai-0.0.85.dev851.dist-info/RECORD,,
419
+ dv_pipecat_ai-0.0.85.dev852.dist-info/METADATA,sha256=L_uFM2KLucwhFvtLcQ9dWL_DQicbrpPyiHOlW81e9LM,32955
420
+ dv_pipecat_ai-0.0.85.dev852.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
+ dv_pipecat_ai-0.0.85.dev852.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
+ dv_pipecat_ai-0.0.85.dev852.dist-info/RECORD,,
@@ -4,14 +4,18 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """ElevenLabs speech-to-text service implementations."""
7
+ """ElevenLabs speech-to-text service implementation.
8
+
9
+ This module provides integration with ElevenLabs' Speech-to-Text API for transcription
10
+ using segmented audio processing. The service uploads audio files and receives
11
+ transcription results directly.
12
+ """
8
13
 
9
- import asyncio
10
14
  import base64
11
15
  import io
12
16
  import json
13
- import urllib.parse
14
- from typing import Any, AsyncGenerator, Dict, Literal, Optional
17
+ from enum import Enum
18
+ from typing import AsyncGenerator, Optional
15
19
 
16
20
  import aiohttp
17
21
  from loguru import logger
@@ -37,9 +41,12 @@ from pipecat.utils.tracing.service_decorators import traced_stt
37
41
  try:
38
42
  from websockets.asyncio.client import connect as websocket_connect
39
43
  from websockets.protocol import State
40
- except ModuleNotFoundError:
41
- websocket_connect = None # type: ignore[assignment]
42
- State = None # type: ignore[assignment]
44
+ except ModuleNotFoundError as e:
45
+ logger.error(f"Exception: {e}")
46
+ logger.error(
47
+ "In order to use ElevenLabs Realtime STT, you need to `pip install pipecat-ai[elevenlabs]`."
48
+ )
49
+ raise Exception(f"Missing module: {e}")
43
50
 
44
51
 
45
52
  def language_to_elevenlabs_language(language: Language) -> Optional[str]:
@@ -173,19 +180,6 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
173
180
  return result
174
181
 
175
182
 
176
- def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
177
- """Convert an ElevenLabs language code back to a Language enum value."""
178
- if not language_code:
179
- return None
180
-
181
- normalized = language_code.lower()
182
- for language in Language:
183
- code = language_to_elevenlabs_language(language)
184
- if code and code.lower() == normalized:
185
- return language
186
- return None
187
-
188
-
189
183
  class ElevenLabsSTTService(SegmentedSTTService):
190
184
  """Speech-to-text service using ElevenLabs' file-based API.
191
185
 
@@ -271,7 +265,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
271
265
  Args:
272
266
  language: The language to use for speech-to-text transcription.
273
267
  """
274
- self.logger.info(f"Switching STT language to: [{language}]")
268
+ logger.info(f"Switching STT language to: [{language}]")
275
269
  self._settings["language"] = self.language_to_service_language(language)
276
270
 
277
271
  async def set_model(self, model: str):
@@ -285,7 +279,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
285
279
  This method is provided for interface compatibility.
286
280
  """
287
281
  await super().set_model(model)
288
- self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
282
+ logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
289
283
 
290
284
  async def _transcribe_audio(self, audio_data: bytes) -> dict:
291
285
  """Upload audio data to ElevenLabs and get transcription result.
@@ -319,7 +313,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
319
313
  async with self._session.post(url, data=data, headers=headers) as response:
320
314
  if response.status != 200:
321
315
  error_text = await response.text()
322
- self.logger.error(f"ElevenLabs transcription error: {error_text}")
316
+ logger.error(f"ElevenLabs transcription error: {error_text}")
323
317
  raise Exception(f"Transcription failed with status {response.status}: {error_text}")
324
318
 
325
319
  result = await response.json()
@@ -360,7 +354,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
360
354
  detected_language = result.get("language_code", "eng")
361
355
 
362
356
  await self._handle_transcription(text, True, detected_language)
363
- self.logger.debug(f"Transcription: [{text}]")
357
+ logger.debug(f"Transcription: [{text}]")
364
358
 
365
359
  yield TranscriptionFrame(
366
360
  text,
@@ -371,18 +365,86 @@ class ElevenLabsSTTService(SegmentedSTTService):
371
365
  )
372
366
 
373
367
  except Exception as e:
374
- self.logger.error(f"ElevenLabs STT error: {e}")
368
+ logger.error(f"ElevenLabs STT error: {e}")
375
369
  yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
376
370
 
377
371
 
372
+ def audio_format_from_sample_rate(sample_rate: int) -> str:
373
+ """Get the appropriate audio format string for a given sample rate.
374
+
375
+ Args:
376
+ sample_rate: The audio sample rate in Hz.
377
+
378
+ Returns:
379
+ The ElevenLabs audio format string.
380
+ """
381
+ match sample_rate:
382
+ case 8000:
383
+ return "pcm_8000"
384
+ case 16000:
385
+ return "pcm_16000"
386
+ case 22050:
387
+ return "pcm_22050"
388
+ case 24000:
389
+ return "pcm_24000"
390
+ case 44100:
391
+ return "pcm_44100"
392
+ case 48000:
393
+ return "pcm_48000"
394
+ logger.warning(
395
+ f"ElevenLabsRealtimeSTTService: No audio format available for {sample_rate} sample rate, using pcm_16000"
396
+ )
397
+ return "pcm_16000"
398
+
399
+
400
+ class CommitStrategy(str, Enum):
401
+ """Commit strategies for transcript segmentation."""
402
+
403
+ MANUAL = "manual"
404
+ VAD = "vad"
405
+
406
+
378
407
  class ElevenLabsRealtimeSTTService(WebsocketSTTService):
379
- """Realtime speech-to-text service using ElevenLabs Scribe v2 WebSocket API."""
408
+ """Speech-to-text service using ElevenLabs' Realtime WebSocket API.
409
+
410
+ This service uses ElevenLabs' Realtime Speech-to-Text API to perform transcription
411
+ with ultra-low latency. It supports both partial (interim) and committed (final)
412
+ transcripts, and can use either manual commit control or automatic Voice Activity
413
+ Detection (VAD) for segment boundaries.
414
+
415
+ By default, uses manual commit strategy where Pipecat's VAD controls when to
416
+ commit transcript segments, providing consistency with other STT services.
417
+
418
+ Important:
419
+ When using manual commit strategy with Pipecat's VAD, it is recommended to set
420
+ the VAD `stop_secs` parameter to at least 0.5 seconds. Lower values may result
421
+ in incomplete transcriptions due to a known limitation in the ElevenLabs model
422
+ where audio sent near the commit boundary may not be fully processed.
423
+ """
380
424
 
381
425
  class InputParams(BaseModel):
382
- """Realtime connection parameters derived from ElevenLabs documentation."""
426
+ """Configuration parameters for ElevenLabs Realtime STT API.
383
427
 
384
- language: Optional[Language] = None
385
- commit_strategy: Literal["manual", "vad"] = "manual"
428
+ Parameters:
429
+ language_code: ISO-639-1 or ISO-639-3 language code. Leave None for auto-detection.
430
+ commit_strategy: How to segment speech - manual (Pipecat VAD) or vad (ElevenLabs VAD).
431
+ vad_silence_threshold_secs: Seconds of silence before VAD commits (0.3-3.0).
432
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
433
+ vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
434
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
435
+ min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
436
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
437
+ min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
438
+ Only used when commit_strategy is VAD. None uses ElevenLabs default.
439
+
440
+ Note:
441
+ When using manual commit strategy, ensure Pipecat's VAD `stop_secs` is set to
442
+ at least 0.5 seconds to avoid incomplete transcriptions. This is a known
443
+ limitation of the ElevenLabs model.
444
+ """
445
+
446
+ language_code: Optional[str] = None
447
+ commit_strategy: CommitStrategy = CommitStrategy.MANUAL
386
448
  vad_silence_threshold_secs: Optional[float] = None
387
449
  vad_threshold: Optional[float] = None
388
450
  min_speech_duration_ms: Optional[int] = None
@@ -392,237 +454,327 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
392
454
  self,
393
455
  *,
394
456
  api_key: str,
395
- sample_rate: Optional[int] = None,
457
+ base_url: str = "api.elevenlabs.io",
396
458
  model: str = "scribe_v2_realtime",
397
- url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
398
- params: Optional["ElevenLabsRealtimeSTTService.InputParams"] = None,
399
- reconnect_on_error: bool = True,
459
+ sample_rate: Optional[int] = None,
460
+ params: Optional[InputParams] = None,
400
461
  **kwargs,
401
462
  ):
402
- """Initialize the realtime STT service.
463
+ """Initialize the ElevenLabs Realtime STT service.
403
464
 
404
465
  Args:
405
466
  api_key: ElevenLabs API key for authentication.
406
- sample_rate: Optional input sample rate. Defaults to pipeline sample rate.
407
- model: Scribe realtime model identifier.
408
- url: WebSocket endpoint for realtime transcription.
409
- params: Optional realtime configuration options.
410
- reconnect_on_error: Whether to auto-reconnect on transient failures.
411
- **kwargs: Additional arguments forwarded to WebsocketSTTService.
467
+ base_url: Base URL for ElevenLabs WebSocket API.
468
+ model: Model ID for transcription. Defaults to "scribe_v2_realtime".
469
+ sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
470
+ params: Configuration parameters for the STT service.
471
+ **kwargs: Additional arguments passed to WebsocketSTTService.
472
+
473
+ Note:
474
+ When using manual commit strategy (default), configure Pipecat's VAD with
475
+ `stop_secs` of at least 0.5 seconds to ensure complete transcriptions.
412
476
  """
413
- if websocket_connect is None or State is None:
414
- logger.error(
415
- "In order to use ElevenLabsRealtimeSTTService, you need to "
416
- "`pip install pipecat-ai[elevenlabs]` (websockets extra)."
417
- )
418
- raise ModuleNotFoundError("Missing optional dependency: websockets")
477
+ super().__init__(
478
+ sample_rate=sample_rate,
479
+ **kwargs,
480
+ )
419
481
 
420
- super().__init__(sample_rate=sample_rate, reconnect_on_error=reconnect_on_error, **kwargs)
482
+ params = params or ElevenLabsRealtimeSTTService.InputParams()
421
483
 
422
484
  self._api_key = api_key
423
- self._url = url
424
- self.set_model_name(model)
425
- self._model = model
426
- self._params = params or ElevenLabsRealtimeSTTService.InputParams()
427
- self._language_override = self._params.language
428
- self._encoding = None
429
- self._receive_task: Optional[asyncio.Task] = None
430
- self._pending_final_message: Optional[Dict[str, Any]] = None
431
- self._pending_final_task: Optional[asyncio.Task] = None
432
- self._timestamp_merge_delay_s = 0.25
433
- self._ttfb_started = False
434
- self._waiting_for_timestamps = False
435
-
436
- @property
437
- def commit_strategy(self) -> str:
438
- """Return the configured commit strategy (manual or vad)."""
439
- return (self._params.commit_strategy or "manual").lower()
485
+ self._base_url = base_url
486
+ self._model_id = model
487
+ self._params = params
488
+ self._audio_format = "" # initialized in start()
489
+ self._receive_task = None
440
490
 
441
491
  def can_generate_metrics(self) -> bool:
442
- """Realtime ElevenLabs service supports latency metrics."""
492
+ """Check if the service can generate processing metrics.
493
+
494
+ Returns:
495
+ True, as ElevenLabs Realtime STT service supports metrics generation.
496
+ """
443
497
  return True
444
498
 
499
+ async def set_language(self, language: Language):
500
+ """Set the transcription language.
501
+
502
+ Args:
503
+ language: The language to use for speech-to-text transcription.
504
+
505
+ Note:
506
+ Changing language requires reconnecting to the WebSocket.
507
+ """
508
+ logger.info(f"Switching STT language to: [{language}]")
509
+ self._params.language_code = language.value if isinstance(language, Language) else language
510
+ # Reconnect with new settings
511
+ await self._disconnect()
512
+ await self._connect()
513
+
514
+ async def set_model(self, model: str):
515
+ """Set the STT model.
516
+
517
+ Args:
518
+ model: The model name to use for transcription.
519
+
520
+ Note:
521
+ Changing model requires reconnecting to the WebSocket.
522
+ """
523
+ await super().set_model(model)
524
+ logger.info(f"Switching STT model to: [{model}]")
525
+ self._model_id = model
526
+ # Reconnect with new settings
527
+ await self._disconnect()
528
+ await self._connect()
529
+
445
530
  async def start(self, frame: StartFrame):
446
- """Start the realtime STT service and establish WebSocket connection."""
531
+ """Start the STT service and establish WebSocket connection.
532
+
533
+ Args:
534
+ frame: Frame indicating service should start.
535
+ """
447
536
  await super().start(frame)
448
- self._encoding = self._determine_encoding(self.sample_rate)
537
+ self._audio_format = audio_format_from_sample_rate(self.sample_rate)
449
538
  await self._connect()
450
539
 
451
540
  async def stop(self, frame: EndFrame):
452
- """Stop the realtime STT service and close WebSocket connection."""
541
+ """Stop the STT service and close WebSocket connection.
542
+
543
+ Args:
544
+ frame: Frame indicating service should stop.
545
+ """
453
546
  await super().stop(frame)
454
547
  await self._disconnect()
455
548
 
456
549
  async def cancel(self, frame: CancelFrame):
457
- """Cancel the realtime STT service and close WebSocket connection."""
550
+ """Cancel the STT service and close WebSocket connection.
551
+
552
+ Args:
553
+ frame: Frame indicating service should be cancelled.
554
+ """
458
555
  await super().cancel(frame)
459
556
  await self._disconnect()
460
557
 
461
- async def set_language(self, language: Language):
462
- """Update preferred transcription language (requires reconnect)."""
463
- self._language_override = language
464
- self._params.language = language
465
- if self._websocket:
466
- await self._disconnect()
467
- await self._connect()
468
-
469
- async def set_model(self, model: str):
470
- """Set the STT model and reconnect the WebSocket."""
471
- await super().set_model(model)
472
- self._model = model
473
- if self._websocket:
474
- await self._disconnect()
475
- await self._connect()
558
+ async def start_metrics(self):
559
+ """Start performance metrics collection for transcription processing."""
560
+ await self.start_ttfb_metrics()
561
+ await self.start_processing_metrics()
476
562
 
477
563
  async def process_frame(self, frame: Frame, direction: FrameDirection):
478
- """Process frames and handle VAD events for commit strategy."""
564
+ """Process incoming frames and handle speech events.
565
+
566
+ Args:
567
+ frame: The frame to process.
568
+ direction: Direction of frame flow in the pipeline.
569
+ """
479
570
  await super().process_frame(frame, direction)
480
571
 
481
572
  if isinstance(frame, UserStartedSpeakingFrame):
482
- if frame.emulated:
483
- return
484
- # Start metrics and set flag to True so we can stop them later
485
- await self.start_ttfb_metrics()
486
- self._ttfb_started = True
487
- await self.start_processing_metrics()
573
+ # Start metrics when user starts speaking
574
+ await self.start_metrics()
488
575
  elif isinstance(frame, UserStoppedSpeakingFrame):
489
- if frame.emulated:
490
- return
491
- if self.commit_strategy == "manual":
492
- await self._send_commit()
576
+ # Send commit when user stops speaking (manual commit mode)
577
+ if self._params.commit_strategy == CommitStrategy.MANUAL:
578
+ if self._websocket and self._websocket.state is State.OPEN:
579
+ try:
580
+ commit_message = {
581
+ "message_type": "input_audio_chunk",
582
+ "audio_base_64": "",
583
+ "commit": True,
584
+ "sample_rate": self.sample_rate,
585
+ }
586
+ await self._websocket.send(json.dumps(commit_message))
587
+ logger.trace("Sent manual commit to ElevenLabs")
588
+ except Exception as e:
589
+ logger.warning(f"Failed to send commit: {e}")
493
590
 
494
591
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
495
- """Stream audio chunks over the ElevenLabs realtime WebSocket."""
496
- if not audio:
497
- yield None
498
- return
592
+ """Process audio data for speech-to-text transcription.
499
593
 
500
- if not await self._ensure_connection():
501
- self.logger.error(f"{self} failed to establish connection, dropping audio")
502
- yield None
503
- return
504
-
505
- await self._send_audio_chunk(audio)
506
- yield None
507
-
508
- async def _ensure_connection(self) -> bool:
509
- """Ensure WebSocket connection is established and ready.
594
+ Args:
595
+ audio: Raw audio bytes to transcribe.
510
596
 
511
- Returns:
512
- bool: True if connection is ready, False otherwise.
597
+ Yields:
598
+ None - transcription results are handled via WebSocket responses.
513
599
  """
600
+ # Reconnect if connection is closed
514
601
  if not self._websocket or self._websocket.state is State.CLOSED:
515
602
  await self._connect()
516
- return self._websocket is not None and self._websocket.state is State.OPEN
603
+
604
+ if self._websocket and self._websocket.state is State.OPEN:
605
+ try:
606
+ # Encode audio as base64
607
+ audio_base64 = base64.b64encode(audio).decode("utf-8")
608
+
609
+ # Send audio chunk
610
+ message = {
611
+ "message_type": "input_audio_chunk",
612
+ "audio_base_64": audio_base64,
613
+ "commit": False,
614
+ "sample_rate": self.sample_rate,
615
+ }
616
+ await self._websocket.send(json.dumps(message))
617
+ except Exception as e:
618
+ logger.error(f"Error sending audio: {e}")
619
+ yield ErrorFrame(f"ElevenLabs Realtime STT error: {str(e)}")
620
+
621
+ yield None
517
622
 
518
623
  async def _connect(self):
624
+ """Establish WebSocket connection to ElevenLabs Realtime STT."""
519
625
  await self._connect_websocket()
520
- if self._websocket and self._websocket.state is State.OPEN and not self._receive_task:
626
+
627
+ if self._websocket and not self._receive_task:
521
628
  self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
522
629
 
523
630
  async def _disconnect(self):
631
+ """Close WebSocket connection and cleanup tasks."""
524
632
  if self._receive_task:
525
633
  await self.cancel_task(self._receive_task)
526
634
  self._receive_task = None
527
635
 
528
- await self._clear_pending_final()
529
636
  await self._disconnect_websocket()
530
637
 
531
638
  async def _connect_websocket(self):
639
+ """Connect to ElevenLabs Realtime STT WebSocket endpoint."""
532
640
  try:
533
641
  if self._websocket and self._websocket.state is State.OPEN:
534
- self.logger.debug(f"{self} already connected, skipping reconnection")
535
642
  return
536
643
 
537
- ws_url = self._build_websocket_url()
644
+ logger.debug("Connecting to ElevenLabs Realtime STT")
645
+
646
+ # Build query parameters
647
+ params = [f"model_id={self._model_id}"]
648
+
649
+ if self._params.language_code:
650
+ params.append(f"language_code={self._params.language_code}")
651
+
652
+ params.append(f"encoding={self._audio_format}")
653
+ params.append(f"sample_rate={self.sample_rate}")
654
+ params.append(f"commit_strategy={self._params.commit_strategy.value}")
655
+
656
+ # Add VAD parameters if using VAD commit strategy and values are specified
657
+ if self._params.commit_strategy == CommitStrategy.VAD:
658
+ if self._params.vad_silence_threshold_secs is not None:
659
+ params.append(
660
+ f"vad_silence_threshold_secs={self._params.vad_silence_threshold_secs}"
661
+ )
662
+ if self._params.vad_threshold is not None:
663
+ params.append(f"vad_threshold={self._params.vad_threshold}")
664
+ if self._params.min_speech_duration_ms is not None:
665
+ params.append(f"min_speech_duration_ms={self._params.min_speech_duration_ms}")
666
+ if self._params.min_silence_duration_ms is not None:
667
+ params.append(f"min_silence_duration_ms={self._params.min_silence_duration_ms}")
668
+
669
+ ws_url = f"wss://{self._base_url}/v1/speech-to-text/realtime?{'&'.join(params)}"
670
+
538
671
  headers = {"xi-api-key": self._api_key}
539
- self.logger.info(f"{self} connecting to ElevenLabs realtime STT (WebSocket URL built)")
672
+
540
673
  self._websocket = await websocket_connect(ws_url, additional_headers=headers)
541
- self.logger.info(f"{self} successfully connected to ElevenLabs realtime STT")
542
674
  await self._call_event_handler("on_connected")
675
+ logger.debug("Connected to ElevenLabs Realtime STT")
543
676
  except Exception as e:
544
- self.logger.error(f"{self} unable to connect to ElevenLabs realtime STT: {e}")
545
- self._websocket = None
546
- if self._receive_task:
547
- await self.cancel_task(self._receive_task)
548
- self._receive_task = None
549
- # Push error to pipeline so callers know the connection failed
550
- await self.push_error(ErrorFrame(f"ElevenLabs connection failed: {e}", fatal=False))
551
- await self._call_event_handler("on_connection_error", f"{e}")
677
+ logger.error(f"{self}: unable to connect to ElevenLabs Realtime STT: {e}")
678
+ await self.push_error(ErrorFrame(f"Connection error: {str(e)}"))
552
679
 
553
680
  async def _disconnect_websocket(self):
681
+ """Disconnect from ElevenLabs Realtime STT WebSocket."""
554
682
  try:
555
- await self.stop_all_metrics()
556
683
  if self._websocket and self._websocket.state is State.OPEN:
557
- self.logger.debug(f"{self} disconnecting from ElevenLabs realtime STT")
684
+ logger.debug("Disconnecting from ElevenLabs Realtime STT")
558
685
  await self._websocket.close()
559
686
  except Exception as e:
560
- self.logger.error(f"{self} error closing ElevenLabs realtime websocket: {e}")
687
+ logger.error(f"{self} error closing websocket: {e}")
561
688
  finally:
562
689
  self._websocket = None
563
690
  await self._call_event_handler("on_disconnected")
564
691
 
565
- async def _receive_messages(self):
566
- async for message in self._get_websocket():
567
- await self._process_event(message)
568
-
569
692
  def _get_websocket(self):
570
- if not self._websocket:
571
- raise RuntimeError("ElevenLabs realtime websocket not connected")
572
- return self._websocket
693
+ """Get the current WebSocket connection.
694
+
695
+ Returns:
696
+ The WebSocket connection.
697
+
698
+ Raises:
699
+ Exception: If WebSocket is not connected.
700
+ """
701
+ if self._websocket:
702
+ return self._websocket
703
+ raise Exception("Websocket not connected")
573
704
 
574
- async def _process_event(self, message: Any):
705
+ async def _process_messages(self):
706
+ """Process incoming WebSocket messages."""
707
+ async for message in self._get_websocket():
708
+ try:
709
+ data = json.loads(message)
710
+ await self._process_response(data)
711
+ except json.JSONDecodeError:
712
+ logger.warning(f"Received non-JSON message: {message}")
713
+ except Exception as e:
714
+ logger.error(f"Error processing message: {e}")
715
+
716
+ async def _receive_messages(self):
717
+ """Continuously receive and process WebSocket messages."""
575
718
  try:
576
- data = json.loads(message)
577
- except json.JSONDecodeError:
578
- self.logger.warning(f"ElevenLabs realtime STT sent invalid JSON: {message}")
579
- return
719
+ await self._process_messages()
720
+ except Exception as e:
721
+ logger.warning(f"{self} WebSocket connection closed: {e}")
722
+ # Connection closed, will reconnect on next audio chunk
723
+
724
+ async def _process_response(self, data: dict):
725
+ """Process a response message from ElevenLabs.
580
726
 
727
+ Args:
728
+ data: Parsed JSON response data.
729
+ """
581
730
  message_type = data.get("message_type")
582
731
 
583
732
  if message_type == "session_started":
584
- self.logger.debug("ElevenLabs realtime session started")
585
- return
733
+ logger.debug(f"ElevenLabs session started: {data}")
734
+
735
+ elif message_type == "partial_transcript":
736
+ await self._on_partial_transcript(data)
586
737
 
587
- if message_type == "partial_transcript":
588
- await self._emit_partial_transcript(data)
589
738
  elif message_type == "committed_transcript":
590
- await self._handle_committed_transcript(data)
739
+ await self._on_committed_transcript(data)
740
+
591
741
  elif message_type == "committed_transcript_with_timestamps":
592
- await self._handle_committed_transcript_with_timestamps(data)
593
- elif message_type in {
742
+ await self._on_committed_transcript_with_timestamps(data)
743
+
744
+ elif message_type == "input_error":
745
+ error_msg = data.get("error", "Unknown input error")
746
+ logger.error(f"ElevenLabs input error: {error_msg}")
747
+ await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
748
+
749
+ elif message_type in [
594
750
  "auth_error",
595
751
  "quota_exceeded",
596
752
  "transcriber_error",
597
- "input_error",
598
753
  "error",
599
- }:
600
- fatal = message_type in {"auth_error", "quota_exceeded", "error"}
601
- description = data.get("error", data)
602
- # Log full error details for debugging
603
- self.logger.error(
604
- f"{self} ElevenLabs error - Type: {message_type}, Fatal: {fatal}, Full data: {data}"
605
- )
606
- await self.push_error(
607
- ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
608
- )
754
+ ]:
755
+ error_msg = data.get("error", data.get("message", "Unknown error"))
756
+ logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
757
+ await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
758
+
609
759
  else:
610
- self.logger.debug(f"Unhandled ElevenLabs realtime message: {data}")
760
+ logger.debug(f"Unknown message type: {message_type}")
611
761
 
612
- async def _emit_partial_transcript(self, data: Dict[str, Any]):
613
- text = (data.get("text") or data.get("transcript") or "").strip()
762
+ async def _on_partial_transcript(self, data: dict):
763
+ """Handle partial transcript (interim results).
764
+
765
+ Args:
766
+ data: Partial transcript data.
767
+ """
768
+ text = data.get("text", "").strip()
614
769
  if not text:
615
770
  return
616
771
 
617
- language = (
618
- elevenlabs_language_code_to_language(data.get("language_code"))
619
- or self._language_override
620
- )
772
+ await self.stop_ttfb_metrics()
773
+
774
+ # Get language if provided
775
+ language = data.get("language_code")
621
776
 
622
- # Only stop TTFB metrics on first partial
623
- if self._ttfb_started:
624
- await self.stop_ttfb_metrics()
625
- self._ttfb_started = False
777
+ logger.trace(f"Partial transcript: [{text}]")
626
778
 
627
779
  await self.push_frame(
628
780
  InterimTranscriptionFrame(
@@ -634,166 +786,56 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
634
786
  )
635
787
  )
636
788
 
637
- async def _handle_committed_transcript(self, data: Dict[str, Any]):
638
- if self._pending_final_message:
639
- await self._emit_transcription(self._pending_final_message)
640
- self._pending_final_message = None
641
- self._waiting_for_timestamps = False
642
-
643
- self._pending_final_message = data
644
- self._waiting_for_timestamps = True
645
- await self._schedule_pending_final_emit()
646
-
647
- async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
648
- if self._pending_final_message:
649
- merged = {**self._pending_final_message, **data}
650
- await self._emit_transcription(merged)
651
- await self._clear_pending_final()
652
- elif self._waiting_for_timestamps:
653
- # Late arrival after timeout - don't emit duplicate
654
- self.logger.warning(f"{self} timestamps arrived after timeout, skipping duplicate")
655
- self._waiting_for_timestamps = False
656
- else:
657
- await self._emit_transcription(data)
658
-
659
- async def _schedule_pending_final_emit(self):
660
- await self._clear_pending_final(timer_only=True)
661
- self._pending_final_task = self.create_task(self._emit_pending_after_delay())
662
-
663
- async def _emit_pending_after_delay(self):
664
- try:
665
- await asyncio.sleep(self._timestamp_merge_delay_s)
666
- if self._pending_final_message:
667
- await self._emit_transcription(self._pending_final_message)
668
- self._pending_final_message = None
669
- self._waiting_for_timestamps = False
670
- except asyncio.CancelledError:
671
- pass
672
- finally:
673
- self._pending_final_task = None
674
-
675
- async def _clear_pending_final(self, timer_only: bool = False):
676
- if self._pending_final_task:
677
- await self.cancel_task(self._pending_final_task)
678
- self._pending_final_task = None
789
+ @traced_stt
790
+ async def _handle_transcription(
791
+ self, transcript: str, is_final: bool, language: Optional[str] = None
792
+ ):
793
+ """Handle a transcription result with tracing."""
794
+ pass
679
795
 
680
- if not timer_only:
681
- self._pending_final_message = None
682
- self._waiting_for_timestamps = False
796
+ async def _on_committed_transcript(self, data: dict):
797
+ """Handle committed transcript (final results).
683
798
 
684
- async def _emit_transcription(self, data: Dict[str, Any]):
685
- text = (data.get("text") or data.get("transcript") or "").strip()
799
+ Args:
800
+ data: Committed transcript data.
801
+ """
802
+ text = data.get("text", "").strip()
686
803
  if not text:
687
804
  return
688
805
 
689
- language = (
690
- elevenlabs_language_code_to_language(data.get("language_code"))
691
- or self._language_override
692
- )
693
-
694
- # TTFB should already be stopped by partial, but guard just in case
695
- if self._ttfb_started:
696
- await self.stop_ttfb_metrics()
697
- self._ttfb_started = False
698
-
699
- frame = TranscriptionFrame(
700
- text,
701
- self._user_id,
702
- time_now_iso8601(),
703
- language,
704
- result=data,
705
- )
706
-
707
- await self.push_frame(frame)
708
- await self._handle_transcription(text, True, language)
806
+ await self.stop_ttfb_metrics()
709
807
  await self.stop_processing_metrics()
710
808
 
711
- async def _send_audio_chunk(self, audio: bytes):
712
- if not audio or not self._websocket:
713
- return
809
+ # Get language if provided
810
+ language = data.get("language_code")
714
811
 
715
- try:
716
- payload = {
717
- "message_type": "input_audio_chunk",
718
- "audio_base_64": base64.b64encode(audio).decode("ascii"),
719
- "commit": False,
720
- "sample_rate": self.sample_rate,
721
- }
722
- await self._websocket.send(json.dumps(payload))
723
- except Exception as e:
724
- self.logger.error(f"{self} error sending audio chunk: {e}")
725
- await self.push_error(ErrorFrame(f"Failed to send audio: {e}"))
726
- # Trigger reconnection
727
- await self._disconnect()
728
- await self._connect()
729
-
730
- async def _send_commit(self):
731
- if not self._websocket:
732
- return
812
+ logger.debug(f"Committed transcript: [{text}]")
733
813
 
734
- try:
735
- payload = {
736
- "message_type": "input_audio_chunk",
737
- "audio_base_64": "",
738
- "commit": True,
739
- "sample_rate": self.sample_rate,
740
- }
741
- await self._websocket.send(json.dumps(payload))
742
- except Exception as e:
743
- self.logger.error(f"{self} error sending commit: {e}")
744
- await self.push_error(ErrorFrame(f"Failed to send commit: {e}"))
745
- # Trigger reconnection
746
- await self._disconnect()
747
- await self._connect()
814
+ await self._handle_transcription(text, True, language)
748
815
 
749
- def _build_websocket_url(self) -> str:
750
- if not self.sample_rate:
751
- raise ValueError(
752
- "ElevenLabs realtime STT requires a valid sample rate (start() must run first)."
816
+ await self.push_frame(
817
+ TranscriptionFrame(
818
+ text,
819
+ self._user_id,
820
+ time_now_iso8601(),
821
+ language,
822
+ result=data,
753
823
  )
824
+ )
754
825
 
755
- params = {
756
- "model_id": self._model,
757
- "encoding": self._encoding or "pcm_16000",
758
- "sample_rate": str(self.sample_rate),
759
- "commit_strategy": self.commit_strategy,
760
- }
826
+ async def _on_committed_transcript_with_timestamps(self, data: dict):
827
+ """Handle committed transcript with word-level timestamps.
761
828
 
762
- language_code = (
763
- language_to_elevenlabs_language(self._language_override)
764
- if self._language_override
765
- else None
766
- )
767
- if language_code:
768
- params["language_code"] = language_code
769
-
770
- if self._params.vad_silence_threshold_secs is not None:
771
- params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
772
- if self._params.vad_threshold is not None:
773
- params["vad_threshold"] = str(self._params.vad_threshold)
774
- if self._params.min_speech_duration_ms is not None:
775
- params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
776
- if self._params.min_silence_duration_ms is not None:
777
- params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
778
-
779
- return f"{self._url}?{urllib.parse.urlencode(params)}"
780
-
781
- def _determine_encoding(self, sample_rate: int) -> str:
782
- if not sample_rate:
783
- raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
784
-
785
- supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
786
- if sample_rate not in supported_rates:
787
- raise ValueError(
788
- f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
789
- f"Received {sample_rate} Hz."
790
- )
791
- return f"pcm_{sample_rate}"
829
+ Args:
830
+ data: Committed transcript data with timestamps.
831
+ """
832
+ text = data.get("text", "").strip()
833
+ if not text:
834
+ return
792
835
 
793
- @traced_stt
794
- async def _handle_transcription(
795
- self, transcript: str, is_final: bool, language: Optional[Language] = None
796
- ):
797
- """Handle a transcription result with tracing."""
798
- # Metrics are stopped by the caller when needed.
799
- return
836
+ logger.debug(f"Committed transcript with timestamps: [{text}]")
837
+ logger.trace(f"Timestamps: {data.get('words', [])}")
838
+
839
+ # This is sent after the committed_transcript, so we don't need to
840
+ # push another TranscriptionFrame, but we could use the timestamps
841
+ # for additional processing if needed in the future