dv-pipecat-ai 0.0.85.dev850__py3-none-any.whl → 0.0.85.dev852__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev850.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/METADATA +1 -1
- {dv_pipecat_ai-0.0.85.dev850.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/RECORD +6 -6
- pipecat/services/elevenlabs/stt.py +369 -270
- {dv_pipecat_ai-0.0.85.dev850.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev850.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev850.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
dv_pipecat_ai-0.0.85.
|
|
1
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
|
|
2
2
|
pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
|
|
3
3
|
pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -217,7 +217,7 @@ pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41
|
|
|
217
217
|
pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
|
|
218
218
|
pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
|
|
219
219
|
pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
|
|
220
|
-
pipecat/services/elevenlabs/stt.py,sha256=
|
|
220
|
+
pipecat/services/elevenlabs/stt.py,sha256=dy88MvQdhUQ-SFA7YTBRykZsIozMnnYQaJ4og1RYlVc,30811
|
|
221
221
|
pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
|
|
222
222
|
pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
|
|
223
223
|
pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
|
|
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
|
|
|
416
416
|
pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
|
|
417
417
|
pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
|
|
418
418
|
pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
|
|
419
|
-
dv_pipecat_ai-0.0.85.
|
|
420
|
-
dv_pipecat_ai-0.0.85.
|
|
421
|
-
dv_pipecat_ai-0.0.85.
|
|
422
|
-
dv_pipecat_ai-0.0.85.
|
|
419
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/METADATA,sha256=L_uFM2KLucwhFvtLcQ9dWL_DQicbrpPyiHOlW81e9LM,32955
|
|
420
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
421
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
|
|
422
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/RECORD,,
|
|
@@ -4,14 +4,18 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""ElevenLabs speech-to-text service
|
|
7
|
+
"""ElevenLabs speech-to-text service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with ElevenLabs' Speech-to-Text API for transcription
|
|
10
|
+
using segmented audio processing. The service uploads audio files and receives
|
|
11
|
+
transcription results directly.
|
|
12
|
+
"""
|
|
8
13
|
|
|
9
|
-
import asyncio
|
|
10
14
|
import base64
|
|
11
15
|
import io
|
|
12
16
|
import json
|
|
13
|
-
import
|
|
14
|
-
from typing import
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import AsyncGenerator, Optional
|
|
15
19
|
|
|
16
20
|
import aiohttp
|
|
17
21
|
from loguru import logger
|
|
@@ -37,9 +41,12 @@ from pipecat.utils.tracing.service_decorators import traced_stt
|
|
|
37
41
|
try:
|
|
38
42
|
from websockets.asyncio.client import connect as websocket_connect
|
|
39
43
|
from websockets.protocol import State
|
|
40
|
-
except ModuleNotFoundError:
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
except ModuleNotFoundError as e:
|
|
45
|
+
logger.error(f"Exception: {e}")
|
|
46
|
+
logger.error(
|
|
47
|
+
"In order to use ElevenLabs Realtime STT, you need to `pip install pipecat-ai[elevenlabs]`."
|
|
48
|
+
)
|
|
49
|
+
raise Exception(f"Missing module: {e}")
|
|
43
50
|
|
|
44
51
|
|
|
45
52
|
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
@@ -159,27 +166,20 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
159
166
|
result = BASE_LANGUAGES.get(language)
|
|
160
167
|
|
|
161
168
|
# If not found in base languages, try to find the base language from a variant
|
|
169
|
+
# For example, Language.EN_US (value "en-US") -> Language("en") -> "eng"
|
|
162
170
|
if not result:
|
|
163
171
|
lang_str = str(language.value)
|
|
164
|
-
base_code = lang_str.split("-")[0]
|
|
165
|
-
|
|
172
|
+
base_code = lang_str.split("-")[0] # Get "en" from "en-US"
|
|
173
|
+
try:
|
|
174
|
+
base_language = Language(base_code)
|
|
175
|
+
result = BASE_LANGUAGES.get(base_language)
|
|
176
|
+
except (ValueError, KeyError):
|
|
177
|
+
# If base language not found in Language enum, return None
|
|
178
|
+
result = None
|
|
166
179
|
|
|
167
180
|
return result
|
|
168
181
|
|
|
169
182
|
|
|
170
|
-
def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
|
|
171
|
-
"""Convert an ElevenLabs language code back to a Language enum value."""
|
|
172
|
-
if not language_code:
|
|
173
|
-
return None
|
|
174
|
-
|
|
175
|
-
normalized = language_code.lower()
|
|
176
|
-
for language in Language:
|
|
177
|
-
code = language_to_elevenlabs_language(language)
|
|
178
|
-
if code and code.lower() == normalized:
|
|
179
|
-
return language
|
|
180
|
-
return None
|
|
181
|
-
|
|
182
|
-
|
|
183
183
|
class ElevenLabsSTTService(SegmentedSTTService):
|
|
184
184
|
"""Speech-to-text service using ElevenLabs' file-based API.
|
|
185
185
|
|
|
@@ -265,7 +265,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
265
265
|
Args:
|
|
266
266
|
language: The language to use for speech-to-text transcription.
|
|
267
267
|
"""
|
|
268
|
-
|
|
268
|
+
logger.info(f"Switching STT language to: [{language}]")
|
|
269
269
|
self._settings["language"] = self.language_to_service_language(language)
|
|
270
270
|
|
|
271
271
|
async def set_model(self, model: str):
|
|
@@ -279,7 +279,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
279
279
|
This method is provided for interface compatibility.
|
|
280
280
|
"""
|
|
281
281
|
await super().set_model(model)
|
|
282
|
-
|
|
282
|
+
logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
|
|
283
283
|
|
|
284
284
|
async def _transcribe_audio(self, audio_data: bytes) -> dict:
|
|
285
285
|
"""Upload audio data to ElevenLabs and get transcription result.
|
|
@@ -313,7 +313,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
313
313
|
async with self._session.post(url, data=data, headers=headers) as response:
|
|
314
314
|
if response.status != 200:
|
|
315
315
|
error_text = await response.text()
|
|
316
|
-
|
|
316
|
+
logger.error(f"ElevenLabs transcription error: {error_text}")
|
|
317
317
|
raise Exception(f"Transcription failed with status {response.status}: {error_text}")
|
|
318
318
|
|
|
319
319
|
result = await response.json()
|
|
@@ -354,7 +354,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
354
354
|
detected_language = result.get("language_code", "eng")
|
|
355
355
|
|
|
356
356
|
await self._handle_transcription(text, True, detected_language)
|
|
357
|
-
|
|
357
|
+
logger.debug(f"Transcription: [{text}]")
|
|
358
358
|
|
|
359
359
|
yield TranscriptionFrame(
|
|
360
360
|
text,
|
|
@@ -365,18 +365,86 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
365
365
|
)
|
|
366
366
|
|
|
367
367
|
except Exception as e:
|
|
368
|
-
|
|
368
|
+
logger.error(f"ElevenLabs STT error: {e}")
|
|
369
369
|
yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
|
|
370
370
|
|
|
371
371
|
|
|
372
|
+
def audio_format_from_sample_rate(sample_rate: int) -> str:
|
|
373
|
+
"""Get the appropriate audio format string for a given sample rate.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
sample_rate: The audio sample rate in Hz.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
The ElevenLabs audio format string.
|
|
380
|
+
"""
|
|
381
|
+
match sample_rate:
|
|
382
|
+
case 8000:
|
|
383
|
+
return "pcm_8000"
|
|
384
|
+
case 16000:
|
|
385
|
+
return "pcm_16000"
|
|
386
|
+
case 22050:
|
|
387
|
+
return "pcm_22050"
|
|
388
|
+
case 24000:
|
|
389
|
+
return "pcm_24000"
|
|
390
|
+
case 44100:
|
|
391
|
+
return "pcm_44100"
|
|
392
|
+
case 48000:
|
|
393
|
+
return "pcm_48000"
|
|
394
|
+
logger.warning(
|
|
395
|
+
f"ElevenLabsRealtimeSTTService: No audio format available for {sample_rate} sample rate, using pcm_16000"
|
|
396
|
+
)
|
|
397
|
+
return "pcm_16000"
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class CommitStrategy(str, Enum):
|
|
401
|
+
"""Commit strategies for transcript segmentation."""
|
|
402
|
+
|
|
403
|
+
MANUAL = "manual"
|
|
404
|
+
VAD = "vad"
|
|
405
|
+
|
|
406
|
+
|
|
372
407
|
class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
373
|
-
"""
|
|
408
|
+
"""Speech-to-text service using ElevenLabs' Realtime WebSocket API.
|
|
409
|
+
|
|
410
|
+
This service uses ElevenLabs' Realtime Speech-to-Text API to perform transcription
|
|
411
|
+
with ultra-low latency. It supports both partial (interim) and committed (final)
|
|
412
|
+
transcripts, and can use either manual commit control or automatic Voice Activity
|
|
413
|
+
Detection (VAD) for segment boundaries.
|
|
414
|
+
|
|
415
|
+
By default, uses manual commit strategy where Pipecat's VAD controls when to
|
|
416
|
+
commit transcript segments, providing consistency with other STT services.
|
|
417
|
+
|
|
418
|
+
Important:
|
|
419
|
+
When using manual commit strategy with Pipecat's VAD, it is recommended to set
|
|
420
|
+
the VAD `stop_secs` parameter to at least 0.5 seconds. Lower values may result
|
|
421
|
+
in incomplete transcriptions due to a known limitation in the ElevenLabs model
|
|
422
|
+
where audio sent near the commit boundary may not be fully processed.
|
|
423
|
+
"""
|
|
374
424
|
|
|
375
425
|
class InputParams(BaseModel):
|
|
376
|
-
"""
|
|
426
|
+
"""Configuration parameters for ElevenLabs Realtime STT API.
|
|
377
427
|
|
|
378
|
-
|
|
379
|
-
|
|
428
|
+
Parameters:
|
|
429
|
+
language_code: ISO-639-1 or ISO-639-3 language code. Leave None for auto-detection.
|
|
430
|
+
commit_strategy: How to segment speech - manual (Pipecat VAD) or vad (ElevenLabs VAD).
|
|
431
|
+
vad_silence_threshold_secs: Seconds of silence before VAD commits (0.3-3.0).
|
|
432
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
433
|
+
vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
|
|
434
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
435
|
+
min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
|
|
436
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
437
|
+
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
|
|
438
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
439
|
+
|
|
440
|
+
Note:
|
|
441
|
+
When using manual commit strategy, ensure Pipecat's VAD `stop_secs` is set to
|
|
442
|
+
at least 0.5 seconds to avoid incomplete transcriptions. This is a known
|
|
443
|
+
limitation of the ElevenLabs model.
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
language_code: Optional[str] = None
|
|
447
|
+
commit_strategy: CommitStrategy = CommitStrategy.MANUAL
|
|
380
448
|
vad_silence_threshold_secs: Optional[float] = None
|
|
381
449
|
vad_threshold: Optional[float] = None
|
|
382
450
|
min_speech_duration_ms: Optional[int] = None
|
|
@@ -386,210 +454,328 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
|
386
454
|
self,
|
|
387
455
|
*,
|
|
388
456
|
api_key: str,
|
|
389
|
-
|
|
457
|
+
base_url: str = "api.elevenlabs.io",
|
|
390
458
|
model: str = "scribe_v2_realtime",
|
|
391
|
-
|
|
392
|
-
params: Optional[
|
|
393
|
-
reconnect_on_error: bool = True,
|
|
459
|
+
sample_rate: Optional[int] = None,
|
|
460
|
+
params: Optional[InputParams] = None,
|
|
394
461
|
**kwargs,
|
|
395
462
|
):
|
|
396
|
-
"""Initialize the
|
|
463
|
+
"""Initialize the ElevenLabs Realtime STT service.
|
|
397
464
|
|
|
398
465
|
Args:
|
|
399
466
|
api_key: ElevenLabs API key for authentication.
|
|
400
|
-
|
|
401
|
-
model:
|
|
402
|
-
|
|
403
|
-
params:
|
|
404
|
-
|
|
405
|
-
|
|
467
|
+
base_url: Base URL for ElevenLabs WebSocket API.
|
|
468
|
+
model: Model ID for transcription. Defaults to "scribe_v2_realtime".
|
|
469
|
+
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
|
470
|
+
params: Configuration parameters for the STT service.
|
|
471
|
+
**kwargs: Additional arguments passed to WebsocketSTTService.
|
|
472
|
+
|
|
473
|
+
Note:
|
|
474
|
+
When using manual commit strategy (default), configure Pipecat's VAD with
|
|
475
|
+
`stop_secs` of at least 0.5 seconds to ensure complete transcriptions.
|
|
406
476
|
"""
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
)
|
|
412
|
-
raise ModuleNotFoundError("Missing optional dependency: websockets")
|
|
477
|
+
super().__init__(
|
|
478
|
+
sample_rate=sample_rate,
|
|
479
|
+
**kwargs,
|
|
480
|
+
)
|
|
413
481
|
|
|
414
|
-
|
|
482
|
+
params = params or ElevenLabsRealtimeSTTService.InputParams()
|
|
415
483
|
|
|
416
484
|
self._api_key = api_key
|
|
417
|
-
self.
|
|
418
|
-
self.
|
|
419
|
-
self.
|
|
420
|
-
self.
|
|
421
|
-
self.
|
|
422
|
-
self._encoding = None
|
|
423
|
-
self._receive_task: Optional[asyncio.Task] = None
|
|
424
|
-
self._pending_final_message: Optional[Dict[str, Any]] = None
|
|
425
|
-
self._pending_final_task: Optional[asyncio.Task] = None
|
|
426
|
-
self._timestamp_merge_delay_s = 0.25
|
|
427
|
-
self._ttfb_started = False
|
|
428
|
-
|
|
429
|
-
@property
|
|
430
|
-
def commit_strategy(self) -> str:
|
|
431
|
-
"""Return the configured commit strategy (manual or vad)."""
|
|
432
|
-
return (self._params.commit_strategy or "manual").lower()
|
|
485
|
+
self._base_url = base_url
|
|
486
|
+
self._model_id = model
|
|
487
|
+
self._params = params
|
|
488
|
+
self._audio_format = "" # initialized in start()
|
|
489
|
+
self._receive_task = None
|
|
433
490
|
|
|
434
491
|
def can_generate_metrics(self) -> bool:
|
|
435
|
-
"""
|
|
492
|
+
"""Check if the service can generate processing metrics.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
True, as ElevenLabs Realtime STT service supports metrics generation.
|
|
496
|
+
"""
|
|
436
497
|
return True
|
|
437
498
|
|
|
499
|
+
async def set_language(self, language: Language):
|
|
500
|
+
"""Set the transcription language.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
language: The language to use for speech-to-text transcription.
|
|
504
|
+
|
|
505
|
+
Note:
|
|
506
|
+
Changing language requires reconnecting to the WebSocket.
|
|
507
|
+
"""
|
|
508
|
+
logger.info(f"Switching STT language to: [{language}]")
|
|
509
|
+
self._params.language_code = language.value if isinstance(language, Language) else language
|
|
510
|
+
# Reconnect with new settings
|
|
511
|
+
await self._disconnect()
|
|
512
|
+
await self._connect()
|
|
513
|
+
|
|
514
|
+
async def set_model(self, model: str):
|
|
515
|
+
"""Set the STT model.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
model: The model name to use for transcription.
|
|
519
|
+
|
|
520
|
+
Note:
|
|
521
|
+
Changing model requires reconnecting to the WebSocket.
|
|
522
|
+
"""
|
|
523
|
+
await super().set_model(model)
|
|
524
|
+
logger.info(f"Switching STT model to: [{model}]")
|
|
525
|
+
self._model_id = model
|
|
526
|
+
# Reconnect with new settings
|
|
527
|
+
await self._disconnect()
|
|
528
|
+
await self._connect()
|
|
529
|
+
|
|
438
530
|
async def start(self, frame: StartFrame):
|
|
439
|
-
"""Start the
|
|
531
|
+
"""Start the STT service and establish WebSocket connection.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
frame: Frame indicating service should start.
|
|
535
|
+
"""
|
|
440
536
|
await super().start(frame)
|
|
441
|
-
self.
|
|
537
|
+
self._audio_format = audio_format_from_sample_rate(self.sample_rate)
|
|
442
538
|
await self._connect()
|
|
443
539
|
|
|
444
540
|
async def stop(self, frame: EndFrame):
|
|
445
|
-
"""Stop the
|
|
541
|
+
"""Stop the STT service and close WebSocket connection.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
frame: Frame indicating service should stop.
|
|
545
|
+
"""
|
|
446
546
|
await super().stop(frame)
|
|
447
547
|
await self._disconnect()
|
|
448
548
|
|
|
449
549
|
async def cancel(self, frame: CancelFrame):
|
|
450
|
-
"""Cancel the
|
|
550
|
+
"""Cancel the STT service and close WebSocket connection.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
frame: Frame indicating service should be cancelled.
|
|
554
|
+
"""
|
|
451
555
|
await super().cancel(frame)
|
|
452
556
|
await self._disconnect()
|
|
453
557
|
|
|
454
|
-
async def
|
|
455
|
-
"""
|
|
456
|
-
self.
|
|
457
|
-
self.
|
|
458
|
-
if self._websocket:
|
|
459
|
-
await self._disconnect()
|
|
460
|
-
await self._connect()
|
|
461
|
-
|
|
462
|
-
async def set_model(self, model: str):
|
|
463
|
-
"""Set the STT model and reconnect the WebSocket."""
|
|
464
|
-
await super().set_model(model)
|
|
465
|
-
self._model = model
|
|
466
|
-
if self._websocket:
|
|
467
|
-
await self._disconnect()
|
|
468
|
-
await self._connect()
|
|
558
|
+
async def start_metrics(self):
|
|
559
|
+
"""Start performance metrics collection for transcription processing."""
|
|
560
|
+
await self.start_ttfb_metrics()
|
|
561
|
+
await self.start_processing_metrics()
|
|
469
562
|
|
|
470
563
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
471
|
-
"""Process frames and handle
|
|
564
|
+
"""Process incoming frames and handle speech events.
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
frame: The frame to process.
|
|
568
|
+
direction: Direction of frame flow in the pipeline.
|
|
569
|
+
"""
|
|
472
570
|
await super().process_frame(frame, direction)
|
|
473
571
|
|
|
474
572
|
if isinstance(frame, UserStartedSpeakingFrame):
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
self._ttfb_started = False
|
|
478
|
-
await self.start_processing_metrics()
|
|
573
|
+
# Start metrics when user starts speaking
|
|
574
|
+
await self.start_metrics()
|
|
479
575
|
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
576
|
+
# Send commit when user stops speaking (manual commit mode)
|
|
577
|
+
if self._params.commit_strategy == CommitStrategy.MANUAL:
|
|
578
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
579
|
+
try:
|
|
580
|
+
commit_message = {
|
|
581
|
+
"message_type": "input_audio_chunk",
|
|
582
|
+
"audio_base_64": "",
|
|
583
|
+
"commit": True,
|
|
584
|
+
"sample_rate": self.sample_rate,
|
|
585
|
+
}
|
|
586
|
+
await self._websocket.send(json.dumps(commit_message))
|
|
587
|
+
logger.trace("Sent manual commit to ElevenLabs")
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.warning(f"Failed to send commit: {e}")
|
|
484
590
|
|
|
485
591
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
486
|
-
"""
|
|
487
|
-
if not audio:
|
|
488
|
-
yield None
|
|
489
|
-
return
|
|
592
|
+
"""Process audio data for speech-to-text transcription.
|
|
490
593
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
yield None
|
|
594
|
+
Args:
|
|
595
|
+
audio: Raw audio bytes to transcribe.
|
|
494
596
|
|
|
495
|
-
|
|
597
|
+
Yields:
|
|
598
|
+
None - transcription results are handled via WebSocket responses.
|
|
599
|
+
"""
|
|
600
|
+
# Reconnect if connection is closed
|
|
496
601
|
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
497
602
|
await self._connect()
|
|
498
603
|
|
|
604
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
605
|
+
try:
|
|
606
|
+
# Encode audio as base64
|
|
607
|
+
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
|
608
|
+
|
|
609
|
+
# Send audio chunk
|
|
610
|
+
message = {
|
|
611
|
+
"message_type": "input_audio_chunk",
|
|
612
|
+
"audio_base_64": audio_base64,
|
|
613
|
+
"commit": False,
|
|
614
|
+
"sample_rate": self.sample_rate,
|
|
615
|
+
}
|
|
616
|
+
await self._websocket.send(json.dumps(message))
|
|
617
|
+
except Exception as e:
|
|
618
|
+
logger.error(f"Error sending audio: {e}")
|
|
619
|
+
yield ErrorFrame(f"ElevenLabs Realtime STT error: {str(e)}")
|
|
620
|
+
|
|
621
|
+
yield None
|
|
622
|
+
|
|
499
623
|
async def _connect(self):
|
|
624
|
+
"""Establish WebSocket connection to ElevenLabs Realtime STT."""
|
|
500
625
|
await self._connect_websocket()
|
|
626
|
+
|
|
501
627
|
if self._websocket and not self._receive_task:
|
|
502
|
-
self._receive_task =
|
|
628
|
+
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
|
503
629
|
|
|
504
630
|
async def _disconnect(self):
|
|
631
|
+
"""Close WebSocket connection and cleanup tasks."""
|
|
505
632
|
if self._receive_task:
|
|
506
633
|
await self.cancel_task(self._receive_task)
|
|
507
634
|
self._receive_task = None
|
|
508
635
|
|
|
509
|
-
await self._clear_pending_final()
|
|
510
636
|
await self._disconnect_websocket()
|
|
511
637
|
|
|
512
638
|
async def _connect_websocket(self):
|
|
639
|
+
"""Connect to ElevenLabs Realtime STT WebSocket endpoint."""
|
|
513
640
|
try:
|
|
514
641
|
if self._websocket and self._websocket.state is State.OPEN:
|
|
515
642
|
return
|
|
516
643
|
|
|
517
|
-
|
|
644
|
+
logger.debug("Connecting to ElevenLabs Realtime STT")
|
|
645
|
+
|
|
646
|
+
# Build query parameters
|
|
647
|
+
params = [f"model_id={self._model_id}"]
|
|
648
|
+
|
|
649
|
+
if self._params.language_code:
|
|
650
|
+
params.append(f"language_code={self._params.language_code}")
|
|
651
|
+
|
|
652
|
+
params.append(f"encoding={self._audio_format}")
|
|
653
|
+
params.append(f"sample_rate={self.sample_rate}")
|
|
654
|
+
params.append(f"commit_strategy={self._params.commit_strategy.value}")
|
|
655
|
+
|
|
656
|
+
# Add VAD parameters if using VAD commit strategy and values are specified
|
|
657
|
+
if self._params.commit_strategy == CommitStrategy.VAD:
|
|
658
|
+
if self._params.vad_silence_threshold_secs is not None:
|
|
659
|
+
params.append(
|
|
660
|
+
f"vad_silence_threshold_secs={self._params.vad_silence_threshold_secs}"
|
|
661
|
+
)
|
|
662
|
+
if self._params.vad_threshold is not None:
|
|
663
|
+
params.append(f"vad_threshold={self._params.vad_threshold}")
|
|
664
|
+
if self._params.min_speech_duration_ms is not None:
|
|
665
|
+
params.append(f"min_speech_duration_ms={self._params.min_speech_duration_ms}")
|
|
666
|
+
if self._params.min_silence_duration_ms is not None:
|
|
667
|
+
params.append(f"min_silence_duration_ms={self._params.min_silence_duration_ms}")
|
|
668
|
+
|
|
669
|
+
ws_url = f"wss://{self._base_url}/v1/speech-to-text/realtime?{'&'.join(params)}"
|
|
670
|
+
|
|
518
671
|
headers = {"xi-api-key": self._api_key}
|
|
519
|
-
|
|
672
|
+
|
|
520
673
|
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
|
521
674
|
await self._call_event_handler("on_connected")
|
|
675
|
+
logger.debug("Connected to ElevenLabs Realtime STT")
|
|
522
676
|
except Exception as e:
|
|
523
|
-
|
|
524
|
-
self.
|
|
525
|
-
await self._call_event_handler("on_connection_error", f"{e}")
|
|
677
|
+
logger.error(f"{self}: unable to connect to ElevenLabs Realtime STT: {e}")
|
|
678
|
+
await self.push_error(ErrorFrame(f"Connection error: {str(e)}"))
|
|
526
679
|
|
|
527
680
|
async def _disconnect_websocket(self):
|
|
681
|
+
"""Disconnect from ElevenLabs Realtime STT WebSocket."""
|
|
528
682
|
try:
|
|
529
|
-
await self.stop_all_metrics()
|
|
530
683
|
if self._websocket and self._websocket.state is State.OPEN:
|
|
531
|
-
|
|
684
|
+
logger.debug("Disconnecting from ElevenLabs Realtime STT")
|
|
532
685
|
await self._websocket.close()
|
|
533
686
|
except Exception as e:
|
|
534
|
-
|
|
687
|
+
logger.error(f"{self} error closing websocket: {e}")
|
|
535
688
|
finally:
|
|
536
689
|
self._websocket = None
|
|
537
690
|
await self._call_event_handler("on_disconnected")
|
|
538
691
|
|
|
539
|
-
async def _receive_messages(self):
|
|
540
|
-
async for message in self._get_websocket():
|
|
541
|
-
await self._process_event(message)
|
|
542
|
-
|
|
543
692
|
def _get_websocket(self):
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
693
|
+
"""Get the current WebSocket connection.
|
|
694
|
+
|
|
695
|
+
Returns:
|
|
696
|
+
The WebSocket connection.
|
|
697
|
+
|
|
698
|
+
Raises:
|
|
699
|
+
Exception: If WebSocket is not connected.
|
|
700
|
+
"""
|
|
701
|
+
if self._websocket:
|
|
702
|
+
return self._websocket
|
|
703
|
+
raise Exception("Websocket not connected")
|
|
547
704
|
|
|
548
|
-
async def
|
|
705
|
+
async def _process_messages(self):
|
|
706
|
+
"""Process incoming WebSocket messages."""
|
|
707
|
+
async for message in self._get_websocket():
|
|
708
|
+
try:
|
|
709
|
+
data = json.loads(message)
|
|
710
|
+
await self._process_response(data)
|
|
711
|
+
except json.JSONDecodeError:
|
|
712
|
+
logger.warning(f"Received non-JSON message: {message}")
|
|
713
|
+
except Exception as e:
|
|
714
|
+
logger.error(f"Error processing message: {e}")
|
|
715
|
+
|
|
716
|
+
async def _receive_messages(self):
|
|
717
|
+
"""Continuously receive and process WebSocket messages."""
|
|
549
718
|
try:
|
|
550
|
-
|
|
551
|
-
except
|
|
552
|
-
|
|
553
|
-
|
|
719
|
+
await self._process_messages()
|
|
720
|
+
except Exception as e:
|
|
721
|
+
logger.warning(f"{self} WebSocket connection closed: {e}")
|
|
722
|
+
# Connection closed, will reconnect on next audio chunk
|
|
723
|
+
|
|
724
|
+
async def _process_response(self, data: dict):
|
|
725
|
+
"""Process a response message from ElevenLabs.
|
|
554
726
|
|
|
727
|
+
Args:
|
|
728
|
+
data: Parsed JSON response data.
|
|
729
|
+
"""
|
|
555
730
|
message_type = data.get("message_type")
|
|
556
731
|
|
|
557
732
|
if message_type == "session_started":
|
|
558
|
-
|
|
559
|
-
|
|
733
|
+
logger.debug(f"ElevenLabs session started: {data}")
|
|
734
|
+
|
|
735
|
+
elif message_type == "partial_transcript":
|
|
736
|
+
await self._on_partial_transcript(data)
|
|
560
737
|
|
|
561
|
-
if message_type == "partial_transcript":
|
|
562
|
-
await self._emit_partial_transcript(data)
|
|
563
738
|
elif message_type == "committed_transcript":
|
|
564
|
-
await self.
|
|
739
|
+
await self._on_committed_transcript(data)
|
|
740
|
+
|
|
565
741
|
elif message_type == "committed_transcript_with_timestamps":
|
|
566
|
-
await self.
|
|
567
|
-
|
|
742
|
+
await self._on_committed_transcript_with_timestamps(data)
|
|
743
|
+
|
|
744
|
+
elif message_type == "input_error":
|
|
745
|
+
error_msg = data.get("error", "Unknown input error")
|
|
746
|
+
logger.error(f"ElevenLabs input error: {error_msg}")
|
|
747
|
+
await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
|
|
748
|
+
|
|
749
|
+
elif message_type in [
|
|
568
750
|
"auth_error",
|
|
569
751
|
"quota_exceeded",
|
|
570
752
|
"transcriber_error",
|
|
571
|
-
"input_error",
|
|
572
753
|
"error",
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
await self.push_error(
|
|
577
|
-
|
|
578
|
-
)
|
|
754
|
+
]:
|
|
755
|
+
error_msg = data.get("error", data.get("message", "Unknown error"))
|
|
756
|
+
logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
|
|
757
|
+
await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
|
|
758
|
+
|
|
579
759
|
else:
|
|
580
|
-
|
|
760
|
+
logger.debug(f"Unknown message type: {message_type}")
|
|
761
|
+
|
|
762
|
+
async def _on_partial_transcript(self, data: dict):
|
|
763
|
+
"""Handle partial transcript (interim results).
|
|
581
764
|
|
|
582
|
-
|
|
583
|
-
|
|
765
|
+
Args:
|
|
766
|
+
data: Partial transcript data.
|
|
767
|
+
"""
|
|
768
|
+
text = data.get("text", "").strip()
|
|
584
769
|
if not text:
|
|
585
770
|
return
|
|
586
771
|
|
|
587
|
-
language = (
|
|
588
|
-
elevenlabs_language_code_to_language(data.get("language_code"))
|
|
589
|
-
or self._language_override
|
|
590
|
-
)
|
|
591
772
|
await self.stop_ttfb_metrics()
|
|
592
773
|
|
|
774
|
+
# Get language if provided
|
|
775
|
+
language = data.get("language_code")
|
|
776
|
+
|
|
777
|
+
logger.trace(f"Partial transcript: [{text}]")
|
|
778
|
+
|
|
593
779
|
await self.push_frame(
|
|
594
780
|
InterimTranscriptionFrame(
|
|
595
781
|
text,
|
|
@@ -600,143 +786,56 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
|
600
786
|
)
|
|
601
787
|
)
|
|
602
788
|
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
await self._schedule_pending_final_emit()
|
|
610
|
-
|
|
611
|
-
async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
|
|
612
|
-
if self._pending_final_message:
|
|
613
|
-
merged = {**self._pending_final_message, **data}
|
|
614
|
-
await self._emit_transcription(merged)
|
|
615
|
-
await self._clear_pending_final()
|
|
616
|
-
else:
|
|
617
|
-
await self._emit_transcription(data)
|
|
618
|
-
|
|
619
|
-
async def _schedule_pending_final_emit(self):
|
|
620
|
-
await self._clear_pending_final(timer_only=True)
|
|
621
|
-
self._pending_final_task = asyncio.create_task(self._emit_pending_after_delay())
|
|
622
|
-
|
|
623
|
-
async def _emit_pending_after_delay(self):
|
|
624
|
-
try:
|
|
625
|
-
await asyncio.sleep(self._timestamp_merge_delay_s)
|
|
626
|
-
if self._pending_final_message:
|
|
627
|
-
await self._emit_transcription(self._pending_final_message)
|
|
628
|
-
self._pending_final_message = None
|
|
629
|
-
except asyncio.CancelledError:
|
|
630
|
-
pass
|
|
631
|
-
finally:
|
|
632
|
-
self._pending_final_task = None
|
|
633
|
-
|
|
634
|
-
async def _clear_pending_final(self, timer_only: bool = False):
|
|
635
|
-
if self._pending_final_task:
|
|
636
|
-
await self.cancel_task(self._pending_final_task)
|
|
637
|
-
self._pending_final_task = None
|
|
789
|
+
@traced_stt
|
|
790
|
+
async def _handle_transcription(
|
|
791
|
+
self, transcript: str, is_final: bool, language: Optional[str] = None
|
|
792
|
+
):
|
|
793
|
+
"""Handle a transcription result with tracing."""
|
|
794
|
+
pass
|
|
638
795
|
|
|
639
|
-
|
|
640
|
-
|
|
796
|
+
async def _on_committed_transcript(self, data: dict):
|
|
797
|
+
"""Handle committed transcript (final results).
|
|
641
798
|
|
|
642
|
-
|
|
643
|
-
|
|
799
|
+
Args:
|
|
800
|
+
data: Committed transcript data.
|
|
801
|
+
"""
|
|
802
|
+
text = data.get("text", "").strip()
|
|
644
803
|
if not text:
|
|
645
804
|
return
|
|
646
805
|
|
|
647
|
-
language = (
|
|
648
|
-
elevenlabs_language_code_to_language(data.get("language_code"))
|
|
649
|
-
or self._language_override
|
|
650
|
-
)
|
|
651
806
|
await self.stop_ttfb_metrics()
|
|
652
|
-
|
|
653
|
-
frame = TranscriptionFrame(
|
|
654
|
-
text,
|
|
655
|
-
self._user_id,
|
|
656
|
-
time_now_iso8601(),
|
|
657
|
-
language,
|
|
658
|
-
result=data,
|
|
659
|
-
)
|
|
660
|
-
|
|
661
|
-
await self.push_frame(frame)
|
|
662
|
-
await self._handle_transcription(text, True, language)
|
|
663
807
|
await self.stop_processing_metrics()
|
|
664
808
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
return
|
|
809
|
+
# Get language if provided
|
|
810
|
+
language = data.get("language_code")
|
|
668
811
|
|
|
669
|
-
|
|
670
|
-
await self.start_ttfb_metrics()
|
|
671
|
-
self._ttfb_started = True
|
|
672
|
-
|
|
673
|
-
payload = {
|
|
674
|
-
"message_type": "input_audio_chunk",
|
|
675
|
-
"audio_base_64": base64.b64encode(audio).decode("ascii"),
|
|
676
|
-
"commit": False,
|
|
677
|
-
"sample_rate": self.sample_rate,
|
|
678
|
-
}
|
|
679
|
-
await self._websocket.send(json.dumps(payload))
|
|
812
|
+
logger.debug(f"Committed transcript: [{text}]")
|
|
680
813
|
|
|
681
|
-
|
|
682
|
-
if not self._websocket:
|
|
683
|
-
return
|
|
684
|
-
payload = {
|
|
685
|
-
"message_type": "input_audio_chunk",
|
|
686
|
-
"audio_base_64": "",
|
|
687
|
-
"commit": True,
|
|
688
|
-
"sample_rate": self.sample_rate,
|
|
689
|
-
}
|
|
690
|
-
await self._websocket.send(json.dumps(payload))
|
|
814
|
+
await self._handle_transcription(text, True, language)
|
|
691
815
|
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
816
|
+
await self.push_frame(
|
|
817
|
+
TranscriptionFrame(
|
|
818
|
+
text,
|
|
819
|
+
self._user_id,
|
|
820
|
+
time_now_iso8601(),
|
|
821
|
+
language,
|
|
822
|
+
result=data,
|
|
696
823
|
)
|
|
824
|
+
)
|
|
697
825
|
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
"encoding": self._encoding or "pcm_16000",
|
|
701
|
-
"sample_rate": str(self.sample_rate),
|
|
702
|
-
"commit_strategy": self.commit_strategy,
|
|
703
|
-
}
|
|
826
|
+
async def _on_committed_transcript_with_timestamps(self, data: dict):
|
|
827
|
+
"""Handle committed transcript with word-level timestamps.
|
|
704
828
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
params["language_code"] = language_code
|
|
712
|
-
|
|
713
|
-
if self._params.vad_silence_threshold_secs is not None:
|
|
714
|
-
params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
|
|
715
|
-
if self._params.vad_threshold is not None:
|
|
716
|
-
params["vad_threshold"] = str(self._params.vad_threshold)
|
|
717
|
-
if self._params.min_speech_duration_ms is not None:
|
|
718
|
-
params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
|
|
719
|
-
if self._params.min_silence_duration_ms is not None:
|
|
720
|
-
params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
|
|
721
|
-
|
|
722
|
-
return f"{self._url}?{urllib.parse.urlencode(params)}"
|
|
723
|
-
|
|
724
|
-
def _determine_encoding(self, sample_rate: int) -> str:
|
|
725
|
-
if not sample_rate:
|
|
726
|
-
raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
|
|
727
|
-
|
|
728
|
-
supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
|
|
729
|
-
if sample_rate not in supported_rates:
|
|
730
|
-
raise ValueError(
|
|
731
|
-
f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
|
|
732
|
-
f"Received {sample_rate} Hz."
|
|
733
|
-
)
|
|
734
|
-
return f"pcm_{sample_rate}"
|
|
829
|
+
Args:
|
|
830
|
+
data: Committed transcript data with timestamps.
|
|
831
|
+
"""
|
|
832
|
+
text = data.get("text", "").strip()
|
|
833
|
+
if not text:
|
|
834
|
+
return
|
|
735
835
|
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
#
|
|
742
|
-
return
|
|
836
|
+
logger.debug(f"Committed transcript with timestamps: [{text}]")
|
|
837
|
+
logger.trace(f"Timestamps: {data.get('words', [])}")
|
|
838
|
+
|
|
839
|
+
# This is sent after the committed_transcript, so we don't need to
|
|
840
|
+
# push another TranscriptionFrame, but we could use the timestamps
|
|
841
|
+
# for additional processing if needed in the future
|
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev850.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev850.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/top_level.txt
RENAMED
|
File without changes
|