dv-pipecat-ai 0.0.85.dev851__py3-none-any.whl → 0.0.85.dev852__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev851.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/METADATA +1 -1
- {dv_pipecat_ai-0.0.85.dev851.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/RECORD +6 -6
- pipecat/services/elevenlabs/stt.py +361 -319
- {dv_pipecat_ai-0.0.85.dev851.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev851.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev851.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
dv_pipecat_ai-0.0.85.
|
|
1
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
|
|
2
2
|
pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
|
|
3
3
|
pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -217,7 +217,7 @@ pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41
|
|
|
217
217
|
pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
|
|
218
218
|
pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
|
|
219
219
|
pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
|
|
220
|
-
pipecat/services/elevenlabs/stt.py,sha256=
|
|
220
|
+
pipecat/services/elevenlabs/stt.py,sha256=dy88MvQdhUQ-SFA7YTBRykZsIozMnnYQaJ4og1RYlVc,30811
|
|
221
221
|
pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
|
|
222
222
|
pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
|
|
223
223
|
pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
|
|
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
|
|
|
416
416
|
pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
|
|
417
417
|
pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
|
|
418
418
|
pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
|
|
419
|
-
dv_pipecat_ai-0.0.85.
|
|
420
|
-
dv_pipecat_ai-0.0.85.
|
|
421
|
-
dv_pipecat_ai-0.0.85.
|
|
422
|
-
dv_pipecat_ai-0.0.85.
|
|
419
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/METADATA,sha256=L_uFM2KLucwhFvtLcQ9dWL_DQicbrpPyiHOlW81e9LM,32955
|
|
420
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
421
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
|
|
422
|
+
dv_pipecat_ai-0.0.85.dev852.dist-info/RECORD,,
|
|
@@ -4,14 +4,18 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""ElevenLabs speech-to-text service
|
|
7
|
+
"""ElevenLabs speech-to-text service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with ElevenLabs' Speech-to-Text API for transcription
|
|
10
|
+
using segmented audio processing. The service uploads audio files and receives
|
|
11
|
+
transcription results directly.
|
|
12
|
+
"""
|
|
8
13
|
|
|
9
|
-
import asyncio
|
|
10
14
|
import base64
|
|
11
15
|
import io
|
|
12
16
|
import json
|
|
13
|
-
import
|
|
14
|
-
from typing import
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import AsyncGenerator, Optional
|
|
15
19
|
|
|
16
20
|
import aiohttp
|
|
17
21
|
from loguru import logger
|
|
@@ -37,9 +41,12 @@ from pipecat.utils.tracing.service_decorators import traced_stt
|
|
|
37
41
|
try:
|
|
38
42
|
from websockets.asyncio.client import connect as websocket_connect
|
|
39
43
|
from websockets.protocol import State
|
|
40
|
-
except ModuleNotFoundError:
|
|
41
|
-
|
|
42
|
-
|
|
44
|
+
except ModuleNotFoundError as e:
|
|
45
|
+
logger.error(f"Exception: {e}")
|
|
46
|
+
logger.error(
|
|
47
|
+
"In order to use ElevenLabs Realtime STT, you need to `pip install pipecat-ai[elevenlabs]`."
|
|
48
|
+
)
|
|
49
|
+
raise Exception(f"Missing module: {e}")
|
|
43
50
|
|
|
44
51
|
|
|
45
52
|
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
@@ -173,19 +180,6 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
173
180
|
return result
|
|
174
181
|
|
|
175
182
|
|
|
176
|
-
def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
|
|
177
|
-
"""Convert an ElevenLabs language code back to a Language enum value."""
|
|
178
|
-
if not language_code:
|
|
179
|
-
return None
|
|
180
|
-
|
|
181
|
-
normalized = language_code.lower()
|
|
182
|
-
for language in Language:
|
|
183
|
-
code = language_to_elevenlabs_language(language)
|
|
184
|
-
if code and code.lower() == normalized:
|
|
185
|
-
return language
|
|
186
|
-
return None
|
|
187
|
-
|
|
188
|
-
|
|
189
183
|
class ElevenLabsSTTService(SegmentedSTTService):
|
|
190
184
|
"""Speech-to-text service using ElevenLabs' file-based API.
|
|
191
185
|
|
|
@@ -271,7 +265,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
271
265
|
Args:
|
|
272
266
|
language: The language to use for speech-to-text transcription.
|
|
273
267
|
"""
|
|
274
|
-
|
|
268
|
+
logger.info(f"Switching STT language to: [{language}]")
|
|
275
269
|
self._settings["language"] = self.language_to_service_language(language)
|
|
276
270
|
|
|
277
271
|
async def set_model(self, model: str):
|
|
@@ -285,7 +279,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
285
279
|
This method is provided for interface compatibility.
|
|
286
280
|
"""
|
|
287
281
|
await super().set_model(model)
|
|
288
|
-
|
|
282
|
+
logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
|
|
289
283
|
|
|
290
284
|
async def _transcribe_audio(self, audio_data: bytes) -> dict:
|
|
291
285
|
"""Upload audio data to ElevenLabs and get transcription result.
|
|
@@ -319,7 +313,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
319
313
|
async with self._session.post(url, data=data, headers=headers) as response:
|
|
320
314
|
if response.status != 200:
|
|
321
315
|
error_text = await response.text()
|
|
322
|
-
|
|
316
|
+
logger.error(f"ElevenLabs transcription error: {error_text}")
|
|
323
317
|
raise Exception(f"Transcription failed with status {response.status}: {error_text}")
|
|
324
318
|
|
|
325
319
|
result = await response.json()
|
|
@@ -360,7 +354,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
360
354
|
detected_language = result.get("language_code", "eng")
|
|
361
355
|
|
|
362
356
|
await self._handle_transcription(text, True, detected_language)
|
|
363
|
-
|
|
357
|
+
logger.debug(f"Transcription: [{text}]")
|
|
364
358
|
|
|
365
359
|
yield TranscriptionFrame(
|
|
366
360
|
text,
|
|
@@ -371,18 +365,86 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
371
365
|
)
|
|
372
366
|
|
|
373
367
|
except Exception as e:
|
|
374
|
-
|
|
368
|
+
logger.error(f"ElevenLabs STT error: {e}")
|
|
375
369
|
yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
|
|
376
370
|
|
|
377
371
|
|
|
372
|
+
def audio_format_from_sample_rate(sample_rate: int) -> str:
|
|
373
|
+
"""Get the appropriate audio format string for a given sample rate.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
sample_rate: The audio sample rate in Hz.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
The ElevenLabs audio format string.
|
|
380
|
+
"""
|
|
381
|
+
match sample_rate:
|
|
382
|
+
case 8000:
|
|
383
|
+
return "pcm_8000"
|
|
384
|
+
case 16000:
|
|
385
|
+
return "pcm_16000"
|
|
386
|
+
case 22050:
|
|
387
|
+
return "pcm_22050"
|
|
388
|
+
case 24000:
|
|
389
|
+
return "pcm_24000"
|
|
390
|
+
case 44100:
|
|
391
|
+
return "pcm_44100"
|
|
392
|
+
case 48000:
|
|
393
|
+
return "pcm_48000"
|
|
394
|
+
logger.warning(
|
|
395
|
+
f"ElevenLabsRealtimeSTTService: No audio format available for {sample_rate} sample rate, using pcm_16000"
|
|
396
|
+
)
|
|
397
|
+
return "pcm_16000"
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class CommitStrategy(str, Enum):
|
|
401
|
+
"""Commit strategies for transcript segmentation."""
|
|
402
|
+
|
|
403
|
+
MANUAL = "manual"
|
|
404
|
+
VAD = "vad"
|
|
405
|
+
|
|
406
|
+
|
|
378
407
|
class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
379
|
-
"""
|
|
408
|
+
"""Speech-to-text service using ElevenLabs' Realtime WebSocket API.
|
|
409
|
+
|
|
410
|
+
This service uses ElevenLabs' Realtime Speech-to-Text API to perform transcription
|
|
411
|
+
with ultra-low latency. It supports both partial (interim) and committed (final)
|
|
412
|
+
transcripts, and can use either manual commit control or automatic Voice Activity
|
|
413
|
+
Detection (VAD) for segment boundaries.
|
|
414
|
+
|
|
415
|
+
By default, uses manual commit strategy where Pipecat's VAD controls when to
|
|
416
|
+
commit transcript segments, providing consistency with other STT services.
|
|
417
|
+
|
|
418
|
+
Important:
|
|
419
|
+
When using manual commit strategy with Pipecat's VAD, it is recommended to set
|
|
420
|
+
the VAD `stop_secs` parameter to at least 0.5 seconds. Lower values may result
|
|
421
|
+
in incomplete transcriptions due to a known limitation in the ElevenLabs model
|
|
422
|
+
where audio sent near the commit boundary may not be fully processed.
|
|
423
|
+
"""
|
|
380
424
|
|
|
381
425
|
class InputParams(BaseModel):
|
|
382
|
-
"""
|
|
426
|
+
"""Configuration parameters for ElevenLabs Realtime STT API.
|
|
383
427
|
|
|
384
|
-
|
|
385
|
-
|
|
428
|
+
Parameters:
|
|
429
|
+
language_code: ISO-639-1 or ISO-639-3 language code. Leave None for auto-detection.
|
|
430
|
+
commit_strategy: How to segment speech - manual (Pipecat VAD) or vad (ElevenLabs VAD).
|
|
431
|
+
vad_silence_threshold_secs: Seconds of silence before VAD commits (0.3-3.0).
|
|
432
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
433
|
+
vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
|
|
434
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
435
|
+
min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
|
|
436
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
437
|
+
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
|
|
438
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
439
|
+
|
|
440
|
+
Note:
|
|
441
|
+
When using manual commit strategy, ensure Pipecat's VAD `stop_secs` is set to
|
|
442
|
+
at least 0.5 seconds to avoid incomplete transcriptions. This is a known
|
|
443
|
+
limitation of the ElevenLabs model.
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
language_code: Optional[str] = None
|
|
447
|
+
commit_strategy: CommitStrategy = CommitStrategy.MANUAL
|
|
386
448
|
vad_silence_threshold_secs: Optional[float] = None
|
|
387
449
|
vad_threshold: Optional[float] = None
|
|
388
450
|
min_speech_duration_ms: Optional[int] = None
|
|
@@ -392,237 +454,327 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
|
392
454
|
self,
|
|
393
455
|
*,
|
|
394
456
|
api_key: str,
|
|
395
|
-
|
|
457
|
+
base_url: str = "api.elevenlabs.io",
|
|
396
458
|
model: str = "scribe_v2_realtime",
|
|
397
|
-
|
|
398
|
-
params: Optional[
|
|
399
|
-
reconnect_on_error: bool = True,
|
|
459
|
+
sample_rate: Optional[int] = None,
|
|
460
|
+
params: Optional[InputParams] = None,
|
|
400
461
|
**kwargs,
|
|
401
462
|
):
|
|
402
|
-
"""Initialize the
|
|
463
|
+
"""Initialize the ElevenLabs Realtime STT service.
|
|
403
464
|
|
|
404
465
|
Args:
|
|
405
466
|
api_key: ElevenLabs API key for authentication.
|
|
406
|
-
|
|
407
|
-
model:
|
|
408
|
-
|
|
409
|
-
params:
|
|
410
|
-
|
|
411
|
-
|
|
467
|
+
base_url: Base URL for ElevenLabs WebSocket API.
|
|
468
|
+
model: Model ID for transcription. Defaults to "scribe_v2_realtime".
|
|
469
|
+
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
|
470
|
+
params: Configuration parameters for the STT service.
|
|
471
|
+
**kwargs: Additional arguments passed to WebsocketSTTService.
|
|
472
|
+
|
|
473
|
+
Note:
|
|
474
|
+
When using manual commit strategy (default), configure Pipecat's VAD with
|
|
475
|
+
`stop_secs` of at least 0.5 seconds to ensure complete transcriptions.
|
|
412
476
|
"""
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
)
|
|
418
|
-
raise ModuleNotFoundError("Missing optional dependency: websockets")
|
|
477
|
+
super().__init__(
|
|
478
|
+
sample_rate=sample_rate,
|
|
479
|
+
**kwargs,
|
|
480
|
+
)
|
|
419
481
|
|
|
420
|
-
|
|
482
|
+
params = params or ElevenLabsRealtimeSTTService.InputParams()
|
|
421
483
|
|
|
422
484
|
self._api_key = api_key
|
|
423
|
-
self.
|
|
424
|
-
self.
|
|
425
|
-
self.
|
|
426
|
-
self.
|
|
427
|
-
self.
|
|
428
|
-
self._encoding = None
|
|
429
|
-
self._receive_task: Optional[asyncio.Task] = None
|
|
430
|
-
self._pending_final_message: Optional[Dict[str, Any]] = None
|
|
431
|
-
self._pending_final_task: Optional[asyncio.Task] = None
|
|
432
|
-
self._timestamp_merge_delay_s = 0.25
|
|
433
|
-
self._ttfb_started = False
|
|
434
|
-
self._waiting_for_timestamps = False
|
|
435
|
-
|
|
436
|
-
@property
|
|
437
|
-
def commit_strategy(self) -> str:
|
|
438
|
-
"""Return the configured commit strategy (manual or vad)."""
|
|
439
|
-
return (self._params.commit_strategy or "manual").lower()
|
|
485
|
+
self._base_url = base_url
|
|
486
|
+
self._model_id = model
|
|
487
|
+
self._params = params
|
|
488
|
+
self._audio_format = "" # initialized in start()
|
|
489
|
+
self._receive_task = None
|
|
440
490
|
|
|
441
491
|
def can_generate_metrics(self) -> bool:
|
|
442
|
-
"""
|
|
492
|
+
"""Check if the service can generate processing metrics.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
True, as ElevenLabs Realtime STT service supports metrics generation.
|
|
496
|
+
"""
|
|
443
497
|
return True
|
|
444
498
|
|
|
499
|
+
async def set_language(self, language: Language):
|
|
500
|
+
"""Set the transcription language.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
language: The language to use for speech-to-text transcription.
|
|
504
|
+
|
|
505
|
+
Note:
|
|
506
|
+
Changing language requires reconnecting to the WebSocket.
|
|
507
|
+
"""
|
|
508
|
+
logger.info(f"Switching STT language to: [{language}]")
|
|
509
|
+
self._params.language_code = language.value if isinstance(language, Language) else language
|
|
510
|
+
# Reconnect with new settings
|
|
511
|
+
await self._disconnect()
|
|
512
|
+
await self._connect()
|
|
513
|
+
|
|
514
|
+
async def set_model(self, model: str):
|
|
515
|
+
"""Set the STT model.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
model: The model name to use for transcription.
|
|
519
|
+
|
|
520
|
+
Note:
|
|
521
|
+
Changing model requires reconnecting to the WebSocket.
|
|
522
|
+
"""
|
|
523
|
+
await super().set_model(model)
|
|
524
|
+
logger.info(f"Switching STT model to: [{model}]")
|
|
525
|
+
self._model_id = model
|
|
526
|
+
# Reconnect with new settings
|
|
527
|
+
await self._disconnect()
|
|
528
|
+
await self._connect()
|
|
529
|
+
|
|
445
530
|
async def start(self, frame: StartFrame):
|
|
446
|
-
"""Start the
|
|
531
|
+
"""Start the STT service and establish WebSocket connection.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
frame: Frame indicating service should start.
|
|
535
|
+
"""
|
|
447
536
|
await super().start(frame)
|
|
448
|
-
self.
|
|
537
|
+
self._audio_format = audio_format_from_sample_rate(self.sample_rate)
|
|
449
538
|
await self._connect()
|
|
450
539
|
|
|
451
540
|
async def stop(self, frame: EndFrame):
|
|
452
|
-
"""Stop the
|
|
541
|
+
"""Stop the STT service and close WebSocket connection.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
frame: Frame indicating service should stop.
|
|
545
|
+
"""
|
|
453
546
|
await super().stop(frame)
|
|
454
547
|
await self._disconnect()
|
|
455
548
|
|
|
456
549
|
async def cancel(self, frame: CancelFrame):
|
|
457
|
-
"""Cancel the
|
|
550
|
+
"""Cancel the STT service and close WebSocket connection.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
frame: Frame indicating service should be cancelled.
|
|
554
|
+
"""
|
|
458
555
|
await super().cancel(frame)
|
|
459
556
|
await self._disconnect()
|
|
460
557
|
|
|
461
|
-
async def
|
|
462
|
-
"""
|
|
463
|
-
self.
|
|
464
|
-
self.
|
|
465
|
-
if self._websocket:
|
|
466
|
-
await self._disconnect()
|
|
467
|
-
await self._connect()
|
|
468
|
-
|
|
469
|
-
async def set_model(self, model: str):
|
|
470
|
-
"""Set the STT model and reconnect the WebSocket."""
|
|
471
|
-
await super().set_model(model)
|
|
472
|
-
self._model = model
|
|
473
|
-
if self._websocket:
|
|
474
|
-
await self._disconnect()
|
|
475
|
-
await self._connect()
|
|
558
|
+
async def start_metrics(self):
|
|
559
|
+
"""Start performance metrics collection for transcription processing."""
|
|
560
|
+
await self.start_ttfb_metrics()
|
|
561
|
+
await self.start_processing_metrics()
|
|
476
562
|
|
|
477
563
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
478
|
-
"""Process frames and handle
|
|
564
|
+
"""Process incoming frames and handle speech events.
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
frame: The frame to process.
|
|
568
|
+
direction: Direction of frame flow in the pipeline.
|
|
569
|
+
"""
|
|
479
570
|
await super().process_frame(frame, direction)
|
|
480
571
|
|
|
481
572
|
if isinstance(frame, UserStartedSpeakingFrame):
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
# Start metrics and set flag to True so we can stop them later
|
|
485
|
-
await self.start_ttfb_metrics()
|
|
486
|
-
self._ttfb_started = True
|
|
487
|
-
await self.start_processing_metrics()
|
|
573
|
+
# Start metrics when user starts speaking
|
|
574
|
+
await self.start_metrics()
|
|
488
575
|
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
576
|
+
# Send commit when user stops speaking (manual commit mode)
|
|
577
|
+
if self._params.commit_strategy == CommitStrategy.MANUAL:
|
|
578
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
579
|
+
try:
|
|
580
|
+
commit_message = {
|
|
581
|
+
"message_type": "input_audio_chunk",
|
|
582
|
+
"audio_base_64": "",
|
|
583
|
+
"commit": True,
|
|
584
|
+
"sample_rate": self.sample_rate,
|
|
585
|
+
}
|
|
586
|
+
await self._websocket.send(json.dumps(commit_message))
|
|
587
|
+
logger.trace("Sent manual commit to ElevenLabs")
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.warning(f"Failed to send commit: {e}")
|
|
493
590
|
|
|
494
591
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
495
|
-
"""
|
|
496
|
-
if not audio:
|
|
497
|
-
yield None
|
|
498
|
-
return
|
|
592
|
+
"""Process audio data for speech-to-text transcription.
|
|
499
593
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
yield None
|
|
503
|
-
return
|
|
504
|
-
|
|
505
|
-
await self._send_audio_chunk(audio)
|
|
506
|
-
yield None
|
|
507
|
-
|
|
508
|
-
async def _ensure_connection(self) -> bool:
|
|
509
|
-
"""Ensure WebSocket connection is established and ready.
|
|
594
|
+
Args:
|
|
595
|
+
audio: Raw audio bytes to transcribe.
|
|
510
596
|
|
|
511
|
-
|
|
512
|
-
|
|
597
|
+
Yields:
|
|
598
|
+
None - transcription results are handled via WebSocket responses.
|
|
513
599
|
"""
|
|
600
|
+
# Reconnect if connection is closed
|
|
514
601
|
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
515
602
|
await self._connect()
|
|
516
|
-
|
|
603
|
+
|
|
604
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
605
|
+
try:
|
|
606
|
+
# Encode audio as base64
|
|
607
|
+
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
|
608
|
+
|
|
609
|
+
# Send audio chunk
|
|
610
|
+
message = {
|
|
611
|
+
"message_type": "input_audio_chunk",
|
|
612
|
+
"audio_base_64": audio_base64,
|
|
613
|
+
"commit": False,
|
|
614
|
+
"sample_rate": self.sample_rate,
|
|
615
|
+
}
|
|
616
|
+
await self._websocket.send(json.dumps(message))
|
|
617
|
+
except Exception as e:
|
|
618
|
+
logger.error(f"Error sending audio: {e}")
|
|
619
|
+
yield ErrorFrame(f"ElevenLabs Realtime STT error: {str(e)}")
|
|
620
|
+
|
|
621
|
+
yield None
|
|
517
622
|
|
|
518
623
|
async def _connect(self):
|
|
624
|
+
"""Establish WebSocket connection to ElevenLabs Realtime STT."""
|
|
519
625
|
await self._connect_websocket()
|
|
520
|
-
|
|
626
|
+
|
|
627
|
+
if self._websocket and not self._receive_task:
|
|
521
628
|
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
|
522
629
|
|
|
523
630
|
async def _disconnect(self):
|
|
631
|
+
"""Close WebSocket connection and cleanup tasks."""
|
|
524
632
|
if self._receive_task:
|
|
525
633
|
await self.cancel_task(self._receive_task)
|
|
526
634
|
self._receive_task = None
|
|
527
635
|
|
|
528
|
-
await self._clear_pending_final()
|
|
529
636
|
await self._disconnect_websocket()
|
|
530
637
|
|
|
531
638
|
async def _connect_websocket(self):
|
|
639
|
+
"""Connect to ElevenLabs Realtime STT WebSocket endpoint."""
|
|
532
640
|
try:
|
|
533
641
|
if self._websocket and self._websocket.state is State.OPEN:
|
|
534
|
-
self.logger.debug(f"{self} already connected, skipping reconnection")
|
|
535
642
|
return
|
|
536
643
|
|
|
537
|
-
|
|
644
|
+
logger.debug("Connecting to ElevenLabs Realtime STT")
|
|
645
|
+
|
|
646
|
+
# Build query parameters
|
|
647
|
+
params = [f"model_id={self._model_id}"]
|
|
648
|
+
|
|
649
|
+
if self._params.language_code:
|
|
650
|
+
params.append(f"language_code={self._params.language_code}")
|
|
651
|
+
|
|
652
|
+
params.append(f"encoding={self._audio_format}")
|
|
653
|
+
params.append(f"sample_rate={self.sample_rate}")
|
|
654
|
+
params.append(f"commit_strategy={self._params.commit_strategy.value}")
|
|
655
|
+
|
|
656
|
+
# Add VAD parameters if using VAD commit strategy and values are specified
|
|
657
|
+
if self._params.commit_strategy == CommitStrategy.VAD:
|
|
658
|
+
if self._params.vad_silence_threshold_secs is not None:
|
|
659
|
+
params.append(
|
|
660
|
+
f"vad_silence_threshold_secs={self._params.vad_silence_threshold_secs}"
|
|
661
|
+
)
|
|
662
|
+
if self._params.vad_threshold is not None:
|
|
663
|
+
params.append(f"vad_threshold={self._params.vad_threshold}")
|
|
664
|
+
if self._params.min_speech_duration_ms is not None:
|
|
665
|
+
params.append(f"min_speech_duration_ms={self._params.min_speech_duration_ms}")
|
|
666
|
+
if self._params.min_silence_duration_ms is not None:
|
|
667
|
+
params.append(f"min_silence_duration_ms={self._params.min_silence_duration_ms}")
|
|
668
|
+
|
|
669
|
+
ws_url = f"wss://{self._base_url}/v1/speech-to-text/realtime?{'&'.join(params)}"
|
|
670
|
+
|
|
538
671
|
headers = {"xi-api-key": self._api_key}
|
|
539
|
-
|
|
672
|
+
|
|
540
673
|
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
|
541
|
-
self.logger.info(f"{self} successfully connected to ElevenLabs realtime STT")
|
|
542
674
|
await self._call_event_handler("on_connected")
|
|
675
|
+
logger.debug("Connected to ElevenLabs Realtime STT")
|
|
543
676
|
except Exception as e:
|
|
544
|
-
|
|
545
|
-
self.
|
|
546
|
-
if self._receive_task:
|
|
547
|
-
await self.cancel_task(self._receive_task)
|
|
548
|
-
self._receive_task = None
|
|
549
|
-
# Push error to pipeline so callers know the connection failed
|
|
550
|
-
await self.push_error(ErrorFrame(f"ElevenLabs connection failed: {e}", fatal=False))
|
|
551
|
-
await self._call_event_handler("on_connection_error", f"{e}")
|
|
677
|
+
logger.error(f"{self}: unable to connect to ElevenLabs Realtime STT: {e}")
|
|
678
|
+
await self.push_error(ErrorFrame(f"Connection error: {str(e)}"))
|
|
552
679
|
|
|
553
680
|
async def _disconnect_websocket(self):
|
|
681
|
+
"""Disconnect from ElevenLabs Realtime STT WebSocket."""
|
|
554
682
|
try:
|
|
555
|
-
await self.stop_all_metrics()
|
|
556
683
|
if self._websocket and self._websocket.state is State.OPEN:
|
|
557
|
-
|
|
684
|
+
logger.debug("Disconnecting from ElevenLabs Realtime STT")
|
|
558
685
|
await self._websocket.close()
|
|
559
686
|
except Exception as e:
|
|
560
|
-
|
|
687
|
+
logger.error(f"{self} error closing websocket: {e}")
|
|
561
688
|
finally:
|
|
562
689
|
self._websocket = None
|
|
563
690
|
await self._call_event_handler("on_disconnected")
|
|
564
691
|
|
|
565
|
-
async def _receive_messages(self):
|
|
566
|
-
async for message in self._get_websocket():
|
|
567
|
-
await self._process_event(message)
|
|
568
|
-
|
|
569
692
|
def _get_websocket(self):
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
693
|
+
"""Get the current WebSocket connection.
|
|
694
|
+
|
|
695
|
+
Returns:
|
|
696
|
+
The WebSocket connection.
|
|
697
|
+
|
|
698
|
+
Raises:
|
|
699
|
+
Exception: If WebSocket is not connected.
|
|
700
|
+
"""
|
|
701
|
+
if self._websocket:
|
|
702
|
+
return self._websocket
|
|
703
|
+
raise Exception("Websocket not connected")
|
|
573
704
|
|
|
574
|
-
async def
|
|
705
|
+
async def _process_messages(self):
|
|
706
|
+
"""Process incoming WebSocket messages."""
|
|
707
|
+
async for message in self._get_websocket():
|
|
708
|
+
try:
|
|
709
|
+
data = json.loads(message)
|
|
710
|
+
await self._process_response(data)
|
|
711
|
+
except json.JSONDecodeError:
|
|
712
|
+
logger.warning(f"Received non-JSON message: {message}")
|
|
713
|
+
except Exception as e:
|
|
714
|
+
logger.error(f"Error processing message: {e}")
|
|
715
|
+
|
|
716
|
+
async def _receive_messages(self):
|
|
717
|
+
"""Continuously receive and process WebSocket messages."""
|
|
575
718
|
try:
|
|
576
|
-
|
|
577
|
-
except
|
|
578
|
-
|
|
579
|
-
|
|
719
|
+
await self._process_messages()
|
|
720
|
+
except Exception as e:
|
|
721
|
+
logger.warning(f"{self} WebSocket connection closed: {e}")
|
|
722
|
+
# Connection closed, will reconnect on next audio chunk
|
|
723
|
+
|
|
724
|
+
async def _process_response(self, data: dict):
|
|
725
|
+
"""Process a response message from ElevenLabs.
|
|
580
726
|
|
|
727
|
+
Args:
|
|
728
|
+
data: Parsed JSON response data.
|
|
729
|
+
"""
|
|
581
730
|
message_type = data.get("message_type")
|
|
582
731
|
|
|
583
732
|
if message_type == "session_started":
|
|
584
|
-
|
|
585
|
-
|
|
733
|
+
logger.debug(f"ElevenLabs session started: {data}")
|
|
734
|
+
|
|
735
|
+
elif message_type == "partial_transcript":
|
|
736
|
+
await self._on_partial_transcript(data)
|
|
586
737
|
|
|
587
|
-
if message_type == "partial_transcript":
|
|
588
|
-
await self._emit_partial_transcript(data)
|
|
589
738
|
elif message_type == "committed_transcript":
|
|
590
|
-
await self.
|
|
739
|
+
await self._on_committed_transcript(data)
|
|
740
|
+
|
|
591
741
|
elif message_type == "committed_transcript_with_timestamps":
|
|
592
|
-
await self.
|
|
593
|
-
|
|
742
|
+
await self._on_committed_transcript_with_timestamps(data)
|
|
743
|
+
|
|
744
|
+
elif message_type == "input_error":
|
|
745
|
+
error_msg = data.get("error", "Unknown input error")
|
|
746
|
+
logger.error(f"ElevenLabs input error: {error_msg}")
|
|
747
|
+
await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
|
|
748
|
+
|
|
749
|
+
elif message_type in [
|
|
594
750
|
"auth_error",
|
|
595
751
|
"quota_exceeded",
|
|
596
752
|
"transcriber_error",
|
|
597
|
-
"input_error",
|
|
598
753
|
"error",
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
f"{self} ElevenLabs error - Type: {message_type}, Fatal: {fatal}, Full data: {data}"
|
|
605
|
-
)
|
|
606
|
-
await self.push_error(
|
|
607
|
-
ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
|
|
608
|
-
)
|
|
754
|
+
]:
|
|
755
|
+
error_msg = data.get("error", data.get("message", "Unknown error"))
|
|
756
|
+
logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
|
|
757
|
+
await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
|
|
758
|
+
|
|
609
759
|
else:
|
|
610
|
-
|
|
760
|
+
logger.debug(f"Unknown message type: {message_type}")
|
|
611
761
|
|
|
612
|
-
async def
|
|
613
|
-
|
|
762
|
+
async def _on_partial_transcript(self, data: dict):
|
|
763
|
+
"""Handle partial transcript (interim results).
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
data: Partial transcript data.
|
|
767
|
+
"""
|
|
768
|
+
text = data.get("text", "").strip()
|
|
614
769
|
if not text:
|
|
615
770
|
return
|
|
616
771
|
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
)
|
|
772
|
+
await self.stop_ttfb_metrics()
|
|
773
|
+
|
|
774
|
+
# Get language if provided
|
|
775
|
+
language = data.get("language_code")
|
|
621
776
|
|
|
622
|
-
|
|
623
|
-
if self._ttfb_started:
|
|
624
|
-
await self.stop_ttfb_metrics()
|
|
625
|
-
self._ttfb_started = False
|
|
777
|
+
logger.trace(f"Partial transcript: [{text}]")
|
|
626
778
|
|
|
627
779
|
await self.push_frame(
|
|
628
780
|
InterimTranscriptionFrame(
|
|
@@ -634,166 +786,56 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
|
634
786
|
)
|
|
635
787
|
)
|
|
636
788
|
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
self._pending_final_message = data
|
|
644
|
-
self._waiting_for_timestamps = True
|
|
645
|
-
await self._schedule_pending_final_emit()
|
|
646
|
-
|
|
647
|
-
async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
|
|
648
|
-
if self._pending_final_message:
|
|
649
|
-
merged = {**self._pending_final_message, **data}
|
|
650
|
-
await self._emit_transcription(merged)
|
|
651
|
-
await self._clear_pending_final()
|
|
652
|
-
elif self._waiting_for_timestamps:
|
|
653
|
-
# Late arrival after timeout - don't emit duplicate
|
|
654
|
-
self.logger.warning(f"{self} timestamps arrived after timeout, skipping duplicate")
|
|
655
|
-
self._waiting_for_timestamps = False
|
|
656
|
-
else:
|
|
657
|
-
await self._emit_transcription(data)
|
|
658
|
-
|
|
659
|
-
async def _schedule_pending_final_emit(self):
|
|
660
|
-
await self._clear_pending_final(timer_only=True)
|
|
661
|
-
self._pending_final_task = self.create_task(self._emit_pending_after_delay())
|
|
662
|
-
|
|
663
|
-
async def _emit_pending_after_delay(self):
|
|
664
|
-
try:
|
|
665
|
-
await asyncio.sleep(self._timestamp_merge_delay_s)
|
|
666
|
-
if self._pending_final_message:
|
|
667
|
-
await self._emit_transcription(self._pending_final_message)
|
|
668
|
-
self._pending_final_message = None
|
|
669
|
-
self._waiting_for_timestamps = False
|
|
670
|
-
except asyncio.CancelledError:
|
|
671
|
-
pass
|
|
672
|
-
finally:
|
|
673
|
-
self._pending_final_task = None
|
|
674
|
-
|
|
675
|
-
async def _clear_pending_final(self, timer_only: bool = False):
|
|
676
|
-
if self._pending_final_task:
|
|
677
|
-
await self.cancel_task(self._pending_final_task)
|
|
678
|
-
self._pending_final_task = None
|
|
789
|
+
@traced_stt
|
|
790
|
+
async def _handle_transcription(
|
|
791
|
+
self, transcript: str, is_final: bool, language: Optional[str] = None
|
|
792
|
+
):
|
|
793
|
+
"""Handle a transcription result with tracing."""
|
|
794
|
+
pass
|
|
679
795
|
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
self._waiting_for_timestamps = False
|
|
796
|
+
async def _on_committed_transcript(self, data: dict):
|
|
797
|
+
"""Handle committed transcript (final results).
|
|
683
798
|
|
|
684
|
-
|
|
685
|
-
|
|
799
|
+
Args:
|
|
800
|
+
data: Committed transcript data.
|
|
801
|
+
"""
|
|
802
|
+
text = data.get("text", "").strip()
|
|
686
803
|
if not text:
|
|
687
804
|
return
|
|
688
805
|
|
|
689
|
-
|
|
690
|
-
elevenlabs_language_code_to_language(data.get("language_code"))
|
|
691
|
-
or self._language_override
|
|
692
|
-
)
|
|
693
|
-
|
|
694
|
-
# TTFB should already be stopped by partial, but guard just in case
|
|
695
|
-
if self._ttfb_started:
|
|
696
|
-
await self.stop_ttfb_metrics()
|
|
697
|
-
self._ttfb_started = False
|
|
698
|
-
|
|
699
|
-
frame = TranscriptionFrame(
|
|
700
|
-
text,
|
|
701
|
-
self._user_id,
|
|
702
|
-
time_now_iso8601(),
|
|
703
|
-
language,
|
|
704
|
-
result=data,
|
|
705
|
-
)
|
|
706
|
-
|
|
707
|
-
await self.push_frame(frame)
|
|
708
|
-
await self._handle_transcription(text, True, language)
|
|
806
|
+
await self.stop_ttfb_metrics()
|
|
709
807
|
await self.stop_processing_metrics()
|
|
710
808
|
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
return
|
|
809
|
+
# Get language if provided
|
|
810
|
+
language = data.get("language_code")
|
|
714
811
|
|
|
715
|
-
|
|
716
|
-
payload = {
|
|
717
|
-
"message_type": "input_audio_chunk",
|
|
718
|
-
"audio_base_64": base64.b64encode(audio).decode("ascii"),
|
|
719
|
-
"commit": False,
|
|
720
|
-
"sample_rate": self.sample_rate,
|
|
721
|
-
}
|
|
722
|
-
await self._websocket.send(json.dumps(payload))
|
|
723
|
-
except Exception as e:
|
|
724
|
-
self.logger.error(f"{self} error sending audio chunk: {e}")
|
|
725
|
-
await self.push_error(ErrorFrame(f"Failed to send audio: {e}"))
|
|
726
|
-
# Trigger reconnection
|
|
727
|
-
await self._disconnect()
|
|
728
|
-
await self._connect()
|
|
729
|
-
|
|
730
|
-
async def _send_commit(self):
|
|
731
|
-
if not self._websocket:
|
|
732
|
-
return
|
|
812
|
+
logger.debug(f"Committed transcript: [{text}]")
|
|
733
813
|
|
|
734
|
-
|
|
735
|
-
payload = {
|
|
736
|
-
"message_type": "input_audio_chunk",
|
|
737
|
-
"audio_base_64": "",
|
|
738
|
-
"commit": True,
|
|
739
|
-
"sample_rate": self.sample_rate,
|
|
740
|
-
}
|
|
741
|
-
await self._websocket.send(json.dumps(payload))
|
|
742
|
-
except Exception as e:
|
|
743
|
-
self.logger.error(f"{self} error sending commit: {e}")
|
|
744
|
-
await self.push_error(ErrorFrame(f"Failed to send commit: {e}"))
|
|
745
|
-
# Trigger reconnection
|
|
746
|
-
await self._disconnect()
|
|
747
|
-
await self._connect()
|
|
814
|
+
await self._handle_transcription(text, True, language)
|
|
748
815
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
816
|
+
await self.push_frame(
|
|
817
|
+
TranscriptionFrame(
|
|
818
|
+
text,
|
|
819
|
+
self._user_id,
|
|
820
|
+
time_now_iso8601(),
|
|
821
|
+
language,
|
|
822
|
+
result=data,
|
|
753
823
|
)
|
|
824
|
+
)
|
|
754
825
|
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
"encoding": self._encoding or "pcm_16000",
|
|
758
|
-
"sample_rate": str(self.sample_rate),
|
|
759
|
-
"commit_strategy": self.commit_strategy,
|
|
760
|
-
}
|
|
826
|
+
async def _on_committed_transcript_with_timestamps(self, data: dict):
|
|
827
|
+
"""Handle committed transcript with word-level timestamps.
|
|
761
828
|
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
params["language_code"] = language_code
|
|
769
|
-
|
|
770
|
-
if self._params.vad_silence_threshold_secs is not None:
|
|
771
|
-
params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
|
|
772
|
-
if self._params.vad_threshold is not None:
|
|
773
|
-
params["vad_threshold"] = str(self._params.vad_threshold)
|
|
774
|
-
if self._params.min_speech_duration_ms is not None:
|
|
775
|
-
params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
|
|
776
|
-
if self._params.min_silence_duration_ms is not None:
|
|
777
|
-
params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
|
|
778
|
-
|
|
779
|
-
return f"{self._url}?{urllib.parse.urlencode(params)}"
|
|
780
|
-
|
|
781
|
-
def _determine_encoding(self, sample_rate: int) -> str:
|
|
782
|
-
if not sample_rate:
|
|
783
|
-
raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
|
|
784
|
-
|
|
785
|
-
supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
|
|
786
|
-
if sample_rate not in supported_rates:
|
|
787
|
-
raise ValueError(
|
|
788
|
-
f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
|
|
789
|
-
f"Received {sample_rate} Hz."
|
|
790
|
-
)
|
|
791
|
-
return f"pcm_{sample_rate}"
|
|
829
|
+
Args:
|
|
830
|
+
data: Committed transcript data with timestamps.
|
|
831
|
+
"""
|
|
832
|
+
text = data.get("text", "").strip()
|
|
833
|
+
if not text:
|
|
834
|
+
return
|
|
792
835
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
#
|
|
799
|
-
return
|
|
836
|
+
logger.debug(f"Committed transcript with timestamps: [{text}]")
|
|
837
|
+
logger.trace(f"Timestamps: {data.get('words', [])}")
|
|
838
|
+
|
|
839
|
+
# This is sent after the committed_transcript, so we don't need to
|
|
840
|
+
# push another TranscriptionFrame, but we could use the timestamps
|
|
841
|
+
# for additional processing if needed in the future
|
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev851.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev851.dist-info → dv_pipecat_ai-0.0.85.dev852.dist-info}/top_level.txt
RENAMED
|
File without changes
|