dv-pipecat-ai 0.0.85.dev818__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/METADATA +2 -1
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/RECORD +32 -29
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +5 -1
- pipecat/frames/frames.py +34 -0
- pipecat/metrics/connection_metrics.py +45 -0
- pipecat/processors/aggregators/llm_response.py +25 -4
- pipecat/processors/dtmf_aggregator.py +17 -21
- pipecat/processors/frame_processor.py +51 -8
- pipecat/processors/metrics/frame_processor_metrics.py +108 -0
- pipecat/processors/transcript_processor.py +22 -1
- pipecat/serializers/__init__.py +2 -0
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +2 -2
- pipecat/serializers/custom.py +2 -2
- pipecat/serializers/vi.py +326 -0
- pipecat/services/cartesia/tts.py +75 -10
- pipecat/services/deepgram/stt.py +317 -17
- pipecat/services/elevenlabs/stt.py +487 -19
- pipecat/services/elevenlabs/tts.py +28 -4
- pipecat/services/google/llm.py +26 -11
- pipecat/services/openai/base_llm.py +79 -14
- pipecat/services/salesforce/llm.py +321 -86
- pipecat/services/sarvam/tts.py +0 -1
- pipecat/services/soniox/stt.py +45 -10
- pipecat/services/vistaar/llm.py +97 -6
- pipecat/transcriptions/language.py +50 -0
- pipecat/transports/base_input.py +15 -11
- pipecat/transports/base_output.py +29 -3
- pipecat/utils/redis.py +58 -0
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev818.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt +0 -0
|
@@ -11,19 +11,43 @@ using segmented audio processing. The service uploads audio files and receives
|
|
|
11
11
|
transcription results directly.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
+
import base64
|
|
14
15
|
import io
|
|
16
|
+
import json
|
|
17
|
+
from enum import Enum
|
|
15
18
|
from typing import AsyncGenerator, Optional
|
|
16
19
|
|
|
17
20
|
import aiohttp
|
|
18
21
|
from loguru import logger
|
|
19
22
|
from pydantic import BaseModel
|
|
20
23
|
|
|
21
|
-
from pipecat.frames.frames import
|
|
22
|
-
|
|
23
|
-
|
|
24
|
+
from pipecat.frames.frames import (
|
|
25
|
+
CancelFrame,
|
|
26
|
+
EndFrame,
|
|
27
|
+
ErrorFrame,
|
|
28
|
+
Frame,
|
|
29
|
+
InterimTranscriptionFrame,
|
|
30
|
+
StartFrame,
|
|
31
|
+
TranscriptionFrame,
|
|
32
|
+
UserStartedSpeakingFrame,
|
|
33
|
+
UserStoppedSpeakingFrame,
|
|
34
|
+
)
|
|
35
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
36
|
+
from pipecat.services.stt_service import SegmentedSTTService, WebsocketSTTService
|
|
37
|
+
from pipecat.transcriptions.language import Language, resolve_language
|
|
24
38
|
from pipecat.utils.time import time_now_iso8601
|
|
25
39
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
26
40
|
|
|
41
|
+
try:
|
|
42
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
43
|
+
from websockets.protocol import State
|
|
44
|
+
except ModuleNotFoundError as e:
|
|
45
|
+
logger.error(f"Exception: {e}")
|
|
46
|
+
logger.error(
|
|
47
|
+
"In order to use ElevenLabs Realtime STT, you need to `pip install pipecat-ai[elevenlabs]`."
|
|
48
|
+
)
|
|
49
|
+
raise Exception(f"Missing module: {e}")
|
|
50
|
+
|
|
27
51
|
|
|
28
52
|
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
29
53
|
"""Convert a Language enum to ElevenLabs language code.
|
|
@@ -37,7 +61,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
37
61
|
Returns:
|
|
38
62
|
The corresponding ElevenLabs language code, or None if not supported.
|
|
39
63
|
"""
|
|
40
|
-
|
|
64
|
+
LANGUAGE_MAP = {
|
|
41
65
|
Language.AF: "afr", # Afrikaans
|
|
42
66
|
Language.AM: "amh", # Amharic
|
|
43
67
|
Language.AR: "ara", # Arabic
|
|
@@ -139,15 +163,7 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
139
163
|
Language.ZU: "zul", # Zulu
|
|
140
164
|
}
|
|
141
165
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# If not found in base languages, try to find the base language from a variant
|
|
145
|
-
if not result:
|
|
146
|
-
lang_str = str(language.value)
|
|
147
|
-
base_code = lang_str.split("-")[0].lower()
|
|
148
|
-
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
|
149
|
-
|
|
150
|
-
return result
|
|
166
|
+
return resolve_language(language, LANGUAGE_MAP, use_base_code=False)
|
|
151
167
|
|
|
152
168
|
|
|
153
169
|
class ElevenLabsSTTService(SegmentedSTTService):
|
|
@@ -235,7 +251,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
235
251
|
Args:
|
|
236
252
|
language: The language to use for speech-to-text transcription.
|
|
237
253
|
"""
|
|
238
|
-
|
|
254
|
+
logger.info(f"Switching STT language to: [{language}]")
|
|
239
255
|
self._settings["language"] = self.language_to_service_language(language)
|
|
240
256
|
|
|
241
257
|
async def set_model(self, model: str):
|
|
@@ -249,7 +265,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
249
265
|
This method is provided for interface compatibility.
|
|
250
266
|
"""
|
|
251
267
|
await super().set_model(model)
|
|
252
|
-
|
|
268
|
+
logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
|
|
253
269
|
|
|
254
270
|
async def _transcribe_audio(self, audio_data: bytes) -> dict:
|
|
255
271
|
"""Upload audio data to ElevenLabs and get transcription result.
|
|
@@ -283,7 +299,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
283
299
|
async with self._session.post(url, data=data, headers=headers) as response:
|
|
284
300
|
if response.status != 200:
|
|
285
301
|
error_text = await response.text()
|
|
286
|
-
|
|
302
|
+
logger.error(f"ElevenLabs transcription error: {error_text}")
|
|
287
303
|
raise Exception(f"Transcription failed with status {response.status}: {error_text}")
|
|
288
304
|
|
|
289
305
|
result = await response.json()
|
|
@@ -324,7 +340,7 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
324
340
|
detected_language = result.get("language_code", "eng")
|
|
325
341
|
|
|
326
342
|
await self._handle_transcription(text, True, detected_language)
|
|
327
|
-
|
|
343
|
+
logger.debug(f"Transcription: [{text}]")
|
|
328
344
|
|
|
329
345
|
yield TranscriptionFrame(
|
|
330
346
|
text,
|
|
@@ -335,5 +351,457 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
335
351
|
)
|
|
336
352
|
|
|
337
353
|
except Exception as e:
|
|
338
|
-
|
|
339
|
-
yield ErrorFrame(f"
|
|
354
|
+
logger.error(f"{self} exception: {e}")
|
|
355
|
+
yield ErrorFrame(error=f"{self} error: {e}")
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def audio_format_from_sample_rate(sample_rate: int) -> str:
|
|
359
|
+
"""Get the appropriate audio format string for a given sample rate.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
sample_rate: The audio sample rate in Hz.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
The ElevenLabs audio format string.
|
|
366
|
+
"""
|
|
367
|
+
match sample_rate:
|
|
368
|
+
case 8000:
|
|
369
|
+
return "pcm_8000"
|
|
370
|
+
case 16000:
|
|
371
|
+
return "pcm_16000"
|
|
372
|
+
case 22050:
|
|
373
|
+
return "pcm_22050"
|
|
374
|
+
case 24000:
|
|
375
|
+
return "pcm_24000"
|
|
376
|
+
case 44100:
|
|
377
|
+
return "pcm_44100"
|
|
378
|
+
case 48000:
|
|
379
|
+
return "pcm_48000"
|
|
380
|
+
logger.warning(
|
|
381
|
+
f"ElevenLabsRealtimeSTTService: No audio format available for {sample_rate} sample rate, using pcm_16000"
|
|
382
|
+
)
|
|
383
|
+
return "pcm_16000"
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
class CommitStrategy(str, Enum):
|
|
387
|
+
"""Commit strategies for transcript segmentation."""
|
|
388
|
+
|
|
389
|
+
MANUAL = "manual"
|
|
390
|
+
VAD = "vad"
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
394
|
+
"""Speech-to-text service using ElevenLabs' Realtime WebSocket API.
|
|
395
|
+
|
|
396
|
+
This service uses ElevenLabs' Realtime Speech-to-Text API to perform transcription
|
|
397
|
+
with ultra-low latency. It supports both partial (interim) and committed (final)
|
|
398
|
+
transcripts, and can use either manual commit control or automatic Voice Activity
|
|
399
|
+
Detection (VAD) for segment boundaries.
|
|
400
|
+
|
|
401
|
+
By default, uses manual commit strategy where Pipecat's VAD controls when to
|
|
402
|
+
commit transcript segments, providing consistency with other STT services.
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
class InputParams(BaseModel):
|
|
406
|
+
"""Configuration parameters for ElevenLabs Realtime STT API.
|
|
407
|
+
|
|
408
|
+
Parameters:
|
|
409
|
+
language_code: ISO-639-1 or ISO-639-3 language code. Leave None for auto-detection.
|
|
410
|
+
commit_strategy: How to segment speech - manual (Pipecat VAD) or vad (ElevenLabs VAD).
|
|
411
|
+
vad_silence_threshold_secs: Seconds of silence before VAD commits (0.3-3.0).
|
|
412
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
413
|
+
vad_threshold: VAD sensitivity (0.1-0.9, lower is more sensitive).
|
|
414
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
415
|
+
min_speech_duration_ms: Minimum speech duration for VAD (50-2000ms).
|
|
416
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
417
|
+
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
|
|
418
|
+
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
|
419
|
+
"""
|
|
420
|
+
|
|
421
|
+
language_code: Optional[str] = None
|
|
422
|
+
commit_strategy: CommitStrategy = CommitStrategy.MANUAL
|
|
423
|
+
vad_silence_threshold_secs: Optional[float] = None
|
|
424
|
+
vad_threshold: Optional[float] = None
|
|
425
|
+
min_speech_duration_ms: Optional[int] = None
|
|
426
|
+
min_silence_duration_ms: Optional[int] = None
|
|
427
|
+
|
|
428
|
+
def __init__(
|
|
429
|
+
self,
|
|
430
|
+
*,
|
|
431
|
+
api_key: str,
|
|
432
|
+
base_url: str = "api.elevenlabs.io",
|
|
433
|
+
model: str = "scribe_v2_realtime",
|
|
434
|
+
sample_rate: Optional[int] = None,
|
|
435
|
+
params: Optional[InputParams] = None,
|
|
436
|
+
**kwargs,
|
|
437
|
+
):
|
|
438
|
+
"""Initialize the ElevenLabs Realtime STT service.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
api_key: ElevenLabs API key for authentication.
|
|
442
|
+
base_url: Base URL for ElevenLabs WebSocket API.
|
|
443
|
+
model: Model ID for transcription. Defaults to "scribe_v2_realtime".
|
|
444
|
+
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
|
445
|
+
params: Configuration parameters for the STT service.
|
|
446
|
+
**kwargs: Additional arguments passed to WebsocketSTTService.
|
|
447
|
+
"""
|
|
448
|
+
super().__init__(
|
|
449
|
+
sample_rate=sample_rate,
|
|
450
|
+
**kwargs,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
params = params or ElevenLabsRealtimeSTTService.InputParams()
|
|
454
|
+
|
|
455
|
+
self._api_key = api_key
|
|
456
|
+
self._base_url = base_url
|
|
457
|
+
self._model_id = model
|
|
458
|
+
self._params = params
|
|
459
|
+
self._audio_format = "" # initialized in start()
|
|
460
|
+
self._receive_task = None
|
|
461
|
+
|
|
462
|
+
def can_generate_metrics(self) -> bool:
|
|
463
|
+
"""Check if the service can generate processing metrics.
|
|
464
|
+
|
|
465
|
+
Returns:
|
|
466
|
+
True, as ElevenLabs Realtime STT service supports metrics generation.
|
|
467
|
+
"""
|
|
468
|
+
return True
|
|
469
|
+
|
|
470
|
+
async def set_language(self, language: Language):
|
|
471
|
+
"""Set the transcription language.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
language: The language to use for speech-to-text transcription.
|
|
475
|
+
|
|
476
|
+
Note:
|
|
477
|
+
Changing language requires reconnecting to the WebSocket.
|
|
478
|
+
"""
|
|
479
|
+
logger.info(f"Switching STT language to: [{language}]")
|
|
480
|
+
self._params.language_code = language.value if isinstance(language, Language) else language
|
|
481
|
+
# Reconnect with new settings
|
|
482
|
+
await self._disconnect()
|
|
483
|
+
await self._connect()
|
|
484
|
+
|
|
485
|
+
async def set_model(self, model: str):
|
|
486
|
+
"""Set the STT model.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
model: The model name to use for transcription.
|
|
490
|
+
|
|
491
|
+
Note:
|
|
492
|
+
Changing model requires reconnecting to the WebSocket.
|
|
493
|
+
"""
|
|
494
|
+
await super().set_model(model)
|
|
495
|
+
logger.info(f"Switching STT model to: [{model}]")
|
|
496
|
+
self._model_id = model
|
|
497
|
+
# Reconnect with new settings
|
|
498
|
+
await self._disconnect()
|
|
499
|
+
await self._connect()
|
|
500
|
+
|
|
501
|
+
async def start(self, frame: StartFrame):
|
|
502
|
+
"""Start the STT service and establish WebSocket connection.
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
frame: Frame indicating service should start.
|
|
506
|
+
"""
|
|
507
|
+
await super().start(frame)
|
|
508
|
+
self._audio_format = audio_format_from_sample_rate(self.sample_rate)
|
|
509
|
+
await self._connect()
|
|
510
|
+
|
|
511
|
+
async def stop(self, frame: EndFrame):
|
|
512
|
+
"""Stop the STT service and close WebSocket connection.
|
|
513
|
+
|
|
514
|
+
Args:
|
|
515
|
+
frame: Frame indicating service should stop.
|
|
516
|
+
"""
|
|
517
|
+
await super().stop(frame)
|
|
518
|
+
await self._disconnect()
|
|
519
|
+
|
|
520
|
+
async def cancel(self, frame: CancelFrame):
|
|
521
|
+
"""Cancel the STT service and close WebSocket connection.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
frame: Frame indicating service should be cancelled.
|
|
525
|
+
"""
|
|
526
|
+
await super().cancel(frame)
|
|
527
|
+
await self._disconnect()
|
|
528
|
+
|
|
529
|
+
async def start_metrics(self):
|
|
530
|
+
"""Start performance metrics collection for transcription processing."""
|
|
531
|
+
await self.start_ttfb_metrics()
|
|
532
|
+
await self.start_processing_metrics()
|
|
533
|
+
|
|
534
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
535
|
+
"""Process incoming frames and handle speech events.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
frame: The frame to process.
|
|
539
|
+
direction: Direction of frame flow in the pipeline.
|
|
540
|
+
"""
|
|
541
|
+
await super().process_frame(frame, direction)
|
|
542
|
+
|
|
543
|
+
if isinstance(frame, UserStartedSpeakingFrame):
|
|
544
|
+
# Start metrics when user starts speaking
|
|
545
|
+
await self.start_metrics()
|
|
546
|
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
547
|
+
# Send commit when user stops speaking (manual commit mode)
|
|
548
|
+
if self._params.commit_strategy == CommitStrategy.MANUAL:
|
|
549
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
550
|
+
try:
|
|
551
|
+
commit_message = {
|
|
552
|
+
"message_type": "input_audio_chunk",
|
|
553
|
+
"audio_base_64": "",
|
|
554
|
+
"commit": True,
|
|
555
|
+
"sample_rate": self.sample_rate,
|
|
556
|
+
}
|
|
557
|
+
await self._websocket.send(json.dumps(commit_message))
|
|
558
|
+
logger.trace("Sent manual commit to ElevenLabs")
|
|
559
|
+
except Exception as e:
|
|
560
|
+
logger.warning(f"Failed to send commit: {e}")
|
|
561
|
+
|
|
562
|
+
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
563
|
+
"""Process audio data for speech-to-text transcription.
|
|
564
|
+
|
|
565
|
+
Args:
|
|
566
|
+
audio: Raw audio bytes to transcribe.
|
|
567
|
+
|
|
568
|
+
Yields:
|
|
569
|
+
None - transcription results are handled via WebSocket responses.
|
|
570
|
+
"""
|
|
571
|
+
# Reconnect if connection is closed
|
|
572
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
573
|
+
await self._connect()
|
|
574
|
+
|
|
575
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
576
|
+
try:
|
|
577
|
+
# Encode audio as base64
|
|
578
|
+
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
|
579
|
+
|
|
580
|
+
# Send audio chunk
|
|
581
|
+
message = {
|
|
582
|
+
"message_type": "input_audio_chunk",
|
|
583
|
+
"audio_base_64": audio_base64,
|
|
584
|
+
"commit": False,
|
|
585
|
+
"sample_rate": self.sample_rate,
|
|
586
|
+
}
|
|
587
|
+
await self._websocket.send(json.dumps(message))
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.error(f"Error sending audio: {e}")
|
|
590
|
+
yield ErrorFrame(f"ElevenLabs Realtime STT error: {str(e)}")
|
|
591
|
+
|
|
592
|
+
yield None
|
|
593
|
+
|
|
594
|
+
async def _connect(self):
|
|
595
|
+
"""Establish WebSocket connection to ElevenLabs Realtime STT."""
|
|
596
|
+
await self._connect_websocket()
|
|
597
|
+
|
|
598
|
+
if self._websocket and not self._receive_task:
|
|
599
|
+
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
|
600
|
+
|
|
601
|
+
async def _disconnect(self):
|
|
602
|
+
"""Close WebSocket connection and cleanup tasks."""
|
|
603
|
+
if self._receive_task:
|
|
604
|
+
await self.cancel_task(self._receive_task)
|
|
605
|
+
self._receive_task = None
|
|
606
|
+
|
|
607
|
+
await self._disconnect_websocket()
|
|
608
|
+
|
|
609
|
+
async def _connect_websocket(self):
|
|
610
|
+
"""Connect to ElevenLabs Realtime STT WebSocket endpoint."""
|
|
611
|
+
try:
|
|
612
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
613
|
+
return
|
|
614
|
+
|
|
615
|
+
logger.debug("Connecting to ElevenLabs Realtime STT")
|
|
616
|
+
|
|
617
|
+
# Build query parameters
|
|
618
|
+
params = [f"model_id={self._model_id}"]
|
|
619
|
+
|
|
620
|
+
if self._params.language_code:
|
|
621
|
+
params.append(f"language_code={self._params.language_code}")
|
|
622
|
+
|
|
623
|
+
params.append(f"encoding={self._audio_format}")
|
|
624
|
+
params.append(f"sample_rate={self.sample_rate}")
|
|
625
|
+
params.append(f"commit_strategy={self._params.commit_strategy.value}")
|
|
626
|
+
|
|
627
|
+
# Add VAD parameters if using VAD commit strategy and values are specified
|
|
628
|
+
if self._params.commit_strategy == CommitStrategy.VAD:
|
|
629
|
+
if self._params.vad_silence_threshold_secs is not None:
|
|
630
|
+
params.append(
|
|
631
|
+
f"vad_silence_threshold_secs={self._params.vad_silence_threshold_secs}"
|
|
632
|
+
)
|
|
633
|
+
if self._params.vad_threshold is not None:
|
|
634
|
+
params.append(f"vad_threshold={self._params.vad_threshold}")
|
|
635
|
+
if self._params.min_speech_duration_ms is not None:
|
|
636
|
+
params.append(f"min_speech_duration_ms={self._params.min_speech_duration_ms}")
|
|
637
|
+
if self._params.min_silence_duration_ms is not None:
|
|
638
|
+
params.append(f"min_silence_duration_ms={self._params.min_silence_duration_ms}")
|
|
639
|
+
|
|
640
|
+
ws_url = f"wss://{self._base_url}/v1/speech-to-text/realtime?{'&'.join(params)}"
|
|
641
|
+
|
|
642
|
+
headers = {"xi-api-key": self._api_key}
|
|
643
|
+
|
|
644
|
+
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
|
645
|
+
await self._call_event_handler("on_connected")
|
|
646
|
+
logger.debug("Connected to ElevenLabs Realtime STT")
|
|
647
|
+
except Exception as e:
|
|
648
|
+
logger.error(f"{self}: unable to connect to ElevenLabs Realtime STT: {e}")
|
|
649
|
+
await self.push_error(ErrorFrame(f"Connection error: {str(e)}"))
|
|
650
|
+
|
|
651
|
+
async def _disconnect_websocket(self):
|
|
652
|
+
"""Disconnect from ElevenLabs Realtime STT WebSocket."""
|
|
653
|
+
try:
|
|
654
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
655
|
+
logger.debug("Disconnecting from ElevenLabs Realtime STT")
|
|
656
|
+
await self._websocket.close()
|
|
657
|
+
except Exception as e:
|
|
658
|
+
logger.error(f"{self} error closing websocket: {e}")
|
|
659
|
+
finally:
|
|
660
|
+
self._websocket = None
|
|
661
|
+
await self._call_event_handler("on_disconnected")
|
|
662
|
+
|
|
663
|
+
def _get_websocket(self):
|
|
664
|
+
"""Get the current WebSocket connection.
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
The WebSocket connection.
|
|
668
|
+
|
|
669
|
+
Raises:
|
|
670
|
+
Exception: If WebSocket is not connected.
|
|
671
|
+
"""
|
|
672
|
+
if self._websocket:
|
|
673
|
+
return self._websocket
|
|
674
|
+
raise Exception("Websocket not connected")
|
|
675
|
+
|
|
676
|
+
async def _process_messages(self):
|
|
677
|
+
"""Process incoming WebSocket messages."""
|
|
678
|
+
async for message in self._get_websocket():
|
|
679
|
+
try:
|
|
680
|
+
data = json.loads(message)
|
|
681
|
+
await self._process_response(data)
|
|
682
|
+
except json.JSONDecodeError:
|
|
683
|
+
logger.warning(f"Received non-JSON message: {message}")
|
|
684
|
+
except Exception as e:
|
|
685
|
+
logger.error(f"Error processing message: {e}")
|
|
686
|
+
|
|
687
|
+
async def _receive_messages(self):
|
|
688
|
+
"""Continuously receive and process WebSocket messages."""
|
|
689
|
+
try:
|
|
690
|
+
await self._process_messages()
|
|
691
|
+
except Exception as e:
|
|
692
|
+
logger.warning(f"{self} WebSocket connection closed: {e}")
|
|
693
|
+
# Connection closed, will reconnect on next audio chunk
|
|
694
|
+
|
|
695
|
+
async def _process_response(self, data: dict):
|
|
696
|
+
"""Process a response message from ElevenLabs.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
data: Parsed JSON response data.
|
|
700
|
+
"""
|
|
701
|
+
message_type = data.get("message_type")
|
|
702
|
+
|
|
703
|
+
if message_type == "session_started":
|
|
704
|
+
logger.debug(f"ElevenLabs session started: {data}")
|
|
705
|
+
|
|
706
|
+
elif message_type == "partial_transcript":
|
|
707
|
+
await self._on_partial_transcript(data)
|
|
708
|
+
|
|
709
|
+
elif message_type == "committed_transcript":
|
|
710
|
+
await self._on_committed_transcript(data)
|
|
711
|
+
|
|
712
|
+
elif message_type == "committed_transcript_with_timestamps":
|
|
713
|
+
await self._on_committed_transcript_with_timestamps(data)
|
|
714
|
+
|
|
715
|
+
elif message_type == "input_error":
|
|
716
|
+
error_msg = data.get("error", "Unknown input error")
|
|
717
|
+
logger.error(f"ElevenLabs input error: {error_msg}")
|
|
718
|
+
await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
|
|
719
|
+
|
|
720
|
+
elif message_type in ["auth_error", "quota_exceeded", "transcriber_error", "error"]:
|
|
721
|
+
error_msg = data.get("error", data.get("message", "Unknown error"))
|
|
722
|
+
logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
|
|
723
|
+
await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
|
|
724
|
+
|
|
725
|
+
else:
|
|
726
|
+
logger.debug(f"Unknown message type: {message_type}")
|
|
727
|
+
|
|
728
|
+
async def _on_partial_transcript(self, data: dict):
|
|
729
|
+
"""Handle partial transcript (interim results).
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
data: Partial transcript data.
|
|
733
|
+
"""
|
|
734
|
+
text = data.get("text", "").strip()
|
|
735
|
+
if not text:
|
|
736
|
+
return
|
|
737
|
+
|
|
738
|
+
await self.stop_ttfb_metrics()
|
|
739
|
+
|
|
740
|
+
# Get language if provided
|
|
741
|
+
language = data.get("language_code")
|
|
742
|
+
|
|
743
|
+
logger.trace(f"Partial transcript: [{text}]")
|
|
744
|
+
|
|
745
|
+
await self.push_frame(
|
|
746
|
+
InterimTranscriptionFrame(
|
|
747
|
+
text,
|
|
748
|
+
self._user_id,
|
|
749
|
+
time_now_iso8601(),
|
|
750
|
+
language,
|
|
751
|
+
result=data,
|
|
752
|
+
)
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
@traced_stt
|
|
756
|
+
async def _handle_transcription(
|
|
757
|
+
self, transcript: str, is_final: bool, language: Optional[str] = None
|
|
758
|
+
):
|
|
759
|
+
"""Handle a transcription result with tracing."""
|
|
760
|
+
pass
|
|
761
|
+
|
|
762
|
+
async def _on_committed_transcript(self, data: dict):
|
|
763
|
+
"""Handle committed transcript (final results).
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
data: Committed transcript data.
|
|
767
|
+
"""
|
|
768
|
+
text = data.get("text", "").strip()
|
|
769
|
+
if not text:
|
|
770
|
+
return
|
|
771
|
+
|
|
772
|
+
await self.stop_ttfb_metrics()
|
|
773
|
+
await self.stop_processing_metrics()
|
|
774
|
+
|
|
775
|
+
# Get language if provided
|
|
776
|
+
language = data.get("language_code")
|
|
777
|
+
|
|
778
|
+
logger.debug(f"Committed transcript: [{text}]")
|
|
779
|
+
|
|
780
|
+
await self._handle_transcription(text, True, language)
|
|
781
|
+
|
|
782
|
+
await self.push_frame(
|
|
783
|
+
TranscriptionFrame(
|
|
784
|
+
text,
|
|
785
|
+
self._user_id,
|
|
786
|
+
time_now_iso8601(),
|
|
787
|
+
language,
|
|
788
|
+
result=data,
|
|
789
|
+
)
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
async def _on_committed_transcript_with_timestamps(self, data: dict):
|
|
793
|
+
"""Handle committed transcript with word-level timestamps.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
data: Committed transcript data with timestamps.
|
|
797
|
+
"""
|
|
798
|
+
text = data.get("text", "").strip()
|
|
799
|
+
if not text:
|
|
800
|
+
return
|
|
801
|
+
|
|
802
|
+
logger.debug(f"Committed transcript with timestamps: [{text}]")
|
|
803
|
+
logger.trace(f"Timestamps: {data.get('words', [])}")
|
|
804
|
+
|
|
805
|
+
# This is sent after the committed_transcript, so we don't need to
|
|
806
|
+
# push another TranscriptionFrame, but we could use the timestamps
|
|
807
|
+
# for additional processing if needed in the future
|
|
@@ -14,7 +14,17 @@ import asyncio
|
|
|
14
14
|
import base64
|
|
15
15
|
import json
|
|
16
16
|
import uuid
|
|
17
|
-
from typing import
|
|
17
|
+
from typing import (
|
|
18
|
+
Any,
|
|
19
|
+
AsyncGenerator,
|
|
20
|
+
Dict,
|
|
21
|
+
List,
|
|
22
|
+
Literal,
|
|
23
|
+
Mapping,
|
|
24
|
+
Optional,
|
|
25
|
+
Tuple,
|
|
26
|
+
Union,
|
|
27
|
+
)
|
|
18
28
|
|
|
19
29
|
import aiohttp
|
|
20
30
|
from loguru import logger
|
|
@@ -157,7 +167,13 @@ def build_elevenlabs_voice_settings(
|
|
|
157
167
|
Returns:
|
|
158
168
|
Dictionary of voice settings or None if no valid settings are provided.
|
|
159
169
|
"""
|
|
160
|
-
voice_setting_keys = [
|
|
170
|
+
voice_setting_keys = [
|
|
171
|
+
"stability",
|
|
172
|
+
"similarity_boost",
|
|
173
|
+
"style",
|
|
174
|
+
"use_speaker_boost",
|
|
175
|
+
"speed",
|
|
176
|
+
]
|
|
161
177
|
|
|
162
178
|
voice_settings = {}
|
|
163
179
|
for key in voice_setting_keys:
|
|
@@ -503,6 +519,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
503
519
|
return
|
|
504
520
|
|
|
505
521
|
self.logger.debug("Connecting to ElevenLabs")
|
|
522
|
+
await self.start_connection_metrics()
|
|
506
523
|
|
|
507
524
|
voice_id = self._voice_id
|
|
508
525
|
model = self.model_name
|
|
@@ -530,17 +547,24 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
530
547
|
|
|
531
548
|
# Set max websocket message size to 16MB for large audio responses
|
|
532
549
|
self._websocket = await websocket_connect(
|
|
533
|
-
url,
|
|
550
|
+
url,
|
|
551
|
+
max_size=16 * 1024 * 1024,
|
|
552
|
+
additional_headers={"xi-api-key": self._api_key},
|
|
534
553
|
)
|
|
535
554
|
|
|
555
|
+
await self.stop_connection_metrics(success=True, connection_type="websocket")
|
|
556
|
+
await self.stop_reconnection_metrics(success=True, reason="successful_reconnection")
|
|
536
557
|
await self._call_event_handler("on_connected")
|
|
537
558
|
except Exception as e:
|
|
538
559
|
self.logger.error(f"{self} initialization error: {e}")
|
|
560
|
+
await self.stop_connection_metrics(success=False, error=str(e), connection_type="websocket")
|
|
561
|
+
await self.stop_reconnection_metrics(success=False, reason="connection_failed")
|
|
539
562
|
self._websocket = None
|
|
540
563
|
await self._call_event_handler("on_connection_error", f"{e}")
|
|
541
564
|
|
|
542
565
|
async def _disconnect_websocket(self):
|
|
543
566
|
try:
|
|
567
|
+
await self.start_reconnection_metrics()
|
|
544
568
|
await self.stop_all_metrics()
|
|
545
569
|
|
|
546
570
|
if self._websocket:
|
|
@@ -549,7 +573,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
549
573
|
if self._context_id:
|
|
550
574
|
await self._websocket.send(json.dumps({"close_socket": True}))
|
|
551
575
|
await self._websocket.close()
|
|
552
|
-
logger.debug("Disconnected from ElevenLabs")
|
|
576
|
+
self.logger.debug("Disconnected from ElevenLabs")
|
|
553
577
|
except Exception as e:
|
|
554
578
|
self.logger.error(f"{self} error closing websocket: {e}")
|
|
555
579
|
finally:
|
pipecat/services/google/llm.py
CHANGED
|
@@ -760,12 +760,19 @@ class GoogleLLMService(LLMService):
|
|
|
760
760
|
|
|
761
761
|
generation_config = GenerateContentConfig(system_instruction=system)
|
|
762
762
|
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
763
|
+
await self.start_connection_metrics()
|
|
764
|
+
|
|
765
|
+
try:
|
|
766
|
+
# Use the new google-genai client's async method
|
|
767
|
+
response = await self._client.aio.models.generate_content(
|
|
768
|
+
model=self._model_name,
|
|
769
|
+
contents=messages,
|
|
770
|
+
config=generation_config,
|
|
771
|
+
)
|
|
772
|
+
await self.stop_connection_metrics(success=True, connection_type="grpc")
|
|
773
|
+
except Exception as e:
|
|
774
|
+
await self.stop_connection_metrics(success=False, error=str(e), connection_type="grpc")
|
|
775
|
+
raise
|
|
769
776
|
|
|
770
777
|
# Extract text from response
|
|
771
778
|
if response.candidates and response.candidates[0].content:
|
|
@@ -849,11 +856,19 @@ class GoogleLLMService(LLMService):
|
|
|
849
856
|
)
|
|
850
857
|
|
|
851
858
|
await self.start_ttfb_metrics()
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
859
|
+
await self.start_connection_metrics()
|
|
860
|
+
|
|
861
|
+
try:
|
|
862
|
+
result = await self._client.aio.models.generate_content_stream(
|
|
863
|
+
model=self._model_name,
|
|
864
|
+
contents=messages,
|
|
865
|
+
config=generation_config,
|
|
866
|
+
)
|
|
867
|
+
await self.stop_connection_metrics(success=True, connection_type="grpc")
|
|
868
|
+
return result
|
|
869
|
+
except Exception as e:
|
|
870
|
+
await self.stop_connection_metrics(success=False, error=str(e), connection_type="grpc")
|
|
871
|
+
raise
|
|
857
872
|
|
|
858
873
|
async def _stream_content_specific_context(
|
|
859
874
|
self, context: OpenAILLMContext
|