dv-pipecat-ai 0.0.82.dev815__py3-none-any.whl → 0.0.82.dev857__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/METADATA +8 -3
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/RECORD +106 -79
- pipecat/adapters/base_llm_adapter.py +44 -6
- pipecat/adapters/services/anthropic_adapter.py +302 -2
- pipecat/adapters/services/aws_nova_sonic_adapter.py +40 -2
- pipecat/adapters/services/bedrock_adapter.py +40 -2
- pipecat/adapters/services/gemini_adapter.py +276 -6
- pipecat/adapters/services/open_ai_adapter.py +88 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +39 -1
- pipecat/audio/dtmf/__init__.py +0 -0
- pipecat/audio/dtmf/types.py +47 -0
- pipecat/audio/dtmf/utils.py +70 -0
- pipecat/audio/filters/aic_filter.py +199 -0
- pipecat/audio/utils.py +9 -7
- pipecat/extensions/ivr/__init__.py +0 -0
- pipecat/extensions/ivr/ivr_navigator.py +452 -0
- pipecat/frames/frames.py +156 -43
- pipecat/pipeline/llm_switcher.py +76 -0
- pipecat/pipeline/parallel_pipeline.py +3 -3
- pipecat/pipeline/service_switcher.py +144 -0
- pipecat/pipeline/task.py +68 -28
- pipecat/pipeline/task_observer.py +10 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -2
- pipecat/processors/aggregators/llm_context.py +277 -0
- pipecat/processors/aggregators/llm_response.py +48 -15
- pipecat/processors/aggregators/llm_response_universal.py +840 -0
- pipecat/processors/aggregators/openai_llm_context.py +3 -3
- pipecat/processors/dtmf_aggregator.py +0 -2
- pipecat/processors/filters/stt_mute_filter.py +0 -2
- pipecat/processors/frame_processor.py +18 -11
- pipecat/processors/frameworks/rtvi.py +17 -10
- pipecat/processors/metrics/sentry.py +2 -0
- pipecat/runner/daily.py +137 -36
- pipecat/runner/run.py +1 -1
- pipecat/runner/utils.py +7 -7
- pipecat/serializers/asterisk.py +20 -4
- pipecat/serializers/exotel.py +1 -1
- pipecat/serializers/plivo.py +1 -1
- pipecat/serializers/telnyx.py +1 -1
- pipecat/serializers/twilio.py +1 -1
- pipecat/services/__init__.py +2 -2
- pipecat/services/anthropic/llm.py +113 -28
- pipecat/services/asyncai/tts.py +4 -0
- pipecat/services/aws/llm.py +82 -8
- pipecat/services/aws/tts.py +0 -10
- pipecat/services/aws_nova_sonic/aws.py +5 -0
- pipecat/services/cartesia/tts.py +28 -16
- pipecat/services/cerebras/llm.py +15 -10
- pipecat/services/deepgram/stt.py +8 -0
- pipecat/services/deepseek/llm.py +13 -8
- pipecat/services/fireworks/llm.py +13 -8
- pipecat/services/fish/tts.py +8 -6
- pipecat/services/gemini_multimodal_live/gemini.py +5 -0
- pipecat/services/gladia/config.py +7 -1
- pipecat/services/gladia/stt.py +23 -15
- pipecat/services/google/llm.py +159 -59
- pipecat/services/google/llm_openai.py +18 -3
- pipecat/services/grok/llm.py +2 -1
- pipecat/services/llm_service.py +38 -3
- pipecat/services/mem0/memory.py +2 -1
- pipecat/services/mistral/llm.py +5 -6
- pipecat/services/nim/llm.py +2 -1
- pipecat/services/openai/base_llm.py +88 -26
- pipecat/services/openai/image.py +6 -1
- pipecat/services/openai_realtime_beta/openai.py +5 -2
- pipecat/services/openpipe/llm.py +6 -8
- pipecat/services/perplexity/llm.py +13 -8
- pipecat/services/playht/tts.py +9 -6
- pipecat/services/rime/tts.py +1 -1
- pipecat/services/sambanova/llm.py +18 -13
- pipecat/services/sarvam/tts.py +415 -10
- pipecat/services/speechmatics/stt.py +2 -2
- pipecat/services/tavus/video.py +1 -1
- pipecat/services/tts_service.py +15 -5
- pipecat/services/vistaar/llm.py +2 -5
- pipecat/transports/base_input.py +32 -19
- pipecat/transports/base_output.py +39 -5
- pipecat/transports/daily/__init__.py +0 -0
- pipecat/transports/daily/transport.py +2371 -0
- pipecat/transports/daily/utils.py +410 -0
- pipecat/transports/livekit/__init__.py +0 -0
- pipecat/transports/livekit/transport.py +1042 -0
- pipecat/transports/network/fastapi_websocket.py +12 -546
- pipecat/transports/network/small_webrtc.py +12 -922
- pipecat/transports/network/webrtc_connection.py +9 -595
- pipecat/transports/network/websocket_client.py +12 -481
- pipecat/transports/network/websocket_server.py +12 -487
- pipecat/transports/services/daily.py +9 -2334
- pipecat/transports/services/helpers/daily_rest.py +12 -396
- pipecat/transports/services/livekit.py +12 -975
- pipecat/transports/services/tavus.py +12 -757
- pipecat/transports/smallwebrtc/__init__.py +0 -0
- pipecat/transports/smallwebrtc/connection.py +612 -0
- pipecat/transports/smallwebrtc/transport.py +936 -0
- pipecat/transports/tavus/__init__.py +0 -0
- pipecat/transports/tavus/transport.py +770 -0
- pipecat/transports/websocket/__init__.py +0 -0
- pipecat/transports/websocket/client.py +494 -0
- pipecat/transports/websocket/fastapi.py +559 -0
- pipecat/transports/websocket/server.py +500 -0
- pipecat/transports/whatsapp/__init__.py +0 -0
- pipecat/transports/whatsapp/api.py +345 -0
- pipecat/transports/whatsapp/client.py +364 -0
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev815.dist-info → dv_pipecat_ai-0.0.82.dev857.dist-info}/top_level.txt +0 -0
pipecat/services/sarvam/tts.py
CHANGED
|
@@ -6,25 +6,40 @@
|
|
|
6
6
|
|
|
7
7
|
"""Sarvam AI text-to-speech service implementation."""
|
|
8
8
|
|
|
9
|
+
import asyncio
|
|
9
10
|
import base64
|
|
10
|
-
|
|
11
|
+
import json
|
|
12
|
+
from typing import Any, AsyncGenerator, Mapping, Optional
|
|
11
13
|
|
|
12
14
|
import aiohttp
|
|
13
15
|
from loguru import logger
|
|
14
16
|
from pydantic import BaseModel, Field
|
|
15
17
|
|
|
16
18
|
from pipecat.frames.frames import (
|
|
19
|
+
CancelFrame,
|
|
20
|
+
EndFrame,
|
|
17
21
|
ErrorFrame,
|
|
18
22
|
Frame,
|
|
23
|
+
LLMFullResponseEndFrame,
|
|
19
24
|
StartFrame,
|
|
25
|
+
StartInterruptionFrame,
|
|
20
26
|
TTSAudioRawFrame,
|
|
21
27
|
TTSStartedFrame,
|
|
22
28
|
TTSStoppedFrame,
|
|
23
29
|
)
|
|
24
|
-
from pipecat.
|
|
30
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
31
|
+
from pipecat.services.tts_service import InterruptibleTTSService, TTSService
|
|
25
32
|
from pipecat.transcriptions.language import Language
|
|
26
33
|
from pipecat.utils.tracing.service_decorators import traced_tts
|
|
27
34
|
|
|
35
|
+
try:
|
|
36
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
37
|
+
from websockets.protocol import State
|
|
38
|
+
except ModuleNotFoundError as e:
|
|
39
|
+
logger.error(f"Exception: {e}")
|
|
40
|
+
logger.error("In order to use Sarvam, you need to `pip install pipecat-ai[sarvam]`.")
|
|
41
|
+
raise Exception(f"Missing module: {e}")
|
|
42
|
+
|
|
28
43
|
|
|
29
44
|
def language_to_sarvam_language(language: Language) -> Optional[str]:
|
|
30
45
|
"""Convert Pipecat Language enum to Sarvam AI language codes.
|
|
@@ -52,7 +67,7 @@ def language_to_sarvam_language(language: Language) -> Optional[str]:
|
|
|
52
67
|
return LANGUAGE_MAP.get(language)
|
|
53
68
|
|
|
54
69
|
|
|
55
|
-
class
|
|
70
|
+
class SarvamHttpTTSService(TTSService):
|
|
56
71
|
"""Text-to-Speech service using Sarvam AI's API.
|
|
57
72
|
|
|
58
73
|
Converts text to speech using Sarvam AI's TTS models with support for multiple
|
|
@@ -95,9 +110,9 @@ class SarvamTTSService(TTSService):
|
|
|
95
110
|
self,
|
|
96
111
|
*,
|
|
97
112
|
api_key: str,
|
|
113
|
+
aiohttp_session: aiohttp.ClientSession,
|
|
98
114
|
voice_id: str = "anushka",
|
|
99
115
|
model: str = "bulbul:v2",
|
|
100
|
-
aiohttp_session: aiohttp.ClientSession,
|
|
101
116
|
base_url: str = "https://api.sarvam.ai",
|
|
102
117
|
sample_rate: Optional[int] = None,
|
|
103
118
|
params: Optional[InputParams] = None,
|
|
@@ -107,9 +122,9 @@ class SarvamTTSService(TTSService):
|
|
|
107
122
|
|
|
108
123
|
Args:
|
|
109
124
|
api_key: Sarvam AI API subscription key.
|
|
125
|
+
aiohttp_session: Shared aiohttp session for making requests.
|
|
110
126
|
voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
|
|
111
127
|
model: TTS model to use ("bulbul:v1" or "bulbul:v2"). Defaults to "bulbul:v2".
|
|
112
|
-
aiohttp_session: Shared aiohttp session for making requests.
|
|
113
128
|
base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
|
|
114
129
|
sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
|
|
115
130
|
params: Additional voice and preprocessing parameters. If None, uses defaults.
|
|
@@ -117,16 +132,16 @@ class SarvamTTSService(TTSService):
|
|
|
117
132
|
"""
|
|
118
133
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
119
134
|
|
|
120
|
-
params = params or
|
|
135
|
+
params = params or SarvamHttpTTSService.InputParams()
|
|
121
136
|
|
|
122
137
|
self._api_key = api_key
|
|
123
138
|
self._base_url = base_url
|
|
124
139
|
self._session = aiohttp_session
|
|
125
140
|
|
|
126
141
|
self._settings = {
|
|
127
|
-
"language":
|
|
128
|
-
|
|
129
|
-
|
|
142
|
+
"language": (
|
|
143
|
+
self.language_to_service_language(params.language) if params.language else "en-IN"
|
|
144
|
+
),
|
|
130
145
|
"pitch": params.pitch,
|
|
131
146
|
"pace": params.pace,
|
|
132
147
|
"loudness": params.loudness,
|
|
@@ -186,7 +201,7 @@ class SarvamTTSService(TTSService):
|
|
|
186
201
|
"pitch": self._settings["pitch"],
|
|
187
202
|
"pace": self._settings["pace"],
|
|
188
203
|
"loudness": self._settings["loudness"],
|
|
189
|
-
"
|
|
204
|
+
"sample_rate": self.sample_rate,
|
|
190
205
|
"enable_preprocessing": self._settings["enable_preprocessing"],
|
|
191
206
|
"model": self._model_name,
|
|
192
207
|
}
|
|
@@ -240,3 +255,393 @@ class SarvamTTSService(TTSService):
|
|
|
240
255
|
finally:
|
|
241
256
|
await self.stop_ttfb_metrics()
|
|
242
257
|
yield TTSStoppedFrame()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class SarvamTTSService(InterruptibleTTSService):
|
|
261
|
+
"""WebSocket-based text-to-speech service using Sarvam AI.
|
|
262
|
+
|
|
263
|
+
Provides streaming TTS with real-time audio generation for multiple Indian languages.
|
|
264
|
+
Supports voice control parameters like pitch, pace, and loudness adjustment.
|
|
265
|
+
|
|
266
|
+
Example::
|
|
267
|
+
|
|
268
|
+
tts = SarvamTTSService(
|
|
269
|
+
api_key="your-api-key",
|
|
270
|
+
voice_id="anushka",
|
|
271
|
+
model="bulbul:v2",
|
|
272
|
+
params=SarvamTTSService.InputParams(
|
|
273
|
+
language=Language.HI,
|
|
274
|
+
pitch=0.1,
|
|
275
|
+
pace=1.2
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
class InputParams(BaseModel):
|
|
281
|
+
"""Configuration parameters for Sarvam TTS.
|
|
282
|
+
|
|
283
|
+
Parameters:
|
|
284
|
+
pitch: Voice pitch adjustment (-0.75 to 0.75). Defaults to 0.0.
|
|
285
|
+
pace: Speech pace multiplier (0.3 to 3.0). Defaults to 1.0.
|
|
286
|
+
loudness: Volume multiplier (0.1 to 3.0). Defaults to 1.0.
|
|
287
|
+
enable_preprocessing: Enable text preprocessing. Defaults to False.
|
|
288
|
+
min_buffer_size: Minimum number of characters to buffer before generating audio.
|
|
289
|
+
Lower values reduce latency but may affect quality. Defaults to 50.
|
|
290
|
+
max_chunk_length: Maximum number of characters processed in a single chunk.
|
|
291
|
+
Controls memory usage and processing efficiency. Defaults to 200.
|
|
292
|
+
output_audio_codec: Audio codec format. Defaults to "linear16".
|
|
293
|
+
output_audio_bitrate: Audio bitrate. Defaults to "128k".
|
|
294
|
+
language: Target language for synthesis. Supports Bengali (bn-IN), English (en-IN),
|
|
295
|
+
Gujarati (gu-IN), Hindi (hi-IN), Kannada (kn-IN), Malayalam (ml-IN),
|
|
296
|
+
Marathi (mr-IN), Odia (od-IN), Punjabi (pa-IN), Tamil (ta-IN),
|
|
297
|
+
Telugu (te-IN). Defaults to en-IN.
|
|
298
|
+
|
|
299
|
+
Available Speakers:
|
|
300
|
+
Female: anushka, manisha, vidya, arya
|
|
301
|
+
Male: abhilash, karun, hitesh
|
|
302
|
+
"""
|
|
303
|
+
|
|
304
|
+
pitch: Optional[float] = Field(default=0.0, ge=-0.75, le=0.75)
|
|
305
|
+
pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
|
|
306
|
+
loudness: Optional[float] = Field(default=1.0, ge=0.1, le=3.0)
|
|
307
|
+
enable_preprocessing: Optional[bool] = False
|
|
308
|
+
min_buffer_size: Optional[int] = 50
|
|
309
|
+
max_chunk_length: Optional[int] = 200
|
|
310
|
+
output_audio_codec: Optional[str] = "linear16"
|
|
311
|
+
output_audio_bitrate: Optional[str] = "128k"
|
|
312
|
+
language: Optional[Language] = Language.EN
|
|
313
|
+
|
|
314
|
+
def __init__(
|
|
315
|
+
self,
|
|
316
|
+
*,
|
|
317
|
+
api_key: str,
|
|
318
|
+
model: str = "bulbul:v2",
|
|
319
|
+
voice_id: str = "anushka",
|
|
320
|
+
url: str = "wss://api.sarvam.ai/text-to-speech/ws",
|
|
321
|
+
aiohttp_session: Optional[aiohttp.ClientSession] = None,
|
|
322
|
+
aggregate_sentences: Optional[bool] = True,
|
|
323
|
+
sample_rate: Optional[int] = None,
|
|
324
|
+
params: Optional[InputParams] = None,
|
|
325
|
+
**kwargs,
|
|
326
|
+
):
|
|
327
|
+
"""Initialize the Sarvam TTS service with voice and transport configuration.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
api_key: Sarvam API key for authenticating TTS requests.
|
|
331
|
+
model: Identifier of the Sarvam speech model (default "bulbul:v2").
|
|
332
|
+
voice_id: Voice identifier for synthesis (default "anushka").
|
|
333
|
+
url: WebSocket URL for connecting to the TTS backend (default production URL).
|
|
334
|
+
aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
|
|
335
|
+
|
|
336
|
+
.. deprecated:: 0.0.81
|
|
337
|
+
aiohttp_session is no longer used. This parameter will be removed in a future version.
|
|
338
|
+
|
|
339
|
+
aggregate_sentences: Whether to merge multiple sentences into one audio chunk (default True).
|
|
340
|
+
sample_rate: Desired sample rate for the output audio in Hz (overrides default if set).
|
|
341
|
+
params: Optional input parameters to override global configuration.
|
|
342
|
+
**kwargs: Optional keyword arguments forwarded to InterruptibleTTSService (such as
|
|
343
|
+
`push_stop_frames`, `sample_rate`, task manager parameters, event hooks, etc.)
|
|
344
|
+
to customize transport behavior or enable metrics support.
|
|
345
|
+
|
|
346
|
+
This method sets up the internal TTS configuration mapping, constructs the WebSocket
|
|
347
|
+
URL based on the chosen model, and initializes state flags before connecting.
|
|
348
|
+
"""
|
|
349
|
+
# Initialize parent class first
|
|
350
|
+
super().__init__(
|
|
351
|
+
aggregate_sentences=aggregate_sentences,
|
|
352
|
+
push_text_frames=True,
|
|
353
|
+
pause_frame_processing=True,
|
|
354
|
+
push_stop_frames=True,
|
|
355
|
+
sample_rate=sample_rate,
|
|
356
|
+
**kwargs,
|
|
357
|
+
)
|
|
358
|
+
params = params or SarvamTTSService.InputParams()
|
|
359
|
+
if aiohttp_session is not None:
|
|
360
|
+
import warnings
|
|
361
|
+
|
|
362
|
+
with warnings.catch_warnings():
|
|
363
|
+
warnings.simplefilter("always")
|
|
364
|
+
warnings.warn(
|
|
365
|
+
"The 'aiohttp_session' parameter is deprecated and will be removed in a future version. ",
|
|
366
|
+
DeprecationWarning,
|
|
367
|
+
stacklevel=2,
|
|
368
|
+
)
|
|
369
|
+
# WebSocket endpoint URL
|
|
370
|
+
self._websocket_url = f"{url}?model={model}"
|
|
371
|
+
self._api_key = api_key
|
|
372
|
+
self.set_model_name(model)
|
|
373
|
+
self.set_voice(voice_id)
|
|
374
|
+
# Configuration parameters
|
|
375
|
+
self._settings = {
|
|
376
|
+
"target_language_code": (
|
|
377
|
+
self.language_to_service_language(params.language) if params.language else "en-IN"
|
|
378
|
+
),
|
|
379
|
+
"pitch": params.pitch,
|
|
380
|
+
"pace": params.pace,
|
|
381
|
+
"speaker": voice_id,
|
|
382
|
+
"loudness": params.loudness,
|
|
383
|
+
"speech_sample_rate": 0,
|
|
384
|
+
"enable_preprocessing": params.enable_preprocessing,
|
|
385
|
+
"min_buffer_size": params.min_buffer_size,
|
|
386
|
+
"max_chunk_length": params.max_chunk_length,
|
|
387
|
+
"output_audio_codec": params.output_audio_codec,
|
|
388
|
+
"output_audio_bitrate": params.output_audio_bitrate,
|
|
389
|
+
}
|
|
390
|
+
self._started = False
|
|
391
|
+
|
|
392
|
+
self._receive_task = None
|
|
393
|
+
self._keepalive_task = None
|
|
394
|
+
self._disconnecting = False
|
|
395
|
+
|
|
396
|
+
def can_generate_metrics(self) -> bool:
|
|
397
|
+
"""Check if this service can generate processing metrics.
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
True, as Sarvam service supports metrics generation.
|
|
401
|
+
"""
|
|
402
|
+
return True
|
|
403
|
+
|
|
404
|
+
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
405
|
+
"""Convert a Language enum to Sarvam AI language format.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
language: The language to convert.
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
The Sarvam AI-specific language code, or None if not supported.
|
|
412
|
+
"""
|
|
413
|
+
return language_to_sarvam_language(language)
|
|
414
|
+
|
|
415
|
+
async def start(self, frame: StartFrame):
|
|
416
|
+
"""Start the Sarvam TTS service.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
frame: The start frame containing initialization parameters.
|
|
420
|
+
"""
|
|
421
|
+
await super().start(frame)
|
|
422
|
+
|
|
423
|
+
self._settings["speech_sample_rate"] = self.sample_rate
|
|
424
|
+
await self._connect()
|
|
425
|
+
|
|
426
|
+
async def stop(self, frame: EndFrame):
|
|
427
|
+
"""Stop the Sarvam TTS service.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
frame: The end frame.
|
|
431
|
+
"""
|
|
432
|
+
await super().stop(frame)
|
|
433
|
+
await self._disconnect()
|
|
434
|
+
|
|
435
|
+
async def cancel(self, frame: CancelFrame):
|
|
436
|
+
"""Cancel the Sarvam TTS service.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
frame: The cancel frame.
|
|
440
|
+
"""
|
|
441
|
+
await super().cancel(frame)
|
|
442
|
+
await self._disconnect()
|
|
443
|
+
|
|
444
|
+
async def flush_audio(self):
|
|
445
|
+
"""Flush any pending audio synthesis by sending stop command."""
|
|
446
|
+
if self._websocket:
|
|
447
|
+
msg = {"type": "flush"}
|
|
448
|
+
await self._websocket.send(json.dumps(msg))
|
|
449
|
+
|
|
450
|
+
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
|
|
451
|
+
"""Push a frame downstream with special handling for stop conditions.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
frame: The frame to push.
|
|
455
|
+
direction: The direction to push the frame.
|
|
456
|
+
"""
|
|
457
|
+
await super().push_frame(frame, direction)
|
|
458
|
+
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
|
|
459
|
+
self._started = False
|
|
460
|
+
|
|
461
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
462
|
+
"""Process a frame and flush audio if it's the end of a full response."""
|
|
463
|
+
if isinstance(frame, LLMFullResponseEndFrame):
|
|
464
|
+
await self.flush_audio()
|
|
465
|
+
return await super().process_frame(frame, direction)
|
|
466
|
+
|
|
467
|
+
async def _update_settings(self, settings: Mapping[str, Any]):
|
|
468
|
+
"""Update service settings and reconnect if voice changed."""
|
|
469
|
+
prev_voice = self._voice_id
|
|
470
|
+
await super()._update_settings(settings)
|
|
471
|
+
if not prev_voice == self._voice_id:
|
|
472
|
+
logger.info(f"Switching TTS voice to: [{self._voice_id}]")
|
|
473
|
+
await self._send_config()
|
|
474
|
+
|
|
475
|
+
async def _connect(self):
|
|
476
|
+
"""Connect to Sarvam WebSocket and start background tasks."""
|
|
477
|
+
await self._connect_websocket()
|
|
478
|
+
|
|
479
|
+
if self._websocket and not self._receive_task:
|
|
480
|
+
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
|
481
|
+
|
|
482
|
+
if self._websocket and not self._keepalive_task:
|
|
483
|
+
self._keepalive_task = self.create_task(
|
|
484
|
+
self._keepalive_task_handler(),
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
async def _disconnect(self):
|
|
488
|
+
"""Disconnect from Sarvam WebSocket and clean up tasks."""
|
|
489
|
+
try:
|
|
490
|
+
# First, set a flag to prevent new operations
|
|
491
|
+
self._disconnecting = True
|
|
492
|
+
|
|
493
|
+
# Cancel background tasks BEFORE closing websocket
|
|
494
|
+
if self._receive_task:
|
|
495
|
+
await self.cancel_task(self._receive_task, timeout=2.0)
|
|
496
|
+
self._receive_task = None
|
|
497
|
+
|
|
498
|
+
if self._keepalive_task:
|
|
499
|
+
await self.cancel_task(self._keepalive_task, timeout=2.0)
|
|
500
|
+
self._keepalive_task = None
|
|
501
|
+
|
|
502
|
+
# Now close the websocket
|
|
503
|
+
await self._disconnect_websocket()
|
|
504
|
+
|
|
505
|
+
except Exception as e:
|
|
506
|
+
logger.error(f"Error during disconnect: {e}")
|
|
507
|
+
finally:
|
|
508
|
+
# Reset state only after everything is cleaned up
|
|
509
|
+
self._started = False
|
|
510
|
+
self._websocket = None
|
|
511
|
+
self._disconnecting = False
|
|
512
|
+
|
|
513
|
+
async def _connect_websocket(self):
|
|
514
|
+
"""Establish WebSocket connection to Sarvam API."""
|
|
515
|
+
try:
|
|
516
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
517
|
+
return
|
|
518
|
+
|
|
519
|
+
self._websocket = await websocket_connect(
|
|
520
|
+
self._websocket_url,
|
|
521
|
+
additional_headers={
|
|
522
|
+
"api-subscription-key": self._api_key,
|
|
523
|
+
},
|
|
524
|
+
)
|
|
525
|
+
logger.debug("Connected to Sarvam TTS Websocket")
|
|
526
|
+
await self._send_config()
|
|
527
|
+
|
|
528
|
+
except Exception as e:
|
|
529
|
+
logger.error(f"{self} initialization error: {e}")
|
|
530
|
+
self._websocket = None
|
|
531
|
+
await self._call_event_handler("on_connection_error", f"{e}")
|
|
532
|
+
|
|
533
|
+
async def _send_config(self):
|
|
534
|
+
"""Send initial configuration message."""
|
|
535
|
+
if not self._websocket:
|
|
536
|
+
raise Exception("WebSocket not connected")
|
|
537
|
+
self._settings["speaker"] = self._voice_id
|
|
538
|
+
logger.debug(f"Config being sent is {self._settings}")
|
|
539
|
+
config_message = {"type": "config", "data": self._settings}
|
|
540
|
+
|
|
541
|
+
try:
|
|
542
|
+
await self._websocket.send(json.dumps(config_message))
|
|
543
|
+
logger.debug("Configuration sent successfully")
|
|
544
|
+
except Exception as e:
|
|
545
|
+
logger.error(f"Failed to send config: {str(e)}")
|
|
546
|
+
await self.push_frame(ErrorFrame(f"Failed to send config: {str(e)}"))
|
|
547
|
+
raise
|
|
548
|
+
|
|
549
|
+
async def _disconnect_websocket(self):
|
|
550
|
+
"""Close WebSocket connection and clean up state."""
|
|
551
|
+
try:
|
|
552
|
+
await self.stop_all_metrics()
|
|
553
|
+
|
|
554
|
+
if self._websocket:
|
|
555
|
+
logger.debug("Disconnecting from Sarvam")
|
|
556
|
+
await self._websocket.close()
|
|
557
|
+
except Exception as e:
|
|
558
|
+
logger.error(f"{self} error closing websocket: {e}")
|
|
559
|
+
|
|
560
|
+
def _get_websocket(self):
|
|
561
|
+
if self._websocket:
|
|
562
|
+
return self._websocket
|
|
563
|
+
raise Exception("Websocket not connected")
|
|
564
|
+
|
|
565
|
+
async def _receive_messages(self):
|
|
566
|
+
"""Receive and process messages from Sarvam WebSocket."""
|
|
567
|
+
async for message in self._get_websocket():
|
|
568
|
+
if isinstance(message, str):
|
|
569
|
+
msg = json.loads(message)
|
|
570
|
+
if msg.get("type") == "audio":
|
|
571
|
+
# Check for interruption before processing audio
|
|
572
|
+
await self.stop_ttfb_metrics()
|
|
573
|
+
audio = base64.b64decode(msg["data"]["audio"])
|
|
574
|
+
frame = TTSAudioRawFrame(audio, self.sample_rate, 1)
|
|
575
|
+
await self.push_frame(frame)
|
|
576
|
+
elif msg.get("type") == "error":
|
|
577
|
+
error_msg = msg["data"]["message"]
|
|
578
|
+
logger.error(f"TTS Error: {error_msg}")
|
|
579
|
+
|
|
580
|
+
# If it's a timeout error, the connection might need to be reset
|
|
581
|
+
if "too long" in error_msg.lower() or "timeout" in error_msg.lower():
|
|
582
|
+
logger.warning("Connection timeout detected, service may need restart")
|
|
583
|
+
|
|
584
|
+
await self.push_frame(ErrorFrame(f"TTS Error: {error_msg}"))
|
|
585
|
+
|
|
586
|
+
async def _keepalive_task_handler(self):
|
|
587
|
+
"""Handle keepalive messages to maintain WebSocket connection."""
|
|
588
|
+
KEEPALIVE_SLEEP = 20
|
|
589
|
+
while True:
|
|
590
|
+
await asyncio.sleep(KEEPALIVE_SLEEP)
|
|
591
|
+
await self._send_keepalive()
|
|
592
|
+
|
|
593
|
+
async def _send_keepalive(self):
|
|
594
|
+
"""Send keepalive message to maintain connection."""
|
|
595
|
+
if self._disconnecting:
|
|
596
|
+
return
|
|
597
|
+
|
|
598
|
+
if self._websocket and self._websocket.state == State.OPEN:
|
|
599
|
+
msg = {"type": "ping"}
|
|
600
|
+
await self._websocket.send(json.dumps(msg))
|
|
601
|
+
|
|
602
|
+
async def _send_text(self, text: str):
|
|
603
|
+
"""Send text to Sarvam WebSocket for synthesis."""
|
|
604
|
+
if self._disconnecting:
|
|
605
|
+
logger.warning("Service is disconnecting, ignoring text send")
|
|
606
|
+
return
|
|
607
|
+
|
|
608
|
+
if self._websocket and self._websocket.state == State.OPEN:
|
|
609
|
+
msg = {"type": "text", "data": {"text": text}}
|
|
610
|
+
await self._websocket.send(json.dumps(msg))
|
|
611
|
+
else:
|
|
612
|
+
logger.warning("WebSocket not ready, cannot send text")
|
|
613
|
+
|
|
614
|
+
@traced_tts
|
|
615
|
+
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
616
|
+
"""Generate speech audio frames from input text using Sarvam TTS.
|
|
617
|
+
|
|
618
|
+
Sends text over WebSocket for synthesis and yields corresponding audio or status frames.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
text: The text input to synthesize.
|
|
622
|
+
|
|
623
|
+
Yields:
|
|
624
|
+
Frame objects including TTSStartedFrame, TTSAudioRawFrame(s), or TTSStoppedFrame.
|
|
625
|
+
"""
|
|
626
|
+
logger.debug(f"Generating TTS: [{text}]")
|
|
627
|
+
|
|
628
|
+
try:
|
|
629
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
630
|
+
await self._connect()
|
|
631
|
+
|
|
632
|
+
try:
|
|
633
|
+
if not self._started:
|
|
634
|
+
await self.start_ttfb_metrics()
|
|
635
|
+
yield TTSStartedFrame()
|
|
636
|
+
self._started = True
|
|
637
|
+
await self._send_text(text)
|
|
638
|
+
await self.start_tts_usage_metrics(text)
|
|
639
|
+
except Exception as e:
|
|
640
|
+
logger.error(f"{self} error sending message: {e}")
|
|
641
|
+
yield TTSStoppedFrame()
|
|
642
|
+
await self._disconnect()
|
|
643
|
+
await self._connect()
|
|
644
|
+
return
|
|
645
|
+
yield None
|
|
646
|
+
except Exception as e:
|
|
647
|
+
logger.error(f"{self} exception: {e}")
|
|
@@ -10,7 +10,6 @@ import asyncio
|
|
|
10
10
|
import datetime
|
|
11
11
|
import os
|
|
12
12
|
import re
|
|
13
|
-
import warnings
|
|
14
13
|
from dataclasses import dataclass, field
|
|
15
14
|
from enum import Enum
|
|
16
15
|
from typing import Any, AsyncGenerator
|
|
@@ -581,7 +580,6 @@ class SpeechmaticsSTTService(STTService):
|
|
|
581
580
|
logger.debug(f"{self} Connected to Speechmatics STT service")
|
|
582
581
|
except Exception as e:
|
|
583
582
|
logger.error(f"{self} Error connecting to Speechmatics: {e}")
|
|
584
|
-
finally:
|
|
585
583
|
self._client = None
|
|
586
584
|
|
|
587
585
|
async def _disconnect(self) -> None:
|
|
@@ -1108,6 +1106,8 @@ def _check_deprecated_args(kwargs: dict, params: SpeechmaticsSTTService.InputPar
|
|
|
1108
1106
|
|
|
1109
1107
|
# Show deprecation warnings
|
|
1110
1108
|
def _deprecation_warning(old: str, new: str | None = None):
|
|
1109
|
+
import warnings
|
|
1110
|
+
|
|
1111
1111
|
with warnings.catch_warnings():
|
|
1112
1112
|
warnings.simplefilter("always")
|
|
1113
1113
|
if new:
|
pipecat/services/tavus/video.py
CHANGED
|
@@ -34,7 +34,7 @@ from pipecat.frames.frames import (
|
|
|
34
34
|
)
|
|
35
35
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessorSetup
|
|
36
36
|
from pipecat.services.ai_service import AIService
|
|
37
|
-
from pipecat.transports.
|
|
37
|
+
from pipecat.transports.tavus.transport import TavusCallbacks, TavusParams, TavusTransportClient
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class TavusVideoService(AIService):
|
pipecat/services/tts_service.py
CHANGED
|
@@ -122,6 +122,9 @@ class TTSService(AIService):
|
|
|
122
122
|
self._voice = None
|
|
123
123
|
self._voice_clone_params = None
|
|
124
124
|
|
|
125
|
+
self._tracing_enabled: bool = False
|
|
126
|
+
|
|
127
|
+
|
|
125
128
|
if text_filter:
|
|
126
129
|
import warnings
|
|
127
130
|
|
|
@@ -283,11 +286,13 @@ class TTSService(AIService):
|
|
|
283
286
|
"""
|
|
284
287
|
import warnings
|
|
285
288
|
|
|
286
|
-
warnings.
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
289
|
+
with warnings.catch_warnings():
|
|
290
|
+
warnings.simplefilter("always")
|
|
291
|
+
warnings.warn(
|
|
292
|
+
"`TTSService.say()` is deprecated. Push a `TTSSpeakFrame` instead.",
|
|
293
|
+
DeprecationWarning,
|
|
294
|
+
stacklevel=2,
|
|
295
|
+
)
|
|
291
296
|
|
|
292
297
|
await self.queue_frame(TTSSpeakFrame(text))
|
|
293
298
|
|
|
@@ -304,6 +309,11 @@ class TTSService(AIService):
|
|
|
304
309
|
await super().process_frame(frame, direction)
|
|
305
310
|
|
|
306
311
|
if (
|
|
312
|
+
isinstance(frame, (TextFrame, LLMFullResponseStartFrame, LLMFullResponseEndFrame))
|
|
313
|
+
and frame.skip_tts
|
|
314
|
+
):
|
|
315
|
+
await self.push_frame(frame, direction)
|
|
316
|
+
elif (
|
|
307
317
|
isinstance(frame, TextFrame)
|
|
308
318
|
and not isinstance(frame, InterimTranscriptionFrame)
|
|
309
319
|
and not isinstance(frame, TranscriptionFrame)
|
pipecat/services/vistaar/llm.py
CHANGED
|
@@ -13,8 +13,6 @@ from loguru import logger
|
|
|
13
13
|
from pydantic import BaseModel, Field
|
|
14
14
|
|
|
15
15
|
from pipecat.frames.frames import (
|
|
16
|
-
CancelFrame,
|
|
17
|
-
EndFrame,
|
|
18
16
|
Frame,
|
|
19
17
|
LLMFullResponseEndFrame,
|
|
20
18
|
LLMFullResponseStartFrame,
|
|
@@ -22,7 +20,6 @@ from pipecat.frames.frames import (
|
|
|
22
20
|
LLMTextFrame,
|
|
23
21
|
LLMUpdateSettingsFrame,
|
|
24
22
|
StartInterruptionFrame,
|
|
25
|
-
StopInterruptionFrame,
|
|
26
23
|
)
|
|
27
24
|
from pipecat.processors.aggregators.llm_response import (
|
|
28
25
|
LLMAssistantAggregatorParams,
|
|
@@ -32,13 +29,13 @@ from pipecat.processors.aggregators.openai_llm_context import (
|
|
|
32
29
|
OpenAILLMContext,
|
|
33
30
|
OpenAILLMContextFrame,
|
|
34
31
|
)
|
|
32
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
33
|
+
from pipecat.services.llm_service import LLMService
|
|
35
34
|
from pipecat.services.openai.llm import (
|
|
36
35
|
OpenAIAssistantContextAggregator,
|
|
37
36
|
OpenAIContextAggregatorPair,
|
|
38
37
|
OpenAIUserContextAggregator,
|
|
39
38
|
)
|
|
40
|
-
from pipecat.processors.frame_processor import FrameDirection
|
|
41
|
-
from pipecat.services.llm_service import LLMService
|
|
42
39
|
|
|
43
40
|
|
|
44
41
|
class VistaarLLMService(LLMService):
|