dv-pipecat-ai 0.0.85.dev698__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +45 -43
- pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
- pipecat/pipeline/runner.py +6 -2
- pipecat/pipeline/task.py +40 -55
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/frameworks/rtvi.py +1 -0
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +149 -67
- pipecat/runner/types.py +5 -5
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +3 -0
- pipecat/services/aws/llm.py +33 -16
- pipecat/services/aws/nova_sonic/context.py +69 -0
- pipecat/services/aws/nova_sonic/llm.py +199 -89
- pipecat/services/aws/stt.py +2 -0
- pipecat/services/aws_nova_sonic/context.py +8 -12
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +3 -1
- pipecat/services/deepgram/flux/stt.py +4 -0
- pipecat/services/elevenlabs/tts.py +82 -41
- pipecat/services/fish/tts.py +3 -0
- pipecat/services/google/stt.py +4 -0
- pipecat/services/lmnt/tts.py +2 -0
- pipecat/services/neuphonic/tts.py +3 -0
- pipecat/services/openai/tts.py +37 -6
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +3 -0
- pipecat/services/rime/tts.py +9 -8
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +465 -0
- pipecat/services/sarvam/tts.py +87 -10
- pipecat/services/speechmatics/stt.py +3 -1
- pipecat/services/stt_service.py +23 -10
- pipecat/services/tts_service.py +64 -13
- pipecat/transports/base_input.py +3 -0
- pipecat/transports/base_output.py +71 -77
- pipecat/transports/smallwebrtc/connection.py +5 -0
- pipecat/transports/smallwebrtc/request_handler.py +42 -0
- pipecat/utils/string.py +1 -0
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev698.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0
pipecat/services/sarvam/tts.py
CHANGED
|
@@ -77,17 +77,29 @@ class SarvamHttpTTSService(TTSService):
|
|
|
77
77
|
|
|
78
78
|
Example::
|
|
79
79
|
|
|
80
|
-
tts =
|
|
80
|
+
tts = SarvamHttpTTSService(
|
|
81
81
|
api_key="your-api-key",
|
|
82
82
|
voice_id="anushka",
|
|
83
83
|
model="bulbul:v2",
|
|
84
84
|
aiohttp_session=session,
|
|
85
|
-
params=
|
|
85
|
+
params=SarvamHttpTTSService.InputParams(
|
|
86
86
|
language=Language.HI,
|
|
87
87
|
pitch=0.1,
|
|
88
88
|
pace=1.2
|
|
89
89
|
)
|
|
90
90
|
)
|
|
91
|
+
|
|
92
|
+
# For bulbul v3 beta with any speaker:
|
|
93
|
+
tts_v3 = SarvamHttpTTSService(
|
|
94
|
+
api_key="your-api-key",
|
|
95
|
+
voice_id="speaker_name",
|
|
96
|
+
model="bulbul:v3,
|
|
97
|
+
aiohttp_session=session,
|
|
98
|
+
params=SarvamHttpTTSService.InputParams(
|
|
99
|
+
language=Language.HI,
|
|
100
|
+
temperature=0.8
|
|
101
|
+
)
|
|
102
|
+
)
|
|
91
103
|
"""
|
|
92
104
|
|
|
93
105
|
class InputParams(BaseModel):
|
|
@@ -106,6 +118,14 @@ class SarvamHttpTTSService(TTSService):
|
|
|
106
118
|
pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
|
|
107
119
|
loudness: Optional[float] = Field(default=1.0, ge=0.1, le=3.0)
|
|
108
120
|
enable_preprocessing: Optional[bool] = False
|
|
121
|
+
temperature: Optional[float] = Field(
|
|
122
|
+
default=0.6,
|
|
123
|
+
ge=0.01,
|
|
124
|
+
le=1.0,
|
|
125
|
+
description="Controls the randomness of the output for bulbul v3 beta. "
|
|
126
|
+
"Lower values make the output more focused and deterministic, while "
|
|
127
|
+
"higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
|
|
128
|
+
)
|
|
109
129
|
|
|
110
130
|
def __init__(
|
|
111
131
|
self,
|
|
@@ -125,7 +145,7 @@ class SarvamHttpTTSService(TTSService):
|
|
|
125
145
|
api_key: Sarvam AI API subscription key.
|
|
126
146
|
aiohttp_session: Shared aiohttp session for making requests.
|
|
127
147
|
voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
|
|
128
|
-
model: TTS model to use ("bulbul:
|
|
148
|
+
model: TTS model to use ("bulbul:v2" or "bulbul:v3-beta" or "bulbul:v3"). Defaults to "bulbul:v2".
|
|
129
149
|
base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
|
|
130
150
|
sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
|
|
131
151
|
params: Additional voice and preprocessing parameters. If None, uses defaults.
|
|
@@ -139,16 +159,32 @@ class SarvamHttpTTSService(TTSService):
|
|
|
139
159
|
self._base_url = base_url
|
|
140
160
|
self._session = aiohttp_session
|
|
141
161
|
|
|
162
|
+
# Build base settings common to all models
|
|
142
163
|
self._settings = {
|
|
143
164
|
"language": (
|
|
144
165
|
self.language_to_service_language(params.language) if params.language else "en-IN"
|
|
145
166
|
),
|
|
146
|
-
"pitch": params.pitch,
|
|
147
|
-
"pace": params.pace,
|
|
148
|
-
"loudness": params.loudness,
|
|
149
167
|
"enable_preprocessing": params.enable_preprocessing,
|
|
150
168
|
}
|
|
151
169
|
|
|
170
|
+
# Add model-specific parameters
|
|
171
|
+
if model in ("bulbul:v3-beta", "bulbul:v3"):
|
|
172
|
+
self._settings.update(
|
|
173
|
+
{
|
|
174
|
+
"temperature": getattr(params, "temperature", 0.6),
|
|
175
|
+
"model": model,
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
self._settings.update(
|
|
180
|
+
{
|
|
181
|
+
"pitch": params.pitch,
|
|
182
|
+
"pace": params.pace,
|
|
183
|
+
"loudness": params.loudness,
|
|
184
|
+
"model": model,
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
|
|
152
188
|
self.set_model_name(model)
|
|
153
189
|
self.set_voice(voice_id)
|
|
154
190
|
|
|
@@ -276,6 +312,18 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
276
312
|
pace=1.2
|
|
277
313
|
)
|
|
278
314
|
)
|
|
315
|
+
|
|
316
|
+
# For bulbul v3 beta with any speaker and temperature:
|
|
317
|
+
# Note: pace and loudness are not supported for bulbul v3 and bulbul v3 beta
|
|
318
|
+
tts_v3 = SarvamTTSService(
|
|
319
|
+
api_key="your-api-key",
|
|
320
|
+
voice_id="speaker_name",
|
|
321
|
+
model="bulbul:v3",
|
|
322
|
+
params=SarvamTTSService.InputParams(
|
|
323
|
+
language=Language.HI,
|
|
324
|
+
temperature=0.8
|
|
325
|
+
)
|
|
326
|
+
)
|
|
279
327
|
"""
|
|
280
328
|
|
|
281
329
|
class InputParams(BaseModel):
|
|
@@ -311,6 +359,14 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
311
359
|
output_audio_codec: Optional[str] = "linear16"
|
|
312
360
|
output_audio_bitrate: Optional[str] = "128k"
|
|
313
361
|
language: Optional[Language] = Language.EN
|
|
362
|
+
temperature: Optional[float] = Field(
|
|
363
|
+
default=0.6,
|
|
364
|
+
ge=0.01,
|
|
365
|
+
le=1.0,
|
|
366
|
+
description="Controls the randomness of the output for bulbul v3 beta. "
|
|
367
|
+
"Lower values make the output more focused and deterministic, while "
|
|
368
|
+
"higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
|
|
369
|
+
)
|
|
314
370
|
|
|
315
371
|
def __init__(
|
|
316
372
|
self,
|
|
@@ -330,6 +386,7 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
330
386
|
Args:
|
|
331
387
|
api_key: Sarvam API key for authenticating TTS requests.
|
|
332
388
|
model: Identifier of the Sarvam speech model (default "bulbul:v2").
|
|
389
|
+
Supports "bulbul:v2", "bulbul:v3-beta" and "bulbul:v3".
|
|
333
390
|
voice_id: Voice identifier for synthesis (default "anushka").
|
|
334
391
|
url: WebSocket URL for connecting to the TTS backend (default production URL).
|
|
335
392
|
aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
|
|
@@ -372,15 +429,12 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
372
429
|
self._api_key = api_key
|
|
373
430
|
self.set_model_name(model)
|
|
374
431
|
self.set_voice(voice_id)
|
|
375
|
-
#
|
|
432
|
+
# Build base settings common to all models
|
|
376
433
|
self._settings = {
|
|
377
434
|
"target_language_code": (
|
|
378
435
|
self.language_to_service_language(params.language) if params.language else "en-IN"
|
|
379
436
|
),
|
|
380
|
-
"pitch": params.pitch,
|
|
381
|
-
"pace": params.pace,
|
|
382
437
|
"speaker": voice_id,
|
|
383
|
-
"loudness": params.loudness,
|
|
384
438
|
"speech_sample_rate": 0,
|
|
385
439
|
"enable_preprocessing": params.enable_preprocessing,
|
|
386
440
|
"min_buffer_size": params.min_buffer_size,
|
|
@@ -388,6 +442,24 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
388
442
|
"output_audio_codec": params.output_audio_codec,
|
|
389
443
|
"output_audio_bitrate": params.output_audio_bitrate,
|
|
390
444
|
}
|
|
445
|
+
|
|
446
|
+
# Add model-specific parameters
|
|
447
|
+
if model in ("bulbul:v3-beta", "bulbul:v3"):
|
|
448
|
+
self._settings.update(
|
|
449
|
+
{
|
|
450
|
+
"temperature": getattr(params, "temperature", 0.6),
|
|
451
|
+
"model": model,
|
|
452
|
+
}
|
|
453
|
+
)
|
|
454
|
+
else:
|
|
455
|
+
self._settings.update(
|
|
456
|
+
{
|
|
457
|
+
"pitch": params.pitch,
|
|
458
|
+
"pace": params.pace,
|
|
459
|
+
"loudness": params.loudness,
|
|
460
|
+
"model": model,
|
|
461
|
+
}
|
|
462
|
+
)
|
|
391
463
|
self._started = False
|
|
392
464
|
|
|
393
465
|
self._receive_task = None
|
|
@@ -526,6 +598,7 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
526
598
|
logger.debug("Connected to Sarvam TTS Websocket")
|
|
527
599
|
await self._send_config()
|
|
528
600
|
|
|
601
|
+
await self._call_event_handler("on_connected")
|
|
529
602
|
except Exception as e:
|
|
530
603
|
logger.error(f"{self} initialization error: {e}")
|
|
531
604
|
self._websocket = None
|
|
@@ -557,6 +630,10 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
557
630
|
await self._websocket.close()
|
|
558
631
|
except Exception as e:
|
|
559
632
|
logger.error(f"{self} error closing websocket: {e}")
|
|
633
|
+
finally:
|
|
634
|
+
self._started = False
|
|
635
|
+
self._websocket = None
|
|
636
|
+
await self._call_event_handler("on_disconnected")
|
|
560
637
|
|
|
561
638
|
def _get_websocket(self):
|
|
562
639
|
if self._websocket:
|
|
@@ -577,6 +577,7 @@ class SpeechmaticsSTTService(STTService):
|
|
|
577
577
|
),
|
|
578
578
|
)
|
|
579
579
|
logger.debug(f"{self} Connected to Speechmatics STT service")
|
|
580
|
+
await self._call_event_handler("on_connected")
|
|
580
581
|
except Exception as e:
|
|
581
582
|
logger.error(f"{self} Error connecting to Speechmatics: {e}")
|
|
582
583
|
self._client = None
|
|
@@ -595,6 +596,7 @@ class SpeechmaticsSTTService(STTService):
|
|
|
595
596
|
logger.error(f"{self} Error closing Speechmatics client: {e}")
|
|
596
597
|
finally:
|
|
597
598
|
self._client = None
|
|
599
|
+
await self._call_event_handler("on_disconnected")
|
|
598
600
|
|
|
599
601
|
def _process_config(self) -> None:
|
|
600
602
|
"""Create a formatted STT transcription config.
|
|
@@ -618,7 +620,7 @@ class SpeechmaticsSTTService(STTService):
|
|
|
618
620
|
transcription_config.additional_vocab = [
|
|
619
621
|
{
|
|
620
622
|
"content": e.content,
|
|
621
|
-
"sounds_like": e.sounds_like,
|
|
623
|
+
**({"sounds_like": e.sounds_like} if e.sounds_like else {}),
|
|
622
624
|
}
|
|
623
625
|
for e in self._params.additional_vocab
|
|
624
626
|
]
|
pipecat/services/stt_service.py
CHANGED
|
@@ -36,6 +36,25 @@ class STTService(AIService):
|
|
|
36
36
|
Provides common functionality for STT services including audio passthrough,
|
|
37
37
|
muting, settings management, and audio processing. Subclasses must implement
|
|
38
38
|
the run_stt method to provide actual speech recognition.
|
|
39
|
+
|
|
40
|
+
Event handlers:
|
|
41
|
+
on_connected: Called when connected to the STT service.
|
|
42
|
+
on_connected: Called when disconnected from the STT service.
|
|
43
|
+
on_connection_error: Called when a connection to the STT service error occurs.
|
|
44
|
+
|
|
45
|
+
Example::
|
|
46
|
+
|
|
47
|
+
@stt.event_handler("on_connected")
|
|
48
|
+
async def on_connected(stt: STTService):
|
|
49
|
+
logger.debug(f"STT connected")
|
|
50
|
+
|
|
51
|
+
@stt.event_handler("on_disconnected")
|
|
52
|
+
async def on_disconnected(stt: STTService):
|
|
53
|
+
logger.debug(f"STT disconnected")
|
|
54
|
+
|
|
55
|
+
@stt.event_handler("on_connection_error")
|
|
56
|
+
async def on_connection_error(stt: STTService, error: str):
|
|
57
|
+
logger.error(f"STT connection error: {error}")
|
|
39
58
|
"""
|
|
40
59
|
|
|
41
60
|
def __init__(
|
|
@@ -66,6 +85,10 @@ class STTService(AIService):
|
|
|
66
85
|
self._voicemail_detect: bool = False
|
|
67
86
|
self._user_id: str = ""
|
|
68
87
|
|
|
88
|
+
self._register_event_handler("on_connected")
|
|
89
|
+
self._register_event_handler("on_disconnected")
|
|
90
|
+
self._register_event_handler("on_connection_error")
|
|
91
|
+
|
|
69
92
|
@property
|
|
70
93
|
def is_muted(self) -> bool:
|
|
71
94
|
"""Check if the STT service is currently muted.
|
|
@@ -307,15 +330,6 @@ class WebsocketSTTService(STTService, WebsocketService):
|
|
|
307
330
|
|
|
308
331
|
Combines STT functionality with websocket connectivity, providing automatic
|
|
309
332
|
error handling and reconnection capabilities.
|
|
310
|
-
|
|
311
|
-
Event handlers:
|
|
312
|
-
on_connection_error: Called when a websocket connection error occurs.
|
|
313
|
-
|
|
314
|
-
Example::
|
|
315
|
-
|
|
316
|
-
@stt.event_handler("on_connection_error")
|
|
317
|
-
async def on_connection_error(stt: STTService, error: str):
|
|
318
|
-
logger.error(f"STT connection error: {error}")
|
|
319
333
|
"""
|
|
320
334
|
|
|
321
335
|
def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
|
|
@@ -327,7 +341,6 @@ class WebsocketSTTService(STTService, WebsocketService):
|
|
|
327
341
|
"""
|
|
328
342
|
STTService.__init__(self, **kwargs)
|
|
329
343
|
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
330
|
-
self._register_event_handler("on_connection_error")
|
|
331
344
|
|
|
332
345
|
async def _report_error(self, error: ErrorFrame):
|
|
333
346
|
await self._call_event_handler("on_connection_error", error.error)
|
pipecat/services/tts_service.py
CHANGED
|
@@ -8,7 +8,17 @@
|
|
|
8
8
|
|
|
9
9
|
import asyncio
|
|
10
10
|
from abc import abstractmethod
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import (
|
|
12
|
+
Any,
|
|
13
|
+
AsyncGenerator,
|
|
14
|
+
AsyncIterator,
|
|
15
|
+
Dict,
|
|
16
|
+
List,
|
|
17
|
+
Mapping,
|
|
18
|
+
Optional,
|
|
19
|
+
Sequence,
|
|
20
|
+
Tuple,
|
|
21
|
+
)
|
|
12
22
|
|
|
13
23
|
from loguru import logger
|
|
14
24
|
|
|
@@ -49,6 +59,25 @@ class TTSService(AIService):
|
|
|
49
59
|
Provides common functionality for TTS services including text aggregation,
|
|
50
60
|
filtering, audio generation, and frame management. Supports configurable
|
|
51
61
|
sentence aggregation, silence insertion, and frame processing control.
|
|
62
|
+
|
|
63
|
+
Event handlers:
|
|
64
|
+
on_connected: Called when connected to the STT service.
|
|
65
|
+
on_connected: Called when disconnected from the STT service.
|
|
66
|
+
on_connection_error: Called when a connection to the STT service error occurs.
|
|
67
|
+
|
|
68
|
+
Example::
|
|
69
|
+
|
|
70
|
+
@tts.event_handler("on_connected")
|
|
71
|
+
async def on_connected(tts: TTSService):
|
|
72
|
+
logger.debug(f"TTS connected")
|
|
73
|
+
|
|
74
|
+
@tts.event_handler("on_disconnected")
|
|
75
|
+
async def on_disconnected(tts: TTSService):
|
|
76
|
+
logger.debug(f"TTS disconnected")
|
|
77
|
+
|
|
78
|
+
@tts.event_handler("on_connection_error")
|
|
79
|
+
async def on_connection_error(stt: TTSService, error: str):
|
|
80
|
+
logger.error(f"TTS connection error: {error}")
|
|
52
81
|
"""
|
|
53
82
|
|
|
54
83
|
def __init__(
|
|
@@ -124,7 +153,6 @@ class TTSService(AIService):
|
|
|
124
153
|
|
|
125
154
|
self._tracing_enabled: bool = False
|
|
126
155
|
|
|
127
|
-
|
|
128
156
|
if text_filter:
|
|
129
157
|
import warnings
|
|
130
158
|
|
|
@@ -143,6 +171,10 @@ class TTSService(AIService):
|
|
|
143
171
|
|
|
144
172
|
self._processing_text: bool = False
|
|
145
173
|
|
|
174
|
+
self._register_event_handler("on_connected")
|
|
175
|
+
self._register_event_handler("on_disconnected")
|
|
176
|
+
self._register_event_handler("on_connection_error")
|
|
177
|
+
|
|
146
178
|
@property
|
|
147
179
|
def sample_rate(self) -> int:
|
|
148
180
|
"""Get the current sample rate for audio output.
|
|
@@ -384,6 +416,36 @@ class TTSService(AIService):
|
|
|
384
416
|
):
|
|
385
417
|
await self._stop_frame_queue.put(frame)
|
|
386
418
|
|
|
419
|
+
async def _stream_audio_frames_from_iterator(
|
|
420
|
+
self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
|
|
421
|
+
) -> AsyncGenerator[Frame, None]:
|
|
422
|
+
buffer = bytearray()
|
|
423
|
+
need_to_strip_wav_header = strip_wav_header
|
|
424
|
+
async for chunk in iterator:
|
|
425
|
+
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
|
|
426
|
+
chunk = chunk[44:]
|
|
427
|
+
need_to_strip_wav_header = False
|
|
428
|
+
|
|
429
|
+
# Append to current buffer.
|
|
430
|
+
buffer.extend(chunk)
|
|
431
|
+
|
|
432
|
+
# Round to nearest even number.
|
|
433
|
+
aligned_length = len(buffer) & ~1 # 111111111...11110
|
|
434
|
+
if aligned_length > 0:
|
|
435
|
+
aligned_chunk = buffer[:aligned_length]
|
|
436
|
+
buffer = buffer[aligned_length:] # keep any leftover byte
|
|
437
|
+
|
|
438
|
+
if len(aligned_chunk) > 0:
|
|
439
|
+
frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
|
|
440
|
+
yield frame
|
|
441
|
+
|
|
442
|
+
if len(buffer) > 0:
|
|
443
|
+
# Make sure we don't need an extra padding byte.
|
|
444
|
+
if len(buffer) % 2 == 1:
|
|
445
|
+
buffer.extend(b"\x00")
|
|
446
|
+
frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
|
|
447
|
+
yield frame
|
|
448
|
+
|
|
387
449
|
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
388
450
|
self._processing_text = False
|
|
389
451
|
await self._text_aggregator.handle_interruption()
|
|
@@ -613,7 +675,6 @@ class WebsocketTTSService(TTSService, WebsocketService):
|
|
|
613
675
|
"""
|
|
614
676
|
TTSService.__init__(self, **kwargs)
|
|
615
677
|
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
616
|
-
self._register_event_handler("on_connection_error")
|
|
617
678
|
|
|
618
679
|
async def _report_error(self, error: ErrorFrame):
|
|
619
680
|
await self._call_event_handler("on_connection_error", error.error)
|
|
@@ -665,15 +726,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
|
|
|
665
726
|
"""Base class for websocket-based TTS services that support word timestamps.
|
|
666
727
|
|
|
667
728
|
Combines word timestamp functionality with websocket connectivity.
|
|
668
|
-
|
|
669
|
-
Event handlers:
|
|
670
|
-
on_connection_error: Called when a websocket connection error occurs.
|
|
671
|
-
|
|
672
|
-
Example::
|
|
673
|
-
|
|
674
|
-
@tts.event_handler("on_connection_error")
|
|
675
|
-
async def on_connection_error(tts: TTSService, error: str):
|
|
676
|
-
logger.error(f"TTS connection error: {error}")
|
|
677
729
|
"""
|
|
678
730
|
|
|
679
731
|
def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
|
|
@@ -685,7 +737,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
|
|
|
685
737
|
"""
|
|
686
738
|
WordTTSService.__init__(self, **kwargs)
|
|
687
739
|
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
688
|
-
self._register_event_handler("on_connection_error")
|
|
689
740
|
|
|
690
741
|
async def _report_error(self, error: ErrorFrame):
|
|
691
742
|
await self._call_event_handler("on_connection_error", error.error)
|
pipecat/transports/base_input.py
CHANGED
|
@@ -232,6 +232,9 @@ class BaseInputTransport(FrameProcessor):
|
|
|
232
232
|
"""
|
|
233
233
|
# Cancel and wait for the audio input task to finish.
|
|
234
234
|
await self._cancel_audio_task()
|
|
235
|
+
# Stop audio filter.
|
|
236
|
+
if self._params.audio_in_filter:
|
|
237
|
+
await self._params.audio_in_filter.stop()
|
|
235
238
|
|
|
236
239
|
async def set_transport_ready(self, frame: StartFrame):
|
|
237
240
|
"""Called when the transport is ready to stream.
|
|
@@ -294,15 +294,15 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
294
294
|
"""
|
|
295
295
|
await super().process_frame(frame, direction)
|
|
296
296
|
|
|
297
|
-
#
|
|
298
|
-
# System frames (like InterruptionFrame) are pushed immediately. Other
|
|
299
|
-
# frames require order so they are put in the sink queue.
|
|
300
|
-
#
|
|
301
297
|
if isinstance(frame, StartFrame):
|
|
302
298
|
# Push StartFrame before start(), because we want StartFrame to be
|
|
303
299
|
# processed by every processor before any other frame is processed.
|
|
304
300
|
await self.push_frame(frame, direction)
|
|
305
301
|
await self.start(frame)
|
|
302
|
+
elif isinstance(frame, EndFrame):
|
|
303
|
+
await self.stop(frame)
|
|
304
|
+
# Keep pushing EndFrame down so all the pipeline stops nicely.
|
|
305
|
+
await self.push_frame(frame, direction)
|
|
306
306
|
elif isinstance(frame, CancelFrame):
|
|
307
307
|
await self.cancel(frame)
|
|
308
308
|
await self.push_frame(frame, direction)
|
|
@@ -315,21 +315,6 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
315
315
|
await self.write_dtmf(frame)
|
|
316
316
|
elif isinstance(frame, SystemFrame):
|
|
317
317
|
await self.push_frame(frame, direction)
|
|
318
|
-
# Control frames.
|
|
319
|
-
elif isinstance(frame, EndFrame):
|
|
320
|
-
await self.stop(frame)
|
|
321
|
-
# Keep pushing EndFrame down so all the pipeline stops nicely.
|
|
322
|
-
await self.push_frame(frame, direction)
|
|
323
|
-
elif isinstance(frame, MixerControlFrame):
|
|
324
|
-
await self._handle_frame(frame)
|
|
325
|
-
# Other frames.
|
|
326
|
-
elif isinstance(frame, OutputAudioRawFrame):
|
|
327
|
-
await self._handle_frame(frame)
|
|
328
|
-
elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
|
|
329
|
-
await self._handle_frame(frame)
|
|
330
|
-
# TODO(aleix): Images and audio should support presentation timestamps.
|
|
331
|
-
elif frame.pts:
|
|
332
|
-
await self._handle_frame(frame)
|
|
333
318
|
elif direction == FrameDirection.UPSTREAM:
|
|
334
319
|
await self.push_frame(frame, direction)
|
|
335
320
|
else:
|
|
@@ -411,6 +396,13 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
411
396
|
|
|
412
397
|
# Indicates if the bot is currently speaking.
|
|
413
398
|
self._bot_speaking = False
|
|
399
|
+
# Last time a BotSpeakingFrame was pushed.
|
|
400
|
+
self._bot_speaking_frame_time = 0
|
|
401
|
+
# How often a BotSpeakingFrame should be pushed (value should be
|
|
402
|
+
# lower than the audio chunks).
|
|
403
|
+
self._bot_speaking_frame_period = 0.2
|
|
404
|
+
# Last time the bot actually spoke.
|
|
405
|
+
self._bot_speech_last_time = 0
|
|
414
406
|
|
|
415
407
|
self._audio_task: Optional[asyncio.Task] = None
|
|
416
408
|
self._video_task: Optional[asyncio.Task] = None
|
|
@@ -602,39 +594,71 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
602
594
|
|
|
603
595
|
async def _bot_started_speaking(self):
|
|
604
596
|
"""Handle bot started speaking event."""
|
|
605
|
-
if
|
|
606
|
-
|
|
607
|
-
f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
|
|
608
|
-
)
|
|
597
|
+
if self._bot_speaking:
|
|
598
|
+
return
|
|
609
599
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
600
|
+
self._transport.logger.debug(
|
|
601
|
+
f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
downstream_frame = BotStartedSpeakingFrame()
|
|
605
|
+
downstream_frame.transport_destination = self._destination
|
|
606
|
+
upstream_frame = BotStartedSpeakingFrame()
|
|
607
|
+
upstream_frame.transport_destination = self._destination
|
|
608
|
+
await self._transport.push_frame(downstream_frame)
|
|
609
|
+
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
616
610
|
|
|
617
|
-
|
|
611
|
+
self._bot_speaking = True
|
|
618
612
|
|
|
619
613
|
async def _bot_stopped_speaking(self):
|
|
620
614
|
"""Handle bot stopped speaking event."""
|
|
621
|
-
if self._bot_speaking:
|
|
622
|
-
|
|
623
|
-
f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
|
|
624
|
-
)
|
|
615
|
+
if not self._bot_speaking:
|
|
616
|
+
return
|
|
625
617
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
618
|
+
self._transport.logger.debug(
|
|
619
|
+
f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
downstream_frame = BotStoppedSpeakingFrame()
|
|
623
|
+
downstream_frame.transport_destination = self._destination
|
|
624
|
+
upstream_frame = BotStoppedSpeakingFrame()
|
|
625
|
+
upstream_frame.transport_destination = self._destination
|
|
626
|
+
await self._transport.push_frame(downstream_frame)
|
|
627
|
+
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
632
628
|
|
|
633
|
-
|
|
629
|
+
self._bot_speaking = False
|
|
630
|
+
|
|
631
|
+
# Clean audio buffer (there could be tiny left overs if not multiple
|
|
632
|
+
# to our output chunk size).
|
|
633
|
+
self._audio_buffer = bytearray()
|
|
634
634
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
635
|
+
async def _bot_currently_speaking(self):
|
|
636
|
+
"""Handle bot speaking event."""
|
|
637
|
+
await self._bot_started_speaking()
|
|
638
|
+
|
|
639
|
+
diff_time = time.time() - self._bot_speaking_frame_time
|
|
640
|
+
if diff_time >= self._bot_speaking_frame_period:
|
|
641
|
+
await self._transport.push_frame(BotSpeakingFrame())
|
|
642
|
+
await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
|
|
643
|
+
self._bot_speaking_frame_time = time.time()
|
|
644
|
+
|
|
645
|
+
self._bot_speech_last_time = time.time()
|
|
646
|
+
|
|
647
|
+
async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
|
|
648
|
+
if not is_silence(frame.audio):
|
|
649
|
+
await self._bot_currently_speaking()
|
|
650
|
+
else:
|
|
651
|
+
silence_duration = time.time() - self._bot_speech_last_time
|
|
652
|
+
if silence_duration > BOT_VAD_STOP_SECS:
|
|
653
|
+
await self._bot_stopped_speaking()
|
|
654
|
+
|
|
655
|
+
async def _handle_bot_speech(self, frame: Frame):
|
|
656
|
+
# TTS case.
|
|
657
|
+
if isinstance(frame, TTSAudioRawFrame):
|
|
658
|
+
await self._bot_currently_speaking()
|
|
659
|
+
# Speech stream case.
|
|
660
|
+
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
|
661
|
+
await self._maybe_bot_currently_speaking(frame)
|
|
638
662
|
|
|
639
663
|
async def _handle_frame(self, frame: Frame):
|
|
640
664
|
"""Handle various frame types with appropriate processing.
|
|
@@ -642,7 +666,9 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
642
666
|
Args:
|
|
643
667
|
frame: The frame to handle.
|
|
644
668
|
"""
|
|
645
|
-
if isinstance(frame,
|
|
669
|
+
if isinstance(frame, OutputAudioRawFrame):
|
|
670
|
+
await self._handle_bot_speech(frame)
|
|
671
|
+
elif isinstance(frame, OutputImageRawFrame):
|
|
646
672
|
await self._set_video_image(frame)
|
|
647
673
|
elif isinstance(frame, SpriteFrame):
|
|
648
674
|
await self._set_video_images(frame.images)
|
|
@@ -706,39 +732,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
706
732
|
|
|
707
733
|
async def _audio_task_handler(self):
|
|
708
734
|
"""Main audio processing task handler."""
|
|
709
|
-
# Push a BotSpeakingFrame every 200ms, we don't really need to push it
|
|
710
|
-
# at every audio chunk. If the audio chunk is bigger than 200ms, push at
|
|
711
|
-
# every audio chunk.
|
|
712
|
-
TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
|
|
713
|
-
BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
|
|
714
|
-
bot_speaking_counter = 0
|
|
715
|
-
speech_last_speaking_time = 0
|
|
716
|
-
|
|
717
735
|
async for frame in self._next_frame():
|
|
718
|
-
# Notify the bot started speaking upstream if necessary and that
|
|
719
|
-
# it's actually speaking.
|
|
720
|
-
is_speaking = False
|
|
721
|
-
if isinstance(frame, TTSAudioRawFrame):
|
|
722
|
-
is_speaking = True
|
|
723
|
-
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
|
724
|
-
if not is_silence(frame.audio):
|
|
725
|
-
is_speaking = True
|
|
726
|
-
speech_last_speaking_time = time.time()
|
|
727
|
-
else:
|
|
728
|
-
silence_duration = time.time() - speech_last_speaking_time
|
|
729
|
-
if silence_duration > BOT_VAD_STOP_SECS:
|
|
730
|
-
await self._bot_stopped_speaking()
|
|
731
|
-
|
|
732
|
-
if is_speaking:
|
|
733
|
-
await self._bot_started_speaking()
|
|
734
|
-
if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
|
|
735
|
-
await self._transport.push_frame(BotSpeakingFrame())
|
|
736
|
-
await self._transport.push_frame(
|
|
737
|
-
BotSpeakingFrame(), FrameDirection.UPSTREAM
|
|
738
|
-
)
|
|
739
|
-
bot_speaking_counter = 0
|
|
740
|
-
bot_speaking_counter += 1
|
|
741
|
-
|
|
742
736
|
# No need to push EndFrame, it's pushed from process_frame().
|
|
743
737
|
if isinstance(frame, EndFrame):
|
|
744
738
|
break
|
|
@@ -689,3 +689,8 @@ class SmallWebRTCConnection(BaseObject):
|
|
|
689
689
|
)()
|
|
690
690
|
if track:
|
|
691
691
|
track.set_enabled(signalling_message.enabled)
|
|
692
|
+
|
|
693
|
+
async def add_ice_candidate(self, candidate):
|
|
694
|
+
"""Handle incoming ICE candidates."""
|
|
695
|
+
logger.debug(f"Adding remote candidate: {candidate}")
|
|
696
|
+
await self.pc.addIceCandidate(candidate)
|