dv-pipecat-ai 0.0.85.dev699__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +43 -43
- pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
- pipecat/pipeline/runner.py +6 -2
- pipecat/pipeline/task.py +40 -55
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/frameworks/rtvi.py +1 -0
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +149 -67
- pipecat/runner/types.py +5 -5
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +3 -0
- pipecat/services/aws/llm.py +33 -16
- pipecat/services/aws/nova_sonic/context.py +69 -0
- pipecat/services/aws/nova_sonic/llm.py +199 -89
- pipecat/services/aws/stt.py +2 -0
- pipecat/services/aws_nova_sonic/context.py +8 -12
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +3 -1
- pipecat/services/deepgram/flux/stt.py +4 -0
- pipecat/services/elevenlabs/tts.py +82 -41
- pipecat/services/fish/tts.py +3 -0
- pipecat/services/google/stt.py +4 -0
- pipecat/services/lmnt/tts.py +2 -0
- pipecat/services/neuphonic/tts.py +3 -0
- pipecat/services/openai/tts.py +37 -6
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +3 -0
- pipecat/services/rime/tts.py +9 -8
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/sarvam/tts.py +87 -10
- pipecat/services/speechmatics/stt.py +3 -1
- pipecat/services/stt_service.py +23 -10
- pipecat/services/tts_service.py +64 -13
- pipecat/transports/base_input.py +3 -0
- pipecat/transports/base_output.py +71 -77
- pipecat/transports/smallwebrtc/connection.py +5 -0
- pipecat/transports/smallwebrtc/request_handler.py +42 -0
- pipecat/utils/string.py +1 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0
pipecat/services/openai/tts.py
CHANGED
|
@@ -14,6 +14,7 @@ from typing import AsyncGenerator, Dict, Literal, Optional
|
|
|
14
14
|
|
|
15
15
|
from loguru import logger
|
|
16
16
|
from openai import AsyncOpenAI, BadRequestError
|
|
17
|
+
from pydantic import BaseModel
|
|
17
18
|
|
|
18
19
|
from pipecat.frames.frames import (
|
|
19
20
|
ErrorFrame,
|
|
@@ -55,6 +56,17 @@ class OpenAITTSService(TTSService):
|
|
|
55
56
|
|
|
56
57
|
OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz
|
|
57
58
|
|
|
59
|
+
class InputParams(BaseModel):
|
|
60
|
+
"""Input parameters for OpenAI TTS configuration.
|
|
61
|
+
|
|
62
|
+
Parameters:
|
|
63
|
+
instructions: Instructions to guide voice synthesis behavior.
|
|
64
|
+
speed: Voice speed control (0.25 to 4.0, default 1.0).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
instructions: Optional[str] = None
|
|
68
|
+
speed: Optional[float] = None
|
|
69
|
+
|
|
58
70
|
def __init__(
|
|
59
71
|
self,
|
|
60
72
|
*,
|
|
@@ -65,6 +77,7 @@ class OpenAITTSService(TTSService):
|
|
|
65
77
|
sample_rate: Optional[int] = None,
|
|
66
78
|
instructions: Optional[str] = None,
|
|
67
79
|
speed: Optional[float] = None,
|
|
80
|
+
params: Optional[InputParams] = None,
|
|
68
81
|
**kwargs,
|
|
69
82
|
):
|
|
70
83
|
"""Initialize OpenAI TTS service.
|
|
@@ -77,7 +90,11 @@ class OpenAITTSService(TTSService):
|
|
|
77
90
|
sample_rate: Output audio sample rate in Hz. If None, uses OpenAI's default 24kHz.
|
|
78
91
|
instructions: Optional instructions to guide voice synthesis behavior.
|
|
79
92
|
speed: Voice speed control (0.25 to 4.0, default 1.0).
|
|
93
|
+
params: Optional synthesis controls (acting instructions, speed, ...).
|
|
80
94
|
**kwargs: Additional keyword arguments passed to TTSService.
|
|
95
|
+
|
|
96
|
+
.. deprecated:: 0.0.91
|
|
97
|
+
The `instructions` and `speed` parameters are deprecated, use `InputParams` instead.
|
|
81
98
|
"""
|
|
82
99
|
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
|
|
83
100
|
logger.warning(
|
|
@@ -86,12 +103,26 @@ class OpenAITTSService(TTSService):
|
|
|
86
103
|
)
|
|
87
104
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
88
105
|
|
|
89
|
-
self._speed = speed
|
|
90
106
|
self.set_model_name(model)
|
|
91
107
|
self.set_voice(voice)
|
|
92
|
-
self._instructions = instructions
|
|
93
108
|
self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
94
109
|
|
|
110
|
+
if instructions or speed:
|
|
111
|
+
import warnings
|
|
112
|
+
|
|
113
|
+
with warnings.catch_warnings():
|
|
114
|
+
warnings.simplefilter("always")
|
|
115
|
+
warnings.warn(
|
|
116
|
+
"The `instructions` and `speed` parameters are deprecated, use `InputParams` instead.",
|
|
117
|
+
DeprecationWarning,
|
|
118
|
+
stacklevel=2,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
self._settings = {
|
|
122
|
+
"instructions": params.instructions if params else instructions,
|
|
123
|
+
"speed": params.speed if params else speed,
|
|
124
|
+
}
|
|
125
|
+
|
|
95
126
|
def can_generate_metrics(self) -> bool:
|
|
96
127
|
"""Check if this service can generate processing metrics.
|
|
97
128
|
|
|
@@ -144,11 +175,11 @@ class OpenAITTSService(TTSService):
|
|
|
144
175
|
"response_format": "pcm",
|
|
145
176
|
}
|
|
146
177
|
|
|
147
|
-
if self.
|
|
148
|
-
create_params["instructions"] = self.
|
|
178
|
+
if self._settings["instructions"]:
|
|
179
|
+
create_params["instructions"] = self._settings["instructions"]
|
|
149
180
|
|
|
150
|
-
if self.
|
|
151
|
-
create_params["speed"] = self.
|
|
181
|
+
if self._settings["speed"]:
|
|
182
|
+
create_params["speed"] = self._settings["speed"]
|
|
152
183
|
|
|
153
184
|
async with self._client.audio.speech.with_streaming_response.create(
|
|
154
185
|
**create_params
|
pipecat/services/piper/tts.py
CHANGED
|
@@ -14,7 +14,6 @@ from loguru import logger
|
|
|
14
14
|
from pipecat.frames.frames import (
|
|
15
15
|
ErrorFrame,
|
|
16
16
|
Frame,
|
|
17
|
-
TTSAudioRawFrame,
|
|
18
17
|
TTSStartedFrame,
|
|
19
18
|
TTSStoppedFrame,
|
|
20
19
|
)
|
|
@@ -99,16 +98,15 @@ class PiperTTSService(TTSService):
|
|
|
99
98
|
|
|
100
99
|
await self.start_tts_usage_metrics(text)
|
|
101
100
|
|
|
101
|
+
yield TTSStartedFrame()
|
|
102
|
+
|
|
102
103
|
CHUNK_SIZE = self.chunk_size
|
|
103
104
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
if len(chunk) > 0:
|
|
110
|
-
await self.stop_ttfb_metrics()
|
|
111
|
-
yield TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
|
105
|
+
async for frame in self._stream_audio_frames_from_iterator(
|
|
106
|
+
response.content.iter_chunked(CHUNK_SIZE), strip_wav_header=True
|
|
107
|
+
):
|
|
108
|
+
await self.stop_ttfb_metrics()
|
|
109
|
+
yield frame
|
|
112
110
|
except Exception as e:
|
|
113
111
|
logger.error(f"Error in run_tts: {e}")
|
|
114
112
|
yield ErrorFrame(error=str(e))
|
pipecat/services/playht/tts.py
CHANGED
|
@@ -269,6 +269,8 @@ class PlayHTTTSService(InterruptibleTTSService):
|
|
|
269
269
|
raise ValueError("WebSocket URL is not a string")
|
|
270
270
|
|
|
271
271
|
self._websocket = await websocket_connect(self._websocket_url)
|
|
272
|
+
|
|
273
|
+
await self._call_event_handler("on_connected")
|
|
272
274
|
except ValueError as e:
|
|
273
275
|
logger.error(f"{self} initialization error: {e}")
|
|
274
276
|
self._websocket = None
|
|
@@ -291,6 +293,7 @@ class PlayHTTTSService(InterruptibleTTSService):
|
|
|
291
293
|
finally:
|
|
292
294
|
self._request_id = None
|
|
293
295
|
self._websocket = None
|
|
296
|
+
await self._call_event_handler("on_disconnected")
|
|
294
297
|
|
|
295
298
|
async def _get_websocket_url(self):
|
|
296
299
|
"""Retrieve WebSocket URL from PlayHT API."""
|
pipecat/services/rime/tts.py
CHANGED
|
@@ -255,6 +255,8 @@ class RimeTTSService(AudioContextWordTTSService):
|
|
|
255
255
|
url = f"{self._url}?{params}"
|
|
256
256
|
headers = {"Authorization": f"Bearer {self._api_key}"}
|
|
257
257
|
self._websocket = await websocket_connect(url, additional_headers=headers)
|
|
258
|
+
|
|
259
|
+
await self._call_event_handler("on_connected")
|
|
258
260
|
except Exception as e:
|
|
259
261
|
logger.error(f"{self} initialization error: {e}")
|
|
260
262
|
self._websocket = None
|
|
@@ -272,6 +274,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
|
|
272
274
|
finally:
|
|
273
275
|
self._context_id = None
|
|
274
276
|
self._websocket = None
|
|
277
|
+
await self._call_event_handler("on_disconnected")
|
|
275
278
|
|
|
276
279
|
def _get_websocket(self):
|
|
277
280
|
"""Get active websocket connection or raise exception."""
|
|
@@ -553,15 +556,13 @@ class RimeHttpTTSService(TTSService):
|
|
|
553
556
|
|
|
554
557
|
CHUNK_SIZE = self.chunk_size
|
|
555
558
|
|
|
556
|
-
async for
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
559
|
+
async for frame in self._stream_audio_frames_from_iterator(
|
|
560
|
+
response.content.iter_chunked(CHUNK_SIZE),
|
|
561
|
+
strip_wav_header=need_to_strip_wav_header,
|
|
562
|
+
):
|
|
563
|
+
await self.stop_ttfb_metrics()
|
|
564
|
+
yield frame
|
|
560
565
|
|
|
561
|
-
if len(chunk) > 0:
|
|
562
|
-
await self.stop_ttfb_metrics()
|
|
563
|
-
frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
|
|
564
|
-
yield frame
|
|
565
566
|
except Exception as e:
|
|
566
567
|
logger.exception(f"Error generating TTS: {e}")
|
|
567
568
|
yield ErrorFrame(error=f"Rime TTS error: {str(e)}")
|
pipecat/services/riva/stt.py
CHANGED
|
@@ -583,7 +583,9 @@ class RivaSegmentedSTTService(SegmentedSTTService):
|
|
|
583
583
|
self._config.language_code = self._language
|
|
584
584
|
|
|
585
585
|
@traced_stt
|
|
586
|
-
async def _handle_transcription(
|
|
586
|
+
async def _handle_transcription(
|
|
587
|
+
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
|
588
|
+
):
|
|
587
589
|
"""Handle a transcription result with tracing."""
|
|
588
590
|
pass
|
|
589
591
|
|
pipecat/services/sarvam/tts.py
CHANGED
|
@@ -77,17 +77,29 @@ class SarvamHttpTTSService(TTSService):
|
|
|
77
77
|
|
|
78
78
|
Example::
|
|
79
79
|
|
|
80
|
-
tts =
|
|
80
|
+
tts = SarvamHttpTTSService(
|
|
81
81
|
api_key="your-api-key",
|
|
82
82
|
voice_id="anushka",
|
|
83
83
|
model="bulbul:v2",
|
|
84
84
|
aiohttp_session=session,
|
|
85
|
-
params=
|
|
85
|
+
params=SarvamHttpTTSService.InputParams(
|
|
86
86
|
language=Language.HI,
|
|
87
87
|
pitch=0.1,
|
|
88
88
|
pace=1.2
|
|
89
89
|
)
|
|
90
90
|
)
|
|
91
|
+
|
|
92
|
+
# For bulbul v3 beta with any speaker:
|
|
93
|
+
tts_v3 = SarvamHttpTTSService(
|
|
94
|
+
api_key="your-api-key",
|
|
95
|
+
voice_id="speaker_name",
|
|
96
|
+
model="bulbul:v3,
|
|
97
|
+
aiohttp_session=session,
|
|
98
|
+
params=SarvamHttpTTSService.InputParams(
|
|
99
|
+
language=Language.HI,
|
|
100
|
+
temperature=0.8
|
|
101
|
+
)
|
|
102
|
+
)
|
|
91
103
|
"""
|
|
92
104
|
|
|
93
105
|
class InputParams(BaseModel):
|
|
@@ -106,6 +118,14 @@ class SarvamHttpTTSService(TTSService):
|
|
|
106
118
|
pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
|
|
107
119
|
loudness: Optional[float] = Field(default=1.0, ge=0.1, le=3.0)
|
|
108
120
|
enable_preprocessing: Optional[bool] = False
|
|
121
|
+
temperature: Optional[float] = Field(
|
|
122
|
+
default=0.6,
|
|
123
|
+
ge=0.01,
|
|
124
|
+
le=1.0,
|
|
125
|
+
description="Controls the randomness of the output for bulbul v3 beta. "
|
|
126
|
+
"Lower values make the output more focused and deterministic, while "
|
|
127
|
+
"higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
|
|
128
|
+
)
|
|
109
129
|
|
|
110
130
|
def __init__(
|
|
111
131
|
self,
|
|
@@ -125,7 +145,7 @@ class SarvamHttpTTSService(TTSService):
|
|
|
125
145
|
api_key: Sarvam AI API subscription key.
|
|
126
146
|
aiohttp_session: Shared aiohttp session for making requests.
|
|
127
147
|
voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
|
|
128
|
-
model: TTS model to use ("bulbul:
|
|
148
|
+
model: TTS model to use ("bulbul:v2" or "bulbul:v3-beta" or "bulbul:v3"). Defaults to "bulbul:v2".
|
|
129
149
|
base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
|
|
130
150
|
sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
|
|
131
151
|
params: Additional voice and preprocessing parameters. If None, uses defaults.
|
|
@@ -139,16 +159,32 @@ class SarvamHttpTTSService(TTSService):
|
|
|
139
159
|
self._base_url = base_url
|
|
140
160
|
self._session = aiohttp_session
|
|
141
161
|
|
|
162
|
+
# Build base settings common to all models
|
|
142
163
|
self._settings = {
|
|
143
164
|
"language": (
|
|
144
165
|
self.language_to_service_language(params.language) if params.language else "en-IN"
|
|
145
166
|
),
|
|
146
|
-
"pitch": params.pitch,
|
|
147
|
-
"pace": params.pace,
|
|
148
|
-
"loudness": params.loudness,
|
|
149
167
|
"enable_preprocessing": params.enable_preprocessing,
|
|
150
168
|
}
|
|
151
169
|
|
|
170
|
+
# Add model-specific parameters
|
|
171
|
+
if model in ("bulbul:v3-beta", "bulbul:v3"):
|
|
172
|
+
self._settings.update(
|
|
173
|
+
{
|
|
174
|
+
"temperature": getattr(params, "temperature", 0.6),
|
|
175
|
+
"model": model,
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
self._settings.update(
|
|
180
|
+
{
|
|
181
|
+
"pitch": params.pitch,
|
|
182
|
+
"pace": params.pace,
|
|
183
|
+
"loudness": params.loudness,
|
|
184
|
+
"model": model,
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
|
|
152
188
|
self.set_model_name(model)
|
|
153
189
|
self.set_voice(voice_id)
|
|
154
190
|
|
|
@@ -276,6 +312,18 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
276
312
|
pace=1.2
|
|
277
313
|
)
|
|
278
314
|
)
|
|
315
|
+
|
|
316
|
+
# For bulbul v3 beta with any speaker and temperature:
|
|
317
|
+
# Note: pace and loudness are not supported for bulbul v3 and bulbul v3 beta
|
|
318
|
+
tts_v3 = SarvamTTSService(
|
|
319
|
+
api_key="your-api-key",
|
|
320
|
+
voice_id="speaker_name",
|
|
321
|
+
model="bulbul:v3",
|
|
322
|
+
params=SarvamTTSService.InputParams(
|
|
323
|
+
language=Language.HI,
|
|
324
|
+
temperature=0.8
|
|
325
|
+
)
|
|
326
|
+
)
|
|
279
327
|
"""
|
|
280
328
|
|
|
281
329
|
class InputParams(BaseModel):
|
|
@@ -311,6 +359,14 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
311
359
|
output_audio_codec: Optional[str] = "linear16"
|
|
312
360
|
output_audio_bitrate: Optional[str] = "128k"
|
|
313
361
|
language: Optional[Language] = Language.EN
|
|
362
|
+
temperature: Optional[float] = Field(
|
|
363
|
+
default=0.6,
|
|
364
|
+
ge=0.01,
|
|
365
|
+
le=1.0,
|
|
366
|
+
description="Controls the randomness of the output for bulbul v3 beta. "
|
|
367
|
+
"Lower values make the output more focused and deterministic, while "
|
|
368
|
+
"higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
|
|
369
|
+
)
|
|
314
370
|
|
|
315
371
|
def __init__(
|
|
316
372
|
self,
|
|
@@ -330,6 +386,7 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
330
386
|
Args:
|
|
331
387
|
api_key: Sarvam API key for authenticating TTS requests.
|
|
332
388
|
model: Identifier of the Sarvam speech model (default "bulbul:v2").
|
|
389
|
+
Supports "bulbul:v2", "bulbul:v3-beta" and "bulbul:v3".
|
|
333
390
|
voice_id: Voice identifier for synthesis (default "anushka").
|
|
334
391
|
url: WebSocket URL for connecting to the TTS backend (default production URL).
|
|
335
392
|
aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
|
|
@@ -372,15 +429,12 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
372
429
|
self._api_key = api_key
|
|
373
430
|
self.set_model_name(model)
|
|
374
431
|
self.set_voice(voice_id)
|
|
375
|
-
#
|
|
432
|
+
# Build base settings common to all models
|
|
376
433
|
self._settings = {
|
|
377
434
|
"target_language_code": (
|
|
378
435
|
self.language_to_service_language(params.language) if params.language else "en-IN"
|
|
379
436
|
),
|
|
380
|
-
"pitch": params.pitch,
|
|
381
|
-
"pace": params.pace,
|
|
382
437
|
"speaker": voice_id,
|
|
383
|
-
"loudness": params.loudness,
|
|
384
438
|
"speech_sample_rate": 0,
|
|
385
439
|
"enable_preprocessing": params.enable_preprocessing,
|
|
386
440
|
"min_buffer_size": params.min_buffer_size,
|
|
@@ -388,6 +442,24 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
388
442
|
"output_audio_codec": params.output_audio_codec,
|
|
389
443
|
"output_audio_bitrate": params.output_audio_bitrate,
|
|
390
444
|
}
|
|
445
|
+
|
|
446
|
+
# Add model-specific parameters
|
|
447
|
+
if model in ("bulbul:v3-beta", "bulbul:v3"):
|
|
448
|
+
self._settings.update(
|
|
449
|
+
{
|
|
450
|
+
"temperature": getattr(params, "temperature", 0.6),
|
|
451
|
+
"model": model,
|
|
452
|
+
}
|
|
453
|
+
)
|
|
454
|
+
else:
|
|
455
|
+
self._settings.update(
|
|
456
|
+
{
|
|
457
|
+
"pitch": params.pitch,
|
|
458
|
+
"pace": params.pace,
|
|
459
|
+
"loudness": params.loudness,
|
|
460
|
+
"model": model,
|
|
461
|
+
}
|
|
462
|
+
)
|
|
391
463
|
self._started = False
|
|
392
464
|
|
|
393
465
|
self._receive_task = None
|
|
@@ -526,6 +598,7 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
526
598
|
logger.debug("Connected to Sarvam TTS Websocket")
|
|
527
599
|
await self._send_config()
|
|
528
600
|
|
|
601
|
+
await self._call_event_handler("on_connected")
|
|
529
602
|
except Exception as e:
|
|
530
603
|
logger.error(f"{self} initialization error: {e}")
|
|
531
604
|
self._websocket = None
|
|
@@ -557,6 +630,10 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
557
630
|
await self._websocket.close()
|
|
558
631
|
except Exception as e:
|
|
559
632
|
logger.error(f"{self} error closing websocket: {e}")
|
|
633
|
+
finally:
|
|
634
|
+
self._started = False
|
|
635
|
+
self._websocket = None
|
|
636
|
+
await self._call_event_handler("on_disconnected")
|
|
560
637
|
|
|
561
638
|
def _get_websocket(self):
|
|
562
639
|
if self._websocket:
|
|
@@ -577,6 +577,7 @@ class SpeechmaticsSTTService(STTService):
|
|
|
577
577
|
),
|
|
578
578
|
)
|
|
579
579
|
logger.debug(f"{self} Connected to Speechmatics STT service")
|
|
580
|
+
await self._call_event_handler("on_connected")
|
|
580
581
|
except Exception as e:
|
|
581
582
|
logger.error(f"{self} Error connecting to Speechmatics: {e}")
|
|
582
583
|
self._client = None
|
|
@@ -595,6 +596,7 @@ class SpeechmaticsSTTService(STTService):
|
|
|
595
596
|
logger.error(f"{self} Error closing Speechmatics client: {e}")
|
|
596
597
|
finally:
|
|
597
598
|
self._client = None
|
|
599
|
+
await self._call_event_handler("on_disconnected")
|
|
598
600
|
|
|
599
601
|
def _process_config(self) -> None:
|
|
600
602
|
"""Create a formatted STT transcription config.
|
|
@@ -618,7 +620,7 @@ class SpeechmaticsSTTService(STTService):
|
|
|
618
620
|
transcription_config.additional_vocab = [
|
|
619
621
|
{
|
|
620
622
|
"content": e.content,
|
|
621
|
-
"sounds_like": e.sounds_like,
|
|
623
|
+
**({"sounds_like": e.sounds_like} if e.sounds_like else {}),
|
|
622
624
|
}
|
|
623
625
|
for e in self._params.additional_vocab
|
|
624
626
|
]
|
pipecat/services/stt_service.py
CHANGED
|
@@ -36,6 +36,25 @@ class STTService(AIService):
|
|
|
36
36
|
Provides common functionality for STT services including audio passthrough,
|
|
37
37
|
muting, settings management, and audio processing. Subclasses must implement
|
|
38
38
|
the run_stt method to provide actual speech recognition.
|
|
39
|
+
|
|
40
|
+
Event handlers:
|
|
41
|
+
on_connected: Called when connected to the STT service.
|
|
42
|
+
on_connected: Called when disconnected from the STT service.
|
|
43
|
+
on_connection_error: Called when a connection to the STT service error occurs.
|
|
44
|
+
|
|
45
|
+
Example::
|
|
46
|
+
|
|
47
|
+
@stt.event_handler("on_connected")
|
|
48
|
+
async def on_connected(stt: STTService):
|
|
49
|
+
logger.debug(f"STT connected")
|
|
50
|
+
|
|
51
|
+
@stt.event_handler("on_disconnected")
|
|
52
|
+
async def on_disconnected(stt: STTService):
|
|
53
|
+
logger.debug(f"STT disconnected")
|
|
54
|
+
|
|
55
|
+
@stt.event_handler("on_connection_error")
|
|
56
|
+
async def on_connection_error(stt: STTService, error: str):
|
|
57
|
+
logger.error(f"STT connection error: {error}")
|
|
39
58
|
"""
|
|
40
59
|
|
|
41
60
|
def __init__(
|
|
@@ -66,6 +85,10 @@ class STTService(AIService):
|
|
|
66
85
|
self._voicemail_detect: bool = False
|
|
67
86
|
self._user_id: str = ""
|
|
68
87
|
|
|
88
|
+
self._register_event_handler("on_connected")
|
|
89
|
+
self._register_event_handler("on_disconnected")
|
|
90
|
+
self._register_event_handler("on_connection_error")
|
|
91
|
+
|
|
69
92
|
@property
|
|
70
93
|
def is_muted(self) -> bool:
|
|
71
94
|
"""Check if the STT service is currently muted.
|
|
@@ -307,15 +330,6 @@ class WebsocketSTTService(STTService, WebsocketService):
|
|
|
307
330
|
|
|
308
331
|
Combines STT functionality with websocket connectivity, providing automatic
|
|
309
332
|
error handling and reconnection capabilities.
|
|
310
|
-
|
|
311
|
-
Event handlers:
|
|
312
|
-
on_connection_error: Called when a websocket connection error occurs.
|
|
313
|
-
|
|
314
|
-
Example::
|
|
315
|
-
|
|
316
|
-
@stt.event_handler("on_connection_error")
|
|
317
|
-
async def on_connection_error(stt: STTService, error: str):
|
|
318
|
-
logger.error(f"STT connection error: {error}")
|
|
319
333
|
"""
|
|
320
334
|
|
|
321
335
|
def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
|
|
@@ -327,7 +341,6 @@ class WebsocketSTTService(STTService, WebsocketService):
|
|
|
327
341
|
"""
|
|
328
342
|
STTService.__init__(self, **kwargs)
|
|
329
343
|
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
330
|
-
self._register_event_handler("on_connection_error")
|
|
331
344
|
|
|
332
345
|
async def _report_error(self, error: ErrorFrame):
|
|
333
346
|
await self._call_event_handler("on_connection_error", error.error)
|
pipecat/services/tts_service.py
CHANGED
|
@@ -8,7 +8,17 @@
|
|
|
8
8
|
|
|
9
9
|
import asyncio
|
|
10
10
|
from abc import abstractmethod
|
|
11
|
-
from typing import
|
|
11
|
+
from typing import (
|
|
12
|
+
Any,
|
|
13
|
+
AsyncGenerator,
|
|
14
|
+
AsyncIterator,
|
|
15
|
+
Dict,
|
|
16
|
+
List,
|
|
17
|
+
Mapping,
|
|
18
|
+
Optional,
|
|
19
|
+
Sequence,
|
|
20
|
+
Tuple,
|
|
21
|
+
)
|
|
12
22
|
|
|
13
23
|
from loguru import logger
|
|
14
24
|
|
|
@@ -49,6 +59,25 @@ class TTSService(AIService):
|
|
|
49
59
|
Provides common functionality for TTS services including text aggregation,
|
|
50
60
|
filtering, audio generation, and frame management. Supports configurable
|
|
51
61
|
sentence aggregation, silence insertion, and frame processing control.
|
|
62
|
+
|
|
63
|
+
Event handlers:
|
|
64
|
+
on_connected: Called when connected to the STT service.
|
|
65
|
+
on_connected: Called when disconnected from the STT service.
|
|
66
|
+
on_connection_error: Called when a connection to the STT service error occurs.
|
|
67
|
+
|
|
68
|
+
Example::
|
|
69
|
+
|
|
70
|
+
@tts.event_handler("on_connected")
|
|
71
|
+
async def on_connected(tts: TTSService):
|
|
72
|
+
logger.debug(f"TTS connected")
|
|
73
|
+
|
|
74
|
+
@tts.event_handler("on_disconnected")
|
|
75
|
+
async def on_disconnected(tts: TTSService):
|
|
76
|
+
logger.debug(f"TTS disconnected")
|
|
77
|
+
|
|
78
|
+
@tts.event_handler("on_connection_error")
|
|
79
|
+
async def on_connection_error(stt: TTSService, error: str):
|
|
80
|
+
logger.error(f"TTS connection error: {error}")
|
|
52
81
|
"""
|
|
53
82
|
|
|
54
83
|
def __init__(
|
|
@@ -124,7 +153,6 @@ class TTSService(AIService):
|
|
|
124
153
|
|
|
125
154
|
self._tracing_enabled: bool = False
|
|
126
155
|
|
|
127
|
-
|
|
128
156
|
if text_filter:
|
|
129
157
|
import warnings
|
|
130
158
|
|
|
@@ -143,6 +171,10 @@ class TTSService(AIService):
|
|
|
143
171
|
|
|
144
172
|
self._processing_text: bool = False
|
|
145
173
|
|
|
174
|
+
self._register_event_handler("on_connected")
|
|
175
|
+
self._register_event_handler("on_disconnected")
|
|
176
|
+
self._register_event_handler("on_connection_error")
|
|
177
|
+
|
|
146
178
|
@property
|
|
147
179
|
def sample_rate(self) -> int:
|
|
148
180
|
"""Get the current sample rate for audio output.
|
|
@@ -384,6 +416,36 @@ class TTSService(AIService):
|
|
|
384
416
|
):
|
|
385
417
|
await self._stop_frame_queue.put(frame)
|
|
386
418
|
|
|
419
|
+
async def _stream_audio_frames_from_iterator(
|
|
420
|
+
self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
|
|
421
|
+
) -> AsyncGenerator[Frame, None]:
|
|
422
|
+
buffer = bytearray()
|
|
423
|
+
need_to_strip_wav_header = strip_wav_header
|
|
424
|
+
async for chunk in iterator:
|
|
425
|
+
if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
|
|
426
|
+
chunk = chunk[44:]
|
|
427
|
+
need_to_strip_wav_header = False
|
|
428
|
+
|
|
429
|
+
# Append to current buffer.
|
|
430
|
+
buffer.extend(chunk)
|
|
431
|
+
|
|
432
|
+
# Round to nearest even number.
|
|
433
|
+
aligned_length = len(buffer) & ~1 # 111111111...11110
|
|
434
|
+
if aligned_length > 0:
|
|
435
|
+
aligned_chunk = buffer[:aligned_length]
|
|
436
|
+
buffer = buffer[aligned_length:] # keep any leftover byte
|
|
437
|
+
|
|
438
|
+
if len(aligned_chunk) > 0:
|
|
439
|
+
frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
|
|
440
|
+
yield frame
|
|
441
|
+
|
|
442
|
+
if len(buffer) > 0:
|
|
443
|
+
# Make sure we don't need an extra padding byte.
|
|
444
|
+
if len(buffer) % 2 == 1:
|
|
445
|
+
buffer.extend(b"\x00")
|
|
446
|
+
frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
|
|
447
|
+
yield frame
|
|
448
|
+
|
|
387
449
|
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
388
450
|
self._processing_text = False
|
|
389
451
|
await self._text_aggregator.handle_interruption()
|
|
@@ -613,7 +675,6 @@ class WebsocketTTSService(TTSService, WebsocketService):
|
|
|
613
675
|
"""
|
|
614
676
|
TTSService.__init__(self, **kwargs)
|
|
615
677
|
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
616
|
-
self._register_event_handler("on_connection_error")
|
|
617
678
|
|
|
618
679
|
async def _report_error(self, error: ErrorFrame):
|
|
619
680
|
await self._call_event_handler("on_connection_error", error.error)
|
|
@@ -665,15 +726,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
|
|
|
665
726
|
"""Base class for websocket-based TTS services that support word timestamps.
|
|
666
727
|
|
|
667
728
|
Combines word timestamp functionality with websocket connectivity.
|
|
668
|
-
|
|
669
|
-
Event handlers:
|
|
670
|
-
on_connection_error: Called when a websocket connection error occurs.
|
|
671
|
-
|
|
672
|
-
Example::
|
|
673
|
-
|
|
674
|
-
@tts.event_handler("on_connection_error")
|
|
675
|
-
async def on_connection_error(tts: TTSService, error: str):
|
|
676
|
-
logger.error(f"TTS connection error: {error}")
|
|
677
729
|
"""
|
|
678
730
|
|
|
679
731
|
def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
|
|
@@ -685,7 +737,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
|
|
|
685
737
|
"""
|
|
686
738
|
WordTTSService.__init__(self, **kwargs)
|
|
687
739
|
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
688
|
-
self._register_event_handler("on_connection_error")
|
|
689
740
|
|
|
690
741
|
async def _report_error(self, error: ErrorFrame):
|
|
691
742
|
await self._call_event_handler("on_connection_error", error.error)
|
pipecat/transports/base_input.py
CHANGED
|
@@ -232,6 +232,9 @@ class BaseInputTransport(FrameProcessor):
|
|
|
232
232
|
"""
|
|
233
233
|
# Cancel and wait for the audio input task to finish.
|
|
234
234
|
await self._cancel_audio_task()
|
|
235
|
+
# Stop audio filter.
|
|
236
|
+
if self._params.audio_in_filter:
|
|
237
|
+
await self._params.audio_in_filter.stop()
|
|
235
238
|
|
|
236
239
|
async def set_transport_ready(self, frame: StartFrame):
|
|
237
240
|
"""Called when the transport is ready to stream.
|