dv-pipecat-ai 0.0.85.dev5__py3-none-any.whl → 0.0.85.dev698__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/METADATA +78 -117
- {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/RECORD +157 -123
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +5 -0
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +120 -87
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +12 -4
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +85 -24
- pipecat/processors/aggregators/dtmf_aggregator.py +28 -22
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_response.py +6 -7
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/filters/stt_mute_filter.py +2 -0
- pipecat/processors/frame_processor.py +103 -17
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +209 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +4 -4
- pipecat/processors/user_idle_processor.py +3 -6
- pipecat/runner/run.py +270 -50
- pipecat/runner/types.py +2 -0
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +6 -9
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/asyncai/tts.py +2 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +122 -97
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +367 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1155 -0
- pipecat/services/aws/stt.py +1 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +13 -355
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/tts.py +2 -2
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +636 -0
- pipecat/services/elevenlabs/__init__.py +2 -1
- pipecat/services/elevenlabs/stt.py +254 -276
- pipecat/services/elevenlabs/tts.py +5 -5
- pipecat/services/fish/tts.py +2 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +2 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +2 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +16 -8
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/playht/tts.py +31 -4
- pipecat/services/rime/tts.py +3 -4
- pipecat/services/sarvam/tts.py +2 -6
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +1 -7
- pipecat/services/stt_service.py +34 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +9 -9
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +4 -0
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +17 -42
- pipecat/transports/base_output.py +42 -26
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +98 -19
- pipecat/transports/smallwebrtc/request_handler.py +204 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/string.py +12 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
pipecat/services/openai/tts.py
CHANGED
|
@@ -64,6 +64,7 @@ class OpenAITTSService(TTSService):
|
|
|
64
64
|
model: str = "gpt-4o-mini-tts",
|
|
65
65
|
sample_rate: Optional[int] = None,
|
|
66
66
|
instructions: Optional[str] = None,
|
|
67
|
+
speed: Optional[float] = None,
|
|
67
68
|
**kwargs,
|
|
68
69
|
):
|
|
69
70
|
"""Initialize OpenAI TTS service.
|
|
@@ -75,6 +76,7 @@ class OpenAITTSService(TTSService):
|
|
|
75
76
|
model: TTS model to use. Defaults to "gpt-4o-mini-tts".
|
|
76
77
|
sample_rate: Output audio sample rate in Hz. If None, uses OpenAI's default 24kHz.
|
|
77
78
|
instructions: Optional instructions to guide voice synthesis behavior.
|
|
79
|
+
speed: Voice speed control (0.25 to 4.0, default 1.0).
|
|
78
80
|
**kwargs: Additional keyword arguments passed to TTSService.
|
|
79
81
|
"""
|
|
80
82
|
if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
|
|
@@ -84,6 +86,7 @@ class OpenAITTSService(TTSService):
|
|
|
84
86
|
)
|
|
85
87
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
86
88
|
|
|
89
|
+
self._speed = speed
|
|
87
90
|
self.set_model_name(model)
|
|
88
91
|
self.set_voice(voice)
|
|
89
92
|
self._instructions = instructions
|
|
@@ -133,17 +136,22 @@ class OpenAITTSService(TTSService):
|
|
|
133
136
|
try:
|
|
134
137
|
await self.start_ttfb_metrics()
|
|
135
138
|
|
|
136
|
-
# Setup
|
|
137
|
-
|
|
139
|
+
# Setup API parameters
|
|
140
|
+
create_params = {
|
|
141
|
+
"input": text,
|
|
142
|
+
"model": self.model_name,
|
|
143
|
+
"voice": VALID_VOICES[self._voice_id],
|
|
144
|
+
"response_format": "pcm",
|
|
145
|
+
}
|
|
146
|
+
|
|
138
147
|
if self._instructions:
|
|
139
|
-
|
|
148
|
+
create_params["instructions"] = self._instructions
|
|
149
|
+
|
|
150
|
+
if self._speed:
|
|
151
|
+
create_params["speed"] = self._speed
|
|
140
152
|
|
|
141
153
|
async with self._client.audio.speech.with_streaming_response.create(
|
|
142
|
-
|
|
143
|
-
model=self.model_name,
|
|
144
|
-
voice=VALID_VOICES[self._voice_id],
|
|
145
|
-
response_format="pcm",
|
|
146
|
-
extra_body=extra_body,
|
|
154
|
+
**create_params
|
|
147
155
|
) as r:
|
|
148
156
|
if r.status_code != 200:
|
|
149
157
|
error = await r.text()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
import warnings
|
|
8
|
+
|
|
9
|
+
from pipecat.services.azure.realtime.llm import AzureRealtimeLLMService
|
|
10
|
+
from pipecat.services.openai.realtime.events import (
|
|
11
|
+
InputAudioNoiseReduction,
|
|
12
|
+
InputAudioTranscription,
|
|
13
|
+
SemanticTurnDetection,
|
|
14
|
+
SessionProperties,
|
|
15
|
+
TurnDetection,
|
|
16
|
+
)
|
|
17
|
+
from pipecat.services.openai.realtime.llm import OpenAIRealtimeLLMService
|
|
18
|
+
|
|
19
|
+
with warnings.catch_warnings():
|
|
20
|
+
warnings.simplefilter("always")
|
|
21
|
+
warnings.warn(
|
|
22
|
+
"Types in pipecat.services.openai_realtime are deprecated. "
|
|
23
|
+
"Please use the equivalent types from "
|
|
24
|
+
"pipecat.services.openai.realtime instead.",
|
|
25
|
+
DeprecationWarning,
|
|
26
|
+
stacklevel=2,
|
|
27
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Azure OpenAI Realtime LLM service implementation."""
|
|
8
|
+
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from pipecat.services.azure.realtime.llm import *
|
|
12
|
+
|
|
13
|
+
with warnings.catch_warnings():
|
|
14
|
+
warnings.simplefilter("always")
|
|
15
|
+
warnings.warn(
|
|
16
|
+
"Types in pipecat.services.openai_realtime.azure are deprecated. "
|
|
17
|
+
"Please use the equivalent types from "
|
|
18
|
+
"pipecat.services.azure.realtime.llm instead.",
|
|
19
|
+
DeprecationWarning,
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""OpenAI Realtime LLM context and aggregator implementations."""
|
|
8
|
+
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from pipecat.services.openai.realtime.context import *
|
|
12
|
+
|
|
13
|
+
with warnings.catch_warnings():
|
|
14
|
+
warnings.simplefilter("always")
|
|
15
|
+
warnings.warn(
|
|
16
|
+
"Types in pipecat.services.openai_realtime.context are deprecated. "
|
|
17
|
+
"Please use the equivalent types from "
|
|
18
|
+
"pipecat.services.openai.realtime.context instead.",
|
|
19
|
+
DeprecationWarning,
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Event models and data structures for OpenAI Realtime API communication."""
|
|
8
|
+
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from pipecat.services.openai.realtime.events import *
|
|
12
|
+
|
|
13
|
+
with warnings.catch_warnings():
|
|
14
|
+
warnings.simplefilter("always")
|
|
15
|
+
warnings.warn(
|
|
16
|
+
"Types in pipecat.services.openai_realtime.events are deprecated. "
|
|
17
|
+
"Please use the equivalent types from "
|
|
18
|
+
"pipecat.services.openai.realtime.events instead.",
|
|
19
|
+
DeprecationWarning,
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Custom frame types for OpenAI Realtime API integration."""
|
|
8
|
+
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
from pipecat.services.openai.realtime.frames import *
|
|
12
|
+
|
|
13
|
+
with warnings.catch_warnings():
|
|
14
|
+
warnings.simplefilter("always")
|
|
15
|
+
warnings.warn(
|
|
16
|
+
"Types in pipecat.services.openai_realtime.frames are deprecated. "
|
|
17
|
+
"Please use the equivalent types from "
|
|
18
|
+
"pipecat.services.openai.realtime.frames instead.",
|
|
19
|
+
DeprecationWarning,
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
@@ -6,6 +6,8 @@
|
|
|
6
6
|
|
|
7
7
|
"""Azure OpenAI Realtime Beta LLM service implementation."""
|
|
8
8
|
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
9
11
|
from loguru import logger
|
|
10
12
|
|
|
11
13
|
from .openai import OpenAIRealtimeBetaLLMService
|
|
@@ -23,6 +25,10 @@ except ModuleNotFoundError as e:
|
|
|
23
25
|
class AzureRealtimeBetaLLMService(OpenAIRealtimeBetaLLMService):
|
|
24
26
|
"""Azure OpenAI Realtime Beta LLM service with Azure-specific authentication.
|
|
25
27
|
|
|
28
|
+
.. deprecated:: 0.0.84
|
|
29
|
+
`AzureRealtimeBetaLLMService` is deprecated, use `AzureRealtimeLLMService` instead.
|
|
30
|
+
This class will be removed in version 1.0.0.
|
|
31
|
+
|
|
26
32
|
Extends the OpenAI Realtime service to work with Azure OpenAI endpoints,
|
|
27
33
|
using Azure's authentication headers and endpoint format. Provides the same
|
|
28
34
|
real-time audio and text communication capabilities as the base OpenAI service.
|
|
@@ -44,6 +50,16 @@ class AzureRealtimeBetaLLMService(OpenAIRealtimeBetaLLMService):
|
|
|
44
50
|
**kwargs: Additional arguments passed to parent OpenAIRealtimeBetaLLMService.
|
|
45
51
|
"""
|
|
46
52
|
super().__init__(base_url=base_url, api_key=api_key, **kwargs)
|
|
53
|
+
|
|
54
|
+
with warnings.catch_warnings():
|
|
55
|
+
warnings.simplefilter("always")
|
|
56
|
+
warnings.warn(
|
|
57
|
+
"AzureRealtimeBetaLLMService is deprecated and will be removed in version 1.0.0. "
|
|
58
|
+
"Use AzureRealtimeLLMService instead.",
|
|
59
|
+
DeprecationWarning,
|
|
60
|
+
stacklevel=2,
|
|
61
|
+
)
|
|
62
|
+
|
|
47
63
|
self.api_key = api_key
|
|
48
64
|
self.base_url = base_url
|
|
49
65
|
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
import base64
|
|
10
10
|
import json
|
|
11
11
|
import time
|
|
12
|
+
import warnings
|
|
12
13
|
from dataclasses import dataclass
|
|
13
14
|
from typing import Optional
|
|
14
15
|
|
|
@@ -23,6 +24,7 @@ from pipecat.frames.frames import (
|
|
|
23
24
|
Frame,
|
|
24
25
|
InputAudioRawFrame,
|
|
25
26
|
InterimTranscriptionFrame,
|
|
27
|
+
InterruptionFrame,
|
|
26
28
|
LLMContextFrame,
|
|
27
29
|
LLMFullResponseEndFrame,
|
|
28
30
|
LLMFullResponseStartFrame,
|
|
@@ -31,7 +33,6 @@ from pipecat.frames.frames import (
|
|
|
31
33
|
LLMTextFrame,
|
|
32
34
|
LLMUpdateSettingsFrame,
|
|
33
35
|
StartFrame,
|
|
34
|
-
StartInterruptionFrame,
|
|
35
36
|
TranscriptionFrame,
|
|
36
37
|
TTSAudioRawFrame,
|
|
37
38
|
TTSStartedFrame,
|
|
@@ -92,6 +93,10 @@ class CurrentAudioResponse:
|
|
|
92
93
|
class OpenAIRealtimeBetaLLMService(LLMService):
|
|
93
94
|
"""OpenAI Realtime Beta LLM service providing real-time audio and text communication.
|
|
94
95
|
|
|
96
|
+
.. deprecated:: 0.0.84
|
|
97
|
+
`OpenAIRealtimeBetaLLMService` is deprecated, use `OpenAIRealtimeLLMService` instead.
|
|
98
|
+
This class will be removed in version 1.0.0.
|
|
99
|
+
|
|
95
100
|
Implements the OpenAI Realtime API Beta with WebSocket communication for low-latency
|
|
96
101
|
bidirectional audio and text interactions. Supports function calling, conversation
|
|
97
102
|
management, and real-time transcription.
|
|
@@ -124,6 +129,15 @@ class OpenAIRealtimeBetaLLMService(LLMService):
|
|
|
124
129
|
send_transcription_frames: Whether to emit transcription frames. Defaults to True.
|
|
125
130
|
**kwargs: Additional arguments passed to parent LLMService.
|
|
126
131
|
"""
|
|
132
|
+
with warnings.catch_warnings():
|
|
133
|
+
warnings.simplefilter("always")
|
|
134
|
+
warnings.warn(
|
|
135
|
+
"OpenAIRealtimeBetaLLMService is deprecated and will be removed in version 1.0.0. "
|
|
136
|
+
"Use OpenAIRealtimeLLMService instead.",
|
|
137
|
+
DeprecationWarning,
|
|
138
|
+
stacklevel=2,
|
|
139
|
+
)
|
|
140
|
+
|
|
127
141
|
full_url = f"{base_url}?model={model}"
|
|
128
142
|
super().__init__(base_url=full_url, **kwargs)
|
|
129
143
|
|
|
@@ -350,7 +364,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
|
|
|
350
364
|
elif isinstance(frame, InputAudioRawFrame):
|
|
351
365
|
if not self._audio_input_paused:
|
|
352
366
|
await self._send_user_audio(frame)
|
|
353
|
-
elif isinstance(frame,
|
|
367
|
+
elif isinstance(frame, InterruptionFrame):
|
|
354
368
|
await self._handle_interruption()
|
|
355
369
|
elif isinstance(frame, UserStartedSpeakingFrame):
|
|
356
370
|
await self._handle_user_started_speaking(frame)
|
|
@@ -644,14 +658,12 @@ class OpenAIRealtimeBetaLLMService(LLMService):
|
|
|
644
658
|
|
|
645
659
|
async def _handle_evt_speech_started(self, evt):
|
|
646
660
|
await self._truncate_current_audio_response()
|
|
647
|
-
await self.
|
|
648
|
-
await self.push_frame(StartInterruptionFrame()) # cancels downstream tasks
|
|
661
|
+
await self.push_interruption_task_frame_and_wait()
|
|
649
662
|
await self.push_frame(UserStartedSpeakingFrame())
|
|
650
663
|
|
|
651
664
|
async def _handle_evt_speech_stopped(self, evt):
|
|
652
665
|
await self.start_ttfb_metrics()
|
|
653
666
|
await self.start_processing_metrics()
|
|
654
|
-
await self._stop_interruption()
|
|
655
667
|
await self.push_frame(UserStoppedSpeakingFrame())
|
|
656
668
|
|
|
657
669
|
async def _maybe_handle_evt_retrieve_conversation_item_error(self, evt: events.ErrorEvent):
|
pipecat/services/playht/tts.py
CHANGED
|
@@ -14,6 +14,7 @@ import io
|
|
|
14
14
|
import json
|
|
15
15
|
import struct
|
|
16
16
|
import uuid
|
|
17
|
+
import warnings
|
|
17
18
|
from typing import AsyncGenerator, Optional
|
|
18
19
|
|
|
19
20
|
import aiohttp
|
|
@@ -25,8 +26,8 @@ from pipecat.frames.frames import (
|
|
|
25
26
|
EndFrame,
|
|
26
27
|
ErrorFrame,
|
|
27
28
|
Frame,
|
|
29
|
+
InterruptionFrame,
|
|
28
30
|
StartFrame,
|
|
29
|
-
StartInterruptionFrame,
|
|
30
31
|
TTSAudioRawFrame,
|
|
31
32
|
TTSStartedFrame,
|
|
32
33
|
TTSStoppedFrame,
|
|
@@ -110,6 +111,11 @@ def language_to_playht_language(language: Language) -> Optional[str]:
|
|
|
110
111
|
class PlayHTTTSService(InterruptibleTTSService):
|
|
111
112
|
"""PlayHT WebSocket-based text-to-speech service.
|
|
112
113
|
|
|
114
|
+
.. deprecated:: 0.0.88
|
|
115
|
+
|
|
116
|
+
This class is deprecated and will be removed in a future version.
|
|
117
|
+
PlayHT is shutting down their API on December 31st, 2025.
|
|
118
|
+
|
|
113
119
|
Provides real-time text-to-speech synthesis using PlayHT's WebSocket API.
|
|
114
120
|
Supports streaming audio generation with configurable voice engines and
|
|
115
121
|
language settings.
|
|
@@ -158,6 +164,15 @@ class PlayHTTTSService(InterruptibleTTSService):
|
|
|
158
164
|
**kwargs,
|
|
159
165
|
)
|
|
160
166
|
|
|
167
|
+
with warnings.catch_warnings():
|
|
168
|
+
warnings.simplefilter("always")
|
|
169
|
+
warnings.warn(
|
|
170
|
+
"PlayHT is shutting down their API on December 31st, 2025. "
|
|
171
|
+
"'PlayHTTTSService' is deprecated and will be removed in a future version.",
|
|
172
|
+
DeprecationWarning,
|
|
173
|
+
stacklevel=2,
|
|
174
|
+
)
|
|
175
|
+
|
|
161
176
|
params = params or PlayHTTTSService.InputParams()
|
|
162
177
|
|
|
163
178
|
self._api_key = api_key
|
|
@@ -312,7 +327,7 @@ class PlayHTTTSService(InterruptibleTTSService):
|
|
|
312
327
|
return self._websocket
|
|
313
328
|
raise Exception("Websocket not connected")
|
|
314
329
|
|
|
315
|
-
async def _handle_interruption(self, frame:
|
|
330
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
316
331
|
"""Handle interruption by stopping metrics and clearing request ID."""
|
|
317
332
|
await super()._handle_interruption(frame, direction)
|
|
318
333
|
await self.stop_all_metrics()
|
|
@@ -401,6 +416,11 @@ class PlayHTTTSService(InterruptibleTTSService):
|
|
|
401
416
|
class PlayHTHttpTTSService(TTSService):
|
|
402
417
|
"""PlayHT HTTP-based text-to-speech service.
|
|
403
418
|
|
|
419
|
+
.. deprecated:: 0.0.88
|
|
420
|
+
|
|
421
|
+
This class is deprecated and will be removed in a future version.
|
|
422
|
+
PlayHT is shutting down their API on December 31st, 2025.
|
|
423
|
+
|
|
404
424
|
Provides text-to-speech synthesis using PlayHT's HTTP API for simpler,
|
|
405
425
|
non-streaming synthesis. Suitable for use cases where streaming is not
|
|
406
426
|
required and simpler integration is preferred.
|
|
@@ -454,8 +474,6 @@ class PlayHTHttpTTSService(TTSService):
|
|
|
454
474
|
|
|
455
475
|
# Warn about deprecated protocol parameter if explicitly provided
|
|
456
476
|
if protocol:
|
|
457
|
-
import warnings
|
|
458
|
-
|
|
459
477
|
with warnings.catch_warnings():
|
|
460
478
|
warnings.simplefilter("always")
|
|
461
479
|
warnings.warn(
|
|
@@ -464,6 +482,15 @@ class PlayHTHttpTTSService(TTSService):
|
|
|
464
482
|
stacklevel=2,
|
|
465
483
|
)
|
|
466
484
|
|
|
485
|
+
with warnings.catch_warnings():
|
|
486
|
+
warnings.simplefilter("always")
|
|
487
|
+
warnings.warn(
|
|
488
|
+
"PlayHT is shutting down their API on December 31st, 2025. "
|
|
489
|
+
"'PlayHTHttpTTSService' is deprecated and will be removed in a future version.",
|
|
490
|
+
DeprecationWarning,
|
|
491
|
+
stacklevel=2,
|
|
492
|
+
)
|
|
493
|
+
|
|
467
494
|
params = params or PlayHTHttpTTSService.InputParams()
|
|
468
495
|
|
|
469
496
|
self._user_id = user_id
|
pipecat/services/rime/tts.py
CHANGED
|
@@ -24,15 +24,14 @@ from pipecat.frames.frames import (
|
|
|
24
24
|
EndFrame,
|
|
25
25
|
ErrorFrame,
|
|
26
26
|
Frame,
|
|
27
|
+
InterruptionFrame,
|
|
27
28
|
StartFrame,
|
|
28
|
-
StartInterruptionFrame,
|
|
29
29
|
TTSAudioRawFrame,
|
|
30
30
|
TTSStartedFrame,
|
|
31
31
|
TTSStoppedFrame,
|
|
32
32
|
)
|
|
33
33
|
from pipecat.processors.frame_processor import FrameDirection
|
|
34
34
|
from pipecat.services.tts_service import AudioContextWordTTSService, TTSService
|
|
35
|
-
from pipecat.transcriptions import language
|
|
36
35
|
from pipecat.transcriptions.language import Language
|
|
37
36
|
from pipecat.utils.text.base_text_aggregator import BaseTextAggregator
|
|
38
37
|
from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
|
|
@@ -280,7 +279,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
|
|
280
279
|
return self._websocket
|
|
281
280
|
raise Exception("Websocket not connected")
|
|
282
281
|
|
|
283
|
-
async def _handle_interruption(self, frame:
|
|
282
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
284
283
|
"""Handle interruption by clearing current context."""
|
|
285
284
|
await super()._handle_interruption(frame, direction)
|
|
286
285
|
await self.stop_all_metrics()
|
|
@@ -375,7 +374,7 @@ class RimeTTSService(AudioContextWordTTSService):
|
|
|
375
374
|
direction: The direction to push the frame.
|
|
376
375
|
"""
|
|
377
376
|
await super().push_frame(frame, direction)
|
|
378
|
-
if isinstance(frame, (TTSStoppedFrame,
|
|
377
|
+
if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
|
|
379
378
|
if isinstance(frame, TTSStoppedFrame):
|
|
380
379
|
await self.add_word_timestamps([("Reset", 0)])
|
|
381
380
|
|
pipecat/services/sarvam/tts.py
CHANGED
|
@@ -20,6 +20,7 @@ from pipecat.frames.frames import (
|
|
|
20
20
|
EndFrame,
|
|
21
21
|
ErrorFrame,
|
|
22
22
|
Frame,
|
|
23
|
+
InterruptionFrame,
|
|
23
24
|
LLMFullResponseEndFrame,
|
|
24
25
|
StartFrame,
|
|
25
26
|
StartInterruptionFrame,
|
|
@@ -455,7 +456,7 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
455
456
|
direction: The direction to push the frame.
|
|
456
457
|
"""
|
|
457
458
|
await super().push_frame(frame, direction)
|
|
458
|
-
if isinstance(frame, (TTSStoppedFrame,
|
|
459
|
+
if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
|
|
459
460
|
self._started = False
|
|
460
461
|
|
|
461
462
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
@@ -632,11 +633,6 @@ class SarvamTTSService(InterruptibleTTSService):
|
|
|
632
633
|
"""
|
|
633
634
|
logger.debug(f"Generating TTS: [{text}]")
|
|
634
635
|
|
|
635
|
-
# Validate text input
|
|
636
|
-
if not text or not isinstance(text, str) or not text.strip():
|
|
637
|
-
logger.warning(f"Invalid text input for Sarvam TTS run_tts: {repr(text)}")
|
|
638
|
-
return
|
|
639
|
-
|
|
640
636
|
try:
|
|
641
637
|
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
642
638
|
await self._connect()
|
pipecat/services/simli/video.py
CHANGED
|
@@ -15,8 +15,8 @@ from pipecat.frames.frames import (
|
|
|
15
15
|
CancelFrame,
|
|
16
16
|
EndFrame,
|
|
17
17
|
Frame,
|
|
18
|
+
InterruptionFrame,
|
|
18
19
|
OutputImageRawFrame,
|
|
19
|
-
StartInterruptionFrame,
|
|
20
20
|
TTSAudioRawFrame,
|
|
21
21
|
TTSStoppedFrame,
|
|
22
22
|
UserStartedSpeakingFrame,
|
|
@@ -179,7 +179,7 @@ class SimliVideoService(FrameProcessor):
|
|
|
179
179
|
return
|
|
180
180
|
elif isinstance(frame, (EndFrame, CancelFrame)):
|
|
181
181
|
await self._stop()
|
|
182
|
-
elif isinstance(frame, (
|
|
182
|
+
elif isinstance(frame, (InterruptionFrame, UserStartedSpeakingFrame)):
|
|
183
183
|
if not self._previously_interrupted:
|
|
184
184
|
await self._simli_client.clearBuffer()
|
|
185
185
|
self._previously_interrupted = self._is_trinity_avatar
|
|
@@ -19,7 +19,6 @@ from loguru import logger
|
|
|
19
19
|
from pydantic import BaseModel
|
|
20
20
|
|
|
21
21
|
from pipecat.frames.frames import (
|
|
22
|
-
BotInterruptionFrame,
|
|
23
22
|
CancelFrame,
|
|
24
23
|
EndFrame,
|
|
25
24
|
ErrorFrame,
|
|
@@ -749,14 +748,13 @@ class SpeechmaticsSTTService(STTService):
|
|
|
749
748
|
return
|
|
750
749
|
|
|
751
750
|
# Frames to send
|
|
752
|
-
upstream_frames: list[Frame] = []
|
|
753
751
|
downstream_frames: list[Frame] = []
|
|
754
752
|
|
|
755
753
|
# If VAD is enabled, then send a speaking frame
|
|
756
754
|
if self._params.enable_vad and not self._is_speaking:
|
|
757
755
|
logger.debug("User started speaking")
|
|
758
756
|
self._is_speaking = True
|
|
759
|
-
|
|
757
|
+
await self.push_interruption_task_frame_and_wait()
|
|
760
758
|
downstream_frames += [UserStartedSpeakingFrame()]
|
|
761
759
|
|
|
762
760
|
# If final, then re-parse into TranscriptionFrame
|
|
@@ -794,10 +792,6 @@ class SpeechmaticsSTTService(STTService):
|
|
|
794
792
|
self._is_speaking = False
|
|
795
793
|
downstream_frames += [UserStoppedSpeakingFrame()]
|
|
796
794
|
|
|
797
|
-
# Send UPSTREAM frames
|
|
798
|
-
for frame in upstream_frames:
|
|
799
|
-
await self.push_frame(frame, FrameDirection.UPSTREAM)
|
|
800
|
-
|
|
801
795
|
# Send the DOWNSTREAM frames
|
|
802
796
|
for frame in downstream_frames:
|
|
803
797
|
await self.push_frame(frame, FrameDirection.DOWNSTREAM)
|
pipecat/services/stt_service.py
CHANGED
|
@@ -16,6 +16,7 @@ from loguru import logger
|
|
|
16
16
|
from pipecat.frames.frames import (
|
|
17
17
|
AudioRawFrame,
|
|
18
18
|
BotStoppedSpeakingFrame,
|
|
19
|
+
ErrorFrame,
|
|
19
20
|
Frame,
|
|
20
21
|
StartFrame,
|
|
21
22
|
STTMuteFrame,
|
|
@@ -25,6 +26,7 @@ from pipecat.frames.frames import (
|
|
|
25
26
|
)
|
|
26
27
|
from pipecat.processors.frame_processor import FrameDirection
|
|
27
28
|
from pipecat.services.ai_service import AIService
|
|
29
|
+
from pipecat.services.websocket_service import WebsocketService
|
|
28
30
|
from pipecat.transcriptions.language import Language
|
|
29
31
|
|
|
30
32
|
|
|
@@ -298,3 +300,35 @@ class SegmentedSTTService(STTService):
|
|
|
298
300
|
if not self._user_speaking and len(self._audio_buffer) > self._audio_buffer_size_1s:
|
|
299
301
|
discarded = len(self._audio_buffer) - self._audio_buffer_size_1s
|
|
300
302
|
self._audio_buffer = self._audio_buffer[discarded:]
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class WebsocketSTTService(STTService, WebsocketService):
|
|
306
|
+
"""Base class for websocket-based STT services.
|
|
307
|
+
|
|
308
|
+
Combines STT functionality with websocket connectivity, providing automatic
|
|
309
|
+
error handling and reconnection capabilities.
|
|
310
|
+
|
|
311
|
+
Event handlers:
|
|
312
|
+
on_connection_error: Called when a websocket connection error occurs.
|
|
313
|
+
|
|
314
|
+
Example::
|
|
315
|
+
|
|
316
|
+
@stt.event_handler("on_connection_error")
|
|
317
|
+
async def on_connection_error(stt: STTService, error: str):
|
|
318
|
+
logger.error(f"STT connection error: {error}")
|
|
319
|
+
"""
|
|
320
|
+
|
|
321
|
+
def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
|
|
322
|
+
"""Initialize the Websocket STT service.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
reconnect_on_error: Whether to automatically reconnect on websocket errors.
|
|
326
|
+
**kwargs: Additional arguments passed to parent classes.
|
|
327
|
+
"""
|
|
328
|
+
STTService.__init__(self, **kwargs)
|
|
329
|
+
WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
330
|
+
self._register_event_handler("on_connection_error")
|
|
331
|
+
|
|
332
|
+
async def _report_error(self, error: ErrorFrame):
|
|
333
|
+
await self._call_event_handler("on_connection_error", error.error)
|
|
334
|
+
await self.push_error(error)
|
pipecat/services/tavus/video.py
CHANGED
|
@@ -23,12 +23,12 @@ from pipecat.frames.frames import (
|
|
|
23
23
|
CancelFrame,
|
|
24
24
|
EndFrame,
|
|
25
25
|
Frame,
|
|
26
|
+
InterruptionFrame,
|
|
26
27
|
OutputAudioRawFrame,
|
|
27
28
|
OutputImageRawFrame,
|
|
28
29
|
OutputTransportReadyFrame,
|
|
29
30
|
SpeechOutputAudioRawFrame,
|
|
30
31
|
StartFrame,
|
|
31
|
-
StartInterruptionFrame,
|
|
32
32
|
TTSAudioRawFrame,
|
|
33
33
|
TTSStartedFrame,
|
|
34
34
|
)
|
|
@@ -222,7 +222,7 @@ class TavusVideoService(AIService):
|
|
|
222
222
|
"""
|
|
223
223
|
await super().process_frame(frame, direction)
|
|
224
224
|
|
|
225
|
-
if isinstance(frame,
|
|
225
|
+
if isinstance(frame, InterruptionFrame):
|
|
226
226
|
await self._handle_interruptions()
|
|
227
227
|
await self.push_frame(frame, direction)
|
|
228
228
|
elif isinstance(frame, TTSAudioRawFrame):
|
pipecat/services/tts_service.py
CHANGED
|
@@ -20,10 +20,10 @@ from pipecat.frames.frames import (
|
|
|
20
20
|
ErrorFrame,
|
|
21
21
|
Frame,
|
|
22
22
|
InterimTranscriptionFrame,
|
|
23
|
+
InterruptionFrame,
|
|
23
24
|
LLMFullResponseEndFrame,
|
|
24
25
|
LLMFullResponseStartFrame,
|
|
25
26
|
StartFrame,
|
|
26
|
-
StartInterruptionFrame,
|
|
27
27
|
TextFrame,
|
|
28
28
|
TranscriptionFrame,
|
|
29
29
|
TTSAudioRawFrame,
|
|
@@ -319,7 +319,7 @@ class TTSService(AIService):
|
|
|
319
319
|
and not isinstance(frame, TranscriptionFrame)
|
|
320
320
|
):
|
|
321
321
|
await self._process_text_frame(frame)
|
|
322
|
-
elif isinstance(frame,
|
|
322
|
+
elif isinstance(frame, InterruptionFrame):
|
|
323
323
|
await self._handle_interruption(frame, direction)
|
|
324
324
|
await self.push_frame(frame, direction)
|
|
325
325
|
elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
|
|
@@ -377,14 +377,14 @@ class TTSService(AIService):
|
|
|
377
377
|
await super().push_frame(frame, direction)
|
|
378
378
|
|
|
379
379
|
if self._push_stop_frames and (
|
|
380
|
-
isinstance(frame,
|
|
380
|
+
isinstance(frame, InterruptionFrame)
|
|
381
381
|
or isinstance(frame, TTSStartedFrame)
|
|
382
382
|
or isinstance(frame, TTSAudioRawFrame)
|
|
383
383
|
or isinstance(frame, TTSStoppedFrame)
|
|
384
384
|
):
|
|
385
385
|
await self._stop_frame_queue.put(frame)
|
|
386
386
|
|
|
387
|
-
async def _handle_interruption(self, frame:
|
|
387
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
388
388
|
self._processing_text = False
|
|
389
389
|
await self._text_aggregator.handle_interruption()
|
|
390
390
|
for filter in self._text_filters:
|
|
@@ -465,7 +465,7 @@ class TTSService(AIService):
|
|
|
465
465
|
)
|
|
466
466
|
if isinstance(frame, TTSStartedFrame):
|
|
467
467
|
has_started = True
|
|
468
|
-
elif isinstance(frame, (TTSStoppedFrame,
|
|
468
|
+
elif isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
|
|
469
469
|
has_started = False
|
|
470
470
|
except asyncio.TimeoutError:
|
|
471
471
|
if has_started:
|
|
@@ -550,7 +550,7 @@ class WordTTSService(TTSService):
|
|
|
550
550
|
elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
|
|
551
551
|
await self.flush_audio()
|
|
552
552
|
|
|
553
|
-
async def _handle_interruption(self, frame:
|
|
553
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
554
554
|
await super()._handle_interruption(frame, direction)
|
|
555
555
|
self._llm_response_started = False
|
|
556
556
|
self.reset_word_timestamps()
|
|
@@ -640,7 +640,7 @@ class InterruptibleTTSService(WebsocketTTSService):
|
|
|
640
640
|
# user interrupts we need to reconnect.
|
|
641
641
|
self._bot_speaking = False
|
|
642
642
|
|
|
643
|
-
async def _handle_interruption(self, frame:
|
|
643
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
644
644
|
await super()._handle_interruption(frame, direction)
|
|
645
645
|
if self._bot_speaking:
|
|
646
646
|
await self._disconnect()
|
|
@@ -712,7 +712,7 @@ class InterruptibleWordTTSService(WebsocketWordTTSService):
|
|
|
712
712
|
# user interrupts we need to reconnect.
|
|
713
713
|
self._bot_speaking = False
|
|
714
714
|
|
|
715
|
-
async def _handle_interruption(self, frame:
|
|
715
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
716
716
|
await super()._handle_interruption(frame, direction)
|
|
717
717
|
if self._bot_speaking:
|
|
718
718
|
await self._disconnect()
|
|
@@ -840,7 +840,7 @@ class AudioContextWordTTSService(WebsocketWordTTSService):
|
|
|
840
840
|
await super().cancel(frame)
|
|
841
841
|
await self._stop_audio_context_task()
|
|
842
842
|
|
|
843
|
-
async def _handle_interruption(self, frame:
|
|
843
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
844
844
|
await super()._handle_interruption(frame, direction)
|
|
845
845
|
await self._stop_audio_context_task()
|
|
846
846
|
self._create_audio_context_task()
|