dv-pipecat-ai 0.0.82.dev884__py3-none-any.whl → 0.0.85.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dv_pipecat_ai-0.0.82.dev884.dist-info → dv_pipecat_ai-0.0.85.dev5.dist-info}/METADATA +2 -1
- {dv_pipecat_ai-0.0.82.dev884.dist-info → dv_pipecat_ai-0.0.85.dev5.dist-info}/RECORD +23 -22
- pipecat/audio/vad/silero.py +1 -1
- pipecat/frames/frames.py +49 -0
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +22 -29
- pipecat/processors/aggregators/llm_response.py +1 -4
- pipecat/processors/dtmf_aggregator.py +175 -74
- pipecat/processors/filters/stt_mute_filter.py +15 -0
- pipecat/processors/user_idle_processor.py +32 -5
- pipecat/serializers/__init__.py +3 -1
- pipecat/serializers/convox.py +40 -3
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/plivo.py +4 -1
- pipecat/services/elevenlabs/stt.py +18 -8
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +13 -1
- pipecat/services/speechmatics/stt.py +16 -0
- pipecat/services/vistaar/llm.py +45 -7
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- {dv_pipecat_ai-0.0.82.dev884.dist-info → dv_pipecat_ai-0.0.85.dev5.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev884.dist-info → dv_pipecat_ai-0.0.85.dev5.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev884.dist-info → dv_pipecat_ai-0.0.85.dev5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Custom/External telephony serializer for Pipecat with Ringg AI WebSocket API. Customers will directly connect to Ringg AI WebSocket API."""
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
import uuid
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from loguru import logger
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
|
|
17
|
+
from pipecat.audio.utils import (
|
|
18
|
+
alaw_to_pcm,
|
|
19
|
+
create_stream_resampler,
|
|
20
|
+
pcm_to_alaw,
|
|
21
|
+
pcm_to_ulaw,
|
|
22
|
+
ulaw_to_pcm,
|
|
23
|
+
)
|
|
24
|
+
from pipecat.frames.frames import (
|
|
25
|
+
AudioRawFrame,
|
|
26
|
+
CallTransferFrame,
|
|
27
|
+
CancelFrame,
|
|
28
|
+
EndFrame,
|
|
29
|
+
Frame,
|
|
30
|
+
InputAudioRawFrame,
|
|
31
|
+
StartFrame,
|
|
32
|
+
StartInterruptionFrame,
|
|
33
|
+
TransportMessageFrame,
|
|
34
|
+
TransportMessageUrgentFrame,
|
|
35
|
+
)
|
|
36
|
+
from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CustomFrameSerializer(FrameSerializer):
|
|
40
|
+
"""Serializer for Custom/External telephony WebSocket protocol (Ringg AI API).
|
|
41
|
+
|
|
42
|
+
This serializer handles converting between Pipecat frames and the Ringg AI
|
|
43
|
+
WebSocket protocol for external/custom telephony providers. It supports
|
|
44
|
+
PCMU (μ-law), PCMA (A-law), and PCM codecs with automatic conversion.
|
|
45
|
+
|
|
46
|
+
Supported events:
|
|
47
|
+
- start: Initialize call with agent configuration
|
|
48
|
+
- media: Bidirectional audio streaming
|
|
49
|
+
- clear: Clear audio buffers (interruption)
|
|
50
|
+
- call_transfer: Transfer call to another number
|
|
51
|
+
- hang_up: End call notification
|
|
52
|
+
|
|
53
|
+
Audio format:
|
|
54
|
+
- Sample Rate: Configurable (default 8kHz)
|
|
55
|
+
- Channels: Mono (1 channel)
|
|
56
|
+
- Bit Depth: 16-bit
|
|
57
|
+
- Encoding: Little-endian
|
|
58
|
+
- Payload Encoding: Base64
|
|
59
|
+
- Supported Codecs: PCMU (μ-law), PCMA (A-law), PCM (raw)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
class InputParams(BaseModel):
|
|
63
|
+
"""Configuration parameters for CustomFrameSerializer.
|
|
64
|
+
|
|
65
|
+
Parameters:
|
|
66
|
+
custom_sample_rate: Sample rate used by external client, defaults to 8000 Hz.
|
|
67
|
+
sample_rate: Optional override for pipeline input sample rate.
|
|
68
|
+
codec: Audio codec - "pcmu" (μ-law), "pcma" (A-law), or "pcm" (raw PCM).
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
custom_sample_rate: int = 8000
|
|
72
|
+
sample_rate: Optional[int] = None
|
|
73
|
+
codec: str = "pcmu" # "pcmu" or "pcm"
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self, stream_sid: str, call_sid: Optional[str] = None, params: Optional[InputParams] = None
|
|
77
|
+
):
|
|
78
|
+
"""Initialize the CustomFrameSerializer.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
stream_sid: The stream identifier from external client.
|
|
82
|
+
call_sid: The call identifier from external client.
|
|
83
|
+
params: Configuration parameters.
|
|
84
|
+
"""
|
|
85
|
+
self._stream_sid = stream_sid
|
|
86
|
+
self._call_sid = call_sid
|
|
87
|
+
self._params = params or CustomFrameSerializer.InputParams()
|
|
88
|
+
|
|
89
|
+
self._custom_sample_rate = self._params.custom_sample_rate
|
|
90
|
+
self._sample_rate = 0 # Pipeline input rate
|
|
91
|
+
self._codec = self._params.codec.lower()
|
|
92
|
+
|
|
93
|
+
self._input_resampler = create_stream_resampler()
|
|
94
|
+
self._output_resampler = create_stream_resampler()
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def type(self) -> FrameSerializerType:
|
|
98
|
+
"""Gets the serializer type.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
The serializer type, TEXT for JSON-based protocol.
|
|
102
|
+
"""
|
|
103
|
+
return FrameSerializerType.TEXT
|
|
104
|
+
|
|
105
|
+
async def setup(self, frame: StartFrame):
|
|
106
|
+
"""Sets up the serializer with pipeline configuration.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
frame: The StartFrame containing pipeline configuration.
|
|
110
|
+
"""
|
|
111
|
+
self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
|
|
112
|
+
|
|
113
|
+
async def serialize(self, frame: Frame) -> str | bytes | None:
|
|
114
|
+
"""Serializes a Pipecat frame to Custom telephony WebSocket format.
|
|
115
|
+
|
|
116
|
+
Handles conversion of various frame types to Ringg AI WebSocket messages.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
frame: The Pipecat frame to serialize.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Serialized data as JSON string, or None if the frame isn't handled.
|
|
123
|
+
"""
|
|
124
|
+
if isinstance(frame, StartInterruptionFrame):
|
|
125
|
+
# Send clear event to instruct client to discard buffered audio
|
|
126
|
+
answer = {"event": "clear", "stream_sid": self._stream_sid}
|
|
127
|
+
return json.dumps(answer)
|
|
128
|
+
|
|
129
|
+
elif isinstance(frame, CallTransferFrame):
|
|
130
|
+
# Send call_transfer event to transfer the call to another number
|
|
131
|
+
answer = {
|
|
132
|
+
"event": "call_transfer",
|
|
133
|
+
"call_sid": self._call_sid or self._stream_sid,
|
|
134
|
+
"to": frame.target,
|
|
135
|
+
}
|
|
136
|
+
return json.dumps(answer)
|
|
137
|
+
|
|
138
|
+
elif isinstance(frame, (EndFrame, CancelFrame)):
|
|
139
|
+
# Send hang_up event to end the call
|
|
140
|
+
answer = {"event": "hang_up", "stream_sid": self._stream_sid}
|
|
141
|
+
return json.dumps(answer)
|
|
142
|
+
|
|
143
|
+
elif isinstance(frame, AudioRawFrame):
|
|
144
|
+
data = frame.audio
|
|
145
|
+
|
|
146
|
+
# Convert audio based on codec
|
|
147
|
+
if self._codec == "pcmu":
|
|
148
|
+
# Convert PCM to μ-law for PCMU codec
|
|
149
|
+
serialized_data = await pcm_to_ulaw(
|
|
150
|
+
data, frame.sample_rate, self._custom_sample_rate, self._output_resampler
|
|
151
|
+
)
|
|
152
|
+
elif self._codec == "pcma":
|
|
153
|
+
# Convert PCM to A-law for PCMA codec
|
|
154
|
+
serialized_data = await pcm_to_alaw(
|
|
155
|
+
data, frame.sample_rate, self._custom_sample_rate, self._output_resampler
|
|
156
|
+
)
|
|
157
|
+
else: # pcm
|
|
158
|
+
# Resample PCM to target sample rate
|
|
159
|
+
serialized_data = await self._output_resampler.resample(
|
|
160
|
+
data, frame.sample_rate, self._custom_sample_rate
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if serialized_data is None or len(serialized_data) == 0:
|
|
164
|
+
# Skip if no audio data
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
payload = base64.b64encode(serialized_data).decode("ascii")
|
|
168
|
+
answer = {
|
|
169
|
+
"event": "media",
|
|
170
|
+
"stream_sid": self._stream_sid,
|
|
171
|
+
"media": {"payload": payload},
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
return json.dumps(answer)
|
|
175
|
+
|
|
176
|
+
elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
|
|
177
|
+
return json.dumps(frame.message)
|
|
178
|
+
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
async def deserialize(self, data: str | bytes) -> Frame | None:
|
|
182
|
+
"""Deserializes Custom telephony WebSocket data to Pipecat frames.
|
|
183
|
+
|
|
184
|
+
Handles conversion of Ringg AI WebSocket events to appropriate Pipecat frames.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
data: The raw WebSocket data from external client.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
A Pipecat frame corresponding to the event, or None if unhandled.
|
|
191
|
+
"""
|
|
192
|
+
try:
|
|
193
|
+
message = json.loads(data)
|
|
194
|
+
except json.JSONDecodeError as e:
|
|
195
|
+
logger.error(f"Failed to parse JSON message: {e}")
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
event = message.get("event")
|
|
199
|
+
|
|
200
|
+
if event == "media":
|
|
201
|
+
media = message.get("media", {})
|
|
202
|
+
payload_base64 = media.get("payload")
|
|
203
|
+
uuid = message.get("uuid")
|
|
204
|
+
|
|
205
|
+
if not payload_base64:
|
|
206
|
+
logger.warning("Media event missing payload")
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
payload = base64.b64decode(payload_base64)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f"Failed to decode base64 payload: {e}")
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
# Convert audio based on codec
|
|
216
|
+
if self._codec == "pcmu":
|
|
217
|
+
# Convert μ-law to PCM
|
|
218
|
+
deserialized_data = await ulaw_to_pcm(
|
|
219
|
+
payload, self._custom_sample_rate, self._sample_rate, self._input_resampler
|
|
220
|
+
)
|
|
221
|
+
elif self._codec == "pcma":
|
|
222
|
+
# Convert A-law to PCM
|
|
223
|
+
deserialized_data = await alaw_to_pcm(
|
|
224
|
+
payload, self._custom_sample_rate, self._sample_rate, self._input_resampler
|
|
225
|
+
)
|
|
226
|
+
else: # pcm
|
|
227
|
+
# Resample PCM to pipeline sample rate
|
|
228
|
+
deserialized_data = await self._input_resampler.resample(
|
|
229
|
+
payload,
|
|
230
|
+
self._custom_sample_rate,
|
|
231
|
+
self._sample_rate,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if deserialized_data is None or len(deserialized_data) == 0:
|
|
235
|
+
# Skip if no audio data
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
audio_frame = InputAudioRawFrame(
|
|
239
|
+
audio=deserialized_data,
|
|
240
|
+
num_channels=1, # Mono audio
|
|
241
|
+
sample_rate=self._sample_rate,
|
|
242
|
+
)
|
|
243
|
+
return audio_frame
|
|
244
|
+
|
|
245
|
+
elif event == "start":
|
|
246
|
+
# Log start event but don't generate a frame (handled by WebSocketService)
|
|
247
|
+
logger.debug(f"Received start event for stream {self._stream_sid}")
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
elif event == "clear":
|
|
251
|
+
# External client requesting to clear our audio buffers
|
|
252
|
+
logger.debug(f"Received clear event for stream {self._stream_sid}")
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
else:
|
|
256
|
+
logger.debug(f"Unhandled event type: {event} for stream {self._stream_sid}")
|
|
257
|
+
return None
|
pipecat/serializers/plivo.py
CHANGED
|
@@ -178,7 +178,10 @@ class PlivoFrameSerializer(FrameSerializer):
|
|
|
178
178
|
return
|
|
179
179
|
|
|
180
180
|
# Plivo API endpoint for hanging up calls
|
|
181
|
-
|
|
181
|
+
if self._stream_id:
|
|
182
|
+
endpoint = f"https://api.plivo.com/v1/Account/{auth_id}/Call/{call_id}/Stream/{self._stream_id}/"
|
|
183
|
+
else:
|
|
184
|
+
endpoint = f"https://api.plivo.com/v1/Account/{auth_id}/Call/{call_id}/"
|
|
182
185
|
|
|
183
186
|
# Create basic auth from auth_id and auth_token
|
|
184
187
|
auth = aiohttp.BasicAuth(auth_id, auth_token)
|
|
@@ -199,6 +199,16 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
199
199
|
# Lithuanian
|
|
200
200
|
Language.LT: "lit",
|
|
201
201
|
Language.LT_LT: "lit",
|
|
202
|
+
Language.TA: "tam", # Tamil
|
|
203
|
+
Language.TA_IN: "tam", # Tamil
|
|
204
|
+
Language.TE: "tel", # Telugu
|
|
205
|
+
Language.TE_IN: "tel", # Telugu
|
|
206
|
+
Language.KN: "kan", # Kannada
|
|
207
|
+
Language.KN_IN: "kan", # Kannada
|
|
208
|
+
Language.ML: "mal", # Malayalam
|
|
209
|
+
Language.ML_IN: "mal", # Malayalam
|
|
210
|
+
Language.MR: "mar", # Marathi
|
|
211
|
+
Language.MR_IN: "mar", # Marathi
|
|
202
212
|
}
|
|
203
213
|
return language_map.get(language)
|
|
204
214
|
|
|
@@ -223,7 +233,7 @@ class ElevenlabsSTTService(SegmentedSTTService):
|
|
|
223
233
|
*,
|
|
224
234
|
api_key: str,
|
|
225
235
|
model_id: str = "scribe_v1",
|
|
226
|
-
language: Language =
|
|
236
|
+
language: Optional[Language] = None,
|
|
227
237
|
tag_audio_events: bool = False,
|
|
228
238
|
sample_rate: Optional[int] = None,
|
|
229
239
|
diarize: bool = False,
|
|
@@ -293,10 +303,6 @@ class ElevenlabsSTTService(SegmentedSTTService):
|
|
|
293
303
|
await self.start_ttfb_metrics()
|
|
294
304
|
|
|
295
305
|
# Get language code for ElevenLabs API
|
|
296
|
-
language = self._settings["language"]
|
|
297
|
-
elevenlabs_lang = self.language_to_service_language(language)
|
|
298
|
-
|
|
299
|
-
# Prepare API parameters
|
|
300
306
|
params = {
|
|
301
307
|
"file": audio,
|
|
302
308
|
"model_id": self._model_id,
|
|
@@ -304,9 +310,13 @@ class ElevenlabsSTTService(SegmentedSTTService):
|
|
|
304
310
|
"diarize": self._diarize,
|
|
305
311
|
}
|
|
306
312
|
|
|
307
|
-
|
|
308
|
-
if
|
|
309
|
-
|
|
313
|
+
language = self._settings["language"]
|
|
314
|
+
if language is not None:
|
|
315
|
+
elevenlabs_lang = self.language_to_service_language(language)
|
|
316
|
+
if elevenlabs_lang:
|
|
317
|
+
params["language_code"] = elevenlabs_lang
|
|
318
|
+
else:
|
|
319
|
+
params["language_code"] = None
|
|
310
320
|
|
|
311
321
|
# Call ElevenLabs STT API in thread pool to avoid blocking
|
|
312
322
|
transcription = await asyncio.to_thread(self._client.speech_to_text.convert, **params)
|
|
@@ -4,5 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
import sys
|
|
7
8
|
|
|
9
|
+
from pipecat.services import DeprecatedModuleProxy
|
|
10
|
+
|
|
11
|
+
from .stt import *
|
|
8
12
|
from .tts import *
|
|
13
|
+
|
|
14
|
+
# Old
|
|
15
|
+
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "sarvam", "sarvam.tts")
|