dv-pipecat-ai 0.0.82.dev884__py3-none-any.whl → 0.0.85.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Custom/External telephony serializer for Pipecat with Ringg AI WebSocket API. Customers will directly connect to Ringg AI WebSocket API."""
8
+
9
+ import base64
10
+ import json
11
+ import uuid
12
+ from typing import Optional
13
+
14
+ from loguru import logger
15
+ from pydantic import BaseModel
16
+
17
+ from pipecat.audio.utils import (
18
+ alaw_to_pcm,
19
+ create_stream_resampler,
20
+ pcm_to_alaw,
21
+ pcm_to_ulaw,
22
+ ulaw_to_pcm,
23
+ )
24
+ from pipecat.frames.frames import (
25
+ AudioRawFrame,
26
+ CallTransferFrame,
27
+ CancelFrame,
28
+ EndFrame,
29
+ Frame,
30
+ InputAudioRawFrame,
31
+ StartFrame,
32
+ StartInterruptionFrame,
33
+ TransportMessageFrame,
34
+ TransportMessageUrgentFrame,
35
+ )
36
+ from pipecat.serializers.base_serializer import FrameSerializer, FrameSerializerType
37
+
38
+
39
+ class CustomFrameSerializer(FrameSerializer):
40
+ """Serializer for Custom/External telephony WebSocket protocol (Ringg AI API).
41
+
42
+ This serializer handles converting between Pipecat frames and the Ringg AI
43
+ WebSocket protocol for external/custom telephony providers. It supports
44
+ PCMU (μ-law), PCMA (A-law), and PCM codecs with automatic conversion.
45
+
46
+ Supported events:
47
+ - start: Initialize call with agent configuration
48
+ - media: Bidirectional audio streaming
49
+ - clear: Clear audio buffers (interruption)
50
+ - call_transfer: Transfer call to another number
51
+ - hang_up: End call notification
52
+
53
+ Audio format:
54
+ - Sample Rate: Configurable (default 8kHz)
55
+ - Channels: Mono (1 channel)
56
+ - Bit Depth: 16-bit
57
+ - Encoding: Little-endian
58
+ - Payload Encoding: Base64
59
+ - Supported Codecs: PCMU (μ-law), PCMA (A-law), PCM (raw)
60
+ """
61
+
62
+ class InputParams(BaseModel):
63
+ """Configuration parameters for CustomFrameSerializer.
64
+
65
+ Parameters:
66
+ custom_sample_rate: Sample rate used by external client, defaults to 8000 Hz.
67
+ sample_rate: Optional override for pipeline input sample rate.
68
+ codec: Audio codec - "pcmu" (μ-law), "pcma" (A-law), or "pcm" (raw PCM).
69
+ """
70
+
71
+ custom_sample_rate: int = 8000
72
+ sample_rate: Optional[int] = None
73
+ codec: str = "pcmu" # "pcmu" or "pcm"
74
+
75
+ def __init__(
76
+ self, stream_sid: str, call_sid: Optional[str] = None, params: Optional[InputParams] = None
77
+ ):
78
+ """Initialize the CustomFrameSerializer.
79
+
80
+ Args:
81
+ stream_sid: The stream identifier from external client.
82
+ call_sid: The call identifier from external client.
83
+ params: Configuration parameters.
84
+ """
85
+ self._stream_sid = stream_sid
86
+ self._call_sid = call_sid
87
+ self._params = params or CustomFrameSerializer.InputParams()
88
+
89
+ self._custom_sample_rate = self._params.custom_sample_rate
90
+ self._sample_rate = 0 # Pipeline input rate
91
+ self._codec = self._params.codec.lower()
92
+
93
+ self._input_resampler = create_stream_resampler()
94
+ self._output_resampler = create_stream_resampler()
95
+
96
+ @property
97
+ def type(self) -> FrameSerializerType:
98
+ """Gets the serializer type.
99
+
100
+ Returns:
101
+ The serializer type, TEXT for JSON-based protocol.
102
+ """
103
+ return FrameSerializerType.TEXT
104
+
105
+ async def setup(self, frame: StartFrame):
106
+ """Sets up the serializer with pipeline configuration.
107
+
108
+ Args:
109
+ frame: The StartFrame containing pipeline configuration.
110
+ """
111
+ self._sample_rate = self._params.sample_rate or frame.audio_in_sample_rate
112
+
113
+ async def serialize(self, frame: Frame) -> str | bytes | None:
114
+ """Serializes a Pipecat frame to Custom telephony WebSocket format.
115
+
116
+ Handles conversion of various frame types to Ringg AI WebSocket messages.
117
+
118
+ Args:
119
+ frame: The Pipecat frame to serialize.
120
+
121
+ Returns:
122
+ Serialized data as JSON string, or None if the frame isn't handled.
123
+ """
124
+ if isinstance(frame, StartInterruptionFrame):
125
+ # Send clear event to instruct client to discard buffered audio
126
+ answer = {"event": "clear", "stream_sid": self._stream_sid}
127
+ return json.dumps(answer)
128
+
129
+ elif isinstance(frame, CallTransferFrame):
130
+ # Send call_transfer event to transfer the call to another number
131
+ answer = {
132
+ "event": "call_transfer",
133
+ "call_sid": self._call_sid or self._stream_sid,
134
+ "to": frame.target,
135
+ }
136
+ return json.dumps(answer)
137
+
138
+ elif isinstance(frame, (EndFrame, CancelFrame)):
139
+ # Send hang_up event to end the call
140
+ answer = {"event": "hang_up", "stream_sid": self._stream_sid}
141
+ return json.dumps(answer)
142
+
143
+ elif isinstance(frame, AudioRawFrame):
144
+ data = frame.audio
145
+
146
+ # Convert audio based on codec
147
+ if self._codec == "pcmu":
148
+ # Convert PCM to μ-law for PCMU codec
149
+ serialized_data = await pcm_to_ulaw(
150
+ data, frame.sample_rate, self._custom_sample_rate, self._output_resampler
151
+ )
152
+ elif self._codec == "pcma":
153
+ # Convert PCM to A-law for PCMA codec
154
+ serialized_data = await pcm_to_alaw(
155
+ data, frame.sample_rate, self._custom_sample_rate, self._output_resampler
156
+ )
157
+ else: # pcm
158
+ # Resample PCM to target sample rate
159
+ serialized_data = await self._output_resampler.resample(
160
+ data, frame.sample_rate, self._custom_sample_rate
161
+ )
162
+
163
+ if serialized_data is None or len(serialized_data) == 0:
164
+ # Skip if no audio data
165
+ return None
166
+
167
+ payload = base64.b64encode(serialized_data).decode("ascii")
168
+ answer = {
169
+ "event": "media",
170
+ "stream_sid": self._stream_sid,
171
+ "media": {"payload": payload},
172
+ }
173
+
174
+ return json.dumps(answer)
175
+
176
+ elif isinstance(frame, (TransportMessageFrame, TransportMessageUrgentFrame)):
177
+ return json.dumps(frame.message)
178
+
179
+ return None
180
+
181
+ async def deserialize(self, data: str | bytes) -> Frame | None:
182
+ """Deserializes Custom telephony WebSocket data to Pipecat frames.
183
+
184
+ Handles conversion of Ringg AI WebSocket events to appropriate Pipecat frames.
185
+
186
+ Args:
187
+ data: The raw WebSocket data from external client.
188
+
189
+ Returns:
190
+ A Pipecat frame corresponding to the event, or None if unhandled.
191
+ """
192
+ try:
193
+ message = json.loads(data)
194
+ except json.JSONDecodeError as e:
195
+ logger.error(f"Failed to parse JSON message: {e}")
196
+ return None
197
+
198
+ event = message.get("event")
199
+
200
+ if event == "media":
201
+ media = message.get("media", {})
202
+ payload_base64 = media.get("payload")
203
+ uuid = message.get("uuid")
204
+
205
+ if not payload_base64:
206
+ logger.warning("Media event missing payload")
207
+ return None
208
+
209
+ try:
210
+ payload = base64.b64decode(payload_base64)
211
+ except Exception as e:
212
+ logger.error(f"Failed to decode base64 payload: {e}")
213
+ return None
214
+
215
+ # Convert audio based on codec
216
+ if self._codec == "pcmu":
217
+ # Convert μ-law to PCM
218
+ deserialized_data = await ulaw_to_pcm(
219
+ payload, self._custom_sample_rate, self._sample_rate, self._input_resampler
220
+ )
221
+ elif self._codec == "pcma":
222
+ # Convert A-law to PCM
223
+ deserialized_data = await alaw_to_pcm(
224
+ payload, self._custom_sample_rate, self._sample_rate, self._input_resampler
225
+ )
226
+ else: # pcm
227
+ # Resample PCM to pipeline sample rate
228
+ deserialized_data = await self._input_resampler.resample(
229
+ payload,
230
+ self._custom_sample_rate,
231
+ self._sample_rate,
232
+ )
233
+
234
+ if deserialized_data is None or len(deserialized_data) == 0:
235
+ # Skip if no audio data
236
+ return None
237
+
238
+ audio_frame = InputAudioRawFrame(
239
+ audio=deserialized_data,
240
+ num_channels=1, # Mono audio
241
+ sample_rate=self._sample_rate,
242
+ )
243
+ return audio_frame
244
+
245
+ elif event == "start":
246
+ # Log start event but don't generate a frame (handled by WebSocketService)
247
+ logger.debug(f"Received start event for stream {self._stream_sid}")
248
+ return None
249
+
250
+ elif event == "clear":
251
+ # External client requesting to clear our audio buffers
252
+ logger.debug(f"Received clear event for stream {self._stream_sid}")
253
+ return None
254
+
255
+ else:
256
+ logger.debug(f"Unhandled event type: {event} for stream {self._stream_sid}")
257
+ return None
@@ -178,7 +178,10 @@ class PlivoFrameSerializer(FrameSerializer):
178
178
  return
179
179
 
180
180
  # Plivo API endpoint for hanging up calls
181
- endpoint = f"https://api.plivo.com/v1/Account/{auth_id}/Call/{call_id}/"
181
+ if self._stream_id:
182
+ endpoint = f"https://api.plivo.com/v1/Account/{auth_id}/Call/{call_id}/Stream/{self._stream_id}/"
183
+ else:
184
+ endpoint = f"https://api.plivo.com/v1/Account/{auth_id}/Call/{call_id}/"
182
185
 
183
186
  # Create basic auth from auth_id and auth_token
184
187
  auth = aiohttp.BasicAuth(auth_id, auth_token)
@@ -199,6 +199,16 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
199
199
  # Lithuanian
200
200
  Language.LT: "lit",
201
201
  Language.LT_LT: "lit",
202
+ Language.TA: "tam", # Tamil
203
+ Language.TA_IN: "tam", # Tamil
204
+ Language.TE: "tel", # Telugu
205
+ Language.TE_IN: "tel", # Telugu
206
+ Language.KN: "kan", # Kannada
207
+ Language.KN_IN: "kan", # Kannada
208
+ Language.ML: "mal", # Malayalam
209
+ Language.ML_IN: "mal", # Malayalam
210
+ Language.MR: "mar", # Marathi
211
+ Language.MR_IN: "mar", # Marathi
202
212
  }
203
213
  return language_map.get(language)
204
214
 
@@ -223,7 +233,7 @@ class ElevenlabsSTTService(SegmentedSTTService):
223
233
  *,
224
234
  api_key: str,
225
235
  model_id: str = "scribe_v1",
226
- language: Language = Language.EN,
236
+ language: Optional[Language] = None,
227
237
  tag_audio_events: bool = False,
228
238
  sample_rate: Optional[int] = None,
229
239
  diarize: bool = False,
@@ -293,10 +303,6 @@ class ElevenlabsSTTService(SegmentedSTTService):
293
303
  await self.start_ttfb_metrics()
294
304
 
295
305
  # Get language code for ElevenLabs API
296
- language = self._settings["language"]
297
- elevenlabs_lang = self.language_to_service_language(language)
298
-
299
- # Prepare API parameters
300
306
  params = {
301
307
  "file": audio,
302
308
  "model_id": self._model_id,
@@ -304,9 +310,13 @@ class ElevenlabsSTTService(SegmentedSTTService):
304
310
  "diarize": self._diarize,
305
311
  }
306
312
 
307
- # Add language if specified
308
- if elevenlabs_lang:
309
- params["language_code"] = elevenlabs_lang
313
+ language = self._settings["language"]
314
+ if language is not None:
315
+ elevenlabs_lang = self.language_to_service_language(language)
316
+ if elevenlabs_lang:
317
+ params["language_code"] = elevenlabs_lang
318
+ else:
319
+ params["language_code"] = None
310
320
 
311
321
  # Call ElevenLabs STT API in thread pool to avoid blocking
312
322
  transcription = await asyncio.to_thread(self._client.speech_to_text.convert, **params)
@@ -4,5 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ import sys
7
8
 
9
+ from pipecat.services import DeprecatedModuleProxy
10
+
11
+ from .stt import *
8
12
  from .tts import *
13
+
14
+ # Old
15
+ sys.modules[__name__] = DeprecatedModuleProxy(globals(), "sarvam", "sarvam.tts")