dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -4,5 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ import sys
7
8
 
9
+ from pipecat.services import DeprecatedModuleProxy
10
+
11
+ from .stt import *
8
12
  from .tts import *
13
+
14
+ # Old
15
+ sys.modules[__name__] = DeprecatedModuleProxy(globals(), "sarvam", "sarvam.tts")
@@ -0,0 +1,540 @@
1
+ """Sarvam AI Speech-to-Text service implementation.
2
+
3
+ This module provides a streaming Speech-to-Text service using Sarvam AI's WebSocket-based
4
+ API. It supports real-time transcription with Voice Activity Detection (VAD) and
5
+ can handle multiple audio formats for Indian language speech recognition.
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import json
11
+ from enum import StrEnum
12
+ from typing import Literal, Optional
13
+ from urllib.parse import urlencode
14
+
15
+ from loguru import logger
16
+ from pydantic import BaseModel
17
+
18
+ from pipecat.audio.resamplers.resampy_resampler import ResampyResampler
19
+ from pipecat.frames.frames import (
20
+ CancelFrame,
21
+ EndFrame,
22
+ ErrorFrame,
23
+ StartFrame,
24
+ TranscriptionFrame,
25
+ )
26
+ from pipecat.services.stt_service import STTService
27
+ from pipecat.transcriptions.language import Language
28
+ from pipecat.utils.time import time_now_iso8601
29
+ from pipecat.utils.tracing.service_decorators import traced_stt
30
+
31
+ try:
32
+ import websockets
33
+ from sarvamai import AsyncSarvamAI
34
+ from sarvamai.speech_to_text_streaming.socket_client import (
35
+ AsyncSpeechToTextStreamingSocketClient,
36
+ )
37
+ from sarvamai.speech_to_text_translate_streaming.socket_client import (
38
+ AsyncSpeechToTextTranslateStreamingSocketClient,
39
+ )
40
+ from websockets.protocol import State
41
+ except ModuleNotFoundError as e:
42
+ logger.error(f"Exception: {e}")
43
+ logger.error("In order to use Sarvam, you need to `pip install pipecat-ai[sarvam]`.")
44
+ raise Exception(f"Missing module: {e}")
45
+
46
+
47
+ def language_to_sarvam_language(language) -> str:
48
+ """Convert Language enum or string to Sarvam language code.
49
+
50
+ Args:
51
+ language: The Language enum or language code string to convert.
52
+
53
+ Returns:
54
+ The corresponding Sarvam language code string.
55
+
56
+ Raises:
57
+ ValueError: If the language is not supported by Sarvam.
58
+ """
59
+ # If already a string in the right format, return it
60
+ if isinstance(language, str):
61
+ if "-" in language: # Already in format like "hi-IN"
62
+ return language
63
+ # Convert short codes to full format
64
+ lang_map = {
65
+ "hi": "hi-IN",
66
+ "bn": "bn-IN",
67
+ "gu": "gu-IN",
68
+ "kn": "kn-IN",
69
+ "ml": "ml-IN",
70
+ "mr": "mr-IN",
71
+ "ta": "ta-IN",
72
+ "te": "te-IN",
73
+ "pa": "pa-IN",
74
+ "or": "od-IN",
75
+ "as": "as-IN",
76
+ "en": "en-IN",
77
+ }
78
+ if language.lower() in lang_map:
79
+ return lang_map[language.lower()]
80
+ raise ValueError(f"Unsupported language string: {language}")
81
+
82
+ # Handle Language enum
83
+ match language:
84
+ case Language.BN_IN:
85
+ return "bn-IN"
86
+ case Language.GU_IN:
87
+ return "gu-IN"
88
+ case Language.HI_IN:
89
+ return "hi-IN"
90
+ case Language.KN_IN:
91
+ return "kn-IN"
92
+ case Language.ML_IN:
93
+ return "ml-IN"
94
+ case Language.MR_IN:
95
+ return "mr-IN"
96
+ case Language.TA_IN:
97
+ return "ta-IN"
98
+ case Language.TE_IN:
99
+ return "te-IN"
100
+ case Language.PA_IN:
101
+ return "pa-IN"
102
+ case Language.OR_IN:
103
+ return "od-IN"
104
+ case Language.EN_US:
105
+ return "en-US"
106
+ case Language.EN_IN:
107
+ return "en-IN"
108
+ case Language.AS_IN:
109
+ return "as-IN"
110
+ case _:
111
+ raise ValueError(f"Unsupported language: {language}")
112
+
113
+
114
+ class TranscriptionMetrics(BaseModel):
115
+ """Metrics for transcription performance."""
116
+
117
+ audio_duration: float
118
+ processing_latency: float
119
+
120
+
121
+ class TranscriptionData(BaseModel):
122
+ """Data structure for transcription results."""
123
+
124
+ request_id: str
125
+ transcript: str
126
+ language_code: Optional[str]
127
+ metrics: Optional[TranscriptionMetrics] = None
128
+ is_final: Optional[bool] = None
129
+
130
+
131
+ class TranscriptionResponse(BaseModel):
132
+ """Response structure for transcription data."""
133
+
134
+ type: Literal["data"]
135
+ data: TranscriptionData
136
+
137
+
138
+ class VADSignal(StrEnum):
139
+ """Voice Activity Detection signal types."""
140
+
141
+ START = "START_SPEECH"
142
+ END = "END_SPEECH"
143
+
144
+
145
+ class EventData(BaseModel):
146
+ """Data structure for VAD events."""
147
+
148
+ signal_type: VADSignal
149
+ occured_at: float
150
+
151
+
152
+ class EventResponse(BaseModel):
153
+ """Response structure for VAD events."""
154
+
155
+ type: Literal["events"]
156
+ data: EventData
157
+
158
+
159
+ class SarvamSTTService(STTService):
160
+ """Sarvam speech-to-text service.
161
+
162
+ Provides real-time speech recognition using Sarvam's WebSocket API.
163
+ Supports both Saarika (transcription) and Saaras (translation) models.
164
+
165
+ Models:
166
+ - Saarika (saarika:v2.5): Transcription in a single language
167
+ - Saaras (saaras:v2.5): Translation from source language to target language
168
+
169
+ The service automatically selects the correct endpoint based on the model name.
170
+ """
171
+
172
+ def __init__(
173
+ self,
174
+ *,
175
+ api_key: str,
176
+ model: str = "saaras:v2.5",
177
+ language: Language = Language.HI_IN,
178
+ **kwargs,
179
+ ):
180
+ """Initialize the Sarvam STT service.
181
+
182
+ Args:
183
+ api_key: Sarvam API key for authentication.
184
+ model: Sarvam model to use for transcription.
185
+ language: Language for transcription. Defaults to Hindi (India).
186
+ **kwargs: Additional arguments passed to the parent STTService.
187
+ Note: Sarvam requires 16kHz audio. If your input is a different
188
+ sample rate, it will be automatically resampled to 16kHz.
189
+ """
190
+ super().__init__(**kwargs)
191
+
192
+ self.set_model_name(model)
193
+ self._api_key = api_key
194
+ self._model = model
195
+ self._language = language
196
+ self._target_sample_rate = 16000 # Sarvam requires 16kHz
197
+
198
+ self._client = AsyncSarvamAI(api_subscription_key=api_key)
199
+ self._websocket = None
200
+ self._websocket_connection = None
201
+ self._listening_task = None
202
+ self._resampler = ResampyResampler()
203
+
204
+ # Register VAD event handlers
205
+ self._register_event_handler("on_speech_started")
206
+ self._register_event_handler("on_speech_ended")
207
+
208
+ def can_generate_metrics(self) -> bool:
209
+ """Check if this service can generate processing metrics.
210
+
211
+ Returns:
212
+ True, as Sarvam service supports metrics generation.
213
+ """
214
+ return True
215
+
216
+ async def set_model(self, model: str):
217
+ """Set the Sarvam model and reconnect.
218
+
219
+ Args:
220
+ model: The Sarvam model name to use.
221
+ """
222
+ await super().set_model(model)
223
+ logger.info(f"Switching STT model to: [{model}]")
224
+ self._model = model
225
+ await self._disconnect()
226
+ await self._connect()
227
+
228
+ async def set_language(self, language: Language):
229
+ """Set the language and reconnect.
230
+
231
+ Args:
232
+ language: The Language enum to use.
233
+ """
234
+ logger.info(f"Switching STT language to: [{language}]")
235
+ self._language = language
236
+ await self._disconnect()
237
+ await self._connect()
238
+
239
+ async def start(self, frame: StartFrame):
240
+ """Start the Sarvam STT service.
241
+
242
+ Args:
243
+ frame: The start frame containing initialization parameters.
244
+ """
245
+ await super().start(frame)
246
+ await self._connect()
247
+
248
+ async def stop(self, frame: EndFrame):
249
+ """Stop the Sarvam STT service.
250
+
251
+ Args:
252
+ frame: The end frame.
253
+ """
254
+ await super().stop(frame)
255
+ await self._disconnect()
256
+
257
+ async def cancel(self, frame: CancelFrame):
258
+ """Cancel the Sarvam STT service.
259
+
260
+ Args:
261
+ frame: The cancel frame.
262
+ """
263
+ await super().cancel(frame)
264
+ await self._disconnect()
265
+
266
+ async def run_stt(self, audio: bytes):
267
+ """Send audio data to Sarvam for transcription.
268
+
269
+ Args:
270
+ audio: Raw audio bytes to transcribe.
271
+
272
+ Yields:
273
+ Frame: None (transcription results come via WebSocket callbacks).
274
+ """
275
+ if not self._websocket_connection or self._websocket_connection.state != State.OPEN:
276
+ logger.warning("WebSocket not connected, cannot process audio")
277
+ yield None
278
+ return
279
+
280
+ try:
281
+ # Resample audio to 16kHz if needed
282
+ if self.sample_rate != self._target_sample_rate:
283
+ audio = await self._resampler.resample(
284
+ audio, self.sample_rate, self._target_sample_rate
285
+ )
286
+
287
+ # Convert audio bytes to base64 for Sarvam API
288
+ audio_base64 = base64.b64encode(audio).decode("utf-8")
289
+
290
+ # Sarvam requires 'audio/wav' encoding (even for raw PCM data)
291
+ message = {
292
+ "audio": {
293
+ "data": audio_base64,
294
+ "encoding": "audio/wav",
295
+ "sample_rate": self._target_sample_rate,
296
+ }
297
+ }
298
+ await self._websocket_connection.send(json.dumps(message))
299
+
300
+ except websockets.exceptions.ConnectionClosed:
301
+ logger.error("WebSocket connection closed")
302
+ await self.push_error(ErrorFrame("WebSocket connection closed"))
303
+ except Exception as e:
304
+ logger.error(f"Error sending audio to Sarvam: {e}")
305
+ await self.push_error(ErrorFrame(f"Failed to send audio: {e}"))
306
+
307
+ yield None
308
+
309
+ async def _connect(self):
310
+ """Connect to Sarvam WebSocket API directly."""
311
+ logger.debug(f"Connecting to Sarvam with model: {self._model}")
312
+
313
+ try:
314
+ base_url = self._client._client_wrapper.get_environment().production
315
+
316
+ # Choose endpoint and socket class based on model
317
+ if self._model.startswith("saarika"):
318
+ # Saarika = Transcription endpoint
319
+ path = "/speech-to-text/ws"
320
+ query_params = {
321
+ "language-code": language_to_sarvam_language(self._language),
322
+ "model": self._model,
323
+ "vad_signals": "true",
324
+ }
325
+ socket_cls = AsyncSpeechToTextStreamingSocketClient
326
+ logger.debug(
327
+ f"Using Saarika transcription endpoint with language: {self._language}"
328
+ )
329
+ else:
330
+ # Saaras = Translation endpoint
331
+ path = "/speech-to-text-translate/ws"
332
+ query_params = {
333
+ "model": self._model,
334
+ "vad_signals": "true",
335
+ }
336
+ socket_cls = AsyncSpeechToTextTranslateStreamingSocketClient
337
+ logger.debug("Using Saaras translation endpoint")
338
+
339
+ ws_url = f"{base_url}{path}?{urlencode(query_params)}"
340
+
341
+ # Get headers
342
+ headers = self._client._client_wrapper.get_headers()
343
+ headers["Api-Subscription-Key"] = self._api_key
344
+
345
+ # Connect to WebSocket
346
+ self._websocket_connection = await websockets.connect(
347
+ ws_url, additional_headers=headers
348
+ )
349
+
350
+ # Create the socket client wrapper
351
+ self._websocket = socket_cls(websocket=self._websocket_connection)
352
+
353
+ # Start listening for messages
354
+ self._listening_task = asyncio.create_task(self._listen_for_messages())
355
+
356
+ logger.info(f"Connected to Sarvam successfully with model: {self._model}")
357
+
358
+ except websockets.exceptions.InvalidStatusCode as e:
359
+ error_msg = f"Failed to connect to Sarvam: HTTP {e.status_code}"
360
+ if e.status_code == 403:
361
+ if self._model.startswith("saarika"):
362
+ error_msg += f" - Access denied. Check: 1) API key has Saarika access, 2) Model '{self._model}' exists (try saarika:v2.5), 3) Using correct endpoint (transcription)"
363
+ else:
364
+ error_msg += f" - Access denied. Check: 1) API key has Saaras access, 2) Model '{self._model}' exists (try saaras:v2.5), 3) Using correct endpoint (translation)"
365
+ elif e.status_code == 401:
366
+ error_msg += " - Invalid API key"
367
+ logger.error(error_msg)
368
+ self._websocket = None
369
+ self._websocket_connection = None
370
+ await self.push_error(ErrorFrame(error_msg))
371
+ except Exception as e:
372
+ logger.error(f"Failed to connect to Sarvam: {e}")
373
+ self._websocket = None
374
+ self._websocket_connection = None
375
+ await self.push_error(ErrorFrame(f"Failed to connect to Sarvam: {e}"))
376
+
377
+ async def _disconnect(self):
378
+ """Disconnect from Sarvam WebSocket API."""
379
+ if self._listening_task:
380
+ self._listening_task.cancel()
381
+ try:
382
+ await self._listening_task
383
+ except asyncio.CancelledError:
384
+ pass
385
+ self._listening_task = None
386
+
387
+ if self._websocket_connection and self._websocket_connection.state == State.OPEN:
388
+ try:
389
+ await self._websocket_connection.close()
390
+ except Exception as e:
391
+ logger.error(f"Error closing WebSocket connection: {e}")
392
+ finally:
393
+ logger.debug("Disconnected from Sarvam WebSocket")
394
+ self._websocket_connection = None
395
+ self._websocket = None
396
+
397
+ async def _listen_for_messages(self):
398
+ """Listen for messages from Sarvam WebSocket."""
399
+ try:
400
+ while self._websocket_connection and self._websocket_connection.state == State.OPEN:
401
+ try:
402
+ message = await self._websocket_connection.recv()
403
+ response = json.loads(message)
404
+ await self._handle_response(response)
405
+
406
+ except websockets.exceptions.ConnectionClosed:
407
+ logger.warning("WebSocket connection closed")
408
+ break
409
+ except json.JSONDecodeError as e:
410
+ logger.error(f"Failed to parse JSON response: {e}")
411
+ continue
412
+ except Exception as e:
413
+ logger.error(f"Error receiving message from Sarvam: {e}")
414
+ break
415
+
416
+ except asyncio.CancelledError:
417
+ logger.debug("Message listening cancelled")
418
+ except Exception as e:
419
+ logger.error(f"Unexpected error in message listener: {e}")
420
+ await self.push_error(ErrorFrame(f"Message listener error: {e}"))
421
+
422
+ async def _handle_response(self, response):
423
+ """Handle transcription response from Sarvam.
424
+
425
+ Handles both Saarika (transcription) and Saaras (translation) message formats.
426
+
427
+ Args:
428
+ response: The response object from Sarvam WebSocket.
429
+ """
430
+ logger.debug(f"Received response: {response}")
431
+
432
+ try:
433
+ msg_type = response.get("type")
434
+
435
+ # Error handling
436
+ if msg_type == "error":
437
+ error_msg = response.get("data", {}).get("message", "Unknown error")
438
+ logger.error(f"Sarvam API error: {error_msg}")
439
+ await self.push_error(ErrorFrame(f"Sarvam API error: {error_msg}"))
440
+ await self._disconnect()
441
+ return
442
+
443
+ # Modern Saarika/Saaras message format
444
+ if msg_type == "speech_start":
445
+ await self.start_metrics()
446
+ logger.debug("User started speaking")
447
+ await self._call_event_handler("on_speech_started")
448
+ return
449
+
450
+ if msg_type == "speech_end":
451
+ logger.debug("User stopped speaking")
452
+ await self._call_event_handler("on_speech_ended")
453
+ return
454
+
455
+ if msg_type == "transcript":
456
+ await self.stop_ttfb_metrics()
457
+ # Handle both Saarika (text) and Saaras (text + text_translated)
458
+ transcript = response.get("text") or response.get("text_translated") or ""
459
+ language_code = (
460
+ response.get("source_language_code") or response.get("language_code") or "hi-IN"
461
+ )
462
+ language = self._map_language_code_to_enum(language_code)
463
+
464
+ if transcript.strip():
465
+ await self.push_frame(
466
+ TranscriptionFrame(
467
+ transcript,
468
+ self._user_id,
469
+ time_now_iso8601(),
470
+ language,
471
+ result=response,
472
+ )
473
+ )
474
+ await self.stop_processing_metrics()
475
+ return
476
+
477
+ # Legacy format (backward compatibility)
478
+ if msg_type == "events":
479
+ parsed = EventResponse(**response)
480
+ signal = parsed.data.signal_type
481
+ timestamp = parsed.data.occured_at
482
+ logger.debug(f"VAD Signal: {signal}, Occurred at: {timestamp}")
483
+
484
+ if signal == VADSignal.START:
485
+ await self.start_metrics()
486
+ logger.debug("User started speaking")
487
+ await self._call_event_handler("on_speech_started")
488
+ elif signal == VADSignal.END:
489
+ logger.debug("User stopped speaking")
490
+ await self._call_event_handler("on_speech_ended")
491
+ return
492
+
493
+ if msg_type == "data":
494
+ await self.stop_ttfb_metrics()
495
+ parsed = TranscriptionResponse(**response)
496
+ transcript = parsed.data.transcript
497
+ language_code = parsed.data.language_code or "hi-IN"
498
+ language = self._map_language_code_to_enum(language_code)
499
+
500
+ if transcript and transcript.strip():
501
+ await self.push_frame(
502
+ TranscriptionFrame(
503
+ transcript,
504
+ self._user_id,
505
+ time_now_iso8601(),
506
+ language,
507
+ result=response,
508
+ )
509
+ )
510
+ await self.stop_processing_metrics()
511
+ return
512
+
513
+ except Exception as e:
514
+ logger.error(f"Error handling Sarvam response: {e}")
515
+ await self.push_error(ErrorFrame(f"Failed to handle response: {e}"))
516
+
517
+ def _map_language_code_to_enum(self, language_code: str) -> Language:
518
+ """Map Sarvam language code (e.g., "hi-IN") to pipecat Language enum."""
519
+ logger.debug(f"Audio language detected as: {language_code}")
520
+ mapping = {
521
+ "bn-IN": Language.BN_IN,
522
+ "gu-IN": Language.GU_IN,
523
+ "hi-IN": Language.HI_IN,
524
+ "kn-IN": Language.KN_IN,
525
+ "ml-IN": Language.ML_IN,
526
+ "mr-IN": Language.MR_IN,
527
+ "ta-IN": Language.TA_IN,
528
+ "te-IN": Language.TE_IN,
529
+ "pa-IN": Language.PA_IN,
530
+ "od-IN": Language.OR_IN,
531
+ "en-US": Language.EN_US,
532
+ "en-IN": Language.EN_IN,
533
+ "as-IN": Language.AS_IN,
534
+ }
535
+ return mapping.get(language_code, Language.HI_IN)
536
+
537
+ async def start_metrics(self):
538
+ """Start TTFB and processing metrics collection."""
539
+ await self.start_ttfb_metrics()
540
+ await self.start_processing_metrics()