dv-pipecat-ai 0.0.82.dev68__py3-none-any.whl → 0.0.82.dev69__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.82.dev68
3
+ Version: 0.0.82.dev69
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.82.dev68.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.82.dev69.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -290,7 +290,7 @@ pipecat/services/sambanova/__init__.py,sha256=oTXExLic-qTcsfsiWmssf3Elclf3IIWoN4
290
290
  pipecat/services/sambanova/llm.py,sha256=5XVfPLEk__W8ykFqLdV95ZUhlGGkAaJwmbciLdZYtTc,8976
291
291
  pipecat/services/sambanova/stt.py,sha256=ZZgEZ7WQjLFHbCko-3LNTtVajjtfUvbtVLtFcaNadVQ,2536
292
292
  pipecat/services/sarvam/__init__.py,sha256=B4TN_tTHV9fWg0aSoPvfQlXISA0nJaQ9-u08I9UWvH4,280
293
- pipecat/services/sarvam/stt.py,sha256=cSrQaDpixNQh4tl8r2xRNREHjKKcyLmrFDLa-Lp4Hl4,15465
293
+ pipecat/services/sarvam/stt.py,sha256=p9Iq4loMwnftNZ_S0WoFSoX7iBbRKyja6RsVWbpj508,19314
294
294
  pipecat/services/sarvam/tts.py,sha256=K-AtWE1Q0ZZwshLP-7sCDmOSIWhuKOj91BCCE4N9XAk,25010
295
295
  pipecat/services/simli/__init__.py,sha256=cbDcqOaGsEgKbGYKpJ1Vv7LN4ZjOWA04sE84WW5vgQI,257
296
296
  pipecat/services/simli/video.py,sha256=fVMYsCE5epH9rTdhN_tyPPJw7W6TCMHCOe2akKHWduw,8330
@@ -378,7 +378,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
378
378
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
379
379
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
380
380
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
381
- dv_pipecat_ai-0.0.82.dev68.dist-info/METADATA,sha256=tRV7JwvNl-emWJwrua577U-gfTxxMtB2RY_ZeI4Qpro,32692
382
- dv_pipecat_ai-0.0.82.dev68.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
383
- dv_pipecat_ai-0.0.82.dev68.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
384
- dv_pipecat_ai-0.0.82.dev68.dist-info/RECORD,,
381
+ dv_pipecat_ai-0.0.82.dev69.dist-info/METADATA,sha256=2Zcf_ZuOSm039KmMpmr76DGUK20UdkgRKaw4dp6y8xA,32692
382
+ dv_pipecat_ai-0.0.82.dev69.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
383
+ dv_pipecat_ai-0.0.82.dev69.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
384
+ dv_pipecat_ai-0.0.82.dev69.dist-info/RECORD,,
@@ -31,6 +31,9 @@ from pipecat.utils.tracing.service_decorators import traced_stt
31
31
  try:
32
32
  import websockets
33
33
  from sarvamai import AsyncSarvamAI
34
+ from sarvamai.speech_to_text_streaming.socket_client import (
35
+ AsyncSpeechToTextStreamingSocketClient,
36
+ )
34
37
  from sarvamai.speech_to_text_translate_streaming.socket_client import (
35
38
  AsyncSpeechToTextTranslateStreamingSocketClient,
36
39
  )
@@ -41,11 +44,11 @@ except ModuleNotFoundError as e:
41
44
  raise Exception(f"Missing module: {e}")
42
45
 
43
46
 
44
- def language_to_sarvam_language(language: Language) -> str:
45
- """Convert Language enum to Sarvam language code.
47
+ def language_to_sarvam_language(language) -> str:
48
+ """Convert Language enum or string to Sarvam language code.
46
49
 
47
50
  Args:
48
- language: The Language enum to convert.
51
+ language: The Language enum or language code string to convert.
49
52
 
50
53
  Returns:
51
54
  The corresponding Sarvam language code string.
@@ -53,6 +56,30 @@ def language_to_sarvam_language(language: Language) -> str:
53
56
  Raises:
54
57
  ValueError: If the language is not supported by Sarvam.
55
58
  """
59
+ # If already a string in the right format, return it
60
+ if isinstance(language, str):
61
+ if "-" in language: # Already in format like "hi-IN"
62
+ return language
63
+ # Convert short codes to full format
64
+ lang_map = {
65
+ "hi": "hi-IN",
66
+ "bn": "bn-IN",
67
+ "gu": "gu-IN",
68
+ "kn": "kn-IN",
69
+ "ml": "ml-IN",
70
+ "mr": "mr-IN",
71
+ "ta": "ta-IN",
72
+ "te": "te-IN",
73
+ "pa": "pa-IN",
74
+ "or": "od-IN",
75
+ "as": "as-IN",
76
+ "en": "en-IN",
77
+ }
78
+ if language.lower() in lang_map:
79
+ return lang_map[language.lower()]
80
+ raise ValueError(f"Unsupported language string: {language}")
81
+
82
+ # Handle Language enum
56
83
  match language:
57
84
  case Language.BN_IN:
58
85
  return "bn-IN"
@@ -133,6 +160,13 @@ class SarvamSTTService(STTService):
133
160
  """Sarvam speech-to-text service.
134
161
 
135
162
  Provides real-time speech recognition using Sarvam's WebSocket API.
163
+ Supports both Saarika (transcription) and Saaras (translation) models.
164
+
165
+ Models:
166
+ - Saarika (saarika:v2.5): Transcription in a single language
167
+ - Saaras (saaras:v2.5): Translation from source language to target language
168
+
169
+ The service automatically selects the correct endpoint based on the model name.
136
170
  """
137
171
 
138
172
  def __init__(
@@ -253,6 +287,7 @@ class SarvamSTTService(STTService):
253
287
  # Convert audio bytes to base64 for Sarvam API
254
288
  audio_base64 = base64.b64encode(audio).decode("utf-8")
255
289
 
290
+ # Sarvam requires 'audio/wav' encoding (even for raw PCM data)
256
291
  message = {
257
292
  "audio": {
258
293
  "data": audio_base64,
@@ -273,33 +308,47 @@ class SarvamSTTService(STTService):
273
308
 
274
309
  async def _connect(self):
275
310
  """Connect to Sarvam WebSocket API directly."""
276
- logger.debug("Connecting to Sarvam")
311
+ logger.debug(f"Connecting to Sarvam with model: {self._model}")
277
312
 
278
313
  try:
279
- # Build WebSocket URL and headers manually
280
- ws_url = (
281
- self._client._client_wrapper.get_environment().production
282
- + "/speech-to-text-translate/ws"
283
- )
314
+ base_url = self._client._client_wrapper.get_environment().production
315
+
316
+ # Choose endpoint and socket class based on model
317
+ if self._model.startswith("saarika"):
318
+ # Saarika = Transcription endpoint
319
+ path = "/speech-to-text/ws"
320
+ query_params = {
321
+ "language-code": language_to_sarvam_language(self._language),
322
+ "model": self._model,
323
+ "vad_signals": "true",
324
+ }
325
+ socket_cls = AsyncSpeechToTextStreamingSocketClient
326
+ logger.debug(
327
+ f"Using Saarika transcription endpoint with language: {self._language}"
328
+ )
329
+ else:
330
+ # Saaras = Translation endpoint
331
+ path = "/speech-to-text-translate/ws"
332
+ query_params = {
333
+ "model": self._model,
334
+ "vad_signals": "true",
335
+ }
336
+ socket_cls = AsyncSpeechToTextTranslateStreamingSocketClient
337
+ logger.debug("Using Saaras translation endpoint")
284
338
 
285
- # Add query parameters
286
- query_params = {"model": self._model, "vad_signals": "true"}
287
- query_string = urlencode(query_params)
288
- ws_url = ws_url + f"?{query_string}"
339
+ ws_url = f"{base_url}{path}?{urlencode(query_params)}"
289
340
 
290
341
  # Get headers
291
342
  headers = self._client._client_wrapper.get_headers()
292
343
  headers["Api-Subscription-Key"] = self._api_key
293
344
 
294
- # Connect to WebSocket directly
345
+ # Connect to WebSocket
295
346
  self._websocket_connection = await websockets.connect(
296
347
  ws_url, additional_headers=headers
297
348
  )
298
349
 
299
350
  # Create the socket client wrapper
300
- self._websocket = AsyncSpeechToTextTranslateStreamingSocketClient(
301
- websocket=self._websocket_connection
302
- )
351
+ self._websocket = socket_cls(websocket=self._websocket_connection)
303
352
 
304
353
  # Start listening for messages
305
354
  self._listening_task = asyncio.create_task(self._listen_for_messages())
@@ -309,7 +358,10 @@ class SarvamSTTService(STTService):
309
358
  except websockets.exceptions.InvalidStatusCode as e:
310
359
  error_msg = f"Failed to connect to Sarvam: HTTP {e.status_code}"
311
360
  if e.status_code == 403:
312
- error_msg += f" - Access denied. Your API key may not have access to model '{self._model}'. Available models: saaras:v2, saaras:v2.5"
361
+ if self._model.startswith("saarika"):
362
+ error_msg += f" - Access denied. Check: 1) API key has Saarika access, 2) Model '{self._model}' exists (try saarika:v2.5), 3) Using correct endpoint (transcription)"
363
+ else:
364
+ error_msg += f" - Access denied. Check: 1) API key has Saaras access, 2) Model '{self._model}' exists (try saaras:v2.5), 3) Using correct endpoint (translation)"
313
365
  elif e.status_code == 401:
314
366
  error_msg += " - Invalid API key"
315
367
  logger.error(error_msg)
@@ -370,21 +422,60 @@ class SarvamSTTService(STTService):
370
422
  async def _handle_response(self, response):
371
423
  """Handle transcription response from Sarvam.
372
424
 
425
+ Handles both Saarika (transcription) and Saaras (translation) message formats.
426
+
373
427
  Args:
374
428
  response: The response object from Sarvam WebSocket.
375
429
  """
376
430
  logger.debug(f"Received response: {response}")
377
431
 
378
432
  try:
379
- if response["type"] == "error":
433
+ msg_type = response.get("type")
434
+
435
+ # Error handling
436
+ if msg_type == "error":
380
437
  error_msg = response.get("data", {}).get("message", "Unknown error")
381
438
  logger.error(f"Sarvam API error: {error_msg}")
382
439
  await self.push_error(ErrorFrame(f"Sarvam API error: {error_msg}"))
383
- # Close connection on error
384
440
  await self._disconnect()
385
441
  return
386
442
 
387
- if response["type"] == "events":
443
+ # Modern Saarika/Saaras message format
444
+ if msg_type == "speech_start":
445
+ await self.start_metrics()
446
+ logger.debug("User started speaking")
447
+ await self._call_event_handler("on_speech_started")
448
+ return
449
+
450
+ if msg_type == "speech_end":
451
+ logger.debug("User stopped speaking")
452
+ await self._call_event_handler("on_speech_ended")
453
+ return
454
+
455
+ if msg_type == "transcript":
456
+ await self.stop_ttfb_metrics()
457
+ # Handle both Saarika (text) and Saaras (text + text_translated)
458
+ transcript = response.get("text") or response.get("text_translated") or ""
459
+ language_code = (
460
+ response.get("source_language_code") or response.get("language_code") or "hi-IN"
461
+ )
462
+ language = self._map_language_code_to_enum(language_code)
463
+
464
+ if transcript.strip():
465
+ await self.push_frame(
466
+ TranscriptionFrame(
467
+ transcript,
468
+ self._user_id,
469
+ time_now_iso8601(),
470
+ language,
471
+ result=response,
472
+ )
473
+ )
474
+ await self.stop_processing_metrics()
475
+ return
476
+
477
+ # Legacy format (backward compatibility)
478
+ if msg_type == "events":
388
479
  parsed = EventResponse(**response)
389
480
  signal = parsed.data.signal_type
390
481
  timestamp = parsed.data.occured_at
@@ -397,14 +488,13 @@ class SarvamSTTService(STTService):
397
488
  elif signal == VADSignal.END:
398
489
  logger.debug("User stopped speaking")
399
490
  await self._call_event_handler("on_speech_ended")
491
+ return
400
492
 
401
- elif response["type"] == "data":
493
+ if msg_type == "data":
402
494
  await self.stop_ttfb_metrics()
403
495
  parsed = TranscriptionResponse(**response)
404
496
  transcript = parsed.data.transcript
405
- language_code = parsed.data.language_code
406
- if language_code is None:
407
- language_code = "hi-IN"
497
+ language_code = parsed.data.language_code or "hi-IN"
408
498
  language = self._map_language_code_to_enum(language_code)
409
499
 
410
500
  if transcript and transcript.strip():
@@ -417,8 +507,8 @@ class SarvamSTTService(STTService):
417
507
  result=response,
418
508
  )
419
509
  )
420
-
421
510
  await self.stop_processing_metrics()
511
+ return
422
512
 
423
513
  except Exception as e:
424
514
  logger.error(f"Error handling Sarvam response: {e}")