dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -20,9 +20,9 @@ from pipecat.frames.frames import (
20
20
  EndFrame,
21
21
  ErrorFrame,
22
22
  Frame,
23
+ InterruptionFrame,
23
24
  LLMFullResponseEndFrame,
24
25
  StartFrame,
25
- StartInterruptionFrame,
26
26
  TTSAudioRawFrame,
27
27
  TTSStartedFrame,
28
28
  TTSStoppedFrame,
@@ -76,17 +76,29 @@ class SarvamHttpTTSService(TTSService):
76
76
 
77
77
  Example::
78
78
 
79
- tts = SarvamTTSService(
79
+ tts = SarvamHttpTTSService(
80
80
  api_key="your-api-key",
81
81
  voice_id="anushka",
82
82
  model="bulbul:v2",
83
83
  aiohttp_session=session,
84
- params=SarvamTTSService.InputParams(
84
+ params=SarvamHttpTTSService.InputParams(
85
85
  language=Language.HI,
86
86
  pitch=0.1,
87
87
  pace=1.2
88
88
  )
89
89
  )
90
+
91
+ # For bulbul v3 beta with any speaker:
92
+ tts_v3 = SarvamHttpTTSService(
93
+ api_key="your-api-key",
94
+ voice_id="speaker_name",
95
+ model="bulbul:v3,
96
+ aiohttp_session=session,
97
+ params=SarvamHttpTTSService.InputParams(
98
+ language=Language.HI,
99
+ temperature=0.8
100
+ )
101
+ )
90
102
  """
91
103
 
92
104
  class InputParams(BaseModel):
@@ -105,6 +117,14 @@ class SarvamHttpTTSService(TTSService):
105
117
  pace: Optional[float] = Field(default=1.0, ge=0.3, le=3.0)
106
118
  loudness: Optional[float] = Field(default=1.0, ge=0.1, le=3.0)
107
119
  enable_preprocessing: Optional[bool] = False
120
+ temperature: Optional[float] = Field(
121
+ default=0.6,
122
+ ge=0.01,
123
+ le=1.0,
124
+ description="Controls the randomness of the output for bulbul v3 beta. "
125
+ "Lower values make the output more focused and deterministic, while "
126
+ "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
127
+ )
108
128
 
109
129
  def __init__(
110
130
  self,
@@ -124,7 +144,7 @@ class SarvamHttpTTSService(TTSService):
124
144
  api_key: Sarvam AI API subscription key.
125
145
  aiohttp_session: Shared aiohttp session for making requests.
126
146
  voice_id: Speaker voice ID (e.g., "anushka", "meera"). Defaults to "anushka".
127
- model: TTS model to use ("bulbul:v1" or "bulbul:v2"). Defaults to "bulbul:v2".
147
+ model: TTS model to use ("bulbul:v2" or "bulbul:v3-beta" or "bulbul:v3"). Defaults to "bulbul:v2".
128
148
  base_url: Sarvam AI API base URL. Defaults to "https://api.sarvam.ai".
129
149
  sample_rate: Audio sample rate in Hz (8000, 16000, 22050, 24000). If None, uses default.
130
150
  params: Additional voice and preprocessing parameters. If None, uses defaults.
@@ -138,16 +158,32 @@ class SarvamHttpTTSService(TTSService):
138
158
  self._base_url = base_url
139
159
  self._session = aiohttp_session
140
160
 
161
+ # Build base settings common to all models
141
162
  self._settings = {
142
163
  "language": (
143
164
  self.language_to_service_language(params.language) if params.language else "en-IN"
144
165
  ),
145
- "pitch": params.pitch,
146
- "pace": params.pace,
147
- "loudness": params.loudness,
148
166
  "enable_preprocessing": params.enable_preprocessing,
149
167
  }
150
168
 
169
+ # Add model-specific parameters
170
+ if model in ("bulbul:v3-beta", "bulbul:v3"):
171
+ self._settings.update(
172
+ {
173
+ "temperature": getattr(params, "temperature", 0.6),
174
+ "model": model,
175
+ }
176
+ )
177
+ else:
178
+ self._settings.update(
179
+ {
180
+ "pitch": params.pitch,
181
+ "pace": params.pace,
182
+ "loudness": params.loudness,
183
+ "model": model,
184
+ }
185
+ )
186
+
151
187
  self.set_model_name(model)
152
188
  self.set_voice(voice_id)
153
189
 
@@ -275,6 +311,18 @@ class SarvamTTSService(InterruptibleTTSService):
275
311
  pace=1.2
276
312
  )
277
313
  )
314
+
315
+ # For bulbul v3 beta with any speaker and temperature:
316
+ # Note: pace and loudness are not supported for bulbul v3 and bulbul v3 beta
317
+ tts_v3 = SarvamTTSService(
318
+ api_key="your-api-key",
319
+ voice_id="speaker_name",
320
+ model="bulbul:v3",
321
+ params=SarvamTTSService.InputParams(
322
+ language=Language.HI,
323
+ temperature=0.8
324
+ )
325
+ )
278
326
  """
279
327
 
280
328
  class InputParams(BaseModel):
@@ -310,6 +358,14 @@ class SarvamTTSService(InterruptibleTTSService):
310
358
  output_audio_codec: Optional[str] = "linear16"
311
359
  output_audio_bitrate: Optional[str] = "128k"
312
360
  language: Optional[Language] = Language.EN
361
+ temperature: Optional[float] = Field(
362
+ default=0.6,
363
+ ge=0.01,
364
+ le=1.0,
365
+ description="Controls the randomness of the output for bulbul v3 beta. "
366
+ "Lower values make the output more focused and deterministic, while "
367
+ "higher values make it more random. Range: 0.01 to 1.0. Default: 0.6.",
368
+ )
313
369
 
314
370
  def __init__(
315
371
  self,
@@ -329,6 +385,7 @@ class SarvamTTSService(InterruptibleTTSService):
329
385
  Args:
330
386
  api_key: Sarvam API key for authenticating TTS requests.
331
387
  model: Identifier of the Sarvam speech model (default "bulbul:v2").
388
+ Supports "bulbul:v2", "bulbul:v3-beta" and "bulbul:v3".
332
389
  voice_id: Voice identifier for synthesis (default "anushka").
333
390
  url: WebSocket URL for connecting to the TTS backend (default production URL).
334
391
  aiohttp_session: Optional shared aiohttp session. To maintain backward compatibility.
@@ -371,15 +428,12 @@ class SarvamTTSService(InterruptibleTTSService):
371
428
  self._api_key = api_key
372
429
  self.set_model_name(model)
373
430
  self.set_voice(voice_id)
374
- # Configuration parameters
431
+ # Build base settings common to all models
375
432
  self._settings = {
376
433
  "target_language_code": (
377
434
  self.language_to_service_language(params.language) if params.language else "en-IN"
378
435
  ),
379
- "pitch": params.pitch,
380
- "pace": params.pace,
381
436
  "speaker": voice_id,
382
- "loudness": params.loudness,
383
437
  "speech_sample_rate": 0,
384
438
  "enable_preprocessing": params.enable_preprocessing,
385
439
  "min_buffer_size": params.min_buffer_size,
@@ -387,6 +441,24 @@ class SarvamTTSService(InterruptibleTTSService):
387
441
  "output_audio_codec": params.output_audio_codec,
388
442
  "output_audio_bitrate": params.output_audio_bitrate,
389
443
  }
444
+
445
+ # Add model-specific parameters
446
+ if model in ("bulbul:v3-beta", "bulbul:v3"):
447
+ self._settings.update(
448
+ {
449
+ "temperature": getattr(params, "temperature", 0.6),
450
+ "model": model,
451
+ }
452
+ )
453
+ else:
454
+ self._settings.update(
455
+ {
456
+ "pitch": params.pitch,
457
+ "pace": params.pace,
458
+ "loudness": params.loudness,
459
+ "model": model,
460
+ }
461
+ )
390
462
  self._started = False
391
463
 
392
464
  self._receive_task = None
@@ -455,7 +527,7 @@ class SarvamTTSService(InterruptibleTTSService):
455
527
  direction: The direction to push the frame.
456
528
  """
457
529
  await super().push_frame(frame, direction)
458
- if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
530
+ if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
459
531
  self._started = False
460
532
 
461
533
  async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -525,6 +597,7 @@ class SarvamTTSService(InterruptibleTTSService):
525
597
  logger.debug("Connected to Sarvam TTS Websocket")
526
598
  await self._send_config()
527
599
 
600
+ await self._call_event_handler("on_connected")
528
601
  except Exception as e:
529
602
  logger.error(f"{self} initialization error: {e}")
530
603
  self._websocket = None
@@ -556,6 +629,10 @@ class SarvamTTSService(InterruptibleTTSService):
556
629
  await self._websocket.close()
557
630
  except Exception as e:
558
631
  logger.error(f"{self} error closing websocket: {e}")
632
+ finally:
633
+ self._started = False
634
+ self._websocket = None
635
+ await self._call_event_handler("on_disconnected")
559
636
 
560
637
  def _get_websocket(self):
561
638
  if self._websocket:
@@ -605,8 +682,15 @@ class SarvamTTSService(InterruptibleTTSService):
605
682
  logger.warning("Service is disconnecting, ignoring text send")
606
683
  return
607
684
 
685
+ # Validate text input
686
+ if not text or not isinstance(text, str) or not text.strip():
687
+ logger.warning(f"Invalid text input for Sarvam TTS: {repr(text)}")
688
+ return
689
+
608
690
  if self._websocket and self._websocket.state == State.OPEN:
609
- msg = {"type": "text", "data": {"text": text}}
691
+ msg = {"type": "text", "data": {"text": text.strip()}}
692
+ logger.info(f"Sarvam TTS: Sending text message: {repr(text.strip())}")
693
+ logger.debug(f"Sarvam TTS: Full message payload: {msg}")
610
694
  await self._websocket.send(json.dumps(msg))
611
695
  else:
612
696
  logger.warning("WebSocket not ready, cannot send text")
@@ -15,8 +15,8 @@ from pipecat.frames.frames import (
15
15
  CancelFrame,
16
16
  EndFrame,
17
17
  Frame,
18
+ InterruptionFrame,
18
19
  OutputImageRawFrame,
19
- StartInterruptionFrame,
20
20
  TTSAudioRawFrame,
21
21
  TTSStoppedFrame,
22
22
  UserStartedSpeakingFrame,
@@ -179,7 +179,7 @@ class SimliVideoService(FrameProcessor):
179
179
  return
180
180
  elif isinstance(frame, (EndFrame, CancelFrame)):
181
181
  await self._stop()
182
- elif isinstance(frame, (StartInterruptionFrame, UserStartedSpeakingFrame)):
182
+ elif isinstance(frame, (InterruptionFrame, UserStartedSpeakingFrame)):
183
183
  if not self._previously_interrupted:
184
184
  await self._simli_client.clearBuffer()
185
185
  self._previously_interrupted = self._is_trinity_avatar
@@ -19,7 +19,6 @@ from loguru import logger
19
19
  from pydantic import BaseModel
20
20
 
21
21
  from pipecat.frames.frames import (
22
- BotInterruptionFrame,
23
22
  CancelFrame,
24
23
  EndFrame,
25
24
  ErrorFrame,
@@ -551,7 +550,7 @@ class SpeechmaticsSTTService(STTService):
551
550
 
552
551
  @self._client.on(ServerMessageType.END_OF_UTTERANCE)
553
552
  def _evt_on_end_of_utterance(message: dict[str, Any]):
554
- logger.debug("End of utterance received from STT")
553
+ self.logger.debug("End of utterance received from STT")
555
554
  asyncio.run_coroutine_threadsafe(
556
555
  self._handle_end_of_utterance(), self.get_event_loop()
557
556
  )
@@ -578,6 +577,7 @@ class SpeechmaticsSTTService(STTService):
578
577
  ),
579
578
  )
580
579
  logger.debug(f"{self} Connected to Speechmatics STT service")
580
+ await self._call_event_handler("on_connected")
581
581
  except Exception as e:
582
582
  logger.error(f"{self} Error connecting to Speechmatics: {e}")
583
583
  self._client = None
@@ -596,6 +596,7 @@ class SpeechmaticsSTTService(STTService):
596
596
  logger.error(f"{self} Error closing Speechmatics client: {e}")
597
597
  finally:
598
598
  self._client = None
599
+ await self._call_event_handler("on_disconnected")
599
600
 
600
601
  def _process_config(self) -> None:
601
602
  """Create a formatted STT transcription config.
@@ -619,7 +620,7 @@ class SpeechmaticsSTTService(STTService):
619
620
  transcription_config.additional_vocab = [
620
621
  {
621
622
  "content": e.content,
622
- "sounds_like": e.sounds_like,
623
+ **({"sounds_like": e.sounds_like} if e.sounds_like else {}),
623
624
  }
624
625
  for e in self._params.additional_vocab
625
626
  ]
@@ -749,14 +750,13 @@ class SpeechmaticsSTTService(STTService):
749
750
  return
750
751
 
751
752
  # Frames to send
752
- upstream_frames: list[Frame] = []
753
753
  downstream_frames: list[Frame] = []
754
754
 
755
755
  # If VAD is enabled, then send a speaking frame
756
756
  if self._params.enable_vad and not self._is_speaking:
757
757
  logger.debug("User started speaking")
758
758
  self._is_speaking = True
759
- upstream_frames += [BotInterruptionFrame()]
759
+ await self.push_interruption_task_frame_and_wait()
760
760
  downstream_frames += [UserStartedSpeakingFrame()]
761
761
 
762
762
  # If final, then re-parse into TranscriptionFrame
@@ -775,7 +775,7 @@ class SpeechmaticsSTTService(STTService):
775
775
  ]
776
776
 
777
777
  # Log transcript(s)
778
- logger.debug(f"Finalized transcript: {[f.text for f in downstream_frames]}")
778
+ self.logger.debug(f"Finalized transcript: {[f.text for f in downstream_frames]}")
779
779
 
780
780
  # Return as interim results (unformatted)
781
781
  else:
@@ -794,10 +794,6 @@ class SpeechmaticsSTTService(STTService):
794
794
  self._is_speaking = False
795
795
  downstream_frames += [UserStoppedSpeakingFrame()]
796
796
 
797
- # Send UPSTREAM frames
798
- for frame in upstream_frames:
799
- await self.push_frame(frame, FrameDirection.UPSTREAM)
800
-
801
797
  # Send the DOWNSTREAM frames
802
798
  for frame in downstream_frames:
803
799
  await self.push_frame(frame, FrameDirection.DOWNSTREAM)
@@ -996,6 +992,22 @@ def _language_to_speechmatics_language(language: Language) -> str:
996
992
  # List of supported input languages
997
993
  BASE_LANGUAGES = {
998
994
  Language.AR: "ar",
995
+ Language.AR_AE: "ar",
996
+ Language.AR_BH: "ar",
997
+ Language.AR_DZ: "ar",
998
+ Language.AR_EG: "ar",
999
+ Language.AR_IQ: "ar",
1000
+ Language.AR_JO: "ar",
1001
+ Language.AR_KW: "ar",
1002
+ Language.AR_LB: "ar",
1003
+ Language.AR_LY: "ar",
1004
+ Language.AR_MA: "ar",
1005
+ Language.AR_OM: "ar",
1006
+ Language.AR_QA: "ar",
1007
+ Language.AR_SA: "ar",
1008
+ Language.AR_SY: "ar",
1009
+ Language.AR_TN: "ar",
1010
+ Language.AR_YE: "ar",
999
1011
  Language.BA: "ba",
1000
1012
  Language.EU: "eu",
1001
1013
  Language.BE: "be",
@@ -16,6 +16,7 @@ from loguru import logger
16
16
  from pipecat.frames.frames import (
17
17
  AudioRawFrame,
18
18
  BotStoppedSpeakingFrame,
19
+ ErrorFrame,
19
20
  Frame,
20
21
  StartFrame,
21
22
  STTMuteFrame,
@@ -25,6 +26,7 @@ from pipecat.frames.frames import (
25
26
  )
26
27
  from pipecat.processors.frame_processor import FrameDirection
27
28
  from pipecat.services.ai_service import AIService
29
+ from pipecat.services.websocket_service import WebsocketService
28
30
  from pipecat.transcriptions.language import Language
29
31
 
30
32
 
@@ -34,6 +36,25 @@ class STTService(AIService):
34
36
  Provides common functionality for STT services including audio passthrough,
35
37
  muting, settings management, and audio processing. Subclasses must implement
36
38
  the run_stt method to provide actual speech recognition.
39
+
40
+ Event handlers:
41
+ on_connected: Called when connected to the STT service.
42
+ on_connected: Called when disconnected from the STT service.
43
+ on_connection_error: Called when a connection to the STT service error occurs.
44
+
45
+ Example::
46
+
47
+ @stt.event_handler("on_connected")
48
+ async def on_connected(stt: STTService):
49
+ logger.debug(f"STT connected")
50
+
51
+ @stt.event_handler("on_disconnected")
52
+ async def on_disconnected(stt: STTService):
53
+ logger.debug(f"STT disconnected")
54
+
55
+ @stt.event_handler("on_connection_error")
56
+ async def on_connection_error(stt: STTService, error: str):
57
+ logger.error(f"STT connection error: {error}")
37
58
  """
38
59
 
39
60
  def __init__(
@@ -64,6 +85,10 @@ class STTService(AIService):
64
85
  self._voicemail_detect: bool = False
65
86
  self._user_id: str = ""
66
87
 
88
+ self._register_event_handler("on_connected")
89
+ self._register_event_handler("on_disconnected")
90
+ self._register_event_handler("on_connection_error")
91
+
67
92
  @property
68
93
  def is_muted(self) -> bool:
69
94
  """Check if the STT service is currently muted.
@@ -298,3 +323,25 @@ class SegmentedSTTService(STTService):
298
323
  if not self._user_speaking and len(self._audio_buffer) > self._audio_buffer_size_1s:
299
324
  discarded = len(self._audio_buffer) - self._audio_buffer_size_1s
300
325
  self._audio_buffer = self._audio_buffer[discarded:]
326
+
327
+
328
+ class WebsocketSTTService(STTService, WebsocketService):
329
+ """Base class for websocket-based STT services.
330
+
331
+ Combines STT functionality with websocket connectivity, providing automatic
332
+ error handling and reconnection capabilities.
333
+ """
334
+
335
+ def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
336
+ """Initialize the Websocket STT service.
337
+
338
+ Args:
339
+ reconnect_on_error: Whether to automatically reconnect on websocket errors.
340
+ **kwargs: Additional arguments passed to parent classes.
341
+ """
342
+ STTService.__init__(self, **kwargs)
343
+ WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
344
+
345
+ async def _report_error(self, error: ErrorFrame):
346
+ await self._call_event_handler("on_connection_error", error.error)
347
+ await self.push_error(error)
@@ -23,12 +23,12 @@ from pipecat.frames.frames import (
23
23
  CancelFrame,
24
24
  EndFrame,
25
25
  Frame,
26
+ InterruptionFrame,
26
27
  OutputAudioRawFrame,
27
28
  OutputImageRawFrame,
28
29
  OutputTransportReadyFrame,
29
30
  SpeechOutputAudioRawFrame,
30
31
  StartFrame,
31
- StartInterruptionFrame,
32
32
  TTSAudioRawFrame,
33
33
  TTSStartedFrame,
34
34
  )
@@ -222,7 +222,7 @@ class TavusVideoService(AIService):
222
222
  """
223
223
  await super().process_frame(frame, direction)
224
224
 
225
- if isinstance(frame, StartInterruptionFrame):
225
+ if isinstance(frame, InterruptionFrame):
226
226
  await self._handle_interruptions()
227
227
  await self.push_frame(frame, direction)
228
228
  elif isinstance(frame, TTSAudioRawFrame):
@@ -8,7 +8,18 @@
8
8
 
9
9
  import asyncio
10
10
  from abc import abstractmethod
11
- from typing import Any, AsyncGenerator, Callable, Dict, List, Mapping, Optional, Sequence, Tuple
11
+ from typing import (
12
+ Any,
13
+ AsyncGenerator,
14
+ AsyncIterator,
15
+ Callable,
16
+ Dict,
17
+ List,
18
+ Mapping,
19
+ Optional,
20
+ Sequence,
21
+ Tuple,
22
+ )
12
23
 
13
24
  from loguru import logger
14
25
 
@@ -20,10 +31,10 @@ from pipecat.frames.frames import (
20
31
  ErrorFrame,
21
32
  Frame,
22
33
  InterimTranscriptionFrame,
34
+ InterruptionFrame,
23
35
  LLMFullResponseEndFrame,
24
36
  LLMFullResponseStartFrame,
25
37
  StartFrame,
26
- StartInterruptionFrame,
27
38
  TextFrame,
28
39
  TranscriptionFrame,
29
40
  TTSAudioRawFrame,
@@ -49,6 +60,25 @@ class TTSService(AIService):
49
60
  Provides common functionality for TTS services including text aggregation,
50
61
  filtering, audio generation, and frame management. Supports configurable
51
62
  sentence aggregation, silence insertion, and frame processing control.
63
+
64
+ Event handlers:
65
+ on_connected: Called when connected to the STT service.
66
+ on_connected: Called when disconnected from the STT service.
67
+ on_connection_error: Called when a connection to the STT service error occurs.
68
+
69
+ Example::
70
+
71
+ @tts.event_handler("on_connected")
72
+ async def on_connected(tts: TTSService):
73
+ logger.debug(f"TTS connected")
74
+
75
+ @tts.event_handler("on_disconnected")
76
+ async def on_disconnected(tts: TTSService):
77
+ logger.debug(f"TTS disconnected")
78
+
79
+ @tts.event_handler("on_connection_error")
80
+ async def on_connection_error(stt: TTSService, error: str):
81
+ logger.error(f"TTS connection error: {error}")
52
82
  """
53
83
 
54
84
  def __init__(
@@ -98,6 +128,7 @@ class TTSService(AIService):
98
128
 
99
129
  .. deprecated:: 0.0.59
100
130
  Use `text_filters` instead, which allows multiple filters.
131
+ text_formatter: Optional callable receiving text and language code, returning formatted text.
101
132
 
102
133
  transport_destination: Destination for generated audio frames.
103
134
  **kwargs: Additional arguments passed to the parent AIService.
@@ -124,7 +155,6 @@ class TTSService(AIService):
124
155
 
125
156
  self._tracing_enabled: bool = False
126
157
 
127
-
128
158
  if text_filter:
129
159
  import warnings
130
160
 
@@ -143,6 +173,10 @@ class TTSService(AIService):
143
173
 
144
174
  self._processing_text: bool = False
145
175
 
176
+ self._register_event_handler("on_connected")
177
+ self._register_event_handler("on_disconnected")
178
+ self._register_event_handler("on_connection_error")
179
+
146
180
  @property
147
181
  def sample_rate(self) -> int:
148
182
  """Get the current sample rate for audio output.
@@ -319,7 +353,7 @@ class TTSService(AIService):
319
353
  and not isinstance(frame, TranscriptionFrame)
320
354
  ):
321
355
  await self._process_text_frame(frame)
322
- elif isinstance(frame, StartInterruptionFrame):
356
+ elif isinstance(frame, InterruptionFrame):
323
357
  await self._handle_interruption(frame, direction)
324
358
  await self.push_frame(frame, direction)
325
359
  elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
@@ -377,14 +411,44 @@ class TTSService(AIService):
377
411
  await super().push_frame(frame, direction)
378
412
 
379
413
  if self._push_stop_frames and (
380
- isinstance(frame, StartInterruptionFrame)
414
+ isinstance(frame, InterruptionFrame)
381
415
  or isinstance(frame, TTSStartedFrame)
382
416
  or isinstance(frame, TTSAudioRawFrame)
383
417
  or isinstance(frame, TTSStoppedFrame)
384
418
  ):
385
419
  await self._stop_frame_queue.put(frame)
386
420
 
387
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
421
+ async def _stream_audio_frames_from_iterator(
422
+ self, iterator: AsyncIterator[bytes], *, strip_wav_header: bool
423
+ ) -> AsyncGenerator[Frame, None]:
424
+ buffer = bytearray()
425
+ need_to_strip_wav_header = strip_wav_header
426
+ async for chunk in iterator:
427
+ if need_to_strip_wav_header and chunk.startswith(b"RIFF"):
428
+ chunk = chunk[44:]
429
+ need_to_strip_wav_header = False
430
+
431
+ # Append to current buffer.
432
+ buffer.extend(chunk)
433
+
434
+ # Round to nearest even number.
435
+ aligned_length = len(buffer) & ~1 # 111111111...11110
436
+ if aligned_length > 0:
437
+ aligned_chunk = buffer[:aligned_length]
438
+ buffer = buffer[aligned_length:] # keep any leftover byte
439
+
440
+ if len(aligned_chunk) > 0:
441
+ frame = TTSAudioRawFrame(bytes(aligned_chunk), self.sample_rate, 1)
442
+ yield frame
443
+
444
+ if len(buffer) > 0:
445
+ # Make sure we don't need an extra padding byte.
446
+ if len(buffer) % 2 == 1:
447
+ buffer.extend(b"\x00")
448
+ frame = TTSAudioRawFrame(bytes(buffer), self.sample_rate, 1)
449
+ yield frame
450
+
451
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
388
452
  self._processing_text = False
389
453
  await self._text_aggregator.handle_interruption()
390
454
  for filter in self._text_filters:
@@ -465,7 +529,7 @@ class TTSService(AIService):
465
529
  )
466
530
  if isinstance(frame, TTSStartedFrame):
467
531
  has_started = True
468
- elif isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
532
+ elif isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
469
533
  has_started = False
470
534
  except asyncio.TimeoutError:
471
535
  if has_started:
@@ -550,7 +614,7 @@ class WordTTSService(TTSService):
550
614
  elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
551
615
  await self.flush_audio()
552
616
 
553
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
617
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
554
618
  await super()._handle_interruption(frame, direction)
555
619
  self._llm_response_started = False
556
620
  self.reset_word_timestamps()
@@ -613,7 +677,6 @@ class WebsocketTTSService(TTSService, WebsocketService):
613
677
  """
614
678
  TTSService.__init__(self, **kwargs)
615
679
  WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
616
- self._register_event_handler("on_connection_error")
617
680
 
618
681
  async def _report_error(self, error: ErrorFrame):
619
682
  await self._call_event_handler("on_connection_error", error.error)
@@ -640,7 +703,7 @@ class InterruptibleTTSService(WebsocketTTSService):
640
703
  # user interrupts we need to reconnect.
641
704
  self._bot_speaking = False
642
705
 
643
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
706
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
644
707
  await super()._handle_interruption(frame, direction)
645
708
  if self._bot_speaking:
646
709
  await self._disconnect()
@@ -665,15 +728,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
665
728
  """Base class for websocket-based TTS services that support word timestamps.
666
729
 
667
730
  Combines word timestamp functionality with websocket connectivity.
668
-
669
- Event handlers:
670
- on_connection_error: Called when a websocket connection error occurs.
671
-
672
- Example::
673
-
674
- @tts.event_handler("on_connection_error")
675
- async def on_connection_error(tts: TTSService, error: str):
676
- logger.error(f"TTS connection error: {error}")
677
731
  """
678
732
 
679
733
  def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
@@ -685,7 +739,6 @@ class WebsocketWordTTSService(WordTTSService, WebsocketService):
685
739
  """
686
740
  WordTTSService.__init__(self, **kwargs)
687
741
  WebsocketService.__init__(self, reconnect_on_error=reconnect_on_error, **kwargs)
688
- self._register_event_handler("on_connection_error")
689
742
 
690
743
  async def _report_error(self, error: ErrorFrame):
691
744
  await self._call_event_handler("on_connection_error", error.error)
@@ -712,7 +765,7 @@ class InterruptibleWordTTSService(WebsocketWordTTSService):
712
765
  # user interrupts we need to reconnect.
713
766
  self._bot_speaking = False
714
767
 
715
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
768
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
716
769
  await super()._handle_interruption(frame, direction)
717
770
  if self._bot_speaking:
718
771
  await self._disconnect()
@@ -840,7 +893,7 @@ class AudioContextWordTTSService(WebsocketWordTTSService):
840
893
  await super().cancel(frame)
841
894
  await self._stop_audio_context_task()
842
895
 
843
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
896
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
844
897
  await super()._handle_interruption(frame, direction)
845
898
  await self._stop_audio_context_task()
846
899
  self._create_audio_context_task()