dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -0,0 +1,220 @@
1
+ # Copyright (c) 2024–2025, Daily
2
+ #
3
+ # SPDX-License-Identifier: BSD 2-Clause License
4
+
5
+ """Hume Text-to-Speech service implementation."""
6
+
7
+ import base64
8
+ import os
9
+ from typing import Any, AsyncGenerator, Optional
10
+
11
+ from loguru import logger
12
+ from pydantic import BaseModel
13
+
14
+ from pipecat.frames.frames import (
15
+ ErrorFrame,
16
+ Frame,
17
+ StartFrame,
18
+ TTSAudioRawFrame,
19
+ TTSStartedFrame,
20
+ TTSStoppedFrame,
21
+ )
22
+ from pipecat.services.tts_service import TTSService
23
+ from pipecat.utils.tracing.service_decorators import traced_tts
24
+
25
+ try:
26
+ from hume import AsyncHumeClient
27
+ from hume.tts import (
28
+ FormatPcm,
29
+ PostedUtterance,
30
+ PostedUtteranceVoiceWithId,
31
+ )
32
+ except ModuleNotFoundError as e: # pragma: no cover - import-time guidance
33
+ logger.error(f"Exception: {e}")
34
+ logger.error("In order to use Hume, you need to `pip install pipecat-ai[hume]`.")
35
+ raise Exception(f"Missing module: {e}")
36
+
37
+
38
+ HUME_SAMPLE_RATE = 48_000 # Hume TTS streams at 48 kHz
39
+
40
+
41
+ class HumeTTSService(TTSService):
42
+ """Hume Octave Text-to-Speech service.
43
+
44
+ Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
45
+ using the Python SDK and emits ``TTSAudioRawFrame`` frames suitable for Pipecat transports.
46
+
47
+ Supported features:
48
+
49
+ - Generates speech from text using Hume TTS.
50
+ - Streams PCM audio.
51
+ - Supports dynamic updates of voice and synthesis parameters at runtime.
52
+ - Provides metrics for Time To First Byte (TTFB) and TTS usage.
53
+ """
54
+
55
+ class InputParams(BaseModel):
56
+ """Optional synthesis parameters for Hume TTS.
57
+
58
+ Parameters:
59
+ description: Natural-language acting directions (up to 100 characters).
60
+ speed: Speaking-rate multiplier (0.5-2.0).
61
+ trailing_silence: Seconds of silence to append at the end (0-5).
62
+ """
63
+
64
+ description: Optional[str] = None
65
+ speed: Optional[float] = None
66
+ trailing_silence: Optional[float] = None
67
+
68
+ def __init__(
69
+ self,
70
+ *,
71
+ api_key: Optional[str] = None,
72
+ voice_id: str,
73
+ params: Optional[InputParams] = None,
74
+ sample_rate: Optional[int] = HUME_SAMPLE_RATE,
75
+ **kwargs,
76
+ ) -> None:
77
+ """Initialize the HumeTTSService.
78
+
79
+ Args:
80
+ api_key: Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
81
+ voice_id: ID of the voice to use. Only voice IDs are supported; voice names are not.
82
+ params: Optional synthesis controls (acting instructions, speed, trailing silence).
83
+ sample_rate: Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
84
+ **kwargs: Additional arguments passed to the parent class.
85
+ """
86
+ api_key = api_key or os.getenv("HUME_API_KEY")
87
+ if not api_key:
88
+ raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
89
+
90
+ if sample_rate != HUME_SAMPLE_RATE:
91
+ logger.warning(
92
+ f"Hume TTS streams at {HUME_SAMPLE_RATE} Hz; configured sample_rate={sample_rate}"
93
+ )
94
+
95
+ super().__init__(sample_rate=sample_rate, **kwargs)
96
+
97
+ self._client = AsyncHumeClient(api_key=api_key)
98
+ self._params = params or HumeTTSService.InputParams()
99
+
100
+ # Store voice in the base class (mirrors other services)
101
+ self.set_voice(voice_id)
102
+
103
+ self._audio_bytes = b""
104
+
105
+ def can_generate_metrics(self) -> bool:
106
+ """Can generate metrics.
107
+
108
+ Returns:
109
+ True if metrics can be generated, False otherwise.
110
+ """
111
+ return True
112
+
113
+ async def start(self, frame: StartFrame) -> None:
114
+ """Start the service.
115
+
116
+ Args:
117
+ frame: The start frame.
118
+ """
119
+ await super().start(frame)
120
+
121
+ async def update_setting(self, key: str, value: Any) -> None:
122
+ """Runtime updates via `TTSUpdateSettingsFrame`.
123
+
124
+ Args:
125
+ key: The name of the setting to update. Recognized keys are:
126
+ - "voice_id"
127
+ - "description"
128
+ - "speed"
129
+ - "trailing_silence"
130
+ value: The new value for the setting.
131
+ """
132
+ key_l = (key or "").lower()
133
+
134
+ if key_l == "voice_id":
135
+ self.set_voice(str(value))
136
+ logger.info(f"HumeTTSService voice_id set to: {self.voice}")
137
+ elif key_l == "description":
138
+ self._params.description = None if value is None else str(value)
139
+ elif key_l == "speed":
140
+ self._params.speed = None if value is None else float(value)
141
+ elif key_l == "trailing_silence":
142
+ self._params.trailing_silence = None if value is None else float(value)
143
+ else:
144
+ # Defer unknown keys to the base class
145
+ await super().update_setting(key, value)
146
+
147
+ @traced_tts
148
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
149
+ """Generate speech from text using Hume TTS.
150
+
151
+ Args:
152
+ text: The text to be synthesized.
153
+
154
+ Returns:
155
+ An async generator that yields `Frame` objects, including
156
+ `TTSStartedFrame`, `TTSAudioRawFrame`, `ErrorFrame`, and
157
+ `TTSStoppedFrame`.
158
+ """
159
+ logger.debug(f"{self}: Generating Hume TTS: [{text}]")
160
+
161
+ # Build the request payload
162
+ utterance_kwargs: dict[str, Any] = {
163
+ "text": text,
164
+ "voice": PostedUtteranceVoiceWithId(id=self._voice_id),
165
+ }
166
+ if self._params.description is not None:
167
+ utterance_kwargs["description"] = self._params.description
168
+ if self._params.speed is not None:
169
+ utterance_kwargs["speed"] = self._params.speed
170
+ if self._params.trailing_silence is not None:
171
+ utterance_kwargs["trailing_silence"] = self._params.trailing_silence
172
+
173
+ utterance = PostedUtterance(**utterance_kwargs)
174
+
175
+ # Request raw PCM chunks in the streaming JSON
176
+ pcm_fmt = FormatPcm(type="pcm")
177
+
178
+ await self.start_ttfb_metrics()
179
+ await self.start_tts_usage_metrics(text)
180
+ yield TTSStartedFrame()
181
+
182
+ try:
183
+ # Instant mode is always enabled here (not user-configurable)
184
+ # Hume emits mono PCM at 48 kHz; downstream can resample if needed.
185
+ # We buffer audio bytes before sending to prevent glitches.
186
+ self._audio_bytes = b""
187
+ async for chunk in self._client.tts.synthesize_json_streaming(
188
+ utterances=[utterance],
189
+ format=pcm_fmt,
190
+ instant_mode=True,
191
+ version="2",
192
+ ):
193
+ audio_b64 = getattr(chunk, "audio", None)
194
+ if not audio_b64:
195
+ continue
196
+
197
+ pcm_bytes = base64.b64decode(audio_b64)
198
+ self._audio_bytes += pcm_bytes
199
+
200
+ # Buffer audio until we have enough to avoid glitches
201
+ if len(self._audio_bytes) < self.chunk_size:
202
+ continue
203
+
204
+ frame = TTSAudioRawFrame(
205
+ audio=self._audio_bytes,
206
+ sample_rate=self.sample_rate,
207
+ num_channels=1,
208
+ )
209
+
210
+ yield frame
211
+
212
+ self._audio_bytes = b""
213
+
214
+ except Exception as e:
215
+ logger.exception(f"{self} error generating TTS: {e}")
216
+ await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
217
+ finally:
218
+ # Ensure TTFB timer is stopped even on early failures
219
+ await self.stop_ttfb_metrics()
220
+ yield TTSStoppedFrame()
@@ -38,7 +38,7 @@ Examples::
38
38
  model="inworld-tts-1",
39
39
  streaming=True, # Default
40
40
  params=InworldTTSService.InputParams(
41
- temperature=0.8, # Optional: control synthesis variability (range: [0, 2])
41
+ temperature=1.1, # Optional: control synthesis variability (range: [0, 2])
42
42
  ),
43
43
  )
44
44
 
@@ -50,7 +50,7 @@ Examples::
50
50
  model="inworld-tts-1",
51
51
  streaming=False,
52
52
  params=InworldTTSService.InputParams(
53
- temperature=0.8,
53
+ temperature=1.1,
54
54
  ),
55
55
  )
56
56
  """
@@ -123,7 +123,7 @@ class InworldTTSService(TTSService):
123
123
  model="inworld-tts-1",
124
124
  streaming=True, # Default behavior
125
125
  params=InworldTTSService.InputParams(
126
- temperature=0.8, # Add variability to speech synthesis (range: [0, 2])
126
+ temperature=1.1, # Add variability to speech synthesis (range: [0, 2])
127
127
  ),
128
128
  )
129
129
 
@@ -135,7 +135,7 @@ class InworldTTSService(TTSService):
135
135
  model="inworld-tts-1-max",
136
136
  streaming=False,
137
137
  params=InworldTTSService.InputParams(
138
- temperature=0.8,
138
+ temperature=1.1,
139
139
  ),
140
140
  )
141
141
  """
@@ -144,7 +144,7 @@ class InworldTTSService(TTSService):
144
144
  """Optional input parameters for Inworld TTS configuration.
145
145
 
146
146
  Parameters:
147
- temperature: Voice temperature control for synthesis variability (e.g., 0.8).
147
+ temperature: Voice temperature control for synthesis variability (e.g., 1.1).
148
148
  Valid range: [0, 2]. Higher values increase variability.
149
149
 
150
150
  Note:
@@ -197,7 +197,7 @@ class InworldTTSService(TTSService):
197
197
  - "LINEAR16" (default) - Uncompressed PCM, best quality
198
198
  - Other formats as supported by Inworld API
199
199
  params: Optional input parameters for additional configuration. Use this to specify:
200
- - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional)
200
+ - temperature: Voice temperature control for variability (range: [0, 2], e.g., 1.1, optional)
201
201
  Language is automatically inferred from input text.
202
202
  **kwargs: Additional arguments passed to the parent TTSService class.
203
203
 
@@ -36,15 +36,15 @@ from pipecat.frames.frames import (
36
36
  FunctionCallResultFrame,
37
37
  FunctionCallResultProperties,
38
38
  FunctionCallsStartedFrame,
39
+ InterruptionFrame,
39
40
  LLMConfigureOutputFrame,
40
41
  LLMFullResponseEndFrame,
41
42
  LLMFullResponseStartFrame,
42
43
  LLMTextFrame,
43
44
  StartFrame,
44
- StartInterruptionFrame,
45
45
  UserImageRequestFrame,
46
46
  )
47
- from pipecat.processors.aggregators.llm_context import LLMContext
47
+ from pipecat.processors.aggregators.llm_context import LLMContext, LLMSpecificMessage
48
48
  from pipecat.processors.aggregators.llm_response import (
49
49
  LLMAssistantAggregatorParams,
50
50
  LLMUserAggregatorParams,
@@ -195,6 +195,17 @@ class LLMService(AIService):
195
195
  """
196
196
  return self._adapter
197
197
 
198
+ def create_llm_specific_message(self, message: Any) -> LLMSpecificMessage:
199
+ """Create an LLM-specific message (as opposed to a standard message) for use in an LLMContext.
200
+
201
+ Args:
202
+ message: The message content.
203
+
204
+ Returns:
205
+ A LLMSpecificMessage instance.
206
+ """
207
+ return self.get_llm_adapter().create_llm_specific_message(message)
208
+
198
209
  async def run_inference(self, context: LLMContext | OpenAILLMContext) -> Optional[str]:
199
210
  """Run a one-shot, out-of-band (i.e. out-of-pipeline) inference with the given LLM context.
200
211
 
@@ -269,7 +280,7 @@ class LLMService(AIService):
269
280
  """
270
281
  await super().process_frame(frame, direction)
271
282
 
272
- if isinstance(frame, StartInterruptionFrame):
283
+ if isinstance(frame, InterruptionFrame):
273
284
  await self._handle_interruptions(frame)
274
285
  elif isinstance(frame, LLMConfigureOutputFrame):
275
286
  self._skip_tts = frame.skip_tts
@@ -286,8 +297,7 @@ class LLMService(AIService):
286
297
 
287
298
  await super().push_frame(frame, direction)
288
299
 
289
- async def _handle_interruptions(self, _: StartInterruptionFrame):
290
- # logger.info("In LLM Handling interruptions")
300
+ async def _handle_interruptions(self, _: InterruptionFrame):
291
301
  for function_name, entry in self._functions.items():
292
302
  if entry.cancel_on_interruption:
293
303
  await self._cancel_function_call(function_name)
@@ -16,8 +16,8 @@ from pipecat.frames.frames import (
16
16
  EndFrame,
17
17
  ErrorFrame,
18
18
  Frame,
19
+ InterruptionFrame,
19
20
  StartFrame,
20
- StartInterruptionFrame,
21
21
  TTSAudioRawFrame,
22
22
  TTSStartedFrame,
23
23
  TTSStoppedFrame,
@@ -180,7 +180,7 @@ class LmntTTSService(InterruptibleTTSService):
180
180
  direction: The direction to push the frame.
181
181
  """
182
182
  await super().push_frame(frame, direction)
183
- if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
183
+ if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
184
184
  self._started = False
185
185
 
186
186
  async def _connect(self):
@@ -222,6 +222,7 @@ class LmntTTSService(InterruptibleTTSService):
222
222
  # Send initialization message
223
223
  await self._websocket.send(json.dumps(init_msg))
224
224
 
225
+ await self._call_event_handler("on_connected")
225
226
  except Exception as e:
226
227
  logger.error(f"{self} initialization error: {e}")
227
228
  self._websocket = None
@@ -243,6 +244,7 @@ class LmntTTSService(InterruptibleTTSService):
243
244
  finally:
244
245
  self._started = False
245
246
  self._websocket = None
247
+ await self._call_event_handler("on_disconnected")
246
248
 
247
249
  def _get_websocket(self):
248
250
  """Get the WebSocket connection if available."""
@@ -7,7 +7,7 @@
7
7
  """MCP (Model Context Protocol) client for integrating external tools with LLMs."""
8
8
 
9
9
  import json
10
- from typing import Any, Dict, List, Tuple
10
+ from typing import Any, Dict, List, TypeAlias
11
11
 
12
12
  from loguru import logger
13
13
 
@@ -28,6 +28,8 @@ except ModuleNotFoundError as e:
28
28
  logger.error("In order to use an MCP client, you need to `pip install pipecat-ai[mcp]`.")
29
29
  raise Exception(f"Missing module: {e}")
30
30
 
31
+ ServerParameters: TypeAlias = StdioServerParameters | SseServerParameters | StreamableHttpParameters
32
+
31
33
 
32
34
  class MCPClient(BaseObject):
33
35
  """Client for Model Context Protocol (MCP) servers.
@@ -42,7 +44,7 @@ class MCPClient(BaseObject):
42
44
 
43
45
  def __init__(
44
46
  self,
45
- server_params: Tuple[StdioServerParameters, SseServerParameters, StreamableHttpParameters],
47
+ server_params: ServerParameters,
46
48
  **kwargs,
47
49
  ):
48
50
  """Initialize the MCP client with server parameters.
@@ -16,7 +16,8 @@ from typing import Any, Dict, List, Optional
16
16
  from loguru import logger
17
17
  from pydantic import BaseModel, Field
18
18
 
19
- from pipecat.frames.frames import ErrorFrame, Frame, LLMMessagesFrame
19
+ from pipecat.frames.frames import ErrorFrame, Frame, LLMContextFrame, LLMMessagesFrame
20
+ from pipecat.processors.aggregators.llm_context import LLMContext
20
21
  from pipecat.processors.aggregators.openai_llm_context import (
21
22
  OpenAILLMContext,
22
23
  OpenAILLMContextFrame,
@@ -180,11 +181,11 @@ class Mem0MemoryService(FrameProcessor):
180
181
  logger.error(f"Error retrieving memories from Mem0: {e}")
181
182
  return []
182
183
 
183
- def _enhance_context_with_memories(self, context: OpenAILLMContext, query: str):
184
+ def _enhance_context_with_memories(self, context: LLMContext | OpenAILLMContext, query: str):
184
185
  """Enhance the LLM context with relevant memories.
185
186
 
186
187
  Args:
187
- context: The OpenAILLMContext to enhance with memory information.
188
+ context: The LLM context to enhance with memory information.
188
189
  query: The query to search for relevant memories.
189
190
  """
190
191
  # Skip if this is the same query we just processed
@@ -222,11 +223,11 @@ class Mem0MemoryService(FrameProcessor):
222
223
  context = None
223
224
  messages = None
224
225
 
225
- if isinstance(frame, OpenAILLMContextFrame):
226
+ if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
226
227
  context = frame.context
227
228
  elif isinstance(frame, LLMMessagesFrame):
228
229
  messages = frame.messages
229
- context = OpenAILLMContext.from_messages(messages)
230
+ context = LLMContext(messages)
230
231
 
231
232
  if context:
232
233
  try:
@@ -57,16 +57,18 @@ class MistralLLMService(OpenAILLMService):
57
57
  logger.debug(f"Creating Mistral client with api {base_url}")
58
58
  return super().create_client(api_key, base_url, **kwargs)
59
59
 
60
- def _apply_mistral_assistant_prefix(
60
+ def _apply_mistral_fixups(
61
61
  self, messages: List[ChatCompletionMessageParam]
62
62
  ) -> List[ChatCompletionMessageParam]:
63
- """Apply Mistral's assistant message prefix requirement.
63
+ """Apply fixups to messages to meet Mistral-specific requirements.
64
64
 
65
- Mistral requires assistant messages to have prefix=True when they
66
- are the final message in a conversation. According to Mistral's API:
67
- - Assistant messages with prefix=True MUST be the last message
68
- - Only add prefix=True to the final assistant message when needed
69
- - This allows assistant messages to be accepted as the last message
65
+ 1. A "tool"-role message must be followed by an assistant message.
66
+
67
+ 2. "system"-role messages must only appear at the start of a
68
+ conversation.
69
+
70
+ 3. Assistant messages must have prefix=True when they are the final
71
+ message in a conversation (but at no other point).
70
72
 
71
73
  Args:
72
74
  messages: The original list of messages.
@@ -80,6 +82,25 @@ class MistralLLMService(OpenAILLMService):
80
82
  # Create a copy to avoid modifying the original
81
83
  fixed_messages = [dict(msg) for msg in messages]
82
84
 
85
+ # Ensure all tool responses are followed by an assistant message
86
+ assistant_insert_indices = []
87
+ for i, msg in enumerate(fixed_messages):
88
+ if msg.get("role") == "tool":
89
+ # If this is the last message or the next message is not assistant
90
+ if i == len(fixed_messages) - 1 or fixed_messages[i + 1].get("role") != "assistant":
91
+ assistant_insert_indices.append(i + 1)
92
+ for idx in reversed(assistant_insert_indices):
93
+ fixed_messages.insert(idx, {"role": "assistant", "content": " "})
94
+
95
+ # Convert any "system" messages that aren't at the start (i.e., after the initial contiguous block) to "user"
96
+ first_non_system_idx = next(
97
+ (i for i, msg in enumerate(fixed_messages) if msg.get("role") != "system"),
98
+ len(fixed_messages),
99
+ )
100
+ for i, msg in enumerate(fixed_messages):
101
+ if msg.get("role") == "system" and i >= first_non_system_idx:
102
+ msg["role"] = "user"
103
+
83
104
  # Get the last message
84
105
  last_message = fixed_messages[-1]
85
106
 
@@ -158,7 +179,7 @@ class MistralLLMService(OpenAILLMService):
158
179
  - Core completion settings
159
180
  """
160
181
  # Apply Mistral's assistant prefix requirement for API compatibility
161
- fixed_messages = self._apply_mistral_assistant_prefix(params_from_context["messages"])
182
+ fixed_messages = self._apply_mistral_fixups(params_from_context["messages"])
162
183
 
163
184
  params = {
164
185
  "model": self.model_name,
@@ -11,17 +11,20 @@ for image analysis and description generation.
11
11
  """
12
12
 
13
13
  import asyncio
14
- from typing import AsyncGenerator
14
+ import base64
15
+ from io import BytesIO
16
+ from typing import AsyncGenerator, Optional
15
17
 
16
18
  from loguru import logger
17
19
  from PIL import Image
18
20
 
19
- from pipecat.frames.frames import ErrorFrame, Frame, TextFrame, VisionImageRawFrame
21
+ from pipecat.frames.frames import ErrorFrame, Frame, TextFrame
22
+ from pipecat.processors.aggregators.llm_context import LLMContext
20
23
  from pipecat.services.vision_service import VisionService
21
24
 
22
25
  try:
23
26
  import torch
24
- from transformers import AutoModelForCausalLM, AutoTokenizer
27
+ from transformers import AutoModelForCausalLM
25
28
  except ModuleNotFoundError as e:
26
29
  logger.error(f"Exception: {e}")
27
30
  logger.error("In order to use Moondream, you need to `pip install pipecat-ai[moondream]`.")
@@ -94,11 +97,11 @@ class MoondreamService(VisionService):
94
97
 
95
98
  logger.debug("Loaded Moondream model")
96
99
 
97
- async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
100
+ async def run_vision(self, context: LLMContext) -> AsyncGenerator[Frame, None]:
98
101
  """Analyze an image and generate a description.
99
102
 
100
103
  Args:
101
- frame: Vision frame containing the image data and optional question text.
104
+ context: The context to process, containing image data.
102
105
 
103
106
  Yields:
104
107
  Frame: TextFrame containing the generated image description, or ErrorFrame
@@ -109,22 +112,45 @@ class MoondreamService(VisionService):
109
112
  yield ErrorFrame("Moondream model not available")
110
113
  return
111
114
 
112
- logger.debug(f"Analyzing image: {frame}")
115
+ image_bytes = None
116
+ text = None
117
+ try:
118
+ messages = context.get_messages()
119
+ last_message = messages[-1]
120
+ last_message_content = last_message.get("content")
121
+
122
+ for item in last_message_content:
123
+ if isinstance(item, dict):
124
+ if (
125
+ "image_url" in item
126
+ and isinstance(item["image_url"], dict)
127
+ and item["image_url"].get("url")
128
+ ):
129
+ image_bytes = base64.b64decode(item["image_url"]["url"].split(",")[1])
130
+ elif "text" in item and isinstance(item["text"], str):
131
+ text = item["text"]
132
+
133
+ except Exception as e:
134
+ logger.error(f"Exception during image extraction: {e}")
135
+ yield ErrorFrame("Failed to extract image from context")
136
+ return
113
137
 
114
- def get_image_description(frame: VisionImageRawFrame):
115
- """Generate description for the given image frame.
138
+ if not image_bytes:
139
+ logger.error("No image found in context")
140
+ yield ErrorFrame("No image found in context")
141
+ return
116
142
 
117
- Args:
118
- frame: Vision frame containing image data and question.
143
+ logger.debug(
144
+ f"Analyzing image (bytes length: {len(image_bytes) if image_bytes else 'None'})"
145
+ )
119
146
 
120
- Returns:
121
- str: Generated description of the image.
122
- """
123
- image = Image.frombytes(frame.format, frame.size, frame.image)
147
+ def get_image_description(bytes: bytes, text: Optional[str]) -> str:
148
+ image_buffer = BytesIO(bytes)
149
+ image = Image.open(image_buffer)
124
150
  image_embeds = self._model.encode_image(image)
125
- description = self._model.query(image_embeds, frame.text)["answer"]
151
+ description = self._model.query(image_embeds, text)["answer"]
126
152
  return description
127
153
 
128
- description = await asyncio.to_thread(get_image_description, frame)
154
+ description = await asyncio.to_thread(get_image_description, image_bytes, text)
129
155
 
130
156
  yield TextFrame(text=description)
@@ -25,9 +25,9 @@ from pipecat.frames.frames import (
25
25
  EndFrame,
26
26
  ErrorFrame,
27
27
  Frame,
28
+ InterruptionFrame,
28
29
  LLMFullResponseEndFrame,
29
30
  StartFrame,
30
- StartInterruptionFrame,
31
31
  TTSAudioRawFrame,
32
32
  TTSSpeakFrame,
33
33
  TTSStartedFrame,
@@ -224,7 +224,7 @@ class NeuphonicTTSService(InterruptibleTTSService):
224
224
  direction: The direction to push the frame.
225
225
  """
226
226
  await super().push_frame(frame, direction)
227
- if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
227
+ if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
228
228
  self._started = False
229
229
 
230
230
  async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -293,6 +293,8 @@ class NeuphonicTTSService(InterruptibleTTSService):
293
293
  headers = {"x-api-key": self._api_key}
294
294
 
295
295
  self._websocket = await websocket_connect(url, additional_headers=headers)
296
+
297
+ await self._call_event_handler("on_connected")
296
298
  except Exception as e:
297
299
  logger.error(f"{self} initialization error: {e}")
298
300
  self._websocket = None
@@ -311,6 +313,7 @@ class NeuphonicTTSService(InterruptibleTTSService):
311
313
  finally:
312
314
  self._started = False
313
315
  self._websocket = None
316
+ await self._call_event_handler("on_disconnected")
314
317
 
315
318
  async def _receive_messages(self):
316
319
  """Receive and process messages from Neuphonic WebSocket."""
@@ -10,6 +10,7 @@ from pipecat.services import DeprecatedModuleProxy
10
10
 
11
11
  from .image import *
12
12
  from .llm import *
13
+ from .realtime import *
13
14
  from .stt import *
14
15
  from .tts import *
15
16