dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -6,6 +6,7 @@
6
6
 
7
7
  """Base OpenAI LLM service implementation."""
8
8
 
9
+ import asyncio
9
10
  import base64
10
11
  import json
11
12
  from typing import Any, Dict, List, Mapping, Optional
@@ -14,6 +15,7 @@ import httpx
14
15
  from loguru import logger
15
16
  from openai import (
16
17
  NOT_GIVEN,
18
+ APITimeoutError,
17
19
  AsyncOpenAI,
18
20
  AsyncStream,
19
21
  DefaultAsyncHttpxClient,
@@ -37,7 +39,6 @@ from pipecat.processors.aggregators.openai_llm_context import (
37
39
  )
38
40
  from pipecat.processors.frame_processor import FrameDirection
39
41
  from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
40
- from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
41
42
  from pipecat.utils.tracing.service_decorators import traced_llm
42
43
 
43
44
 
@@ -48,16 +49,6 @@ class BaseOpenAILLMService(LLMService):
48
49
  to an OpenAILLMContext object. The context defines what is sent to the LLM for
49
50
  completion, including user, assistant, and system messages, as well as tool
50
51
  choices and function call configurations.
51
-
52
- Args:
53
- model: The OpenAI model name to use (e.g., "gpt-4.1", "gpt-4o").
54
- api_key: OpenAI API key. If None, uses environment variable.
55
- base_url: Custom base URL for OpenAI API. If None, uses default.
56
- organization: OpenAI organization ID.
57
- project: OpenAI project ID.
58
- default_headers: Additional HTTP headers to include in requests.
59
- params: Input parameters for model configuration and behavior.
60
- **kwargs: Additional arguments passed to the parent LLMService.
61
52
  """
62
53
 
63
54
  class InputParams(BaseModel):
@@ -101,8 +92,24 @@ class BaseOpenAILLMService(LLMService):
101
92
  project=None,
102
93
  default_headers: Optional[Mapping[str, str]] = None,
103
94
  params: Optional[InputParams] = None,
95
+ retry_timeout_secs: Optional[float] = 5.0,
96
+ retry_on_timeout: Optional[bool] = False,
104
97
  **kwargs,
105
98
  ):
99
+ """Initialize the BaseOpenAILLMService.
100
+
101
+ Args:
102
+ model: The OpenAI model name to use (e.g., "gpt-4.1", "gpt-4o").
103
+ api_key: OpenAI API key. If None, uses environment variable.
104
+ base_url: Custom base URL for OpenAI API. If None, uses default.
105
+ organization: OpenAI organization ID.
106
+ project: OpenAI project ID.
107
+ default_headers: Additional HTTP headers to include in requests.
108
+ params: Input parameters for model configuration and behavior.
109
+ retry_timeout_secs: Request timeout in seconds. Defaults to 5.0 seconds.
110
+ retry_on_timeout: Whether to retry the request once if it times out.
111
+ **kwargs: Additional arguments passed to the parent LLMService.
112
+ """
106
113
  super().__init__(**kwargs)
107
114
 
108
115
  params = params or BaseOpenAILLMService.InputParams()
@@ -117,6 +124,8 @@ class BaseOpenAILLMService(LLMService):
117
124
  "max_completion_tokens": params.max_completion_tokens,
118
125
  "extra": params.extra if isinstance(params.extra, dict) else {},
119
126
  }
127
+ self._retry_timeout_secs = retry_timeout_secs
128
+ self._retry_on_timeout = retry_on_timeout
120
129
  self.set_model_name(model)
121
130
  self._client = self.create_client(
122
131
  api_key=api_key,
@@ -173,7 +182,7 @@ class BaseOpenAILLMService(LLMService):
173
182
  async def get_chat_completions(
174
183
  self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
175
184
  ) -> AsyncStream[ChatCompletionChunk]:
176
- """Get streaming chat completions from OpenAI API.
185
+ """Get streaming chat completions from OpenAI API with optional timeout and retry.
177
186
 
178
187
  Args:
179
188
  context: The LLM context containing tools and configuration.
@@ -182,6 +191,37 @@ class BaseOpenAILLMService(LLMService):
182
191
  Returns:
183
192
  Async stream of chat completion chunks.
184
193
  """
194
+ params = self.build_chat_completion_params(context, messages)
195
+
196
+ if self._retry_on_timeout:
197
+ try:
198
+ chunks = await asyncio.wait_for(
199
+ self._client.chat.completions.create(**params), timeout=self._retry_timeout_secs
200
+ )
201
+ return chunks
202
+ except (APITimeoutError, asyncio.TimeoutError):
203
+ # Retry, this time without a timeout so we get a response
204
+ logger.debug(f"{self}: Retrying chat completion due to timeout")
205
+ chunks = await self._client.chat.completions.create(**params)
206
+ return chunks
207
+ else:
208
+ chunks = await self._client.chat.completions.create(**params)
209
+ return chunks
210
+
211
+ def build_chat_completion_params(
212
+ self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
213
+ ) -> dict:
214
+ """Build parameters for chat completion request.
215
+
216
+ Subclasses can override this to customize parameters for different providers.
217
+
218
+ Args:
219
+ context: The LLM context containing tools and configuration.
220
+ messages: List of chat completion messages to send.
221
+
222
+ Returns:
223
+ Dictionary of parameters for the chat completion request.
224
+ """
185
225
  params = {
186
226
  "model": self.model_name,
187
227
  "stream": True,
@@ -199,9 +239,7 @@ class BaseOpenAILLMService(LLMService):
199
239
  }
200
240
 
201
241
  params.update(self._settings["extra"])
202
-
203
- chunks = await self._client.chat.completions.create(**params)
204
- return chunks
242
+ return params
205
243
 
206
244
  async def _stream_chat_completions(
207
245
  self, context: OpenAILLMContext
@@ -245,7 +283,7 @@ class BaseOpenAILLMService(LLMService):
245
283
  context
246
284
  )
247
285
 
248
- async for chunk in WatchdogAsyncIterator(chunk_stream, manager=self.task_manager):
286
+ async for chunk in chunk_stream:
249
287
  if chunk.usage:
250
288
  tokens = LLMTokenUsage(
251
289
  prompt_tokens=chunk.usage.prompt_tokens,
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """OpenAI image generation service implementation.
8
+
9
+ This module provides integration with OpenAI's DALL-E image generation API
10
+ for creating images from text prompts.
11
+ """
12
+
7
13
  import io
8
14
  from typing import AsyncGenerator, Literal, Optional
9
15
 
@@ -21,6 +27,13 @@ from pipecat.services.image_service import ImageGenService
21
27
 
22
28
 
23
29
  class OpenAIImageGenService(ImageGenService):
30
+ """OpenAI DALL-E image generation service.
31
+
32
+ Provides image generation capabilities using OpenAI's DALL-E models.
33
+ Supports various image sizes and can generate images from text prompts
34
+ with configurable quality and style parameters.
35
+ """
36
+
24
37
  def __init__(
25
38
  self,
26
39
  *,
@@ -30,6 +43,15 @@ class OpenAIImageGenService(ImageGenService):
30
43
  image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"],
31
44
  model: str = "dall-e-3",
32
45
  ):
46
+ """Initialize the OpenAI image generation service.
47
+
48
+ Args:
49
+ api_key: OpenAI API key for authentication.
50
+ base_url: Custom base URL for OpenAI API. If None, uses default.
51
+ aiohttp_session: HTTP session for downloading generated images.
52
+ image_size: Target size for generated images.
53
+ model: DALL-E model to use for generation. Defaults to "dall-e-3".
54
+ """
33
55
  super().__init__()
34
56
  self.set_model_name(model)
35
57
  self._image_size = image_size
@@ -37,6 +59,14 @@ class OpenAIImageGenService(ImageGenService):
37
59
  self._aiohttp_session = aiohttp_session
38
60
 
39
61
  async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
62
+ """Generate an image from a text prompt using OpenAI's DALL-E.
63
+
64
+ Args:
65
+ prompt: Text description of the image to generate.
66
+
67
+ Yields:
68
+ Frame: URLImageRawFrame containing the generated image data.
69
+ """
40
70
  logger.debug(f"Generating image from prompt: {prompt}")
41
71
 
42
72
  image = await self._client.images.generate(
@@ -61,11 +61,6 @@ class OpenAILLMService(BaseOpenAILLMService):
61
61
  Provides a complete OpenAI LLM service with context aggregation support.
62
62
  Uses the BaseOpenAILLMService for core functionality and adds OpenAI-specific
63
63
  context aggregator creation.
64
-
65
- Args:
66
- model: The OpenAI model name to use. Defaults to "gpt-4.1".
67
- params: Input parameters for model configuration.
68
- **kwargs: Additional arguments passed to the parent BaseOpenAILLMService.
69
64
  """
70
65
 
71
66
  def __init__(
@@ -75,6 +70,13 @@ class OpenAILLMService(BaseOpenAILLMService):
75
70
  params: Optional[BaseOpenAILLMService.InputParams] = None,
76
71
  **kwargs,
77
72
  ):
73
+ """Initialize OpenAI LLM service.
74
+
75
+ Args:
76
+ model: The OpenAI model name to use. Defaults to "gpt-4.1".
77
+ params: Input parameters for model configuration.
78
+ **kwargs: Additional arguments passed to the parent BaseOpenAILLMService.
79
+ """
78
80
  super().__init__(model=model, params=params, **kwargs)
79
81
 
80
82
  def create_context_aggregator(
@@ -4,6 +4,8 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """OpenAI Speech-to-Text service implementation using OpenAI's transcription API."""
8
+
7
9
  from typing import Optional
8
10
 
9
11
  from pipecat.services.whisper.base_stt import BaseWhisperSTTService, Transcription
@@ -15,15 +17,6 @@ class OpenAISTTService(BaseWhisperSTTService):
15
17
 
16
18
  Uses OpenAI's transcription API to convert audio to text. Requires an OpenAI API key
17
19
  set via the api_key parameter or OPENAI_API_KEY environment variable.
18
-
19
- Args:
20
- model: Model to use — either gpt-4o or Whisper. Defaults to "gpt-4o-transcribe".
21
- api_key: OpenAI API key. Defaults to None.
22
- base_url: API base URL. Defaults to None.
23
- language: Language of the audio input. Defaults to English.
24
- prompt: Optional text to guide the model's style or continue a previous segment.
25
- temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
26
- **kwargs: Additional arguments passed to BaseWhisperSTTService.
27
20
  """
28
21
 
29
22
  def __init__(
@@ -37,6 +30,17 @@ class OpenAISTTService(BaseWhisperSTTService):
37
30
  temperature: Optional[float] = None,
38
31
  **kwargs,
39
32
  ):
33
+ """Initialize OpenAI STT service.
34
+
35
+ Args:
36
+ model: Model to use — either gpt-4o or Whisper. Defaults to "gpt-4o-transcribe".
37
+ api_key: OpenAI API key. Defaults to None.
38
+ base_url: API base URL. Defaults to None.
39
+ language: Language of the audio input. Defaults to English.
40
+ prompt: Optional text to guide the model's style or continue a previous segment.
41
+ temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
42
+ **kwargs: Additional arguments passed to BaseWhisperSTTService.
43
+ """
40
44
  super().__init__(
41
45
  model=model,
42
46
  api_key=api_key,
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """OpenAI text-to-speech service implementation.
8
+
9
+ This module provides integration with OpenAI's text-to-speech API for
10
+ generating high-quality synthetic speech from text input.
11
+ """
12
+
7
13
  from typing import AsyncGenerator, Dict, Literal, Optional
8
14
 
9
15
  from loguru import logger
@@ -43,16 +49,8 @@ class OpenAITTSService(TTSService):
43
49
  """OpenAI Text-to-Speech service that generates audio from text.
44
50
 
45
51
  This service uses the OpenAI TTS API to generate PCM-encoded audio at 24kHz.
46
-
47
- Args:
48
- api_key: OpenAI API key. Defaults to None.
49
- voice: Voice ID to use. Defaults to "alloy".
50
- model: TTS model to use. Defaults to "gpt-4o-mini-tts".
51
- sample_rate: Output audio sample rate in Hz. Defaults to None.
52
- **kwargs: Additional keyword arguments passed to TTSService.
53
-
54
- The service returns PCM-encoded audio at the specified sample rate.
55
-
52
+ Supports multiple voice models and configurable parameters for high-quality
53
+ speech synthesis with streaming audio output.
56
54
  """
57
55
 
58
56
  OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz
@@ -68,6 +66,17 @@ class OpenAITTSService(TTSService):
68
66
  instructions: Optional[str] = None,
69
67
  **kwargs,
70
68
  ):
69
+ """Initialize OpenAI TTS service.
70
+
71
+ Args:
72
+ api_key: OpenAI API key for authentication. If None, uses environment variable.
73
+ base_url: Custom base URL for OpenAI API. If None, uses default.
74
+ voice: Voice ID to use for synthesis. Defaults to "alloy".
75
+ model: TTS model to use. Defaults to "gpt-4o-mini-tts".
76
+ sample_rate: Output audio sample rate in Hz. If None, uses OpenAI's default 24kHz.
77
+ instructions: Optional instructions to guide voice synthesis behavior.
78
+ **kwargs: Additional keyword arguments passed to TTSService.
79
+ """
71
80
  if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
72
81
  logger.warning(
73
82
  f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
@@ -81,13 +90,28 @@ class OpenAITTSService(TTSService):
81
90
  self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)
82
91
 
83
92
  def can_generate_metrics(self) -> bool:
93
+ """Check if this service can generate processing metrics.
94
+
95
+ Returns:
96
+ True, as OpenAI TTS service supports metrics generation.
97
+ """
84
98
  return True
85
99
 
86
100
  async def set_model(self, model: str):
101
+ """Set the TTS model to use.
102
+
103
+ Args:
104
+ model: The model name to use for text-to-speech synthesis.
105
+ """
87
106
  logger.info(f"Switching TTS model to: [{model}]")
88
107
  self.set_model_name(model)
89
108
 
90
109
  async def start(self, frame: StartFrame):
110
+ """Start the OpenAI TTS service.
111
+
112
+ Args:
113
+ frame: The start frame containing initialization parameters.
114
+ """
91
115
  await super().start(frame)
92
116
  if self.sample_rate != self.OPENAI_SAMPLE_RATE:
93
117
  logger.warning(
@@ -97,6 +121,14 @@ class OpenAITTSService(TTSService):
97
121
 
98
122
  @traced_tts
99
123
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
124
+ """Generate speech from text using OpenAI's TTS API.
125
+
126
+ Args:
127
+ text: The text to synthesize into speech.
128
+
129
+ Yields:
130
+ Frame: Audio frames containing the synthesized speech data.
131
+ """
100
132
  logger.debug(f"{self}: Generating TTS [{text}]")
101
133
  try:
102
134
  await self.start_ttfb_metrics()
@@ -11,7 +11,7 @@ from loguru import logger
11
11
  from .openai import OpenAIRealtimeBetaLLMService
12
12
 
13
13
  try:
14
- import websockets
14
+ from websockets.asyncio.client import connect as websocket_connect
15
15
  except ModuleNotFoundError as e:
16
16
  logger.error(f"Exception: {e}")
17
17
  logger.error(
@@ -26,12 +26,6 @@ class AzureRealtimeBetaLLMService(OpenAIRealtimeBetaLLMService):
26
26
  Extends the OpenAI Realtime service to work with Azure OpenAI endpoints,
27
27
  using Azure's authentication headers and endpoint format. Provides the same
28
28
  real-time audio and text communication capabilities as the base OpenAI service.
29
-
30
- Args:
31
- api_key: The API key for the Azure OpenAI service.
32
- base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
33
- Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
34
- **kwargs: Additional arguments passed to parent OpenAIRealtimeBetaLLMService.
35
29
  """
36
30
 
37
31
  def __init__(
@@ -41,6 +35,14 @@ class AzureRealtimeBetaLLMService(OpenAIRealtimeBetaLLMService):
41
35
  base_url: str,
42
36
  **kwargs,
43
37
  ):
38
+ """Initialize Azure Realtime Beta LLM service.
39
+
40
+ Args:
41
+ api_key: The API key for the Azure OpenAI service.
42
+ base_url: The full Azure WebSocket endpoint URL including api-version and deployment.
43
+ Example: "wss://my-project.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=my-realtime-deployment"
44
+ **kwargs: Additional arguments passed to parent OpenAIRealtimeBetaLLMService.
45
+ """
44
46
  super().__init__(base_url=base_url, api_key=api_key, **kwargs)
45
47
  self.api_key = api_key
46
48
  self.base_url = base_url
@@ -53,9 +55,9 @@ class AzureRealtimeBetaLLMService(OpenAIRealtimeBetaLLMService):
53
55
  return
54
56
 
55
57
  logger.info(f"Connecting to {self.base_url}, api key: {self.api_key}")
56
- self._websocket = await websockets.connect(
58
+ self._websocket = await websocket_connect(
57
59
  uri=self.base_url,
58
- extra_headers={
60
+ additional_headers={
59
61
  "api-key": self.api_key,
60
62
  },
61
63
  )
@@ -37,14 +37,16 @@ class OpenAIRealtimeLLMContext(OpenAILLMContext):
37
37
  Extends the standard OpenAI LLM context to support real-time session properties,
38
38
  instruction management, and conversion between standard message formats and
39
39
  realtime conversation items.
40
-
41
- Args:
42
- messages: Initial conversation messages. Defaults to None.
43
- tools: Available function tools. Defaults to None.
44
- **kwargs: Additional arguments passed to parent OpenAILLMContext.
45
40
  """
46
41
 
47
42
  def __init__(self, messages=None, tools=None, **kwargs):
43
+ """Initialize the OpenAIRealtimeLLMContext.
44
+
45
+ Args:
46
+ messages: Initial conversation messages. Defaults to None.
47
+ tools: Available function tools. Defaults to None.
48
+ **kwargs: Additional arguments passed to parent OpenAILLMContext.
49
+ """
48
50
  super().__init__(messages=messages, tools=tools, **kwargs)
49
51
  self.__setup_local()
50
52
 
@@ -18,13 +18,7 @@ from pydantic import BaseModel, ConfigDict, Field
18
18
 
19
19
 
20
20
  class InputAudioTranscription(BaseModel):
21
- """Configuration for audio transcription settings.
22
-
23
- Parameters:
24
- model: Transcription model to use (e.g., "gpt-4o-transcribe", "whisper-1").
25
- language: Optional language code for transcription.
26
- prompt: Optional transcription hint text.
27
- """
21
+ """Configuration for audio transcription settings."""
28
22
 
29
23
  model: str = "gpt-4o-transcribe"
30
24
  language: Optional[str]
@@ -36,6 +30,13 @@ class InputAudioTranscription(BaseModel):
36
30
  language: Optional[str] = None,
37
31
  prompt: Optional[str] = None,
38
32
  ):
33
+ """Initialize InputAudioTranscription.
34
+
35
+ Args:
36
+ model: Transcription model to use (e.g., "gpt-4o-transcribe", "whisper-1").
37
+ language: Optional language code for transcription.
38
+ prompt: Optional transcription hint text.
39
+ """
39
40
  super().__init__(model=model, language=language, prompt=prompt)
40
41
 
41
42
 
@@ -881,6 +882,8 @@ class TokenDetails(BaseModel):
881
882
  audio_tokens: Optional[int] = 0
882
883
 
883
884
  class Config:
885
+ """Pydantic configuration for TokenDetails."""
886
+
884
887
  extra = "allow"
885
888
 
886
889
 
@@ -53,7 +53,6 @@ from pipecat.processors.frame_processor import FrameDirection
53
53
  from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
54
54
  from pipecat.services.openai.llm import OpenAIContextAggregatorPair
55
55
  from pipecat.transcriptions.language import Language
56
- from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
57
56
  from pipecat.utils.time import time_now_iso8601
58
57
  from pipecat.utils.tracing.service_decorators import traced_openai_realtime, traced_stt
59
58
 
@@ -66,7 +65,7 @@ from .context import (
66
65
  from .frames import RealtimeFunctionCallResultFrame, RealtimeMessagesUpdateFrame
67
66
 
68
67
  try:
69
- import websockets
68
+ from websockets.asyncio.client import connect as websocket_connect
70
69
  except ModuleNotFoundError as e:
71
70
  logger.error(f"Exception: {e}")
72
71
  logger.error("In order to use OpenAI, you need to `pip install pipecat-ai[openai]`.")
@@ -96,17 +95,6 @@ class OpenAIRealtimeBetaLLMService(LLMService):
96
95
  Implements the OpenAI Realtime API Beta with WebSocket communication for low-latency
97
96
  bidirectional audio and text interactions. Supports function calling, conversation
98
97
  management, and real-time transcription.
99
-
100
- Args:
101
- api_key: OpenAI API key for authentication.
102
- model: OpenAI model name. Defaults to "gpt-4o-realtime-preview-2025-06-03".
103
- base_url: WebSocket base URL for the realtime API.
104
- Defaults to "wss://api.openai.com/v1/realtime".
105
- session_properties: Configuration properties for the realtime session.
106
- If None, uses default SessionProperties.
107
- start_audio_paused: Whether to start with audio input paused. Defaults to False.
108
- send_transcription_frames: Whether to emit transcription frames. Defaults to True.
109
- **kwargs: Additional arguments passed to parent LLMService.
110
98
  """
111
99
 
112
100
  # Overriding the default adapter to use the OpenAIRealtimeLLMAdapter one.
@@ -123,6 +111,19 @@ class OpenAIRealtimeBetaLLMService(LLMService):
123
111
  send_transcription_frames: bool = True,
124
112
  **kwargs,
125
113
  ):
114
+ """Initialize the OpenAI Realtime Beta LLM service.
115
+
116
+ Args:
117
+ api_key: OpenAI API key for authentication.
118
+ model: OpenAI model name. Defaults to "gpt-4o-realtime-preview-2025-06-03".
119
+ base_url: WebSocket base URL for the realtime API.
120
+ Defaults to "wss://api.openai.com/v1/realtime".
121
+ session_properties: Configuration properties for the realtime session.
122
+ If None, uses default SessionProperties.
123
+ start_audio_paused: Whether to start with audio input paused. Defaults to False.
124
+ send_transcription_frames: Whether to emit transcription frames. Defaults to True.
125
+ **kwargs: Additional arguments passed to parent LLMService.
126
+ """
126
127
  full_url = f"{base_url}?model={model}"
127
128
  super().__init__(base_url=full_url, **kwargs)
128
129
 
@@ -169,6 +170,15 @@ class OpenAIRealtimeBetaLLMService(LLMService):
169
170
  """
170
171
  self._audio_input_paused = paused
171
172
 
173
+ def _is_modality_enabled(self, modality: str) -> bool:
174
+ """Check if a specific modality is enabled, "text" or "audio"."""
175
+ modalities = self._session_properties.modalities or ["audio", "text"]
176
+ return modality in modalities
177
+
178
+ def _get_enabled_modalities(self) -> list[str]:
179
+ """Get the list of enabled modalities."""
180
+ return self._session_properties.modalities or ["audio", "text"]
181
+
172
182
  async def retrieve_conversation_item(self, item_id: str):
173
183
  """Retrieve a conversation item by ID from the server.
174
184
 
@@ -241,7 +251,9 @@ class OpenAIRealtimeBetaLLMService(LLMService):
241
251
  await self.stop_all_metrics()
242
252
  if self._current_assistant_response:
243
253
  await self.push_frame(LLMFullResponseEndFrame())
244
- await self.push_frame(TTSStoppedFrame())
254
+ # Only push TTSStoppedFrame if audio modality is enabled
255
+ if self._is_modality_enabled("audio"):
256
+ await self.push_frame(TTSStoppedFrame())
245
257
 
246
258
  async def _handle_user_started_speaking(self, frame):
247
259
  pass
@@ -385,9 +397,9 @@ class OpenAIRealtimeBetaLLMService(LLMService):
385
397
  # Here we assume that if we have a websocket, we are connected. We
386
398
  # handle disconnections in the send/recv code paths.
387
399
  return
388
- self._websocket = await websockets.connect(
400
+ self._websocket = await websocket_connect(
389
401
  uri=self.base_url,
390
- extra_headers={
402
+ additional_headers={
391
403
  "Authorization": f"Bearer {self.api_key}",
392
404
  "OpenAI-Beta": "realtime=v1",
393
405
  },
@@ -443,7 +455,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
443
455
  #
444
456
 
445
457
  async def _receive_task_handler(self):
446
- async for message in WatchdogAsyncIterator(self._websocket, manager=self.task_manager):
458
+ async for message in self._websocket:
447
459
  evt = events.parse_server_event(message)
448
460
  if evt.type == "session.created":
449
461
  await self._handle_evt_session_created(evt)
@@ -467,6 +479,8 @@ class OpenAIRealtimeBetaLLMService(LLMService):
467
479
  await self._handle_evt_speech_started(evt)
468
480
  elif evt.type == "input_audio_buffer.speech_stopped":
469
481
  await self._handle_evt_speech_stopped(evt)
482
+ elif evt.type == "response.text.delta":
483
+ await self._handle_evt_text_delta(evt)
470
484
  elif evt.type == "response.audio_transcript.delta":
471
485
  await self._handle_evt_audio_transcript_delta(evt)
472
486
  elif evt.type == "error":
@@ -615,6 +629,10 @@ class OpenAIRealtimeBetaLLMService(LLMService):
615
629
  # Response message without preceding user message. Add it to the context.
616
630
  await self._handle_assistant_output(evt.response.output)
617
631
 
632
+ async def _handle_evt_text_delta(self, evt):
633
+ if evt.delta:
634
+ await self.push_frame(LLMTextFrame(evt.delta))
635
+
618
636
  async def _handle_evt_audio_transcript_delta(self, evt):
619
637
  if evt.delta:
620
638
  await self.push_frame(LLMTextFrame(evt.delta))
@@ -637,6 +655,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
637
655
  """Maybe handle an error event related to retrieving a conversation item.
638
656
 
639
657
  If the given error event is an error retrieving a conversation item:
658
+
640
659
  - set an exception on the future that retrieve_conversation_item() is waiting on
641
660
  - return true
642
661
  Otherwise:
@@ -720,7 +739,7 @@ class OpenAIRealtimeBetaLLMService(LLMService):
720
739
  await self.start_ttfb_metrics()
721
740
  await self.send_client_event(
722
741
  events.ResponseCreateEvent(
723
- response=events.ResponseProperties(modalities=["audio", "text"])
742
+ response=events.ResponseProperties(modalities=self._get_enabled_modalities())
724
743
  )
725
744
  )
726
745