dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -26,6 +26,7 @@ from pydantic import BaseModel, Field
26
26
 
27
27
  from pipecat.adapters.services.anthropic_adapter import AnthropicLLMAdapter
28
28
  from pipecat.frames.frames import (
29
+ ErrorFrame,
29
30
  Frame,
30
31
  FunctionCallCancelFrame,
31
32
  FunctionCallInProgressFrame,
@@ -52,11 +53,10 @@ from pipecat.processors.aggregators.openai_llm_context import (
52
53
  )
53
54
  from pipecat.processors.frame_processor import FrameDirection
54
55
  from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
55
- from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
56
56
  from pipecat.utils.tracing.service_decorators import traced_llm
57
57
 
58
58
  try:
59
- from anthropic import NOT_GIVEN, AsyncAnthropic, NotGiven
59
+ from anthropic import NOT_GIVEN, APITimeoutError, AsyncAnthropic, NotGiven
60
60
  except ModuleNotFoundError as e:
61
61
  logger.error(f"Exception: {e}")
62
62
  logger.error("In order to use Anthropic, you need to `pip install pipecat-ai[anthropic]`.")
@@ -101,13 +101,6 @@ class AnthropicLLMService(LLMService):
101
101
  Provides inference capabilities with Claude models including support for
102
102
  function calling, vision processing, streaming responses, and prompt caching.
103
103
  Can use custom clients like AsyncAnthropicBedrock and AsyncAnthropicVertex.
104
-
105
- Args:
106
- api_key: Anthropic API key for authentication.
107
- model: Model name to use. Defaults to "claude-sonnet-4-20250514".
108
- params: Optional model parameters for inference.
109
- client: Optional custom Anthropic client instance.
110
- **kwargs: Additional arguments passed to parent LLMService.
111
104
  """
112
105
 
113
106
  # Overriding the default adapter to use the Anthropic one.
@@ -139,14 +132,29 @@ class AnthropicLLMService(LLMService):
139
132
  model: str = "claude-sonnet-4-20250514",
140
133
  params: Optional[InputParams] = None,
141
134
  client=None,
135
+ retry_timeout_secs: Optional[float] = 5.0,
136
+ retry_on_timeout: Optional[bool] = False,
142
137
  **kwargs,
143
138
  ):
139
+ """Initialize the Anthropic LLM service.
140
+
141
+ Args:
142
+ api_key: Anthropic API key for authentication.
143
+ model: Model name to use. Defaults to "claude-sonnet-4-20250514".
144
+ params: Optional model parameters for inference.
145
+ client: Optional custom Anthropic client instance.
146
+ retry_timeout_secs: Request timeout in seconds for retry logic.
147
+ retry_on_timeout: Whether to retry the request once if it times out.
148
+ **kwargs: Additional arguments passed to parent LLMService.
149
+ """
144
150
  super().__init__(**kwargs)
145
151
  params = params or AnthropicLLMService.InputParams()
146
152
  self._client = client or AsyncAnthropic(
147
153
  api_key=api_key
148
154
  ) # if the client is provided, use it and remove it, otherwise create a new one
149
155
  self.set_model_name(model)
156
+ self._retry_timeout_secs = retry_timeout_secs
157
+ self._retry_on_timeout = retry_on_timeout
150
158
  self._settings = {
151
159
  "max_tokens": params.max_tokens,
152
160
  "enable_prompt_caching_beta": params.enable_prompt_caching_beta or False,
@@ -164,6 +172,31 @@ class AnthropicLLMService(LLMService):
164
172
  """
165
173
  return True
166
174
 
175
+ async def _create_message_stream(self, api_call, params):
176
+ """Create message stream with optional timeout and retry.
177
+
178
+ Args:
179
+ api_call: The Anthropic API method to call.
180
+ params: Parameters for the API call.
181
+
182
+ Returns:
183
+ Async stream of message events.
184
+ """
185
+ if self._retry_on_timeout:
186
+ try:
187
+ response = await asyncio.wait_for(
188
+ api_call(**params), timeout=self._retry_timeout_secs
189
+ )
190
+ return response
191
+ except (APITimeoutError, asyncio.TimeoutError):
192
+ # Retry, this time without a timeout so we get a response
193
+ logger.debug(f"{self}: Retrying message creation due to timeout")
194
+ response = await api_call(**params)
195
+ return response
196
+ else:
197
+ response = await api_call(**params)
198
+ return response
199
+
167
200
  @property
168
201
  def enable_prompt_caching_beta(self) -> bool:
169
202
  """Check if prompt caching beta feature is enabled.
@@ -247,7 +280,7 @@ class AnthropicLLMService(LLMService):
247
280
 
248
281
  params.update(self._settings["extra"])
249
282
 
250
- response = await api_call(**params)
283
+ response = await self._create_message_stream(api_call, params)
251
284
 
252
285
  await self.stop_ttfb_metrics()
253
286
 
@@ -256,7 +289,7 @@ class AnthropicLLMService(LLMService):
256
289
  json_accumulator = ""
257
290
 
258
291
  function_calls = []
259
- async for event in WatchdogAsyncIterator(response, manager=self.task_manager):
292
+ async for event in response:
260
293
  # Aggregate streaming content, create frames, trigger events
261
294
 
262
295
  if event.type == "content_block_delta":
@@ -344,6 +377,7 @@ class AnthropicLLMService(LLMService):
344
377
  await self._call_event_handler("on_completion_timeout")
345
378
  except Exception as e:
346
379
  self.logger.exception(f"{self} exception: {e}")
380
+ await self.push_error(ErrorFrame(f"{e}"))
347
381
  finally:
348
382
  await self.stop_processing_metrics()
349
383
  await self.push_frame(LLMFullResponseEndFrame())
@@ -425,12 +459,6 @@ class AnthropicLLMContext(OpenAILLMContext):
425
459
  Extends OpenAILLMContext to handle Anthropic-specific features like
426
460
  system messages, prompt caching, and message format conversions.
427
461
  Manages conversation state and message history formatting.
428
-
429
- Args:
430
- messages: Initial list of conversation messages.
431
- tools: Available function calling tools.
432
- tool_choice: Tool selection preference.
433
- system: System message content.
434
462
  """
435
463
 
436
464
  def __init__(
@@ -441,15 +469,25 @@ class AnthropicLLMContext(OpenAILLMContext):
441
469
  *,
442
470
  system: Union[str, NotGiven] = NOT_GIVEN,
443
471
  ):
472
+ """Initialize the Anthropic LLM context.
473
+
474
+ Args:
475
+ messages: Initial list of conversation messages.
476
+ tools: Available function calling tools.
477
+ tool_choice: Tool selection preference.
478
+ system: System message content.
479
+ """
444
480
  super().__init__(messages=messages, tools=tools, tool_choice=tool_choice)
481
+ self.__setup_local()
482
+ self.system = system
445
483
 
484
+ def __setup_local(self):
446
485
  # For beta prompt caching. This is a counter that tracks the number of turns
447
486
  # we've seen above the cache threshold. We reset this when we reset the
448
487
  # messages list. We only care about this number being 0, 1, or 2. But
449
488
  # it's easiest just to treat it as a counter.
450
489
  self.turns_above_cache_threshold = 0
451
-
452
- self.system = system
490
+ return
453
491
 
454
492
  @staticmethod
455
493
  def upgrade_to_anthropic(obj: OpenAILLMContext) -> "AnthropicLLMContext":
@@ -466,6 +504,7 @@ class AnthropicLLMContext(OpenAILLMContext):
466
504
  logger.debug(f"Upgrading to Anthropic: {obj}")
467
505
  if isinstance(obj, OpenAILLMContext) and not isinstance(obj, AnthropicLLMContext):
468
506
  obj.__class__ = AnthropicLLMContext
507
+ obj.__setup_local()
469
508
  obj._restructure_from_openai_messages()
470
509
  return obj
471
510
 
@@ -534,20 +573,37 @@ class AnthropicLLMContext(OpenAILLMContext):
534
573
  Handles text content and function calls for both user and assistant messages.
535
574
 
536
575
  Args:
537
- obj: Message in Anthropic format:
538
- {
539
- "role": "user/assistant",
540
- "content": str | [{"type": "text/tool_use/tool_result", ...}]
541
- }
576
+ obj: Message in Anthropic format.
542
577
 
543
578
  Returns:
544
- List of messages in standard format:
545
- [
579
+ List of messages in standard format.
580
+
581
+ Examples:
582
+ Input Anthropic format::
583
+
546
584
  {
547
- "role": "user/assistant/tool",
548
- "content": [{"type": "text", "text": str}]
585
+ "role": "assistant",
586
+ "content": [
587
+ {"type": "text", "text": "Hello"},
588
+ {"type": "tool_use", "id": "123", "name": "search", "input": {"q": "test"}}
589
+ ]
549
590
  }
550
- ]
591
+
592
+ Output standard format::
593
+
594
+ [
595
+ {"role": "assistant", "content": [{"type": "text", "text": "Hello"}]},
596
+ {
597
+ "role": "assistant",
598
+ "tool_calls": [
599
+ {
600
+ "type": "function",
601
+ "id": "123",
602
+ "function": {"name": "search", "arguments": '{"q": "test"}'}
603
+ }
604
+ ]
605
+ }
606
+ ]
551
607
  """
552
608
  # todo: image format (?)
553
609
  # tool_use
@@ -609,23 +665,37 @@ class AnthropicLLMContext(OpenAILLMContext):
609
665
  Empty text content is converted to "(empty)".
610
666
 
611
667
  Args:
612
- message: Message in standard format:
668
+ message: Message in standard format.
669
+
670
+ Returns:
671
+ Message in Anthropic format.
672
+
673
+ Examples:
674
+ Input standard format::
675
+
613
676
  {
614
- "role": "user/assistant/tool",
615
- "content": str | [{"type": "text", ...}],
616
- "tool_calls": [{"id": str, "function": {"name": str, "arguments": str}}]
677
+ "role": "assistant",
678
+ "tool_calls": [
679
+ {
680
+ "id": "123",
681
+ "function": {"name": "search", "arguments": '{"q": "test"}'}
682
+ }
683
+ ]
617
684
  }
618
685
 
619
- Returns:
620
- Message in Anthropic format:
621
- {
622
- "role": "user/assistant",
623
- "content": str | [
624
- {"type": "text", "text": str} |
625
- {"type": "tool_use", "id": str, "name": str, "input": dict} |
626
- {"type": "tool_result", "tool_use_id": str, "content": str}
627
- ]
628
- }
686
+ Output Anthropic format::
687
+
688
+ {
689
+ "role": "assistant",
690
+ "content": [
691
+ {
692
+ "type": "tool_use",
693
+ "id": "123",
694
+ "name": "search",
695
+ "input": {"q": "test"}
696
+ }
697
+ ]
698
+ }
629
699
  """
630
700
  # todo: image messages (?)
631
701
  if message["role"] == "tool":
@@ -1,10 +1,30 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """AssemblyAI WebSocket API message models and connection parameters.
8
+
9
+ This module defines Pydantic models for handling AssemblyAI's real-time
10
+ transcription WebSocket messages and connection configuration.
11
+ """
12
+
1
13
  from typing import List, Literal, Optional
2
14
 
3
15
  from pydantic import BaseModel, Field
4
16
 
5
17
 
6
18
  class Word(BaseModel):
7
- """Represents a single word in a transcription with timing and confidence."""
19
+ """Represents a single word in a transcription with timing and confidence.
20
+
21
+ Parameters:
22
+ start: Start time of the word in milliseconds.
23
+ end: End time of the word in milliseconds.
24
+ text: The transcribed word text.
25
+ confidence: Confidence score for the word (0.0 to 1.0).
26
+ word_is_final: Whether this word is finalized and won't change.
27
+ """
8
28
 
9
29
  start: int
10
30
  end: int
@@ -14,13 +34,23 @@ class Word(BaseModel):
14
34
 
15
35
 
16
36
  class BaseMessage(BaseModel):
17
- """Base class for all AssemblyAI WebSocket messages."""
37
+ """Base class for all AssemblyAI WebSocket messages.
38
+
39
+ Parameters:
40
+ type: The message type identifier.
41
+ """
18
42
 
19
43
  type: str
20
44
 
21
45
 
22
46
  class BeginMessage(BaseMessage):
23
- """Message sent when a new session begins."""
47
+ """Message sent when a new session begins.
48
+
49
+ Parameters:
50
+ type: Always "Begin" for this message type.
51
+ id: Unique session identifier.
52
+ expires_at: Unix timestamp when the session expires.
53
+ """
24
54
 
25
55
  type: Literal["Begin"] = "Begin"
26
56
  id: str
@@ -28,7 +58,17 @@ class BeginMessage(BaseMessage):
28
58
 
29
59
 
30
60
  class TurnMessage(BaseMessage):
31
- """Message containing transcription data for a turn of speech."""
61
+ """Message containing transcription data for a turn of speech.
62
+
63
+ Parameters:
64
+ type: Always "Turn" for this message type.
65
+ turn_order: Sequential number of this turn in the session.
66
+ turn_is_formatted: Whether the transcript has been formatted.
67
+ end_of_turn: Whether this marks the end of a speaking turn.
68
+ transcript: The transcribed text for this turn.
69
+ end_of_turn_confidence: Confidence score for end-of-turn detection.
70
+ words: List of individual words with timing and confidence data.
71
+ """
32
72
 
33
73
  type: Literal["Turn"] = "Turn"
34
74
  turn_order: int
@@ -40,7 +80,13 @@ class TurnMessage(BaseMessage):
40
80
 
41
81
 
42
82
  class TerminationMessage(BaseMessage):
43
- """Message sent when the session is terminated."""
83
+ """Message sent when the session is terminated.
84
+
85
+ Parameters:
86
+ type: Always "Termination" for this message type.
87
+ audio_duration_seconds: Total duration of audio processed.
88
+ session_duration_seconds: Total duration of the session.
89
+ """
44
90
 
45
91
  type: Literal["Termination"] = "Termination"
46
92
  audio_duration_seconds: float
@@ -52,6 +98,18 @@ AnyMessage = BeginMessage | TurnMessage | TerminationMessage
52
98
 
53
99
 
54
100
  class AssemblyAIConnectionParams(BaseModel):
101
+ """Configuration parameters for AssemblyAI WebSocket connection.
102
+
103
+ Parameters:
104
+ sample_rate: Audio sample rate in Hz. Defaults to 16000.
105
+ encoding: Audio encoding format. Defaults to "pcm_s16le".
106
+ formatted_finals: Whether to enable transcript formatting. Defaults to True.
107
+ word_finalization_max_wait_time: Maximum time to wait for word finalization in milliseconds.
108
+ end_of_turn_confidence_threshold: Confidence threshold for end-of-turn detection.
109
+ min_end_of_turn_silence_when_confident: Minimum silence duration when confident about end-of-turn.
110
+ max_turn_silence: Maximum silence duration before forcing end-of-turn.
111
+ """
112
+
55
113
  sample_rate: int = 16000
56
114
  encoding: Literal["pcm_s16le", "pcm_mulaw"] = "pcm_s16le"
57
115
  formatted_finals: bool = True
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """AssemblyAI speech-to-text service implementation.
8
+
9
+ This module provides integration with AssemblyAI's real-time speech-to-text
10
+ WebSocket API for streaming audio transcription.
11
+ """
12
+
7
13
  import asyncio
8
14
  import json
9
15
  from typing import Any, AsyncGenerator, Dict
@@ -38,6 +44,7 @@ from .models import (
38
44
 
39
45
  try:
40
46
  import websockets
47
+ from websockets.asyncio.client import connect as websocket_connect
41
48
  except ModuleNotFoundError as e:
42
49
  logger.error(f"Exception: {e}")
43
50
  logger.error('In order to use AssemblyAI, you need to `pip install "pipecat-ai[assemblyai]"`.')
@@ -45,6 +52,13 @@ except ModuleNotFoundError as e:
45
52
 
46
53
 
47
54
  class AssemblyAISTTService(STTService):
55
+ """AssemblyAI real-time speech-to-text service.
56
+
57
+ Provides real-time speech transcription using AssemblyAI's WebSocket API.
58
+ Supports both interim and final transcriptions with configurable parameters
59
+ for audio processing and connection management.
60
+ """
61
+
48
62
  def __init__(
49
63
  self,
50
64
  *,
@@ -55,6 +69,16 @@ class AssemblyAISTTService(STTService):
55
69
  vad_force_turn_endpoint: bool = True,
56
70
  **kwargs,
57
71
  ):
72
+ """Initialize the AssemblyAI STT service.
73
+
74
+ Args:
75
+ api_key: AssemblyAI API key for authentication.
76
+ language: Language code for transcription. Defaults to English (Language.EN).
77
+ api_endpoint_base_url: WebSocket endpoint URL. Defaults to AssemblyAI's streaming endpoint.
78
+ connection_params: Connection configuration parameters. Defaults to AssemblyAIConnectionParams().
79
+ vad_force_turn_endpoint: Whether to force turn endpoint on VAD stop. Defaults to True.
80
+ **kwargs: Additional arguments passed to parent STTService class.
81
+ """
58
82
  self._api_key = api_key
59
83
  self._language = language
60
84
  self._api_endpoint_base_url = api_endpoint_base_url
@@ -75,22 +99,50 @@ class AssemblyAISTTService(STTService):
75
99
  self._chunk_size_bytes = 0
76
100
 
77
101
  def can_generate_metrics(self) -> bool:
102
+ """Check if the service can generate metrics.
103
+
104
+ Returns:
105
+ True if metrics generation is supported.
106
+ """
78
107
  return True
79
108
 
80
109
  async def start(self, frame: StartFrame):
110
+ """Start the speech-to-text service.
111
+
112
+ Args:
113
+ frame: Start frame to begin processing.
114
+ """
81
115
  await super().start(frame)
82
116
  self._chunk_size_bytes = int(self._chunk_size_ms * self._sample_rate * 2 / 1000)
83
117
  await self._connect()
84
118
 
85
119
  async def stop(self, frame: EndFrame):
120
+ """Stop the speech-to-text service.
121
+
122
+ Args:
123
+ frame: End frame to stop processing.
124
+ """
86
125
  await super().stop(frame)
87
126
  await self._disconnect()
88
127
 
89
128
  async def cancel(self, frame: CancelFrame):
129
+ """Cancel the speech-to-text service.
130
+
131
+ Args:
132
+ frame: Cancel frame to abort processing.
133
+ """
90
134
  await super().cancel(frame)
91
135
  await self._disconnect()
92
136
 
93
137
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
138
+ """Process audio data for speech-to-text conversion.
139
+
140
+ Args:
141
+ audio: Raw audio bytes to process.
142
+
143
+ Yields:
144
+ None (processing handled via WebSocket messages).
145
+ """
94
146
  self._audio_buffer.extend(audio)
95
147
 
96
148
  while len(self._audio_buffer) >= self._chunk_size_bytes:
@@ -101,6 +153,12 @@ class AssemblyAISTTService(STTService):
101
153
  yield None
102
154
 
103
155
  async def process_frame(self, frame: Frame, direction: FrameDirection):
156
+ """Process frames for VAD and metrics handling.
157
+
158
+ Args:
159
+ frame: Frame to process.
160
+ direction: Direction of frame processing.
161
+ """
104
162
  await super().process_frame(frame, direction)
105
163
  if isinstance(frame, UserStartedSpeakingFrame):
106
164
  await self.start_ttfb_metrics()
@@ -133,9 +191,9 @@ class AssemblyAISTTService(STTService):
133
191
  "Authorization": self._api_key,
134
192
  "User-Agent": f"AssemblyAI/1.0 (integration=Pipecat/{pipecat_version})",
135
193
  }
136
- self._websocket = await websockets.connect(
194
+ self._websocket = await websocket_connect(
137
195
  ws_url,
138
- extra_headers=headers,
196
+ additional_headers=headers,
139
197
  )
140
198
  self._connected = True
141
199
  self._receive_task = self.create_task(self._receive_task_handler())
@@ -161,10 +219,7 @@ class AssemblyAISTTService(STTService):
161
219
  await self._websocket.send(json.dumps({"type": "Terminate"}))
162
220
 
163
221
  try:
164
- await asyncio.wait_for(
165
- self._termination_event.wait(),
166
- timeout=5.0,
167
- )
222
+ await asyncio.wait_for(self._termination_event.wait(), timeout=5.0)
168
223
  except asyncio.TimeoutError:
169
224
  logger.warning("Timed out waiting for termination message from server")
170
225
 
@@ -189,11 +244,9 @@ class AssemblyAISTTService(STTService):
189
244
  try:
190
245
  while self._connected:
191
246
  try:
192
- message = await asyncio.wait_for(self._websocket.recv(), timeout=1.0)
247
+ message = await self._websocket.recv()
193
248
  data = json.loads(message)
194
249
  await self._handle_message(data)
195
- except asyncio.TimeoutError:
196
- self.reset_watchdog()
197
250
  except websockets.exceptions.ConnectionClosedOK:
198
251
  break
199
252
  except Exception as e:
@@ -254,7 +307,7 @@ class AssemblyAISTTService(STTService):
254
307
  await self.push_frame(
255
308
  TranscriptionFrame(
256
309
  message.transcript,
257
- "", # participant
310
+ self._user_id,
258
311
  time_now_iso8601(),
259
312
  self._language,
260
313
  message,
@@ -266,7 +319,7 @@ class AssemblyAISTTService(STTService):
266
319
  await self.push_frame(
267
320
  InterimTranscriptionFrame(
268
321
  message.transcript,
269
- "", # participant
322
+ self._user_id,
270
323
  time_now_iso8601(),
271
324
  self._language,
272
325
  message,
File without changes