dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """AWS Transcribe utility functions and classes for WebSocket streaming.
8
+
9
+ This module provides utilities for creating presigned URLs, building event messages,
10
+ and handling AWS event stream protocol for real-time transcription services.
11
+ """
12
+
7
13
  import binascii
8
14
  import datetime
9
15
  import hashlib
@@ -29,7 +35,29 @@ def get_presigned_url(
29
35
  show_speaker_label: bool = False,
30
36
  enable_channel_identification: bool = False,
31
37
  ) -> str:
32
- """Create a presigned URL for AWS Transcribe streaming."""
38
+ """Create a presigned URL for AWS Transcribe streaming.
39
+
40
+ Args:
41
+ region: AWS region for the service.
42
+ credentials: Dictionary containing AWS credentials. Must include
43
+ 'access_key' and 'secret_key', with optional 'session_token'.
44
+ language_code: Language code for transcription (e.g., "en-US").
45
+ media_encoding: Audio encoding format. Defaults to "pcm".
46
+ sample_rate: Audio sample rate in Hz. Defaults to 16000.
47
+ number_of_channels: Number of audio channels. Defaults to 1.
48
+ enable_partial_results_stabilization: Whether to enable partial result stabilization.
49
+ partial_results_stability: Stability level for partial results.
50
+ vocabulary_name: Custom vocabulary name to use.
51
+ vocabulary_filter_name: Vocabulary filter name to apply.
52
+ show_speaker_label: Whether to include speaker labels.
53
+ enable_channel_identification: Whether to enable channel identification.
54
+
55
+ Returns:
56
+ Presigned WebSocket URL for AWS Transcribe streaming.
57
+
58
+ Raises:
59
+ ValueError: If required AWS credentials are missing.
60
+ """
33
61
  access_key = credentials.get("access_key")
34
62
  secret_key = credentials.get("secret_key")
35
63
  session_token = credentials.get("session_token")
@@ -58,9 +86,23 @@ def get_presigned_url(
58
86
 
59
87
 
60
88
  class AWSTranscribePresignedURL:
89
+ """Generator for AWS Transcribe presigned WebSocket URLs.
90
+
91
+ Handles AWS Signature Version 4 signing process to create authenticated
92
+ WebSocket URLs for streaming transcription requests.
93
+ """
94
+
61
95
  def __init__(
62
96
  self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1"
63
97
  ):
98
+ """Initialize the presigned URL generator.
99
+
100
+ Args:
101
+ access_key: AWS access key ID.
102
+ secret_key: AWS secret access key.
103
+ session_token: AWS session token for temporary credentials.
104
+ region: AWS region for the service. Defaults to "us-east-1".
105
+ """
64
106
  self.access_key = access_key
65
107
  self.secret_key = secret_key
66
108
  self.session_token = session_token
@@ -96,6 +138,23 @@ class AWSTranscribePresignedURL:
96
138
  enable_partial_results_stabilization: bool = False,
97
139
  partial_results_stability: str = "",
98
140
  ) -> str:
141
+ """Generate a presigned WebSocket URL for AWS Transcribe.
142
+
143
+ Args:
144
+ sample_rate: Audio sample rate in Hz.
145
+ language_code: Language code for transcription.
146
+ media_encoding: Audio encoding format.
147
+ vocabulary_name: Custom vocabulary name.
148
+ vocabulary_filter_name: Vocabulary filter name.
149
+ show_speaker_label: Whether to include speaker labels.
150
+ enable_channel_identification: Whether to enable channel identification.
151
+ number_of_channels: Number of audio channels.
152
+ enable_partial_results_stabilization: Whether to enable partial result stabilization.
153
+ partial_results_stability: Stability level for partial results.
154
+
155
+ Returns:
156
+ Presigned WebSocket URL with authentication parameters.
157
+ """
99
158
  self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443"
100
159
  self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443"
101
160
 
@@ -172,7 +231,15 @@ class AWSTranscribePresignedURL:
172
231
 
173
232
 
174
233
  def get_headers(header_name: str, header_value: str) -> bytearray:
175
- """Build a header following AWS event stream format."""
234
+ """Build a header following AWS event stream format.
235
+
236
+ Args:
237
+ header_name: Name of the header.
238
+ header_value: Value of the header.
239
+
240
+ Returns:
241
+ Encoded header as a bytearray following AWS event stream protocol.
242
+ """
176
243
  name = header_name.encode("utf-8")
177
244
  name_byte_length = bytes([len(name)])
178
245
  value_type = bytes([7]) # 7 represents a string
@@ -190,9 +257,21 @@ def get_headers(header_name: str, header_value: str) -> bytearray:
190
257
 
191
258
 
192
259
  def build_event_message(payload: bytes) -> bytes:
193
- """
194
- Build an event message for AWS Transcribe streaming.
195
- Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
260
+ """Build an event message for AWS Transcribe streaming.
261
+
262
+ Creates a properly formatted AWS event stream message containing audio data
263
+ for real-time transcription. Follows the AWS event stream protocol with
264
+ prelude, headers, payload, and CRC checksums.
265
+
266
+ Args:
267
+ payload: Raw audio bytes to include in the event message.
268
+
269
+ Returns:
270
+ Complete event message as bytes, ready to send via WebSocket.
271
+
272
+ Note:
273
+ Implementation matches AWS sample:
274
+ https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
196
275
  """
197
276
  # Build headers
198
277
  content_type_header = get_headers(":content-type", "application/octet-stream")
@@ -235,6 +314,23 @@ def build_event_message(payload: bytes) -> bytes:
235
314
 
236
315
 
237
316
  def decode_event(message):
317
+ """Decode an AWS event stream message.
318
+
319
+ Parses an AWS event stream message to extract headers and payload,
320
+ verifying CRC checksums for data integrity.
321
+
322
+ Args:
323
+ message: Raw event stream message bytes received from AWS.
324
+
325
+ Returns:
326
+ A tuple of (headers, payload) where:
327
+
328
+ - headers: Dictionary of parsed headers
329
+ - payload: Dictionary of parsed JSON payload
330
+
331
+ Raises:
332
+ AssertionError: If CRC checksum verification fails.
333
+ """
238
334
  # Extract the prelude, headers, payload and CRC
239
335
  prelude = message[:8]
240
336
  total_length, headers_length = struct.unpack(">II", prelude)
@@ -95,7 +95,13 @@ class AWSNovaSonicUnhandledFunctionException(Exception):
95
95
 
96
96
 
97
97
  class ContentType(Enum):
98
- """Content types supported by AWS Nova Sonic."""
98
+ """Content types supported by AWS Nova Sonic.
99
+
100
+ Parameters:
101
+ AUDIO: Audio content type.
102
+ TEXT: Text content type.
103
+ TOOL: Tool content type.
104
+ """
99
105
 
100
106
  AUDIO = "AUDIO"
101
107
  TEXT = "TEXT"
@@ -103,7 +109,12 @@ class ContentType(Enum):
103
109
 
104
110
 
105
111
  class TextStage(Enum):
106
- """Text generation stages in AWS Nova Sonic responses."""
112
+ """Text generation stages in AWS Nova Sonic responses.
113
+
114
+ Parameters:
115
+ FINAL: Final text that has been fully generated.
116
+ SPECULATIVE: Speculative text that is still being generated.
117
+ """
107
118
 
108
119
  FINAL = "FINAL" # what has been said
109
120
  SPECULATIVE = "SPECULATIVE" # what's planned to be said
@@ -126,6 +137,7 @@ class CurrentContent:
126
137
  text_content: str # starts as None, then fills in if text
127
138
 
128
139
  def __str__(self):
140
+ """String representation of the current content."""
129
141
  return (
130
142
  f"CurrentContent(\n"
131
143
  f" type={self.type.name},\n"
@@ -138,7 +150,7 @@ class CurrentContent:
138
150
  class Params(BaseModel):
139
151
  """Configuration parameters for AWS Nova Sonic.
140
152
 
141
- Attributes:
153
+ Parameters:
142
154
  input_sample_rate: Audio input sample rate in Hz.
143
155
  input_sample_size: Audio input sample size in bits.
144
156
  input_channel_count: Number of input audio channels.
@@ -171,18 +183,6 @@ class AWSNovaSonicLLMService(LLMService):
171
183
 
172
184
  Provides bidirectional audio streaming, real-time transcription, text generation,
173
185
  and function calling capabilities using AWS Nova Sonic model.
174
-
175
- Args:
176
- secret_access_key: AWS secret access key for authentication.
177
- access_key_id: AWS access key ID for authentication.
178
- region: AWS region where the service is hosted.
179
- model: Model identifier. Defaults to "amazon.nova-sonic-v1:0".
180
- voice_id: Voice ID for speech synthesis. Options: matthew, tiffany, amy.
181
- params: Model parameters for audio configuration and inference.
182
- system_instruction: System-level instruction for the model.
183
- tools: Available tools/functions for the model to use.
184
- send_transcription_frames: Whether to emit transcription frames.
185
- **kwargs: Additional arguments passed to the parent LLMService.
186
186
  """
187
187
 
188
188
  # Override the default adapter to use the AWSNovaSonicLLMAdapter one
@@ -193,6 +193,7 @@ class AWSNovaSonicLLMService(LLMService):
193
193
  *,
194
194
  secret_access_key: str,
195
195
  access_key_id: str,
196
+ session_token: Optional[str] = None,
196
197
  region: str,
197
198
  model: str = "amazon.nova-sonic-v1:0",
198
199
  voice_id: str = "matthew", # matthew, tiffany, amy
@@ -202,9 +203,25 @@ class AWSNovaSonicLLMService(LLMService):
202
203
  send_transcription_frames: bool = True,
203
204
  **kwargs,
204
205
  ):
206
+ """Initializes the AWS Nova Sonic LLM service.
207
+
208
+ Args:
209
+ secret_access_key: AWS secret access key for authentication.
210
+ access_key_id: AWS access key ID for authentication.
211
+ session_token: AWS session token for authentication.
212
+ region: AWS region where the service is hosted.
213
+ model: Model identifier. Defaults to "amazon.nova-sonic-v1:0".
214
+ voice_id: Voice ID for speech synthesis. Options: matthew, tiffany, amy.
215
+ params: Model parameters for audio configuration and inference.
216
+ system_instruction: System-level instruction for the model.
217
+ tools: Available tools/functions for the model to use.
218
+ send_transcription_frames: Whether to emit transcription frames.
219
+ **kwargs: Additional arguments passed to the parent LLMService.
220
+ """
205
221
  super().__init__(**kwargs)
206
222
  self._secret_access_key = secret_access_key
207
223
  self._access_key_id = access_key_id
224
+ self._session_token = session_token
208
225
  self._region = region
209
226
  self._model = model
210
227
  self._client: Optional[BedrockRuntimeClient] = None
@@ -456,7 +473,6 @@ class AWSNovaSonicLLMService(LLMService):
456
473
  # If we need to, send assistant response trigger (depends on self._connected_time)
457
474
  if self._triggering_assistant_response:
458
475
  await self._send_assistant_response_trigger()
459
- self._triggering_assistant_response = False
460
476
 
461
477
  async def _disconnect(self):
462
478
  try:
@@ -508,7 +524,9 @@ class AWSNovaSonicLLMService(LLMService):
508
524
  region=self._region,
509
525
  aws_credentials_identity_resolver=StaticCredentialsResolver(
510
526
  credentials=AWSCredentialsIdentity(
511
- access_key_id=self._access_key_id, secret_access_key=self._secret_access_key
527
+ access_key_id=self._access_key_id,
528
+ secret_access_key=self._secret_access_key,
529
+ session_token=self._session_token,
512
530
  )
513
531
  ),
514
532
  http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
@@ -776,9 +794,7 @@ class AWSNovaSonicLLMService(LLMService):
776
794
  try:
777
795
  while self._stream and not self._disconnecting:
778
796
  output = await self._stream.await_output()
779
- result = await asyncio.wait_for(output[1].receive(), timeout=1.0)
780
-
781
- self.reset_watchdog()
797
+ result = await output[1].receive()
782
798
 
783
799
  if result.value and result.value.bytes_:
784
800
  response_data = result.value.bytes_.decode("utf-8")
@@ -807,8 +823,6 @@ class AWSNovaSonicLLMService(LLMService):
807
823
  elif "completionEnd" in event_json:
808
824
  # Handle the LLM completion ending
809
825
  await self._handle_completion_end_event(event_json)
810
- except asyncio.TimeoutError:
811
- self.reset_watchdog()
812
826
  except Exception as e:
813
827
  logger.error(f"{self} error processing responses: {e}")
814
828
  if self._wants_connection:
@@ -1089,7 +1103,6 @@ class AWSNovaSonicLLMService(LLMService):
1089
1103
  # Send the trigger audio, if we're fully connected and set up
1090
1104
  if self._connected_time is not None:
1091
1105
  await self._send_assistant_response_trigger()
1092
- self._triggering_assistant_response = False
1093
1106
 
1094
1107
  async def _send_assistant_response_trigger(self):
1095
1108
  if (
@@ -1097,46 +1110,51 @@ class AWSNovaSonicLLMService(LLMService):
1097
1110
  ): # should never happen
1098
1111
  return
1099
1112
 
1100
- logger.debug("Sending assistant response trigger...")
1101
-
1102
- chunk_duration = 0.02 # what we might get from InputAudioRawFrame
1103
- chunk_size = int(
1104
- chunk_duration
1105
- * self._params.input_sample_rate
1106
- * self._params.input_channel_count
1107
- * (self._params.input_sample_size / 8)
1108
- ) # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes
1109
-
1110
- # Lead with a bit of blank audio, if needed.
1111
- # It seems like the LLM can't quite "hear" the first little bit of audio sent on a
1112
- # connection.
1113
- current_time = time.time()
1114
- max_blank_audio_duration = 0.5
1115
- blank_audio_duration = (
1116
- max_blank_audio_duration - (current_time - self._connected_time)
1117
- if self._connected_time is not None
1118
- and (current_time - self._connected_time) < max_blank_audio_duration
1119
- else None
1120
- )
1121
- if blank_audio_duration:
1122
- logger.debug(
1123
- f"Leading assistant response trigger with {blank_audio_duration}s of blank audio"
1113
+ try:
1114
+ logger.debug("Sending assistant response trigger...")
1115
+
1116
+ chunk_duration = 0.02 # what we might get from InputAudioRawFrame
1117
+ chunk_size = int(
1118
+ chunk_duration
1119
+ * self._params.input_sample_rate
1120
+ * self._params.input_channel_count
1121
+ * (self._params.input_sample_size / 8)
1122
+ ) # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes
1123
+
1124
+ # Lead with a bit of blank audio, if needed.
1125
+ # It seems like the LLM can't quite "hear" the first little bit of audio sent on a
1126
+ # connection.
1127
+ current_time = time.time()
1128
+ max_blank_audio_duration = 0.5
1129
+ blank_audio_duration = (
1130
+ max_blank_audio_duration - (current_time - self._connected_time)
1131
+ if self._connected_time is not None
1132
+ and (current_time - self._connected_time) < max_blank_audio_duration
1133
+ else None
1124
1134
  )
1125
- blank_audio_chunk = b"\x00" * chunk_size
1126
- num_chunks = int(blank_audio_duration / chunk_duration)
1127
- for _ in range(num_chunks):
1128
- await self._send_user_audio_event(blank_audio_chunk)
1135
+ if blank_audio_duration:
1136
+ logger.debug(
1137
+ f"Leading assistant response trigger with {blank_audio_duration}s of blank audio"
1138
+ )
1139
+ blank_audio_chunk = b"\x00" * chunk_size
1140
+ num_chunks = int(blank_audio_duration / chunk_duration)
1141
+ for _ in range(num_chunks):
1142
+ await self._send_user_audio_event(blank_audio_chunk)
1143
+ await asyncio.sleep(chunk_duration)
1144
+
1145
+ # Send trigger audio
1146
+ # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
1147
+ # if we ever need to seed this service again with context it would make sense to include it
1148
+ # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
1149
+ # context as well.
1150
+ audio_chunks = [
1151
+ self._assistant_response_trigger_audio[i : i + chunk_size]
1152
+ for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
1153
+ ]
1154
+ for chunk in audio_chunks:
1155
+ await self._send_user_audio_event(chunk)
1129
1156
  await asyncio.sleep(chunk_duration)
1130
-
1131
- # Send trigger audio
1132
- # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
1133
- # if we ever need to seed this service again with context it would make sense to include it
1134
- # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
1135
- # context as well.
1136
- audio_chunks = [
1137
- self._assistant_response_trigger_audio[i : i + chunk_size]
1138
- for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
1139
- ]
1140
- for chunk in audio_chunks:
1141
- await self._send_user_audio_event(chunk)
1142
- await asyncio.sleep(chunk_duration)
1157
+ finally:
1158
+ # We need to clean up in case sending the trigger was cancelled, e.g. in the case of a user interruption.
1159
+ # (An asyncio.CancelledError would be raised in that case.)
1160
+ self._triggering_assistant_response = False
@@ -41,7 +41,14 @@ from pipecat.services.openai.llm import (
41
41
 
42
42
 
43
43
  class Role(Enum):
44
- """Roles supported in AWS Nova Sonic conversations."""
44
+ """Roles supported in AWS Nova Sonic conversations.
45
+
46
+ Parameters:
47
+ SYSTEM: System-level messages (not used in conversation history).
48
+ USER: Messages sent by the user.
49
+ ASSISTANT: Messages sent by the assistant.
50
+ TOOL: Messages sent by tools (not used in conversation history).
51
+ """
45
52
 
46
53
  SYSTEM = "SYSTEM"
47
54
  USER = "USER"
@@ -80,14 +87,16 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
80
87
 
81
88
  Extends OpenAI context with Nova Sonic-specific message handling,
82
89
  conversation history management, and text buffering capabilities.
83
-
84
- Args:
85
- messages: Initial messages for the context.
86
- tools: Available tools for the context.
87
- **kwargs: Additional arguments passed to parent class.
88
90
  """
89
91
 
90
92
  def __init__(self, messages=None, tools=None, **kwargs):
93
+ """Initialize AWS Nova Sonic LLM context.
94
+
95
+ Args:
96
+ messages: Initial messages for the context.
97
+ tools: Available tools for the context.
98
+ **kwargs: Additional arguments passed to parent class.
99
+ """
91
100
  super().__init__(messages=messages, tools=tools, **kwargs)
92
101
  self.__setup_local()
93
102
 
@@ -4,14 +4,22 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- from typing import Optional
7
+ """Language conversion utilities for Azure services."""
8
8
 
9
- from loguru import logger
9
+ from typing import Optional
10
10
 
11
11
  from pipecat.transcriptions.language import Language
12
12
 
13
13
 
14
14
  def language_to_azure_language(language: Language) -> Optional[str]:
15
+ """Convert a Language enum to Azure language code.
16
+
17
+ Args:
18
+ language: The Language enum value to convert.
19
+
20
+ Returns:
21
+ The corresponding Azure language code, or None if not supported.
22
+ """
15
23
  language_map = {
16
24
  # Afrikaans
17
25
  Language.AF: "af-ZA",
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Azure OpenAI image generation service implementation.
8
+
9
+ This module provides integration with Azure's OpenAI image generation API
10
+ using REST endpoints for creating images from text prompts.
11
+ """
12
+
7
13
  import asyncio
8
14
  import io
9
15
  from typing import AsyncGenerator
@@ -17,6 +23,13 @@ from pipecat.services.image_service import ImageGenService
17
23
 
18
24
 
19
25
  class AzureImageGenServiceREST(ImageGenService):
26
+ """Azure OpenAI REST-based image generation service.
27
+
28
+ Provides image generation using Azure's OpenAI service via REST API.
29
+ Supports asynchronous image generation with polling for completion
30
+ and automatic image download and processing.
31
+ """
32
+
20
33
  def __init__(
21
34
  self,
22
35
  *,
@@ -27,6 +40,16 @@ class AzureImageGenServiceREST(ImageGenService):
27
40
  aiohttp_session: aiohttp.ClientSession,
28
41
  api_version="2023-06-01-preview",
29
42
  ):
43
+ """Initialize the AzureImageGenServiceREST.
44
+
45
+ Args:
46
+ image_size: Size specification for generated images (e.g., "1024x1024").
47
+ api_key: Azure OpenAI API key for authentication.
48
+ endpoint: Azure OpenAI endpoint URL.
49
+ model: The image generation model to use.
50
+ aiohttp_session: Shared aiohttp session for HTTP requests.
51
+ api_version: Azure API version string. Defaults to "2023-06-01-preview".
52
+ """
30
53
  super().__init__()
31
54
 
32
55
  self._api_key = api_key
@@ -37,6 +60,15 @@ class AzureImageGenServiceREST(ImageGenService):
37
60
  self._aiohttp_session = aiohttp_session
38
61
 
39
62
  async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
63
+ """Generate an image from a text prompt using Azure OpenAI.
64
+
65
+ Args:
66
+ prompt: The text prompt describing the desired image.
67
+
68
+ Yields:
69
+ URLImageRawFrame containing the generated image data, or
70
+ ErrorFrame if generation fails.
71
+ """
40
72
  url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
41
73
 
42
74
  headers = {"api-key": self._api_key, "Content-Type": "application/json"}
@@ -17,13 +17,6 @@ class AzureLLMService(OpenAILLMService):
17
17
 
18
18
  This service extends OpenAILLMService to connect to Azure's OpenAI endpoint while
19
19
  maintaining full compatibility with OpenAI's interface and functionality.
20
-
21
- Args:
22
- api_key: The API key for accessing Azure OpenAI.
23
- endpoint: The Azure endpoint URL.
24
- model: The model identifier to use.
25
- api_version: Azure API version. Defaults to "2024-09-01-preview".
26
- **kwargs: Additional keyword arguments passed to OpenAILLMService.
27
20
  """
28
21
 
29
22
  def __init__(
@@ -35,6 +28,15 @@ class AzureLLMService(OpenAILLMService):
35
28
  api_version: str = "2024-09-01-preview",
36
29
  **kwargs,
37
30
  ):
31
+ """Initialize the Azure LLM service.
32
+
33
+ Args:
34
+ api_key: The API key for accessing Azure OpenAI.
35
+ endpoint: The Azure endpoint URL.
36
+ model: The model identifier to use.
37
+ api_version: Azure API version. Defaults to "2024-09-01-preview".
38
+ **kwargs: Additional keyword arguments passed to OpenAILLMService.
39
+ """
38
40
  # Initialize variables before calling parent __init__() because that
39
41
  # will call create_client() and we need those values there.
40
42
  self._endpoint = endpoint
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Azure Speech-to-Text service implementation for Pipecat.
8
+
9
+ This module provides speech-to-text functionality using Azure Cognitive Services
10
+ Speech SDK for real-time audio transcription.
11
+ """
12
+
7
13
  import asyncio
8
14
  from typing import AsyncGenerator, List, Optional # Add List
9
15
 
@@ -42,6 +48,13 @@ except ModuleNotFoundError as e:
42
48
 
43
49
 
44
50
  class AzureSTTService(STTService):
51
+ """Azure Speech-to-Text service for real-time audio transcription.
52
+
53
+ This service uses Azure Cognitive Services Speech SDK to convert speech
54
+ audio into text transcriptions. It supports continuous recognition and
55
+ provides real-time transcription results with timing information.
56
+ """
57
+
45
58
  def __init__(
46
59
  self,
47
60
  *,
@@ -50,8 +63,19 @@ class AzureSTTService(STTService):
50
63
  language: Language = Language.EN_US,
51
64
  additional_languages: list[Language] = None,
52
65
  sample_rate: Optional[int] = None,
66
+ endpoint_id: Optional[str] = None,
53
67
  **kwargs,
54
68
  ):
69
+ """Initialize the Azure STT service.
70
+
71
+ Args:
72
+ api_key: Azure Cognitive Services subscription key.
73
+ region: Azure region for the Speech service (e.g., 'eastus').
74
+ language: Language for speech recognition. Defaults to English (US).
75
+ sample_rate: Audio sample rate in Hz. If None, uses service default.
76
+ endpoint_id: Custom model endpoint id.
77
+ **kwargs: Additional arguments passed to parent STTService.
78
+ """
55
79
  super().__init__(sample_rate=sample_rate, **kwargs)
56
80
  self._vocab: Optional[List[str]] = kwargs.pop("vocab", None) # Get vocab from kwargs
57
81
 
@@ -65,6 +89,8 @@ class AzureSTTService(STTService):
65
89
  self._speech_config.set_property(PropertyId.Speech_SegmentationSilenceTimeoutMs, "400")
66
90
  self._primary_language = language
67
91
  self._additional_languages = additional_languages
92
+ if endpoint_id:
93
+ self._speech_config.endpoint_id = endpoint_id
68
94
 
69
95
  self._audio_stream = None
70
96
  self._speech_recognizer = None
@@ -75,10 +101,25 @@ class AzureSTTService(STTService):
75
101
  }
76
102
 
77
103
  def can_generate_metrics(self) -> bool:
104
+ """Check if this service can generate performance metrics.
105
+
106
+ Returns:
107
+ True as this service supports metrics generation.
108
+ """
78
109
  return True
79
110
 
80
111
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
81
- # Entry point for streaming audio to Azure STT and yielding transcription frames
112
+ """Process audio data for speech-to-text conversion.
113
+
114
+ Feeds audio data to the Azure speech recognizer for processing.
115
+ Recognition results are handled asynchronously through callbacks.
116
+
117
+ Args:
118
+ audio: Raw audio bytes to process.
119
+
120
+ Yields:
121
+ None - actual transcription frames are pushed via callbacks.
122
+ """
82
123
  await self.start_processing_metrics()
83
124
  await self.start_ttfb_metrics()
84
125
  if self._audio_stream:
@@ -87,6 +128,14 @@ class AzureSTTService(STTService):
87
128
  yield None
88
129
 
89
130
  async def start(self, frame: StartFrame):
131
+ """Start the speech recognition service.
132
+
133
+ Initializes the Azure speech recognizer with audio stream configuration
134
+ and begins continuous speech recognition.
135
+
136
+ Args:
137
+ frame: Frame indicating the start of processing.
138
+ """
90
139
  await super().start(frame)
91
140
 
92
141
  if self._audio_stream:
@@ -139,6 +188,13 @@ class AzureSTTService(STTService):
139
188
  self._speech_recognizer.start_continuous_recognition_async()
140
189
 
141
190
  async def stop(self, frame: EndFrame):
191
+ """Stop the speech recognition service.
192
+
193
+ Cleanly shuts down the Azure speech recognizer and closes audio streams.
194
+
195
+ Args:
196
+ frame: Frame indicating the end of processing.
197
+ """
142
198
  await super().stop(frame)
143
199
 
144
200
  if self._speech_recognizer:
@@ -150,6 +206,13 @@ class AzureSTTService(STTService):
150
206
  self._audio_stream.close()
151
207
 
152
208
  async def cancel(self, frame: CancelFrame):
209
+ """Cancel the speech recognition service.
210
+
211
+ Immediately stops recognition and closes resources.
212
+
213
+ Args:
214
+ frame: Frame indicating cancellation.
215
+ """
153
216
  await super().cancel(frame)
154
217
 
155
218
  if self._speech_recognizer:
@@ -175,7 +238,7 @@ class AzureSTTService(STTService):
175
238
  language = getattr(event.result, "language", None) or self._settings.get("language")
176
239
  frame = TranscriptionFrame(
177
240
  event.result.text,
178
- "",
241
+ self._user_id,
179
242
  time_now_iso8601(),
180
243
  language,
181
244
  result=event,