dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -0,0 +1,398 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Soniox speech-to-text service implementation."""
8
+
9
+ import asyncio
10
+ import json
11
+ import time
12
+ from typing import AsyncGenerator, List, Optional
13
+
14
+ from loguru import logger
15
+ from pydantic import BaseModel
16
+
17
+ from pipecat.frames.frames import (
18
+ CancelFrame,
19
+ EndFrame,
20
+ ErrorFrame,
21
+ Frame,
22
+ InterimTranscriptionFrame,
23
+ StartFrame,
24
+ TranscriptionFrame,
25
+ UserStoppedSpeakingFrame,
26
+ )
27
+ from pipecat.processors.frame_processor import FrameDirection
28
+ from pipecat.services.stt_service import STTService
29
+ from pipecat.transcriptions.language import Language
30
+ from pipecat.utils.time import time_now_iso8601
31
+ from pipecat.utils.tracing.service_decorators import traced_stt
32
+
33
+ try:
34
+ import websockets
35
+ from websockets.asyncio.client import connect as websocket_connect
36
+ from websockets.protocol import State
37
+ except ModuleNotFoundError as e:
38
+ logger.error(f"Exception: {e}")
39
+ logger.error("In order to use Soniox, you need to `pip install pipecat-ai[soniox]`.")
40
+ raise Exception(f"Missing module: {e}")
41
+
42
+
43
+ KEEPALIVE_MESSAGE = '{"type": "keepalive"}'
44
+
45
+ FINALIZE_MESSAGE = '{"type": "finalize"}'
46
+
47
+ END_TOKEN = "<end>"
48
+
49
+ FINALIZED_TOKEN = "<fin>"
50
+
51
+
52
+ class SonioxInputParams(BaseModel):
53
+ """Real-time transcription settings.
54
+
55
+ See Soniox WebSocket API documentation for more details:
56
+ https://soniox.com/docs/speech-to-text/api-reference/websocket-api#configuration-parameters
57
+
58
+ Parameters:
59
+ model: Model to use for transcription.
60
+ audio_format: Audio format to use for transcription.
61
+ num_channels: Number of channels to use for transcription.
62
+ language_hints: List of language hints to use for transcription.
63
+ context: Customization for transcription.
64
+ enable_non_final_tokens: Whether to enable non-final tokens. If false, only final tokens will be returned.
65
+ max_non_final_tokens_duration_ms: Maximum duration of non-final tokens.
66
+ client_reference_id: Client reference ID to use for transcription.
67
+ """
68
+
69
+ model: str = "stt-rt-preview"
70
+
71
+ audio_format: Optional[str] = "pcm_s16le"
72
+ num_channels: Optional[int] = 1
73
+
74
+ language_hints: Optional[List[Language]] = None
75
+ context: Optional[str] = None
76
+
77
+ enable_non_final_tokens: Optional[bool] = True
78
+ max_non_final_tokens_duration_ms: Optional[int] = None
79
+
80
+ client_reference_id: Optional[str] = None
81
+
82
+
83
+ def is_end_token(token: dict) -> bool:
84
+ """Determine if a token is an end token."""
85
+ return token["text"] == END_TOKEN or token["text"] == FINALIZED_TOKEN
86
+
87
+
88
+ def language_to_soniox_language(language: Language) -> str:
89
+ """Pipecat Language enum uses same ISO 2-letter codes as Soniox, except with added regional variants.
90
+
91
+ For a list of all supported languages, see: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
92
+ """
93
+ lang_str = str(language.value).lower()
94
+ if "-" in lang_str:
95
+ return lang_str.split("-")[0]
96
+ return lang_str
97
+
98
+
99
+ def _prepare_language_hints(
100
+ language_hints: Optional[List[Language]],
101
+ ) -> Optional[List[str]]:
102
+ if language_hints is None:
103
+ return None
104
+
105
+ prepared_languages = [language_to_soniox_language(lang) for lang in language_hints]
106
+ # Remove duplicates (in case of language_hints with multiple regions).
107
+ return list(set(prepared_languages))
108
+
109
+
110
+ class SonioxSTTService(STTService):
111
+ """Speech-to-Text service using Soniox's WebSocket API.
112
+
113
+ This service connects to Soniox's WebSocket API for real-time transcription
114
+ with support for multiple languages, custom context, speaker diarization,
115
+ and more.
116
+
117
+ For complete API documentation, see: https://soniox.com/docs/speech-to-text/api-reference/websocket-api
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ *,
123
+ api_key: str,
124
+ url: str = "wss://stt-rt.soniox.com/transcribe-websocket",
125
+ sample_rate: Optional[int] = None,
126
+ params: Optional[SonioxInputParams] = None,
127
+ vad_force_turn_endpoint: bool = False,
128
+ **kwargs,
129
+ ):
130
+ """Initialize the Soniox STT service.
131
+
132
+ Args:
133
+ api_key: Soniox API key.
134
+ url: Soniox WebSocket API URL.
135
+ sample_rate: Audio sample rate.
136
+ params: Additional configuration parameters, such as language hints, context and
137
+ speaker diarization.
138
+ vad_force_turn_endpoint: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox. If disabled, Soniox will detect the end of the speech.
139
+ **kwargs: Additional arguments passed to the STTService.
140
+ """
141
+ super().__init__(sample_rate=sample_rate, **kwargs)
142
+ params = params or SonioxInputParams()
143
+
144
+ self._api_key = api_key
145
+ self._url = url
146
+ self.set_model_name(params.model)
147
+ self._params = params
148
+ self._vad_force_turn_endpoint = vad_force_turn_endpoint
149
+ self._websocket = None
150
+
151
+ self._final_transcription_buffer = []
152
+ self._last_tokens_received: Optional[float] = None
153
+
154
+ self._receive_task = None
155
+ self._keepalive_task = None
156
+
157
+ async def start(self, frame: StartFrame):
158
+ """Start the Soniox STT websocket connection.
159
+
160
+ Args:
161
+ frame: The start frame containing initialization parameters.
162
+ """
163
+ await super().start(frame)
164
+ if self._websocket:
165
+ return
166
+
167
+ self._websocket = await websocket_connect(self._url)
168
+
169
+ if not self._websocket:
170
+ logger.error(f"Unable to connect to Soniox API at {self._url}")
171
+
172
+ # If vad_force_turn_endpoint is not enabled, we need to enable endpoint detection.
173
+ # Either one or the other is required.
174
+ enable_endpoint_detection = not self._vad_force_turn_endpoint
175
+
176
+ # Send the initial configuration message.
177
+ config = {
178
+ "api_key": self._api_key,
179
+ "model": self._model_name,
180
+ "audio_format": self._params.audio_format,
181
+ "num_channels": self._params.num_channels or 1,
182
+ "enable_endpoint_detection": enable_endpoint_detection,
183
+ "sample_rate": self.sample_rate,
184
+ "language_hints": _prepare_language_hints(self._params.language_hints),
185
+ "context": self._params.context,
186
+ "enable_non_final_tokens": self._params.enable_non_final_tokens,
187
+ "max_non_final_tokens_duration_ms": self._params.max_non_final_tokens_duration_ms,
188
+ "client_reference_id": self._params.client_reference_id,
189
+ }
190
+
191
+ # Send the configuration message.
192
+ await self._websocket.send(json.dumps(config))
193
+
194
+ if self._websocket and not self._receive_task:
195
+ self._receive_task = self.create_task(self._receive_task_handler())
196
+ if self._websocket and not self._keepalive_task:
197
+ self._keepalive_task = self.create_task(self._keepalive_task_handler())
198
+
199
+ async def _cleanup(self):
200
+ if self._keepalive_task:
201
+ await self.cancel_task(self._keepalive_task)
202
+ self._keepalive_task = None
203
+
204
+ if self._websocket:
205
+ await self._websocket.close()
206
+ self._websocket = None
207
+
208
+ if self._receive_task:
209
+ # Task cannot cancel itself. If task called _cleanup() we expect it to cancel itself.
210
+ if self._receive_task != asyncio.current_task():
211
+ await self._receive_task
212
+ self._receive_task = None
213
+
214
+ async def stop(self, frame: EndFrame):
215
+ """Stop the Soniox STT websocket connection.
216
+
217
+ Stopping waits for the server to close the connection as we might receive
218
+ additional final tokens after sending the stop recording message.
219
+
220
+ Args:
221
+ frame: The end frame.
222
+ """
223
+ await super().stop(frame)
224
+ await self._send_stop_recording()
225
+
226
+ async def cancel(self, frame: CancelFrame):
227
+ """Cancel the Soniox STT websocket connection.
228
+
229
+ Compared to stop, this method closes the connection immediately without waiting
230
+ for the server to close it. This is useful when we want to stop the connection
231
+ immediately without waiting for the server to send any final tokens.
232
+
233
+ Args:
234
+ frame: The cancel frame.
235
+ """
236
+ await super().cancel(frame)
237
+ await self._cleanup()
238
+
239
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
240
+ """Send audio data to Soniox STT Service.
241
+
242
+ Args:
243
+ audio: Raw audio bytes to transcribe.
244
+
245
+ Yields:
246
+ Frame: None (transcription results come via WebSocket callbacks).
247
+ """
248
+ await self.start_processing_metrics()
249
+ if self._websocket and self._websocket.state is State.OPEN:
250
+ await self._websocket.send(audio)
251
+ await self.stop_processing_metrics()
252
+
253
+ yield None
254
+
255
+ @traced_stt
256
+ async def _handle_transcription(
257
+ self, transcript: str, is_final: bool, language: Optional[Language] = None
258
+ ):
259
+ """Handle a transcription result with tracing."""
260
+ pass
261
+
262
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
263
+ """Processes a frame of audio data, either buffering or transcribing it.
264
+
265
+ Args:
266
+ frame: The frame to process.
267
+ direction: The direction of frame processing.
268
+ """
269
+ await super().process_frame(frame, direction)
270
+
271
+ if isinstance(frame, UserStoppedSpeakingFrame) and self._vad_force_turn_endpoint:
272
+ # Send finalize message to Soniox so we get the final tokens asap.
273
+ if self._websocket and self._websocket.state is State.OPEN:
274
+ await self._websocket.send(FINALIZE_MESSAGE)
275
+ logger.debug(f"Triggered finalize event on: {frame.name=}, {direction=}")
276
+
277
+ async def _send_stop_recording(self):
278
+ """Send stop recording message to Soniox."""
279
+ if self._websocket and self._websocket.state is State.OPEN:
280
+ # Send stop recording message
281
+ await self._websocket.send("")
282
+
283
+ async def _keepalive_task_handler(self):
284
+ """Connection has to be open all the time."""
285
+ try:
286
+ while True:
287
+ logger.trace("Sending keepalive message")
288
+ if self._websocket and self._websocket.state is State.OPEN:
289
+ await self._websocket.send(KEEPALIVE_MESSAGE)
290
+ else:
291
+ logger.debug("WebSocket connection closed.")
292
+ break
293
+ await asyncio.sleep(5)
294
+
295
+ except websockets.exceptions.ConnectionClosed:
296
+ # Expected when closing the connection
297
+ logger.debug("WebSocket connection closed, keepalive task stopped.")
298
+ except Exception as e:
299
+ logger.error(f"{self} error (_keepalive_task_handler): {e}")
300
+ await self.push_error(ErrorFrame(f"{self} error (_keepalive_task_handler): {e}"))
301
+
302
+ async def _receive_task_handler(self):
303
+ if not self._websocket:
304
+ return
305
+
306
+ # Transcription frame will be only sent after we get the "endpoint" event.
307
+ self._final_transcription_buffer = []
308
+
309
+ async def send_endpoint_transcript():
310
+ if self._final_transcription_buffer:
311
+ text = "".join(map(lambda token: token["text"], self._final_transcription_buffer))
312
+ await self.push_frame(
313
+ TranscriptionFrame(
314
+ text=text,
315
+ user_id=self._user_id,
316
+ timestamp=time_now_iso8601(),
317
+ result=self._final_transcription_buffer,
318
+ )
319
+ )
320
+ await self._handle_transcription(text, is_final=True)
321
+ await self.stop_processing_metrics()
322
+ self._final_transcription_buffer = []
323
+
324
+ try:
325
+ async for message in self._websocket:
326
+ content = json.loads(message)
327
+
328
+ tokens = content["tokens"]
329
+
330
+ if tokens:
331
+ if len(tokens) == 1 and tokens[0]["text"] == FINALIZED_TOKEN:
332
+ # Ignore finalized token, prevent auto-finalize cycling.
333
+ pass
334
+ else:
335
+ # Got at least one token, so we can reset the auto finalize delay.
336
+ self._last_tokens_received = time.time()
337
+
338
+ # We will only send the final tokens after we get the "endpoint" event.
339
+ non_final_transcription = []
340
+
341
+ for token in tokens:
342
+ if token["is_final"]:
343
+ if is_end_token(token):
344
+ # Found an endpoint, tokens until here will be sent as transcript,
345
+ # the rest will be sent as interim tokens (even final tokens).
346
+ await send_endpoint_transcript()
347
+ else:
348
+ self._final_transcription_buffer.append(token)
349
+ else:
350
+ non_final_transcription.append(token)
351
+
352
+ if self._final_transcription_buffer or non_final_transcription:
353
+ final_text = "".join(
354
+ map(lambda token: token["text"], self._final_transcription_buffer)
355
+ )
356
+ non_final_text = "".join(
357
+ map(lambda token: token["text"], non_final_transcription)
358
+ )
359
+
360
+ await self.push_frame(
361
+ InterimTranscriptionFrame(
362
+ # Even final tokens are sent as interim tokens as we want to send
363
+ # nicely formatted messages - therefore waiting for the endpoint.
364
+ text=final_text + non_final_text,
365
+ user_id=self._user_id,
366
+ timestamp=time_now_iso8601(),
367
+ result=self._final_transcription_buffer + non_final_transcription,
368
+ )
369
+ )
370
+
371
+ error_code = content.get("error_code")
372
+ error_message = content.get("error_message")
373
+ if error_code or error_message:
374
+ # In case of error, still send the final transcript (if any remaining in the buffer).
375
+ await send_endpoint_transcript()
376
+ logger.error(
377
+ f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
378
+ )
379
+ await self.push_error(
380
+ ErrorFrame(
381
+ f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
382
+ )
383
+ )
384
+
385
+ finished = content.get("finished")
386
+ if finished:
387
+ # When finished, still send the final transcript (if any remaining in the buffer).
388
+ await send_endpoint_transcript()
389
+ logger.debug("Transcription finished.")
390
+ await self._cleanup()
391
+ return
392
+
393
+ except websockets.exceptions.ConnectionClosed:
394
+ # Expected when closing the connection.
395
+ pass
396
+ except Exception as e:
397
+ logger.error(f"{self} error: {e}")
398
+ await self.push_error(ErrorFrame(f"{self} error: {e}"))
@@ -23,6 +23,7 @@ from pipecat.frames.frames import (
23
23
  BotInterruptionFrame,
24
24
  CancelFrame,
25
25
  EndFrame,
26
+ ErrorFrame,
26
27
  Frame,
27
28
  InterimTranscriptionFrame,
28
29
  StartFrame,
@@ -463,8 +464,14 @@ class SpeechmaticsSTTService(STTService):
463
464
 
464
465
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
465
466
  """Adds audio to the audio buffer and yields None."""
466
- await self._client.send_audio(audio)
467
- yield None
467
+ try:
468
+ if self._client:
469
+ await self._client.send_audio(audio)
470
+ yield None
471
+ except Exception as e:
472
+ logger.error(f"Speechmatics error: {e}")
473
+ yield ErrorFrame(f"Speechmatics error: {e}", fatal=False)
474
+ await self._disconnect()
468
475
 
469
476
  def update_params(
470
477
  self,
@@ -520,7 +527,7 @@ class SpeechmaticsSTTService(STTService):
520
527
  )
521
528
 
522
529
  # Log the event
523
- logger.debug("Connected to Speechmatics STT service")
530
+ logger.debug(f"{self} Connecting to Speechmatics STT service")
524
531
 
525
532
  # Recognition started event
526
533
  @self._client.on(ServerMessageType.RECOGNITION_STARTED)
@@ -562,31 +569,36 @@ class SpeechmaticsSTTService(STTService):
562
569
  )
563
570
 
564
571
  # Start session
565
- await self._client.start_session(
566
- transcription_config=self._transcription_config,
567
- audio_format=AudioFormat(
568
- encoding=self._params.audio_encoding,
569
- sample_rate=self.sample_rate,
570
- chunk_size=self._params.chunk_size,
571
- ),
572
- )
572
+ try:
573
+ await self._client.start_session(
574
+ transcription_config=self._transcription_config,
575
+ audio_format=AudioFormat(
576
+ encoding=self._params.audio_encoding,
577
+ sample_rate=self.sample_rate,
578
+ chunk_size=self._params.chunk_size,
579
+ ),
580
+ )
581
+ logger.debug(f"{self} Connected to Speechmatics STT service")
582
+ except Exception as e:
583
+ logger.error(f"{self} Error connecting to Speechmatics: {e}")
584
+ finally:
585
+ self._client = None
573
586
 
574
587
  async def _disconnect(self) -> None:
575
588
  """Disconnect from the STT service."""
576
589
  # Disconnect the client
590
+ self.logger.debug(f"{self} Disconnecting from Speechmatics STT service")
577
591
  try:
578
592
  if self._client:
579
- await asyncio.wait_for(self._client.close(), timeout=1.0)
593
+ await asyncio.wait_for(self._client.close(), timeout=5.0)
594
+ self.logger.debug(f"{self} Disconnected from Speechmatics STT service")
580
595
  except asyncio.TimeoutError:
581
- logger.warning("Timeout while closing Speechmatics client connection")
596
+ logger.warning(f"{self} Timeout while closing Speechmatics client connection")
582
597
  except Exception as e:
583
- logger.error(f"Error closing Speechmatics client: {e}")
598
+ logger.error(f"{self} Error closing Speechmatics client: {e}")
584
599
  finally:
585
600
  self._client = None
586
601
 
587
- # Log the event
588
- logger.debug("Disconnected from Speechmatics STT service")
589
-
590
602
  def _process_config(self) -> None:
591
603
  """Create a formatted STT transcription config.
592
604
 
@@ -34,13 +34,6 @@ class STTService(AIService):
34
34
  Provides common functionality for STT services including audio passthrough,
35
35
  muting, settings management, and audio processing. Subclasses must implement
36
36
  the run_stt method to provide actual speech recognition.
37
-
38
- Args:
39
- audio_passthrough: Whether to pass audio frames downstream after processing.
40
- Defaults to True.
41
- sample_rate: The sample rate for audio input. If None, will be determined
42
- from the start frame.
43
- **kwargs: Additional arguments passed to the parent AIService.
44
37
  """
45
38
 
46
39
  def __init__(
@@ -50,15 +43,26 @@ class STTService(AIService):
50
43
  sample_rate: Optional[int] = None,
51
44
  **kwargs,
52
45
  ):
46
+ """Initialize the STT service.
47
+
48
+ Args:
49
+ audio_passthrough: Whether to pass audio frames downstream after processing.
50
+ Defaults to True.
51
+ sample_rate: The sample rate for audio input. If None, will be determined
52
+ from the start frame.
53
+ **kwargs: Additional arguments passed to the parent AIService.
54
+ """
53
55
  super().__init__(**kwargs)
54
56
  self._audio_passthrough = audio_passthrough
55
57
  self._init_sample_rate = sample_rate
56
58
  self._sample_rate = 0
57
59
  self._settings: Dict[str, Any] = {}
60
+ self._tracing_enabled: bool = False
58
61
  self._muted: bool = False
59
62
  # Custom fields from ai_services.py for voicemail and first speech handling
60
63
  self._first_speech_handled: bool = False
61
64
  self._voicemail_detect: bool = False
65
+ self._user_id: str = ""
62
66
 
63
67
  @property
64
68
  def is_muted(self) -> bool:
@@ -119,6 +123,7 @@ class STTService(AIService):
119
123
  self._sample_rate = self._init_sample_rate or frame.audio_in_sample_rate
120
124
  if hasattr(frame, "metadata") and "voicemail_detect" in frame.metadata:
121
125
  self._voicemail_detect = frame.metadata["voicemail_detect"]
126
+ self._tracing_enabled = frame.enable_tracing
122
127
 
123
128
  async def _update_settings(self, settings: Mapping[str, Any]):
124
129
  self.logger.info(f"Updating STT settings: {self._settings}")
@@ -138,6 +143,11 @@ class STTService(AIService):
138
143
  async def process_audio_frame(self, frame: AudioRawFrame, direction: FrameDirection):
139
144
  """Process an audio frame for speech recognition.
140
145
 
146
+ If the service is muted, this method does nothing. Otherwise, it
147
+ processes the audio frame and runs speech-to-text on it, yielding
148
+ transcription results. If the frame has a user_id, it is stored
149
+ for later use in transcription.
150
+
141
151
  Args:
142
152
  frame: The audio frame to process.
143
153
  direction: The direction of frame processing.
@@ -146,6 +156,21 @@ class STTService(AIService):
146
156
  # If first speech is handled, we dont need to worry anymore.
147
157
  if self._muted and ((not self._voicemail_detect) or self._first_speech_handled):
148
158
  return
159
+
160
+ # UserAudioRawFrame contains a user_id (e.g. Daily, Livekit)
161
+ if hasattr(frame, "user_id"):
162
+ self._user_id = frame.user_id
163
+ # AudioRawFrame does not have a user_id (e.g. SmallWebRTCTransport, websockets)
164
+ else:
165
+ self._user_id = ""
166
+
167
+ if not frame.audio:
168
+ # Ignoring in case we don't have audio to transcribe.
169
+ logger.warning(
170
+ f"Empty audio frame received for STT service: {self.name} {frame.num_frames}"
171
+ )
172
+ return
173
+
149
174
  await self.process_generator(self.run_stt(frame.audio))
150
175
 
151
176
  async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -187,14 +212,16 @@ class SegmentedSTTService(STTService):
187
212
  Requires VAD to be enabled in the pipeline to function properly. Maintains a
188
213
  small audio buffer to account for the delay between actual speech start and
189
214
  VAD detection.
215
+ """
216
+
217
+ def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
218
+ """Initialize the segmented STT service.
190
219
 
191
- Args:
220
+ Args:
192
221
  sample_rate: The sample rate for audio input. If None, will be determined
193
222
  from the start frame.
194
223
  **kwargs: Additional arguments passed to the parent STTService.
195
- """
196
-
197
- def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
224
+ """
198
225
  super().__init__(sample_rate=sample_rate, **kwargs)
199
226
  self._content = None
200
227
  self._wave = None
@@ -251,10 +278,19 @@ class SegmentedSTTService(STTService):
251
278
  Continuously buffers audio, growing the buffer while user is speaking and
252
279
  maintaining a small buffer when not speaking to account for VAD delay.
253
280
 
281
+ If the frame has a user_id, it is stored for later use in transcription.
282
+
254
283
  Args:
255
284
  frame: The audio frame to process.
256
285
  direction: The direction of frame processing.
257
286
  """
287
+ # UserAudioRawFrame contains a user_id (e.g. Daily, Livekit)
288
+ if hasattr(frame, "user_id"):
289
+ self._user_id = frame.user_id
290
+ # AudioRawFrame does not have a user_id (e.g. SmallWebRTCTransport, websockets)
291
+ else:
292
+ self._user_id = ""
293
+
258
294
  # If the user is speaking the audio buffer will keep growing.
259
295
  self._audio_buffer += frame.audio
260
296