dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,10 +4,16 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Rime text-to-speech service implementations.
8
+
9
+ This module provides both WebSocket and HTTP-based text-to-speech services
10
+ using Rime's API for streaming and batch audio synthesis.
11
+ """
12
+
7
13
  import base64
8
14
  import json
9
15
  import uuid
10
- from typing import AsyncGenerator, Optional
16
+ from typing import Any, AsyncGenerator, Mapping, Optional
11
17
 
12
18
  import aiohttp
13
19
  from loguru import logger
@@ -33,7 +39,8 @@ from pipecat.utils.text.skip_tags_aggregator import SkipTagsAggregator
33
39
  from pipecat.utils.tracing.service_decorators import traced_tts
34
40
 
35
41
  try:
36
- import websockets
42
+ from websockets.asyncio.client import connect as websocket_connect
43
+ from websockets.protocol import State
37
44
  except ModuleNotFoundError as e:
38
45
  logger.error(f"Exception: {e}")
39
46
  logger.error("In order to use Rime, you need to `pip install pipecat-ai[rime]`.")
@@ -47,7 +54,7 @@ def language_to_rime_language(language: Language) -> str:
47
54
  language: The pipecat Language enum value.
48
55
 
49
56
  Returns:
50
- str: Three-letter language code used by Rime (e.g., 'eng' for English).
57
+ Three-letter language code used by Rime (e.g., 'eng' for English).
51
58
  """
52
59
  LANGUAGE_MAP = {
53
60
  Language.DE: "ger",
@@ -67,7 +74,15 @@ class RimeTTSService(AudioContextWordTTSService):
67
74
  """
68
75
 
69
76
  class InputParams(BaseModel):
70
- """Configuration parameters for Rime TTS service."""
77
+ """Configuration parameters for Rime TTS service.
78
+
79
+ Parameters:
80
+ language: Language for synthesis. Defaults to English.
81
+ speed_alpha: Speech speed multiplier. Defaults to 1.0.
82
+ reduce_latency: Whether to reduce latency at potential quality cost.
83
+ pause_between_brackets: Whether to add pauses between bracketed content.
84
+ phonemize_between_brackets: Whether to phonemize bracketed content.
85
+ """
71
86
 
72
87
  language: Optional[Language] = Language.EN
73
88
  speed_alpha: Optional[float] = 1.0
@@ -85,6 +100,7 @@ class RimeTTSService(AudioContextWordTTSService):
85
100
  sample_rate: Optional[int] = None,
86
101
  params: Optional[InputParams] = None,
87
102
  text_aggregator: Optional[BaseTextAggregator] = None,
103
+ aggregate_sentences: Optional[bool] = True,
88
104
  **kwargs,
89
105
  ):
90
106
  """Initialize Rime TTS service.
@@ -96,10 +112,13 @@ class RimeTTSService(AudioContextWordTTSService):
96
112
  model: Model ID to use for synthesis.
97
113
  sample_rate: Audio sample rate in Hz.
98
114
  params: Additional configuration parameters.
115
+ text_aggregator: Custom text aggregator for processing input text.
116
+ aggregate_sentences: Whether to aggregate sentences within the TTSService.
117
+ **kwargs: Additional arguments passed to parent class.
99
118
  """
100
119
  # Initialize with parent class settings for proper frame handling
101
120
  super().__init__(
102
- aggregate_sentences=True,
121
+ aggregate_sentences=aggregate_sentences,
103
122
  push_text_frames=False,
104
123
  push_stop_frames=True,
105
124
  pause_frame_processing=True,
@@ -135,17 +154,43 @@ class RimeTTSService(AudioContextWordTTSService):
135
154
  self._cumulative_time = 0 # Accumulates time across messages
136
155
 
137
156
  def can_generate_metrics(self) -> bool:
157
+ """Check if this service can generate processing metrics.
158
+
159
+ Returns:
160
+ True, as Rime service supports metrics generation.
161
+ """
138
162
  return True
139
163
 
140
164
  def language_to_service_language(self, language: Language) -> str | None:
141
- """Convert pipecat language to Rime language code."""
165
+ """Convert pipecat language to Rime language code.
166
+
167
+ Args:
168
+ language: The language to convert.
169
+
170
+ Returns:
171
+ The Rime-specific language code, or None if not supported.
172
+ """
142
173
  return language_to_rime_language(language)
143
174
 
144
175
  async def set_model(self, model: str):
145
- """Update the TTS model."""
176
+ """Update the TTS model.
177
+
178
+ Args:
179
+ model: The model name to use for synthesis.
180
+ """
146
181
  self._model = model
147
182
  await super().set_model(model)
148
183
 
184
+ async def _update_settings(self, settings: Mapping[str, Any]):
185
+ """Update service settings and reconnect if voice changed."""
186
+ prev_voice = self._voice_id
187
+ await super()._update_settings(settings)
188
+ if not prev_voice == self._voice_id:
189
+ self._settings["speaker"] = self._voice_id
190
+ logger.info(f"Switching TTS voice to: [{self._voice_id}]")
191
+ await self._disconnect()
192
+ await self._connect()
193
+
149
194
  def _build_msg(self, text: str = "") -> dict:
150
195
  """Build JSON message for Rime API."""
151
196
  return {"text": text, "contextId": self._context_id}
@@ -159,18 +204,30 @@ class RimeTTSService(AudioContextWordTTSService):
159
204
  return {"operation": "eos"}
160
205
 
161
206
  async def start(self, frame: StartFrame):
162
- """Start the service and establish websocket connection."""
207
+ """Start the service and establish websocket connection.
208
+
209
+ Args:
210
+ frame: The start frame containing initialization parameters.
211
+ """
163
212
  await super().start(frame)
164
213
  self._settings["samplingRate"] = self.sample_rate
165
214
  await self._connect()
166
215
 
167
216
  async def stop(self, frame: EndFrame):
168
- """Stop the service and close connection."""
217
+ """Stop the service and close connection.
218
+
219
+ Args:
220
+ frame: The end frame.
221
+ """
169
222
  await super().stop(frame)
170
223
  await self._disconnect()
171
224
 
172
225
  async def cancel(self, frame: CancelFrame):
173
- """Cancel current operation and clean up."""
226
+ """Cancel current operation and clean up.
227
+
228
+ Args:
229
+ frame: The cancel frame.
230
+ """
174
231
  await super().cancel(frame)
175
232
  await self._disconnect()
176
233
 
@@ -192,13 +249,13 @@ class RimeTTSService(AudioContextWordTTSService):
192
249
  async def _connect_websocket(self):
193
250
  """Connect to Rime websocket API with configured settings."""
194
251
  try:
195
- if self._websocket and self._websocket.open:
252
+ if self._websocket and self._websocket.state is State.OPEN:
196
253
  return
197
254
 
198
255
  params = "&".join(f"{k}={v}" for k, v in self._settings.items())
199
256
  url = f"{self._url}?{params}"
200
257
  headers = {"Authorization": f"Bearer {self._api_key}"}
201
- self._websocket = await websockets.connect(url, extra_headers=headers)
258
+ self._websocket = await websocket_connect(url, additional_headers=headers)
202
259
  except Exception as e:
203
260
  logger.error(f"{self} initialization error: {e}")
204
261
  self._websocket = None
@@ -261,6 +318,7 @@ class RimeTTSService(AudioContextWordTTSService):
261
318
  return word_pairs
262
319
 
263
320
  async def flush_audio(self):
321
+ """Flush any pending audio synthesis."""
264
322
  if not self._context_id or not self._websocket:
265
323
  return
266
324
 
@@ -310,7 +368,12 @@ class RimeTTSService(AudioContextWordTTSService):
310
368
  self._context_id = None
311
369
 
312
370
  async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
313
- """Push frame and handle end-of-turn conditions."""
371
+ """Push frame and handle end-of-turn conditions.
372
+
373
+ Args:
374
+ frame: The frame to push.
375
+ direction: The direction to push the frame.
376
+ """
314
377
  await super().push_frame(frame, direction)
315
378
  if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
316
379
  if isinstance(frame, TTSStoppedFrame):
@@ -318,17 +381,17 @@ class RimeTTSService(AudioContextWordTTSService):
318
381
 
319
382
  @traced_tts
320
383
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
321
- """Generate speech from text.
384
+ """Generate speech from text using Rime's streaming API.
322
385
 
323
386
  Args:
324
387
  text: The text to convert to speech.
325
388
 
326
389
  Yields:
327
- Frames containing audio data and timing information.
390
+ Frame: Audio frames containing the synthesized speech.
328
391
  """
329
392
  logger.debug(f"{self}: Generating TTS [{text}]")
330
393
  try:
331
- if not self._websocket or self._websocket.closed:
394
+ if not self._websocket or self._websocket.state is State.CLOSED:
332
395
  await self._connect()
333
396
 
334
397
  try:
@@ -354,7 +417,24 @@ class RimeTTSService(AudioContextWordTTSService):
354
417
 
355
418
 
356
419
  class RimeHttpTTSService(TTSService):
420
+ """Rime HTTP-based text-to-speech service.
421
+
422
+ Provides text-to-speech synthesis using Rime's HTTP API for batch processing.
423
+ Suitable for use cases where streaming is not required.
424
+ """
425
+
357
426
  class InputParams(BaseModel):
427
+ """Configuration parameters for Rime HTTP TTS service.
428
+
429
+ Parameters:
430
+ language: Language for synthesis. Defaults to English.
431
+ pause_between_brackets: Whether to add pauses between bracketed content.
432
+ phonemize_between_brackets: Whether to phonemize bracketed content.
433
+ inline_speed_alpha: Inline speed control markup.
434
+ speed_alpha: Speech speed multiplier. Defaults to 1.0.
435
+ reduce_latency: Whether to reduce latency at potential quality cost.
436
+ """
437
+
358
438
  language: Optional[Language] = Language.EN
359
439
  pause_between_brackets: Optional[bool] = False
360
440
  phonemize_between_brackets: Optional[bool] = False
@@ -373,6 +453,17 @@ class RimeHttpTTSService(TTSService):
373
453
  params: Optional[InputParams] = None,
374
454
  **kwargs,
375
455
  ):
456
+ """Initialize Rime HTTP TTS service.
457
+
458
+ Args:
459
+ api_key: Rime API key for authentication.
460
+ voice_id: ID of the voice to use.
461
+ aiohttp_session: Shared aiohttp session for HTTP requests.
462
+ model: Model ID to use for synthesis.
463
+ sample_rate: Audio sample rate in Hz.
464
+ params: Additional configuration parameters.
465
+ **kwargs: Additional arguments passed to parent TTSService.
466
+ """
376
467
  super().__init__(sample_rate=sample_rate, **kwargs)
377
468
 
378
469
  params = params or RimeHttpTTSService.InputParams()
@@ -396,14 +487,34 @@ class RimeHttpTTSService(TTSService):
396
487
  self._settings["inlineSpeedAlpha"] = params.inline_speed_alpha
397
488
 
398
489
  def can_generate_metrics(self) -> bool:
490
+ """Check if this service can generate processing metrics.
491
+
492
+ Returns:
493
+ True, as Rime HTTP service supports metrics generation.
494
+ """
399
495
  return True
400
496
 
401
497
  def language_to_service_language(self, language: Language) -> str | None:
402
- """Convert pipecat language to Rime language code."""
498
+ """Convert pipecat language to Rime language code.
499
+
500
+ Args:
501
+ language: The language to convert.
502
+
503
+ Returns:
504
+ The Rime-specific language code, or None if not supported.
505
+ """
403
506
  return language_to_rime_language(language)
404
507
 
405
508
  @traced_tts
406
509
  async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
510
+ """Generate speech from text using Rime's HTTP API.
511
+
512
+ Args:
513
+ text: The text to synthesize into speech.
514
+
515
+ Yields:
516
+ Frame: Audio frames containing the synthesized speech.
517
+ """
407
518
  logger.debug(f"{self}: Generating TTS [{text}]")
408
519
 
409
520
  headers = {
@@ -4,7 +4,10 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """NVIDIA Riva Speech-to-Text service implementations for real-time and batch transcription."""
8
+
7
9
  import asyncio
10
+ from concurrent.futures import CancelledError as FuturesCancelledError
8
11
  from typing import AsyncGenerator, List, Mapping, Optional
9
12
 
10
13
  from loguru import logger
@@ -21,7 +24,6 @@ from pipecat.frames.frames import (
21
24
  )
22
25
  from pipecat.services.stt_service import SegmentedSTTService, STTService
23
26
  from pipecat.transcriptions.language import Language
24
- from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue
25
27
  from pipecat.utils.time import time_now_iso8601
26
28
  from pipecat.utils.tracing.service_decorators import traced_stt
27
29
 
@@ -87,7 +89,20 @@ def language_to_riva_language(language: Language) -> Optional[str]:
87
89
 
88
90
 
89
91
  class RivaSTTService(STTService):
92
+ """Real-time speech-to-text service using NVIDIA Riva streaming ASR.
93
+
94
+ Provides real-time transcription capabilities using NVIDIA's Riva ASR models
95
+ through streaming recognition. Supports interim results and continuous audio
96
+ processing for low-latency applications.
97
+ """
98
+
90
99
  class InputParams(BaseModel):
100
+ """Configuration parameters for Riva STT service.
101
+
102
+ Parameters:
103
+ language: Target language for transcription. Defaults to EN_US.
104
+ """
105
+
91
106
  language: Optional[Language] = Language.EN_US
92
107
 
93
108
  def __init__(
@@ -103,6 +118,16 @@ class RivaSTTService(STTService):
103
118
  params: Optional[InputParams] = None,
104
119
  **kwargs,
105
120
  ):
121
+ """Initialize the Riva STT service.
122
+
123
+ Args:
124
+ api_key: NVIDIA API key for authentication.
125
+ server: Riva server address. Defaults to NVIDIA Cloud Function endpoint.
126
+ model_function_map: Mapping containing 'function_id' and 'model_name' for the ASR model.
127
+ sample_rate: Audio sample rate in Hz. If None, uses pipeline default.
128
+ params: Additional configuration parameters for Riva.
129
+ **kwargs: Additional arguments passed to STTService.
130
+ """
106
131
  super().__init__(sample_rate=sample_rate, **kwargs)
107
132
 
108
133
  params = params or RivaSTTService.InputParams()
@@ -142,15 +167,29 @@ class RivaSTTService(STTService):
142
167
 
143
168
  self._asr_service = riva.client.ASRService(auth)
144
169
 
145
- self._queue = asyncio.Queue()
170
+ self._queue = None
146
171
  self._config = None
147
172
  self._thread_task = None
148
173
  self._response_task = None
149
174
 
150
175
  def can_generate_metrics(self) -> bool:
176
+ """Check if this service can generate processing metrics.
177
+
178
+ Returns:
179
+ False - this service does not support metrics generation.
180
+ """
151
181
  return False
152
182
 
153
183
  async def set_model(self, model: str):
184
+ """Set the ASR model for transcription.
185
+
186
+ Args:
187
+ model: Model name to set.
188
+
189
+ Note:
190
+ Model cannot be changed after initialization. Use model_function_map
191
+ parameter in constructor instead.
192
+ """
154
193
  logger.warning(f"Cannot set model after initialization. Set model and function id like so:")
155
194
  example = {"function_id": "<UUID>", "model_name": "<model_name>"}
156
195
  logger.warning(
@@ -158,6 +197,11 @@ class RivaSTTService(STTService):
158
197
  )
159
198
 
160
199
  async def start(self, frame: StartFrame):
200
+ """Start the Riva STT service and initialize streaming configuration.
201
+
202
+ Args:
203
+ frame: StartFrame indicating pipeline start.
204
+ """
161
205
  await super().start(frame)
162
206
 
163
207
  if self._config:
@@ -194,19 +238,30 @@ class RivaSTTService(STTService):
194
238
  riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
195
239
 
196
240
  self._config = config
241
+ self._queue = asyncio.Queue()
197
242
 
198
243
  if not self._thread_task:
199
244
  self._thread_task = self.create_task(self._thread_task_handler())
200
245
 
201
246
  if not self._response_task:
202
- self._response_queue = WatchdogQueue(self.task_manager)
247
+ self._response_queue = asyncio.Queue()
203
248
  self._response_task = self.create_task(self._response_task_handler())
204
249
 
205
250
  async def stop(self, frame: EndFrame):
251
+ """Stop the Riva STT service and clean up resources.
252
+
253
+ Args:
254
+ frame: EndFrame indicating pipeline stop.
255
+ """
206
256
  await super().stop(frame)
207
257
  await self._stop_tasks()
208
258
 
209
259
  async def cancel(self, frame: CancelFrame):
260
+ """Cancel the Riva STT service operation.
261
+
262
+ Args:
263
+ frame: CancelFrame indicating operation cancellation.
264
+ """
210
265
  await super().cancel(frame)
211
266
  await self._stop_tasks()
212
267
 
@@ -225,7 +280,6 @@ class RivaSTTService(STTService):
225
280
  streaming_config=self._config,
226
281
  )
227
282
  for response in responses:
228
- self.reset_watchdog()
229
283
  if not response.results:
230
284
  continue
231
285
  asyncio.run_coroutine_threadsafe(
@@ -260,7 +314,7 @@ class RivaSTTService(STTService):
260
314
  await self.push_frame(
261
315
  TranscriptionFrame(
262
316
  transcript,
263
- "",
317
+ self._user_id,
264
318
  time_now_iso8601(),
265
319
  self._language_code,
266
320
  result=result,
@@ -275,7 +329,7 @@ class RivaSTTService(STTService):
275
329
  await self.push_frame(
276
330
  InterimTranscriptionFrame(
277
331
  transcript,
278
- "",
332
+ self._user_id,
279
333
  time_now_iso8601(),
280
334
  self._language_code,
281
335
  result=result,
@@ -289,18 +343,43 @@ class RivaSTTService(STTService):
289
343
  self._response_queue.task_done()
290
344
 
291
345
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
346
+ """Process audio data for speech-to-text transcription.
347
+
348
+ Args:
349
+ audio: Raw audio bytes to transcribe.
350
+
351
+ Yields:
352
+ None - transcription results are pushed to the pipeline via frames.
353
+ """
292
354
  await self.start_ttfb_metrics()
293
355
  await self.start_processing_metrics()
294
356
  await self._queue.put(audio)
295
357
  yield None
296
358
 
297
359
  def __next__(self) -> bytes:
360
+ """Get the next audio chunk for Riva processing.
361
+
362
+ Returns:
363
+ Audio bytes from the queue.
364
+
365
+ Raises:
366
+ StopIteration: When the thread is no longer running.
367
+ """
298
368
  if not self._thread_running:
299
369
  raise StopIteration
300
- future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop())
301
- return future.result()
370
+
371
+ try:
372
+ future = asyncio.run_coroutine_threadsafe(self._queue.get(), self.get_event_loop())
373
+ return future.result()
374
+ except FuturesCancelledError:
375
+ raise StopIteration
302
376
 
303
377
  def __iter__(self):
378
+ """Return iterator for audio chunk processing.
379
+
380
+ Returns:
381
+ Self as iterator.
382
+ """
304
383
  return self
305
384
 
306
385
 
@@ -310,17 +389,20 @@ class RivaSegmentedSTTService(SegmentedSTTService):
310
389
  By default, his service uses NVIDIA's Riva Canary ASR API to perform speech-to-text
311
390
  transcription on audio segments. It inherits from SegmentedSTTService to handle
312
391
  audio buffering and speech detection.
313
-
314
- Args:
315
- api_key: NVIDIA API key for authentication
316
- server: Riva server address (defaults to NVIDIA Cloud Function endpoint)
317
- model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID
318
- sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate
319
- params: Additional configuration parameters for Riva
320
- **kwargs: Additional arguments passed to SegmentedSTTService
321
392
  """
322
393
 
323
394
  class InputParams(BaseModel):
395
+ """Configuration parameters for Riva segmented STT service.
396
+
397
+ Parameters:
398
+ language: Target language for transcription. Defaults to EN_US.
399
+ profanity_filter: Whether to filter profanity from results.
400
+ automatic_punctuation: Whether to add automatic punctuation.
401
+ verbatim_transcripts: Whether to return verbatim transcripts.
402
+ boosted_lm_words: List of words to boost in language model.
403
+ boosted_lm_score: Score boost for specified words.
404
+ """
405
+
324
406
  language: Optional[Language] = Language.EN_US
325
407
  profanity_filter: bool = False
326
408
  automatic_punctuation: bool = True
@@ -341,6 +423,16 @@ class RivaSegmentedSTTService(SegmentedSTTService):
341
423
  params: Optional[InputParams] = None,
342
424
  **kwargs,
343
425
  ):
426
+ """Initialize the Riva segmented STT service.
427
+
428
+ Args:
429
+ api_key: NVIDIA API key for authentication
430
+ server: Riva server address (defaults to NVIDIA Cloud Function endpoint)
431
+ model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID
432
+ sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate
433
+ params: Additional configuration parameters for Riva
434
+ **kwargs: Additional arguments passed to SegmentedSTTService
435
+ """
344
436
  super().__init__(sample_rate=sample_rate, **kwargs)
345
437
 
346
438
  params = params or RivaSegmentedSTTService.InputParams()
@@ -380,7 +472,14 @@ class RivaSegmentedSTTService(SegmentedSTTService):
380
472
  self._settings = {"language": self._language_enum}
381
473
 
382
474
  def language_to_service_language(self, language: Language) -> Optional[str]:
383
- """Convert pipecat Language enum to Riva's language code."""
475
+ """Convert pipecat Language enum to Riva's language code.
476
+
477
+ Args:
478
+ language: Language enum value.
479
+
480
+ Returns:
481
+ Riva language code or None if not supported.
482
+ """
384
483
  return language_to_riva_language(language)
385
484
 
386
485
  def _initialize_client(self):
@@ -435,10 +534,23 @@ class RivaSegmentedSTTService(SegmentedSTTService):
435
534
  return config
436
535
 
437
536
  def can_generate_metrics(self) -> bool:
438
- """Indicates whether this service can generate processing metrics."""
537
+ """Check if this service can generate processing metrics.
538
+
539
+ Returns:
540
+ True - this service supports metrics generation.
541
+ """
439
542
  return True
440
543
 
441
544
  async def set_model(self, model: str):
545
+ """Set the ASR model for transcription.
546
+
547
+ Args:
548
+ model: Model name to set.
549
+
550
+ Note:
551
+ Model cannot be changed after initialization. Use model_function_map
552
+ parameter in constructor instead.
553
+ """
442
554
  logger.warning(f"Cannot set model after initialization. Set model and function id like so:")
443
555
  example = {"function_id": "<UUID>", "model_name": "<model_name>"}
444
556
  logger.warning(
@@ -446,13 +558,21 @@ class RivaSegmentedSTTService(SegmentedSTTService):
446
558
  )
447
559
 
448
560
  async def start(self, frame: StartFrame):
449
- """Initialize the service when the pipeline starts."""
561
+ """Initialize the service when the pipeline starts.
562
+
563
+ Args:
564
+ frame: StartFrame indicating pipeline start.
565
+ """
450
566
  await super().start(frame)
451
567
  self._initialize_client()
452
568
  self._config = self._create_recognition_config()
453
569
 
454
570
  async def set_language(self, language: Language):
455
- """Set the language for the STT service."""
571
+ """Set the language for the STT service.
572
+
573
+ Args:
574
+ language: Target language for transcription.
575
+ """
456
576
  logger.info(f"Switching STT language to: [{language}]")
457
577
  self._language_enum = language
458
578
  self._language = self.language_to_service_language(language) or "en-US"
@@ -520,7 +640,10 @@ class RivaSegmentedSTTService(SegmentedSTTService):
520
640
  if text:
521
641
  logger.debug(f"Transcription: [{text}]")
522
642
  yield TranscriptionFrame(
523
- text, "", time_now_iso8601(), self._language_enum
643
+ text,
644
+ self._user_id,
645
+ time_now_iso8601(),
646
+ self._language_enum,
524
647
  )
525
648
  transcription_found = True
526
649
 
@@ -539,7 +662,12 @@ class RivaSegmentedSTTService(SegmentedSTTService):
539
662
 
540
663
 
541
664
  class ParakeetSTTService(RivaSTTService):
542
- """Deprecated: Use RivaSTTService instead."""
665
+ """Deprecated speech-to-text service using NVIDIA Parakeet models.
666
+
667
+ .. deprecated:: 0.0.66
668
+ This class is deprecated. Use `RivaSTTService` instead for equivalent functionality
669
+ with Parakeet models by specifying the appropriate model_function_map.
670
+ """
543
671
 
544
672
  def __init__(
545
673
  self,
@@ -554,6 +682,16 @@ class ParakeetSTTService(RivaSTTService):
554
682
  params: Optional[RivaSTTService.InputParams] = None, # Use parent class's type
555
683
  **kwargs,
556
684
  ):
685
+ """Initialize the Parakeet STT service.
686
+
687
+ Args:
688
+ api_key: NVIDIA API key for authentication.
689
+ server: Riva server address. Defaults to NVIDIA Cloud Function endpoint.
690
+ model_function_map: Mapping containing 'function_id' and 'model_name' for Parakeet model.
691
+ sample_rate: Audio sample rate in Hz. If None, uses pipeline default.
692
+ params: Additional configuration parameters for Riva.
693
+ **kwargs: Additional arguments passed to RivaSTTService.
694
+ """
557
695
  super().__init__(
558
696
  api_key=api_key,
559
697
  server=server,