dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -0,0 +1,592 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Inworld AI Text-to-Speech Service Implementation.
8
+
9
+ This module provides integration with Inworld AI's HTTP-based TTS API, enabling
10
+ both streaming and non-streaming text-to-speech synthesis with high-quality,
11
+ natural-sounding voices.
12
+
13
+ Key Features:
14
+
15
+ - HTTP streaming and non-streaming API support for flexible audio generation
16
+ - Multiple voice options (Ashley, Hades, etc.)
17
+ - Automatic language detection from input text (no manual language setting required)
18
+ - Real-time audio chunk processing with proper buffering
19
+ - WAV header handling and audio format conversion
20
+ - Comprehensive error handling and metrics tracking
21
+
22
+ Technical Implementation:
23
+
24
+ - Uses aiohttp for HTTP connections
25
+ - Implements both JSON line-by-line parsing (streaming) and complete response (non-streaming)
26
+ - Handles base64-encoded audio data with proper decoding
27
+ - Manages audio continuity to prevent clicks and artifacts
28
+ - Integrates with Pipecat's frame-based pipeline architecture
29
+
30
+ Examples::
31
+
32
+ async with aiohttp.ClientSession() as session:
33
+ # Streaming mode (default) - real-time audio generation
34
+ tts = InworldTTSService(
35
+ api_key=os.getenv("INWORLD_API_KEY"),
36
+ aiohttp_session=session,
37
+ voice_id="Ashley",
38
+ model="inworld-tts-1",
39
+ streaming=True, # Default
40
+ params=InworldTTSService.InputParams(
41
+ temperature=0.8, # Optional: control synthesis variability (range: [0, 2])
42
+ ),
43
+ )
44
+
45
+ # Non-streaming mode - complete audio generation then playback
46
+ tts = InworldTTSService(
47
+ api_key=os.getenv("INWORLD_API_KEY"),
48
+ aiohttp_session=session,
49
+ voice_id="Ashley",
50
+ model="inworld-tts-1",
51
+ streaming=False,
52
+ params=InworldTTSService.InputParams(
53
+ temperature=0.8,
54
+ ),
55
+ )
56
+ """
57
+
58
+ import base64
59
+ import json
60
+ from typing import AsyncGenerator, Optional
61
+
62
+ import aiohttp
63
+ from loguru import logger
64
+ from pydantic import BaseModel
65
+
66
+ from pipecat.frames.frames import (
67
+ CancelFrame,
68
+ EndFrame,
69
+ ErrorFrame,
70
+ Frame,
71
+ StartFrame,
72
+ TTSAudioRawFrame,
73
+ TTSStartedFrame,
74
+ TTSStoppedFrame,
75
+ )
76
+ from pipecat.services.tts_service import TTSService
77
+ from pipecat.utils.tracing.service_decorators import traced_tts
78
+
79
+
80
+ class InworldTTSService(TTSService):
81
+ """Inworld AI HTTP-based Text-to-Speech Service.
82
+
83
+ This unified service integrates Inworld AI's high-quality TTS API with Pipecat's pipeline
84
+ architecture. It supports both streaming and non-streaming modes, providing flexible
85
+ speech synthesis with natural-sounding voices.
86
+
87
+ Key Features:
88
+
89
+ - **Streaming Mode**: Real-time HTTP streaming for minimal latency
90
+ - **Non-Streaming Mode**: Complete audio synthesis then chunked playback
91
+ - Multiple voice options (Ashley, Hades, etc.)
92
+ - High-quality audio output (48kHz LINEAR16 PCM)
93
+ - Automatic audio format handling and header stripping
94
+ - Comprehensive error handling and recovery
95
+ - Built-in performance metrics and monitoring
96
+ - Unified interface for both modes
97
+
98
+ Technical Architecture:
99
+
100
+ - Uses aiohttp for non-blocking HTTP requests
101
+ - **Streaming**: Implements JSON line-by-line streaming protocol
102
+ - **Non-Streaming**: Single HTTP POST with complete response
103
+ - Processes base64-encoded audio chunks in real-time or batch
104
+ - Manages audio continuity to prevent artifacts
105
+ - Integrates with Pipecat's frame-based pipeline system
106
+
107
+ Supported Configuration:
108
+
109
+ - Voice Selection: Ashley, Hades, and other Inworld voices
110
+ - Models: inworld-tts-1 and other available models
111
+ - Audio Formats: LINEAR16 PCM at various sample rates
112
+ - Language Detection: Automatically inferred from input text (no explicit language setting required)
113
+ - Mode Selection: streaming=True for real-time, streaming=False for complete synthesis
114
+
115
+ Examples::
116
+
117
+ async with aiohttp.ClientSession() as session:
118
+ # Streaming mode (default) - Real-time audio generation
119
+ tts_streaming = InworldTTSService(
120
+ api_key=os.getenv("INWORLD_API_KEY"),
121
+ aiohttp_session=session,
122
+ voice_id="Ashley",
123
+ model="inworld-tts-1",
124
+ streaming=True, # Default behavior
125
+ params=InworldTTSService.InputParams(
126
+ temperature=0.8, # Add variability to speech synthesis (range: [0, 2])
127
+ ),
128
+ )
129
+
130
+ # Non-streaming mode - Complete audio then playback
131
+ tts_complete = InworldTTSService(
132
+ api_key=os.getenv("INWORLD_API_KEY"),
133
+ aiohttp_session=session,
134
+ voice_id="Hades",
135
+ model="inworld-tts-1-max",
136
+ streaming=False,
137
+ params=InworldTTSService.InputParams(
138
+ temperature=0.8,
139
+ ),
140
+ )
141
+ """
142
+
143
+ class InputParams(BaseModel):
144
+ """Optional input parameters for Inworld TTS configuration.
145
+
146
+ Parameters:
147
+ temperature: Voice temperature control for synthesis variability (e.g., 0.8).
148
+ Valid range: [0, 2]. Higher values increase variability.
149
+
150
+ Note:
151
+ Language is automatically inferred from the input text by Inworld's TTS models,
152
+ so no explicit language parameter is required.
153
+ """
154
+
155
+ temperature: Optional[float] = None # optional temperature control (range: [0, 2])
156
+
157
+ def __init__(
158
+ self,
159
+ *,
160
+ api_key: str,
161
+ aiohttp_session: aiohttp.ClientSession,
162
+ voice_id: str = "Ashley",
163
+ model: str = "inworld-tts-1",
164
+ streaming: bool = True,
165
+ sample_rate: Optional[int] = None,
166
+ encoding: str = "LINEAR16",
167
+ params: Optional[InputParams] = None,
168
+ **kwargs,
169
+ ):
170
+ """Initialize the Inworld TTS service.
171
+
172
+ Sets up the TTS service with Inworld AI's API configuration.
173
+ This constructor prepares all necessary parameters for speech synthesis.
174
+
175
+ Args:
176
+ api_key: Inworld API key for authentication (base64-encoded from Inworld Portal).
177
+ Get this from: Inworld Portal > Settings > API Keys > Runtime API Key
178
+ aiohttp_session: Shared aiohttp session for HTTP requests. Must be provided
179
+ for proper connection pooling and resource management.
180
+ voice_id: Voice selection for speech synthesis. Common options include:
181
+ - "Ashley": Clear, professional female voice (default)
182
+ - "Hades": Deep, authoritative male voice
183
+ - And many more available in your Inworld account
184
+ model: TTS model to use for speech synthesis:
185
+ - "inworld-tts-1": Standard quality model (default)
186
+ - "inworld-tts-1-max": Higher quality model
187
+ - Other models as available in your Inworld account
188
+ streaming: Whether to use streaming mode (True) or non-streaming mode (False).
189
+ - True: Real-time audio chunks as they're generated (lower latency)
190
+ - False: Complete audio file generated first, then chunked for playback (simpler)
191
+ The base URL is automatically selected based on this mode:
192
+ - Streaming: "https://api.inworld.ai/tts/v1/voice:stream"
193
+ - Non-streaming: "https://api.inworld.ai/tts/v1/voice"
194
+ sample_rate: Audio sample rate in Hz. If None, uses default from StartFrame.
195
+ Common values: 48000 (high quality), 24000 (good quality), 16000 (basic)
196
+ encoding: Audio encoding format. Supported options:
197
+ - "LINEAR16" (default) - Uncompressed PCM, best quality
198
+ - Other formats as supported by Inworld API
199
+ params: Optional input parameters for additional configuration. Use this to specify:
200
+ - temperature: Voice temperature control for variability (range: [0, 2], e.g., 0.8, optional)
201
+ Language is automatically inferred from input text.
202
+ **kwargs: Additional arguments passed to the parent TTSService class.
203
+
204
+ Note:
205
+ The aiohttp_session parameter is required because Inworld's HTTP API
206
+ benefits from connection reuse and proper async session management.
207
+ """
208
+ # Initialize parent TTSService with audio configuration
209
+ super().__init__(sample_rate=sample_rate, **kwargs)
210
+
211
+ # Use provided params or create default configuration
212
+ params = params or InworldTTSService.InputParams()
213
+
214
+ # Store core configuration for API requests
215
+ self._api_key = api_key # Authentication credentials
216
+ self._session = aiohttp_session # HTTP session for requests
217
+ self._streaming = streaming # Streaming mode selection
218
+
219
+ # Set base URL based on streaming mode
220
+ if streaming:
221
+ self._base_url = "https://api.inworld.ai/tts/v1/voice:stream" # Streaming endpoint
222
+ else:
223
+ self._base_url = "https://api.inworld.ai/tts/v1/voice" # Non-streaming endpoint
224
+
225
+ # Build settings dictionary that matches Inworld's API expectations
226
+ # This will be sent as JSON payload in each TTS request
227
+ # Note: Language is automatically inferred from text by Inworld's models
228
+ self._settings = {
229
+ "voiceId": voice_id, # Voice selection from direct parameter
230
+ "modelId": model, # TTS model selection from direct parameter
231
+ "audio_config": { # Audio format configuration
232
+ "audio_encoding": encoding, # Format: LINEAR16, MP3, etc.
233
+ "sample_rate_hertz": 0, # Will be set in start() from parent service
234
+ },
235
+ }
236
+
237
+ # Add optional temperature parameter if provided (valid range: [0, 2])
238
+ if params and params.temperature is not None:
239
+ self._settings["temperature"] = params.temperature
240
+
241
+ # Register voice and model with parent service for metrics and tracking
242
+ self.set_voice(voice_id) # Used for logging and metrics
243
+ self.set_model_name(model) # Used for performance tracking
244
+
245
+ def can_generate_metrics(self) -> bool:
246
+ """Check if this service can generate processing metrics.
247
+
248
+ Returns:
249
+ True, as Inworld TTS service supports metrics generation.
250
+ """
251
+ return True
252
+
253
+ async def start(self, frame: StartFrame):
254
+ """Start the Inworld TTS service.
255
+
256
+ Args:
257
+ frame: The start frame containing initialization parameters.
258
+ """
259
+ await super().start(frame)
260
+ self._settings["audio_config"]["sample_rate_hertz"] = self.sample_rate
261
+
262
+ async def stop(self, frame: EndFrame):
263
+ """Stop the Inworld TTS service.
264
+
265
+ Args:
266
+ frame: The end frame.
267
+ """
268
+ await super().stop(frame)
269
+
270
+ async def cancel(self, frame: CancelFrame):
271
+ """Cancel the Inworld TTS service.
272
+
273
+ Args:
274
+ frame: The cancel frame.
275
+ """
276
+ await super().cancel(frame)
277
+
278
+ @traced_tts
279
+ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
280
+ """Generate speech from text using Inworld's HTTP API.
281
+
282
+ This is the core TTS processing function that adapts its behavior based on the streaming mode:
283
+
284
+ **Streaming Mode (streaming=True)**:
285
+ 1. Sends text to Inworld's streaming TTS endpoint
286
+ 2. Receives JSON-streamed audio chunks in real-time
287
+ 3. Processes and cleans audio data (removes WAV headers, validates content)
288
+ 4. Yields audio frames for immediate playback in the pipeline
289
+
290
+ **Non-Streaming Mode (streaming=False)**:
291
+ 1. Sends text to Inworld's non-streaming TTS endpoint
292
+ 2. Receives complete audio file as base64-encoded response
293
+ 3. Processes entire audio and chunks for playback
294
+ 4. Yields audio frames in manageable pieces
295
+
296
+ Technical Details:
297
+
298
+ - **Streaming**: Uses HTTP streaming with JSON line-by-line responses
299
+ - **Non-Streaming**: Single HTTP POST with complete JSON response
300
+ - Each audio chunk contains base64-encoded audio data
301
+ - Implements buffering to handle partial data (streaming mode)
302
+ - Strips WAV headers to prevent audio artifacts/clicks
303
+ - Provides optimized audio delivery for each mode
304
+
305
+ Args:
306
+ text: The text to synthesize into speech.
307
+
308
+ Yields:
309
+ Frame: Audio frames containing the synthesized speech, plus control frames.
310
+
311
+ Raises:
312
+ ErrorFrame: If API errors occur or audio processing fails.
313
+ """
314
+ logger.debug(f"{self}: Generating TTS [{text}] (streaming={self._streaming})")
315
+
316
+ # ================================================================================
317
+ # STEP 1: PREPARE API REQUEST
318
+ # ================================================================================
319
+ # Build the JSON payload according to Inworld's API specification
320
+ # This matches the format shown in their documentation examples
321
+ # Note: Language is automatically inferred from the input text by Inworld's models
322
+ payload = {
323
+ "text": text, # Text to synthesize
324
+ "voiceId": self._settings["voiceId"], # Voice selection (Ashley, Hades, etc.)
325
+ "modelId": self._settings["modelId"], # TTS model (inworld-tts-1)
326
+ "audio_config": self._settings[
327
+ "audio_config"
328
+ ], # Audio format settings (LINEAR16, 48kHz)
329
+ }
330
+
331
+ # Add optional temperature parameter if configured (valid range: [0, 2])
332
+ if "temperature" in self._settings:
333
+ payload["temperature"] = self._settings["temperature"]
334
+
335
+ # Set up HTTP headers for authentication and content type
336
+ # Inworld requires Basic auth with base64-encoded API key
337
+ headers = {
338
+ "Authorization": f"Basic {self._api_key}", # Base64 API key from Inworld Portal
339
+ "Content-Type": "application/json", # JSON request body
340
+ }
341
+
342
+ try:
343
+ # ================================================================================
344
+ # STEP 2: INITIALIZE METRICS AND PROCESSING
345
+ # ================================================================================
346
+ # Start measuring Time To First Byte (TTFB) for performance tracking
347
+ await self.start_ttfb_metrics()
348
+
349
+ # Signal to the pipeline that TTS generation has started
350
+ # This allows downstream processors to prepare for incoming audio
351
+ yield TTSStartedFrame()
352
+
353
+ # ================================================================================
354
+ # STEP 3: MAKE HTTP REQUEST (MODE-SPECIFIC)
355
+ # ================================================================================
356
+ # Use aiohttp to make request to Inworld's endpoint
357
+ # Behavior differs based on streaming mode
358
+ async with self._session.post(
359
+ self._base_url, json=payload, headers=headers
360
+ ) as response:
361
+ # ================================================================================
362
+ # STEP 4: HANDLE HTTP ERRORS
363
+ # ================================================================================
364
+ # Check for API errors (expired keys, invalid requests, etc.)
365
+ if response.status != 200:
366
+ error_text = await response.text()
367
+ logger.error(f"Inworld API error: {error_text}")
368
+ await self.push_error(ErrorFrame(f"Inworld API error: {error_text}"))
369
+ return
370
+
371
+ # ================================================================================
372
+ # STEP 5: PROCESS RESPONSE (MODE-SPECIFIC)
373
+ # ================================================================================
374
+ # Choose processing method based on streaming mode
375
+ if self._streaming:
376
+ # Stream processing: JSON line-by-line with real-time audio
377
+ async for frame in self._process_streaming_response(response):
378
+ yield frame
379
+ else:
380
+ # Non-stream processing: Complete JSON response with batch audio
381
+ async for frame in self._process_non_streaming_response(response):
382
+ yield frame
383
+
384
+ # ================================================================================
385
+ # STEP 6: FINALIZE METRICS AND CLEANUP
386
+ # ================================================================================
387
+ # Start usage metrics tracking after successful completion
388
+ await self.start_tts_usage_metrics(text)
389
+
390
+ except Exception as e:
391
+ # ================================================================================
392
+ # STEP 7: ERROR HANDLING
393
+ # ================================================================================
394
+ # Log any unexpected errors and notify the pipeline
395
+ logger.error(f"{self} exception: {e}")
396
+ await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
397
+ finally:
398
+ # ================================================================================
399
+ # STEP 8: CLEANUP AND COMPLETION
400
+ # ================================================================================
401
+ # Always stop metrics tracking, even if errors occurred
402
+ await self.stop_all_metrics()
403
+
404
+ # Signal to pipeline that TTS generation is complete
405
+ # This allows downstream processors to finalize audio processing
406
+ yield TTSStoppedFrame()
407
+
408
+ async def _process_streaming_response(
409
+ self, response: aiohttp.ClientResponse
410
+ ) -> AsyncGenerator[Frame, None]:
411
+ """Process streaming JSON response with real-time audio chunks.
412
+
413
+ This method handles Inworld's streaming endpoint response format:
414
+ - JSON lines containing base64-encoded audio chunks
415
+ - Real-time processing as data arrives
416
+ - Line buffering to handle partial JSON data
417
+
418
+ Args:
419
+ response: The aiohttp response object from streaming endpoint.
420
+
421
+ Yields:
422
+ Frame: Audio frames as they're processed from the stream.
423
+ """
424
+ # ================================================================================
425
+ # STREAMING: PROCESS JSON LINE-BY-LINE RESPONSE
426
+ # ================================================================================
427
+ # Inworld streams JSON lines where each line contains audio data
428
+ # We need to buffer incoming data and process complete lines
429
+
430
+ # Buffer to accumulate incoming text data
431
+ # This handles cases where JSON lines are split across HTTP chunks
432
+ buffer = ""
433
+
434
+ # Read HTTP response in manageable chunks (1KB each)
435
+ # This prevents memory issues with large responses
436
+ async for chunk in response.content.iter_chunked(1024):
437
+ if not chunk:
438
+ continue
439
+
440
+ # ============================================================================
441
+ # BUFFER MANAGEMENT
442
+ # ============================================================================
443
+ # Decode binary chunk to text and add to our line buffer
444
+ # Each chunk may contain partial JSON lines, so we need to accumulate
445
+ buffer += chunk.decode("utf-8")
446
+
447
+ # ============================================================================
448
+ # LINE-BY-LINE JSON PROCESSING
449
+ # ============================================================================
450
+ # Process all complete lines in the buffer (lines ending with \n)
451
+ # Leave partial lines in buffer for next iteration
452
+ while "\n" in buffer:
453
+ # Split on first newline, keeping remainder in buffer
454
+ line, buffer = buffer.split("\n", 1)
455
+ line_str = line.strip()
456
+
457
+ # Skip empty lines (common in streaming responses)
458
+ if not line_str:
459
+ continue
460
+
461
+ try:
462
+ # ================================================================
463
+ # PARSE JSON AND EXTRACT AUDIO
464
+ # ================================================================
465
+ # Parse the JSON line - should contain audio data
466
+ chunk_data = json.loads(line_str)
467
+
468
+ # Check if this line contains audio content
469
+ # Inworld's response format: {"result": {"audioContent": "base64data"}}
470
+ if "result" in chunk_data and "audioContent" in chunk_data["result"]:
471
+ # Process the audio chunk
472
+ await self.stop_ttfb_metrics()
473
+ async for frame in self._process_audio_chunk(
474
+ base64.b64decode(chunk_data["result"]["audioContent"])
475
+ ):
476
+ yield frame
477
+
478
+ except json.JSONDecodeError:
479
+ # Ignore malformed JSON lines - streaming can have partial data
480
+ # This is normal in HTTP streaming scenarios
481
+ continue
482
+
483
+ async def _process_non_streaming_response(
484
+ self, response: aiohttp.ClientResponse
485
+ ) -> AsyncGenerator[Frame, None]:
486
+ """Process complete JSON response with full audio content.
487
+
488
+ This method handles Inworld's non-streaming endpoint response format:
489
+ - Single JSON response with complete base64-encoded audio
490
+ - Full audio download then chunked playback
491
+ - Simpler processing without line buffering
492
+
493
+ Args:
494
+ response: The aiohttp response object from non-streaming endpoint.
495
+
496
+ Yields:
497
+ Frame: Audio frames chunked from the complete audio.
498
+ """
499
+ # ================================================================================
500
+ # NON-STREAMING: PARSE COMPLETE JSON RESPONSE
501
+ # ================================================================================
502
+ # Parse the complete JSON response containing base64 audio data
503
+ response_data = await response.json()
504
+
505
+ # ================================================================================
506
+ # EXTRACT AND VALIDATE AUDIO CONTENT
507
+ # ================================================================================
508
+ # Extract the base64-encoded audio content from response
509
+ if "audioContent" not in response_data:
510
+ logger.error("No audioContent in Inworld API response")
511
+ await self.push_error(ErrorFrame("No audioContent in response"))
512
+ return
513
+
514
+ # ================================================================================
515
+ # DECODE AND PROCESS COMPLETE AUDIO DATA
516
+ # ================================================================================
517
+ # Decode the base64 audio data to binary
518
+ audio_data = base64.b64decode(response_data["audioContent"])
519
+
520
+ # Strip WAV header if present (Inworld may include WAV header)
521
+ # This prevents audio clicks and ensures clean audio playback
522
+ if len(audio_data) > 44 and audio_data.startswith(b"RIFF"):
523
+ audio_data = audio_data[44:]
524
+
525
+ # ================================================================================
526
+ # CHUNK AND YIELD COMPLETE AUDIO FOR PLAYBACK
527
+ # ================================================================================
528
+ # Chunk the complete audio for streaming playback
529
+ # This allows the pipeline to process audio in manageable pieces
530
+ CHUNK_SIZE = self.chunk_size
531
+
532
+ for i in range(0, len(audio_data), CHUNK_SIZE):
533
+ chunk = audio_data[i : i + CHUNK_SIZE]
534
+ if len(chunk) > 0:
535
+ await self.stop_ttfb_metrics()
536
+ yield TTSAudioRawFrame(
537
+ audio=chunk,
538
+ sample_rate=self.sample_rate,
539
+ num_channels=1,
540
+ )
541
+
542
+ async def _process_audio_chunk(self, audio_chunk: bytes) -> AsyncGenerator[Frame, None]:
543
+ """Process a single audio chunk (common logic for both modes).
544
+
545
+ This method handles audio chunk processing that's common to both streaming
546
+ and non-streaming modes:
547
+ - WAV header removal
548
+ - Audio validation
549
+ - Frame creation and yielding
550
+
551
+ Args:
552
+ audio_chunk: Raw audio data bytes to process.
553
+
554
+ Yields:
555
+ Frame: Audio frame if chunk contains valid audio data.
556
+ """
557
+ # ========================================================
558
+ # AUDIO DATA VALIDATION
559
+ # ========================================================
560
+ # Skip empty audio chunks that could cause discontinuities
561
+ # Empty chunks can create gaps or clicks in audio playback
562
+ if not audio_chunk:
563
+ return
564
+
565
+ # Start with the raw audio data
566
+ audio_data = audio_chunk
567
+
568
+ # ========================================================
569
+ # WAV HEADER REMOVAL (CRITICAL FOR AUDIO QUALITY)
570
+ # ========================================================
571
+ # Each audio chunk may have its own WAV header (44 bytes)
572
+ # These headers contain metadata and will sound like clicks if played
573
+ # We must strip them from EVERY chunk, not just the first one
574
+ if (
575
+ len(audio_chunk) > 44 # Ensure chunk is large enough
576
+ and audio_chunk.startswith(b"RIFF") # Check for WAV header magic bytes
577
+ ):
578
+ # Remove the 44-byte WAV header to get pure audio data
579
+ audio_data = audio_chunk[44:]
580
+
581
+ # ========================================================
582
+ # YIELD AUDIO FRAME TO PIPELINE
583
+ # ========================================================
584
+ # Only yield frames with actual audio content
585
+ # Empty frames can cause pipeline issues
586
+ if len(audio_data) > 0:
587
+ # Create Pipecat audio frame with processed audio data
588
+ yield TTSAudioRawFrame(
589
+ audio=audio_data, # Clean audio without headers
590
+ sample_rate=self.sample_rate, # Configured sample rate (48kHz)
591
+ num_channels=1, # Mono audio
592
+ )