dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -0,0 +1,620 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """HeyGen implementation for Pipecat.
8
+
9
+ This module provides integration with the HeyGen platform for creating conversational
10
+ AI applications with avatars. It manages conversation sessions and provides real-time
11
+ audio/video streaming capabilities through the HeyGen API.
12
+ """
13
+
14
+ import asyncio
15
+ import base64
16
+ import json
17
+ import time
18
+ import uuid
19
+ from typing import Awaitable, Callable, Optional
20
+
21
+ import aiohttp
22
+ from loguru import logger
23
+ from pydantic import BaseModel
24
+
25
+ from pipecat.frames.frames import (
26
+ AudioRawFrame,
27
+ ImageRawFrame,
28
+ StartFrame,
29
+ )
30
+ from pipecat.processors.frame_processor import FrameProcessorSetup
31
+ from pipecat.services.heygen.api import HeyGenApi, HeyGenSession, NewSessionRequest
32
+ from pipecat.transports.base_transport import TransportParams
33
+ from pipecat.utils.asyncio.task_manager import BaseTaskManager
34
+
35
+ try:
36
+ from livekit import rtc
37
+ from livekit.rtc._proto.video_frame_pb2 import VideoBufferType
38
+ from websockets.asyncio.client import connect as websocket_connect
39
+ from websockets.exceptions import ConnectionClosedOK
40
+ except ModuleNotFoundError as e:
41
+ logger.error(f"Exception: {e}")
42
+ logger.error("In order to use HeyGen, you need to `pip install pipecat-ai[heygen]`.")
43
+ raise Exception(f"Missing module: {e}")
44
+
45
+ HEY_GEN_SAMPLE_RATE = 24000
46
+
47
+
48
+ class HeyGenCallbacks(BaseModel):
49
+ """Callback handlers for HeyGen events.
50
+
51
+ Parameters:
52
+ on_participant_connected: Called when a participant connects
53
+ on_participant_disconnected: Called when a participant disconnects
54
+ """
55
+
56
+ on_participant_connected: Callable[[str], Awaitable[None]]
57
+ on_participant_disconnected: Callable[[str], Awaitable[None]]
58
+
59
+
60
+ class HeyGenClient:
61
+ """A client for interacting with HeyGen's Interactive Avatar Realtime API.
62
+
63
+ This client manages both WebSocket and LiveKit connections for real-time avatar streaming,
64
+ handling bi-directional audio/video communication and avatar control. It implements the API defined in
65
+ https://docs.heygen.com/docs/interactive-avatar-realtime-api
66
+
67
+ The client manages the following connections:
68
+ 1. WebSocket connection for avatar control and audio streaming
69
+ 2. LiveKit connection for receiving avatar video and audio
70
+
71
+ Attributes:
72
+ HEY_GEN_SAMPLE_RATE (int): The required sample rate for HeyGen's audio processing (24000 Hz)
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ *,
78
+ api_key: str,
79
+ session: aiohttp.ClientSession,
80
+ params: TransportParams,
81
+ session_request: NewSessionRequest = NewSessionRequest(
82
+ avatarName="Shawn_Therapist_public",
83
+ version="v2",
84
+ ),
85
+ callbacks: HeyGenCallbacks,
86
+ ) -> None:
87
+ """Initialize the HeyGen client.
88
+
89
+ Args:
90
+ api_key: HeyGen API key for authentication
91
+ session: HTTP client session for API requests
92
+ params: Transport configuration parameters
93
+ session_request: Configuration for the HeyGen session (default: uses Shawn_Therapist_public avatar)
94
+ callbacks: Callback handlers for HeyGen events
95
+ """
96
+ self._api = HeyGenApi(api_key, session=session)
97
+ self._heyGen_session: Optional[HeyGenSession] = None
98
+ self._websocket = None
99
+ self._task_manager: Optional[BaseTaskManager] = None
100
+ self._params = params
101
+ self._in_sample_rate = 0
102
+ self._out_sample_rate = 0
103
+ self._connected = False
104
+ self._session_request = session_request
105
+ self._callbacks = callbacks
106
+ self._event_queue: Optional[asyncio.Queue] = None
107
+ self._event_task = None
108
+ # Currently supporting to capture the audio and video from a single participant
109
+ self._video_task = None
110
+ self._audio_task = None
111
+ self._video_frame_callback = None
112
+ self._audio_frame_callback = None
113
+ # write_audio_frame() is called quickly, as soon as we get audio
114
+ # (e.g. from the TTS), and since this is just a network connection we
115
+ # would be sending it to quickly. Instead, we want to block to emulate
116
+ # an audio device, this is what the send interval is. It will be
117
+ # computed on StartFrame.
118
+ self._send_interval = 0
119
+ self._next_send_time = 0
120
+ self._audio_seconds_sent = 0.0
121
+ self._transport_ready = False
122
+
123
+ async def _initialize(self):
124
+ self._heyGen_session = await self._api.new_session(self._session_request)
125
+ logger.debug(f"HeyGen sessionId: {self._heyGen_session.session_id}")
126
+ logger.debug(f"HeyGen realtime_endpoint: {self._heyGen_session.realtime_endpoint}")
127
+ logger.debug(f"HeyGen livekit URL: {self._heyGen_session.url}")
128
+ logger.debug(f"HeyGen livekit toke: {self._heyGen_session.access_token}")
129
+ logger.info(
130
+ f"Full Link: https://meet.livekit.io/custom?liveKitUrl={self._heyGen_session.url}&token={self._heyGen_session.access_token}"
131
+ )
132
+
133
+ await self._api.start_session(self._heyGen_session.session_id)
134
+ logger.info("HeyGen session started")
135
+
136
+ async def setup(self, setup: FrameProcessorSetup) -> None:
137
+ """Setup the client and initialize the conversation.
138
+
139
+ Establishes a new session with HeyGen's API if one doesn't exist.
140
+
141
+ Args:
142
+ setup: The frame processor setup configuration.
143
+ """
144
+ if self._heyGen_session is not None:
145
+ logger.debug("heygen_session already initialized")
146
+ return
147
+ self._task_manager = setup.task_manager
148
+ try:
149
+ await self._initialize()
150
+
151
+ self._event_queue = asyncio.Queue()
152
+ self._event_task = self._task_manager.create_task(
153
+ self._callback_task_handler(self._event_queue),
154
+ f"{self}::event_callback_task",
155
+ )
156
+ except Exception as e:
157
+ logger.error(f"Failed to setup HeyGenClient: {e}")
158
+ await self.cleanup()
159
+
160
+ async def cleanup(self) -> None:
161
+ """Cleanup client resources.
162
+
163
+ Closes the active HeyGen session and resets internal state.
164
+ """
165
+ try:
166
+ if self._heyGen_session is not None:
167
+ await self._api.close_session(self._heyGen_session.session_id)
168
+ self._heyGen_session = None
169
+ self._connected = False
170
+
171
+ if self._event_task and self._task_manager:
172
+ await self._task_manager.cancel_task(self._event_task)
173
+ self._event_task = None
174
+ except Exception as e:
175
+ logger.exception(f"Exception during cleanup: {e}")
176
+
177
+ async def start(self, frame: StartFrame, audio_chunk_size: int) -> None:
178
+ """Start the client and establish all necessary connections.
179
+
180
+ Initializes WebSocket and LiveKit connections using the provided configuration.
181
+ Sets up audio processing with the specified sample rates.
182
+
183
+ Args:
184
+ frame: Initial configuration frame containing audio parameters
185
+ audio_chunk_size: Audio chunk size for output processing
186
+ """
187
+ if self._websocket:
188
+ logger.debug("heygen client already started")
189
+ return
190
+
191
+ logger.debug(f"HeyGenClient starting")
192
+ self._in_sample_rate = self._params.audio_in_sample_rate or frame.audio_in_sample_rate
193
+ self._out_sample_rate = self._params.audio_out_sample_rate or frame.audio_out_sample_rate
194
+ self._send_interval = (audio_chunk_size / self._out_sample_rate) / 2
195
+ logger.debug(f"HeyGenClient send_interval: {self._send_interval}")
196
+ await self._ws_connect()
197
+ await self._livekit_connect()
198
+
199
+ async def stop(self) -> None:
200
+ """Stop the client and terminate all connections.
201
+
202
+ Disconnects from WebSocket and LiveKit endpoints, and performs cleanup.
203
+ """
204
+ logger.debug(f"HeyGenVideoService stopping")
205
+ await self._ws_disconnect()
206
+ await self._livekit_disconnect()
207
+ await self.cleanup()
208
+
209
+ # websocket connection methods
210
+ async def _ws_connect(self):
211
+ """Connect to HeyGen websocket endpoint."""
212
+ try:
213
+ if self._websocket:
214
+ logger.debug(f"HeyGenClient ws already connected!")
215
+ return
216
+ logger.debug(f"HeyGenClient ws connecting")
217
+ self._websocket = await websocket_connect(
218
+ uri=self._heyGen_session.realtime_endpoint,
219
+ )
220
+ self._connected = True
221
+ self._receive_task = self._task_manager.create_task(
222
+ self._ws_receive_task_handler(), name="HeyGenClient_Websocket"
223
+ )
224
+ except Exception as e:
225
+ logger.error(f"{self} initialization error: {e}")
226
+ self._websocket = None
227
+
228
+ async def _ws_receive_task_handler(self):
229
+ """Handle incoming WebSocket messages."""
230
+ while self._connected:
231
+ try:
232
+ message = await self._websocket.recv()
233
+ parsed_message = json.loads(message)
234
+ await self._handle_ws_server_event(parsed_message)
235
+ except ConnectionClosedOK:
236
+ break
237
+ except Exception as e:
238
+ logger.error(f"Error processing WebSocket message: {e}")
239
+ break
240
+
241
+ async def _handle_ws_server_event(self, event: dict) -> None:
242
+ """Handle an event from HeyGen websocket."""
243
+ event_type = event.get("type")
244
+ if event_type == "agent.state":
245
+ logger.debug(f"HeyGenClient ws received agent status: {event}")
246
+ else:
247
+ logger.trace(f"HeyGenClient ws received unknown event: {event_type}")
248
+
249
+ async def _ws_disconnect(self) -> None:
250
+ """Disconnect from HeyGen websocket endpoint."""
251
+ try:
252
+ self._connected = False
253
+ if self._websocket:
254
+ await self._websocket.close()
255
+ except Exception as e:
256
+ logger.error(f"{self} disconnect error: {e}")
257
+ finally:
258
+ self._websocket = None
259
+
260
+ async def _ws_send(self, message: dict) -> None:
261
+ """Send a message to HeyGen websocket."""
262
+ if not self._connected:
263
+ logger.debug(f"{self} websocket is not connected anymore.")
264
+ return
265
+ try:
266
+ if self._websocket:
267
+ await self._websocket.send(json.dumps(message))
268
+ except Exception as e:
269
+ logger.error(f"Error sending message to HeyGen websocket: {e}")
270
+ raise e
271
+
272
+ async def interrupt(self, event_id: str) -> None:
273
+ """Interrupt the avatar's current action.
274
+
275
+ Stops the current animation/speech and returns the avatar to idle state.
276
+ Useful for handling user interruptions during avatar speech.
277
+ """
278
+ logger.debug("HeyGenClient interrupt")
279
+ self._reset_audio_timing()
280
+ await self._ws_send(
281
+ {
282
+ "type": "agent.interrupt",
283
+ "event_id": event_id,
284
+ }
285
+ )
286
+
287
+ async def start_agent_listening(self) -> None:
288
+ """Start the avatar's listening animation.
289
+
290
+ Triggers visual cues indicating the avatar is listening to user input.
291
+ """
292
+ logger.debug("HeyGenClient start_agent_listening")
293
+ await self._ws_send(
294
+ {
295
+ "type": "agent.start_listening",
296
+ "event_id": str(uuid.uuid4()),
297
+ }
298
+ )
299
+
300
+ async def stop_agent_listening(self) -> None:
301
+ """Stop the avatar's listening animation.
302
+
303
+ Returns the avatar to idle state from listening state.
304
+ """
305
+ await self._ws_send(
306
+ {
307
+ "type": "agent.stop_listening",
308
+ "event_id": str(uuid.uuid4()),
309
+ }
310
+ )
311
+
312
+ def transport_ready(self) -> None:
313
+ """Indicates that the output transport is ready and able to receive frames."""
314
+ self._transport_ready = True
315
+
316
+ @property
317
+ def out_sample_rate(self) -> int:
318
+ """Get the output sample rate.
319
+
320
+ Returns:
321
+ The output sample rate in Hz.
322
+ """
323
+ return self._out_sample_rate
324
+
325
+ @property
326
+ def in_sample_rate(self) -> int:
327
+ """Get the input sample rate.
328
+
329
+ Returns:
330
+ The input sample rate in Hz.
331
+ """
332
+ return self._in_sample_rate
333
+
334
+ async def agent_speak(self, audio: bytes, event_id: str) -> None:
335
+ """Send audio data to the agent speak.
336
+
337
+ Args:
338
+ audio: Audio data in base64 encoded format
339
+ event_id: Unique identifier for the event
340
+ """
341
+ audio_base64 = base64.b64encode(audio).decode("utf-8")
342
+ await self._ws_send(
343
+ {
344
+ "type": "agent.speak",
345
+ "audio": audio_base64,
346
+ "event_id": event_id,
347
+ }
348
+ )
349
+ # Simulate audio playback with a sleep.
350
+ await self._write_audio_sleep()
351
+
352
+ def _reset_audio_timing(self):
353
+ """Reset audio timing control variables."""
354
+ self._audio_seconds_sent = 0.0
355
+ self._next_send_time = 0
356
+
357
+ async def _write_audio_sleep(self):
358
+ """Simulate audio playback timing with appropriate delays."""
359
+ # Only sleep after we've sent the first second of audio
360
+ # This appears to reduce the latency to receive the answer from HeyGen
361
+ if self._audio_seconds_sent < 3.0:
362
+ self._audio_seconds_sent += self._send_interval
363
+ self._next_send_time = time.monotonic() + self._send_interval
364
+ return
365
+
366
+ # After first second, use normal timing
367
+ current_time = time.monotonic()
368
+ sleep_duration = max(0, self._next_send_time - current_time)
369
+ if sleep_duration > 0:
370
+ await asyncio.sleep(sleep_duration)
371
+ self._next_send_time += self._send_interval
372
+ else:
373
+ self._next_send_time = time.monotonic() + self._send_interval
374
+
375
+ async def agent_speak_end(self, event_id: str) -> None:
376
+ """Send signaling that the agent has finished speaking.
377
+
378
+ Args:
379
+ event_id: Unique identifier for the event
380
+ """
381
+ self._reset_audio_timing()
382
+ await self._ws_send(
383
+ {
384
+ "type": "agent.speak_end",
385
+ "event_id": event_id,
386
+ }
387
+ )
388
+
389
+ async def capture_participant_audio(self, participant_id: str, callback) -> None:
390
+ """Capture audio frames from the HeyGen avatar.
391
+
392
+ Args:
393
+ participant_id: Identifier of the participant to capture audio from
394
+ callback: Async function to handle received audio frames
395
+ """
396
+ logger.debug(f"capture_participant_audio: {participant_id}")
397
+ self._audio_frame_callback = callback
398
+ if self._audio_task is not None:
399
+ logger.warning(
400
+ "Trying to capture more than one audio stream. It is currently not supported."
401
+ )
402
+ return
403
+
404
+ # Check if we already have audio tracks and participant is connected
405
+ if self._livekit_room and participant_id in self._livekit_room.remote_participants:
406
+ participant = self._livekit_room.remote_participants[participant_id]
407
+ for track_pub in participant.track_publications.values():
408
+ if track_pub.kind == rtc.TrackKind.KIND_AUDIO and track_pub.track is not None:
409
+ logger.debug(f"Starting audio capture for existing track: {track_pub.sid}")
410
+ audio_stream = rtc.AudioStream(track_pub.track)
411
+ self._audio_task = self._task_manager.create_task(
412
+ self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
413
+ )
414
+ break
415
+
416
+ async def capture_participant_video(self, participant_id: str, callback) -> None:
417
+ """Capture video frames from the HeyGen avatar.
418
+
419
+ Args:
420
+ participant_id: Identifier of the participant to capture video from
421
+ callback: Async function to handle received video frames
422
+ """
423
+ logger.debug(f"capture_participant_video: {participant_id}")
424
+ self._video_frame_callback = callback
425
+ if self._video_task is not None:
426
+ logger.warning(
427
+ "Trying to capture more than one audio stream. It is currently not supported."
428
+ )
429
+ return
430
+
431
+ # Check if we already have video tracks and participant is connected
432
+ if self._livekit_room and participant_id in self._livekit_room.remote_participants:
433
+ participant = self._livekit_room.remote_participants[participant_id]
434
+ for track_pub in participant.track_publications.values():
435
+ if track_pub.kind == rtc.TrackKind.KIND_VIDEO and track_pub.track is not None:
436
+ logger.debug(f"Starting video capture for existing track: {track_pub.sid}")
437
+ video_stream = rtc.VideoStream(track_pub.track)
438
+ self._video_task = self._task_manager.create_task(
439
+ self._process_video_frames(video_stream), name="HeyGenClient_Receive_Video"
440
+ )
441
+ break
442
+
443
+ # Livekit integration to receive audio and video
444
+ async def _process_audio_frames(self, stream: rtc.AudioStream):
445
+ """Process audio frames from LiveKit stream."""
446
+ try:
447
+ logger.debug("Starting audio frame processing...")
448
+ async for frame_event in stream:
449
+ try:
450
+ audio_frame = frame_event.frame
451
+ # Convert audio to raw bytes
452
+ audio_data = bytes(audio_frame.data)
453
+
454
+ audio_frame = AudioRawFrame(
455
+ audio=audio_data,
456
+ sample_rate=audio_frame.sample_rate,
457
+ num_channels=1, # HeyGen uses mono audio
458
+ )
459
+ if self._transport_ready and self._audio_frame_callback:
460
+ await self._audio_frame_callback(audio_frame)
461
+
462
+ except Exception as e:
463
+ logger.error(f"Error processing audio frame: {e}")
464
+ except Exception as e:
465
+ logger.error(f"Error processing audio frames: {e}")
466
+ finally:
467
+ logger.debug(f"Audio frame processing ended.")
468
+
469
+ async def _process_video_frames(self, stream: rtc.VideoStream):
470
+ """Process video frames from LiveKit stream."""
471
+ try:
472
+ logger.debug("Starting video frame processing...")
473
+ async for frame_event in stream:
474
+ try:
475
+ video_frame = frame_event.frame
476
+
477
+ # Convert to RGB24 if not already
478
+ if video_frame.type != VideoBufferType.RGB24:
479
+ video_frame = video_frame.convert(VideoBufferType.RGB24)
480
+
481
+ # Create frame with original dimensions
482
+ image_frame = ImageRawFrame(
483
+ image=bytes(video_frame.data),
484
+ size=(video_frame.width, video_frame.height),
485
+ format="RGB",
486
+ )
487
+ image_frame.pts = frame_event.timestamp_us // 1000 # Convert to milliseconds
488
+
489
+ if self._transport_ready and self._video_frame_callback:
490
+ await self._video_frame_callback(image_frame)
491
+ except Exception as e:
492
+ logger.error(f"Error processing individual video frame: {e}")
493
+ except Exception as e:
494
+ logger.error(f"Error processing video frames: {e}")
495
+ finally:
496
+ logger.debug(f"Video frame processing ended.")
497
+
498
+ async def _livekit_connect(self):
499
+ """Connect to LiveKit room."""
500
+ try:
501
+ logger.debug(f"HeyGenClient livekit connecting to room URL: {self._heyGen_session.url}")
502
+ self._livekit_room = rtc.Room()
503
+
504
+ @self._livekit_room.on("participant_connected")
505
+ def on_participant_connected(participant: rtc.RemoteParticipant):
506
+ logger.debug(
507
+ f"Participant connected - SID: {participant.sid}, Identity: {participant.identity}"
508
+ )
509
+ for track_pub in participant.track_publications.values():
510
+ logger.debug(
511
+ f"Available track - SID: {track_pub.sid}, Kind: {track_pub.kind}, Name: {track_pub.name}"
512
+ )
513
+ self._call_event_callback(
514
+ self._callbacks.on_participant_connected, participant.identity
515
+ )
516
+
517
+ @self._livekit_room.on("track_subscribed")
518
+ def on_track_subscribed(
519
+ track: rtc.Track,
520
+ publication: rtc.RemoteTrackPublication,
521
+ participant: rtc.RemoteParticipant,
522
+ ):
523
+ if (
524
+ track.kind == rtc.TrackKind.KIND_VIDEO
525
+ and self._video_frame_callback is not None
526
+ and self._video_task is None
527
+ ):
528
+ logger.debug(f"Creating video stream processor for track: {publication.sid}")
529
+ video_stream = rtc.VideoStream(track)
530
+ self._video_task = self._task_manager.create_task(
531
+ self._process_video_frames(video_stream), name="HeyGenClient_Receive_Video"
532
+ )
533
+ elif (
534
+ track.kind == rtc.TrackKind.KIND_AUDIO
535
+ and self._audio_frame_callback is not None
536
+ and self._audio_task is None
537
+ ):
538
+ logger.debug(f"Creating audio stream processor for track: {publication.sid}")
539
+ audio_stream = rtc.AudioStream(track)
540
+ self._audio_task = self._task_manager.create_task(
541
+ self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
542
+ )
543
+
544
+ @self._livekit_room.on("track_unsubscribed")
545
+ def on_track_unsubscribed(
546
+ track: rtc.Track,
547
+ publication: rtc.RemoteTrackPublication,
548
+ participant: rtc.RemoteParticipant,
549
+ ):
550
+ logger.debug(f"Track unsubscribed - SID: {publication.sid}, Kind: {track.kind}")
551
+
552
+ @self._livekit_room.on("participant_disconnected")
553
+ def on_participant_disconnected(participant: rtc.RemoteParticipant):
554
+ logger.debug(
555
+ f"Participant disconnected - SID: {participant.sid}, Identity: {participant.identity}"
556
+ )
557
+ self._call_event_callback(
558
+ self._callbacks.on_participant_disconnected, participant.identity
559
+ )
560
+
561
+ await self._livekit_room.connect(
562
+ self._heyGen_session.url, self._heyGen_session.access_token
563
+ )
564
+ logger.debug(f"Successfully connected to LiveKit room: {self._livekit_room.name}")
565
+ logger.debug(f"Local participant SID: {self._livekit_room.local_participant.sid}")
566
+ logger.debug(
567
+ f"Number of remote participants: {len(self._livekit_room.remote_participants)}"
568
+ )
569
+
570
+ # Log existing participants and their tracks
571
+ for participant in self._livekit_room.remote_participants.values():
572
+ logger.debug(
573
+ f"Existing participant - SID: {participant.sid}, Identity: {participant.identity}"
574
+ )
575
+ self._call_event_callback(
576
+ self._callbacks.on_participant_connected, participant.identity
577
+ )
578
+ for track_pub in participant.track_publications.values():
579
+ logger.debug(
580
+ f"Existing track - SID: {track_pub.sid}, Kind: {track_pub.kind}, Name: {track_pub.name}"
581
+ )
582
+
583
+ except Exception as e:
584
+ logger.error(f"LiveKit initialization error: {e}")
585
+ self._livekit_room = None
586
+
587
+ async def _livekit_disconnect(self):
588
+ """Disconnect from LiveKit room."""
589
+ try:
590
+ logger.debug("Starting LiveKit disconnect...")
591
+ if self._video_task:
592
+ await self._task_manager.cancel_task(self._video_task)
593
+ self._video_task = None
594
+
595
+ if self._audio_task:
596
+ await self._task_manager.cancel_task(self._audio_task)
597
+ self._audio_task = None
598
+
599
+ if self._livekit_room:
600
+ logger.debug("Disconnecting from LiveKit room")
601
+ await self._livekit_room.disconnect()
602
+ self._livekit_room = None
603
+ logger.debug("Successfully disconnected from LiveKit room")
604
+ except Exception as e:
605
+ logger.error(f"LiveKit disconnect error: {e}")
606
+
607
+ #
608
+ # Queue callback handling
609
+ #
610
+
611
+ def _call_event_callback(self, callback, *args):
612
+ """Queue an event callback for async execution."""
613
+ self._event_queue.put_nowait((callback, *args))
614
+
615
+ async def _callback_task_handler(self, queue: asyncio.Queue):
616
+ """Handle queued callbacks from the specified queue."""
617
+ while True:
618
+ (callback, *args) = await queue.get()
619
+ await callback(*args)
620
+ queue.task_done()