dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -0,0 +1,620 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""HeyGen implementation for Pipecat.
|
|
8
|
+
|
|
9
|
+
This module provides integration with the HeyGen platform for creating conversational
|
|
10
|
+
AI applications with avatars. It manages conversation sessions and provides real-time
|
|
11
|
+
audio/video streaming capabilities through the HeyGen API.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import base64
|
|
16
|
+
import json
|
|
17
|
+
import time
|
|
18
|
+
import uuid
|
|
19
|
+
from typing import Awaitable, Callable, Optional
|
|
20
|
+
|
|
21
|
+
import aiohttp
|
|
22
|
+
from loguru import logger
|
|
23
|
+
from pydantic import BaseModel
|
|
24
|
+
|
|
25
|
+
from pipecat.frames.frames import (
|
|
26
|
+
AudioRawFrame,
|
|
27
|
+
ImageRawFrame,
|
|
28
|
+
StartFrame,
|
|
29
|
+
)
|
|
30
|
+
from pipecat.processors.frame_processor import FrameProcessorSetup
|
|
31
|
+
from pipecat.services.heygen.api import HeyGenApi, HeyGenSession, NewSessionRequest
|
|
32
|
+
from pipecat.transports.base_transport import TransportParams
|
|
33
|
+
from pipecat.utils.asyncio.task_manager import BaseTaskManager
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from livekit import rtc
|
|
37
|
+
from livekit.rtc._proto.video_frame_pb2 import VideoBufferType
|
|
38
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
39
|
+
from websockets.exceptions import ConnectionClosedOK
|
|
40
|
+
except ModuleNotFoundError as e:
|
|
41
|
+
logger.error(f"Exception: {e}")
|
|
42
|
+
logger.error("In order to use HeyGen, you need to `pip install pipecat-ai[heygen]`.")
|
|
43
|
+
raise Exception(f"Missing module: {e}")
|
|
44
|
+
|
|
45
|
+
HEY_GEN_SAMPLE_RATE = 24000
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class HeyGenCallbacks(BaseModel):
|
|
49
|
+
"""Callback handlers for HeyGen events.
|
|
50
|
+
|
|
51
|
+
Parameters:
|
|
52
|
+
on_participant_connected: Called when a participant connects
|
|
53
|
+
on_participant_disconnected: Called when a participant disconnects
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
on_participant_connected: Callable[[str], Awaitable[None]]
|
|
57
|
+
on_participant_disconnected: Callable[[str], Awaitable[None]]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class HeyGenClient:
|
|
61
|
+
"""A client for interacting with HeyGen's Interactive Avatar Realtime API.
|
|
62
|
+
|
|
63
|
+
This client manages both WebSocket and LiveKit connections for real-time avatar streaming,
|
|
64
|
+
handling bi-directional audio/video communication and avatar control. It implements the API defined in
|
|
65
|
+
https://docs.heygen.com/docs/interactive-avatar-realtime-api
|
|
66
|
+
|
|
67
|
+
The client manages the following connections:
|
|
68
|
+
1. WebSocket connection for avatar control and audio streaming
|
|
69
|
+
2. LiveKit connection for receiving avatar video and audio
|
|
70
|
+
|
|
71
|
+
Attributes:
|
|
72
|
+
HEY_GEN_SAMPLE_RATE (int): The required sample rate for HeyGen's audio processing (24000 Hz)
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
*,
|
|
78
|
+
api_key: str,
|
|
79
|
+
session: aiohttp.ClientSession,
|
|
80
|
+
params: TransportParams,
|
|
81
|
+
session_request: NewSessionRequest = NewSessionRequest(
|
|
82
|
+
avatarName="Shawn_Therapist_public",
|
|
83
|
+
version="v2",
|
|
84
|
+
),
|
|
85
|
+
callbacks: HeyGenCallbacks,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""Initialize the HeyGen client.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
api_key: HeyGen API key for authentication
|
|
91
|
+
session: HTTP client session for API requests
|
|
92
|
+
params: Transport configuration parameters
|
|
93
|
+
session_request: Configuration for the HeyGen session (default: uses Shawn_Therapist_public avatar)
|
|
94
|
+
callbacks: Callback handlers for HeyGen events
|
|
95
|
+
"""
|
|
96
|
+
self._api = HeyGenApi(api_key, session=session)
|
|
97
|
+
self._heyGen_session: Optional[HeyGenSession] = None
|
|
98
|
+
self._websocket = None
|
|
99
|
+
self._task_manager: Optional[BaseTaskManager] = None
|
|
100
|
+
self._params = params
|
|
101
|
+
self._in_sample_rate = 0
|
|
102
|
+
self._out_sample_rate = 0
|
|
103
|
+
self._connected = False
|
|
104
|
+
self._session_request = session_request
|
|
105
|
+
self._callbacks = callbacks
|
|
106
|
+
self._event_queue: Optional[asyncio.Queue] = None
|
|
107
|
+
self._event_task = None
|
|
108
|
+
# Currently supporting to capture the audio and video from a single participant
|
|
109
|
+
self._video_task = None
|
|
110
|
+
self._audio_task = None
|
|
111
|
+
self._video_frame_callback = None
|
|
112
|
+
self._audio_frame_callback = None
|
|
113
|
+
# write_audio_frame() is called quickly, as soon as we get audio
|
|
114
|
+
# (e.g. from the TTS), and since this is just a network connection we
|
|
115
|
+
# would be sending it to quickly. Instead, we want to block to emulate
|
|
116
|
+
# an audio device, this is what the send interval is. It will be
|
|
117
|
+
# computed on StartFrame.
|
|
118
|
+
self._send_interval = 0
|
|
119
|
+
self._next_send_time = 0
|
|
120
|
+
self._audio_seconds_sent = 0.0
|
|
121
|
+
self._transport_ready = False
|
|
122
|
+
|
|
123
|
+
async def _initialize(self):
|
|
124
|
+
self._heyGen_session = await self._api.new_session(self._session_request)
|
|
125
|
+
logger.debug(f"HeyGen sessionId: {self._heyGen_session.session_id}")
|
|
126
|
+
logger.debug(f"HeyGen realtime_endpoint: {self._heyGen_session.realtime_endpoint}")
|
|
127
|
+
logger.debug(f"HeyGen livekit URL: {self._heyGen_session.url}")
|
|
128
|
+
logger.debug(f"HeyGen livekit toke: {self._heyGen_session.access_token}")
|
|
129
|
+
logger.info(
|
|
130
|
+
f"Full Link: https://meet.livekit.io/custom?liveKitUrl={self._heyGen_session.url}&token={self._heyGen_session.access_token}"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
await self._api.start_session(self._heyGen_session.session_id)
|
|
134
|
+
logger.info("HeyGen session started")
|
|
135
|
+
|
|
136
|
+
async def setup(self, setup: FrameProcessorSetup) -> None:
|
|
137
|
+
"""Setup the client and initialize the conversation.
|
|
138
|
+
|
|
139
|
+
Establishes a new session with HeyGen's API if one doesn't exist.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
setup: The frame processor setup configuration.
|
|
143
|
+
"""
|
|
144
|
+
if self._heyGen_session is not None:
|
|
145
|
+
logger.debug("heygen_session already initialized")
|
|
146
|
+
return
|
|
147
|
+
self._task_manager = setup.task_manager
|
|
148
|
+
try:
|
|
149
|
+
await self._initialize()
|
|
150
|
+
|
|
151
|
+
self._event_queue = asyncio.Queue()
|
|
152
|
+
self._event_task = self._task_manager.create_task(
|
|
153
|
+
self._callback_task_handler(self._event_queue),
|
|
154
|
+
f"{self}::event_callback_task",
|
|
155
|
+
)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Failed to setup HeyGenClient: {e}")
|
|
158
|
+
await self.cleanup()
|
|
159
|
+
|
|
160
|
+
async def cleanup(self) -> None:
|
|
161
|
+
"""Cleanup client resources.
|
|
162
|
+
|
|
163
|
+
Closes the active HeyGen session and resets internal state.
|
|
164
|
+
"""
|
|
165
|
+
try:
|
|
166
|
+
if self._heyGen_session is not None:
|
|
167
|
+
await self._api.close_session(self._heyGen_session.session_id)
|
|
168
|
+
self._heyGen_session = None
|
|
169
|
+
self._connected = False
|
|
170
|
+
|
|
171
|
+
if self._event_task and self._task_manager:
|
|
172
|
+
await self._task_manager.cancel_task(self._event_task)
|
|
173
|
+
self._event_task = None
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.exception(f"Exception during cleanup: {e}")
|
|
176
|
+
|
|
177
|
+
async def start(self, frame: StartFrame, audio_chunk_size: int) -> None:
|
|
178
|
+
"""Start the client and establish all necessary connections.
|
|
179
|
+
|
|
180
|
+
Initializes WebSocket and LiveKit connections using the provided configuration.
|
|
181
|
+
Sets up audio processing with the specified sample rates.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
frame: Initial configuration frame containing audio parameters
|
|
185
|
+
audio_chunk_size: Audio chunk size for output processing
|
|
186
|
+
"""
|
|
187
|
+
if self._websocket:
|
|
188
|
+
logger.debug("heygen client already started")
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
logger.debug(f"HeyGenClient starting")
|
|
192
|
+
self._in_sample_rate = self._params.audio_in_sample_rate or frame.audio_in_sample_rate
|
|
193
|
+
self._out_sample_rate = self._params.audio_out_sample_rate or frame.audio_out_sample_rate
|
|
194
|
+
self._send_interval = (audio_chunk_size / self._out_sample_rate) / 2
|
|
195
|
+
logger.debug(f"HeyGenClient send_interval: {self._send_interval}")
|
|
196
|
+
await self._ws_connect()
|
|
197
|
+
await self._livekit_connect()
|
|
198
|
+
|
|
199
|
+
async def stop(self) -> None:
|
|
200
|
+
"""Stop the client and terminate all connections.
|
|
201
|
+
|
|
202
|
+
Disconnects from WebSocket and LiveKit endpoints, and performs cleanup.
|
|
203
|
+
"""
|
|
204
|
+
logger.debug(f"HeyGenVideoService stopping")
|
|
205
|
+
await self._ws_disconnect()
|
|
206
|
+
await self._livekit_disconnect()
|
|
207
|
+
await self.cleanup()
|
|
208
|
+
|
|
209
|
+
# websocket connection methods
|
|
210
|
+
async def _ws_connect(self):
|
|
211
|
+
"""Connect to HeyGen websocket endpoint."""
|
|
212
|
+
try:
|
|
213
|
+
if self._websocket:
|
|
214
|
+
logger.debug(f"HeyGenClient ws already connected!")
|
|
215
|
+
return
|
|
216
|
+
logger.debug(f"HeyGenClient ws connecting")
|
|
217
|
+
self._websocket = await websocket_connect(
|
|
218
|
+
uri=self._heyGen_session.realtime_endpoint,
|
|
219
|
+
)
|
|
220
|
+
self._connected = True
|
|
221
|
+
self._receive_task = self._task_manager.create_task(
|
|
222
|
+
self._ws_receive_task_handler(), name="HeyGenClient_Websocket"
|
|
223
|
+
)
|
|
224
|
+
except Exception as e:
|
|
225
|
+
logger.error(f"{self} initialization error: {e}")
|
|
226
|
+
self._websocket = None
|
|
227
|
+
|
|
228
|
+
async def _ws_receive_task_handler(self):
|
|
229
|
+
"""Handle incoming WebSocket messages."""
|
|
230
|
+
while self._connected:
|
|
231
|
+
try:
|
|
232
|
+
message = await self._websocket.recv()
|
|
233
|
+
parsed_message = json.loads(message)
|
|
234
|
+
await self._handle_ws_server_event(parsed_message)
|
|
235
|
+
except ConnectionClosedOK:
|
|
236
|
+
break
|
|
237
|
+
except Exception as e:
|
|
238
|
+
logger.error(f"Error processing WebSocket message: {e}")
|
|
239
|
+
break
|
|
240
|
+
|
|
241
|
+
async def _handle_ws_server_event(self, event: dict) -> None:
|
|
242
|
+
"""Handle an event from HeyGen websocket."""
|
|
243
|
+
event_type = event.get("type")
|
|
244
|
+
if event_type == "agent.state":
|
|
245
|
+
logger.debug(f"HeyGenClient ws received agent status: {event}")
|
|
246
|
+
else:
|
|
247
|
+
logger.trace(f"HeyGenClient ws received unknown event: {event_type}")
|
|
248
|
+
|
|
249
|
+
async def _ws_disconnect(self) -> None:
|
|
250
|
+
"""Disconnect from HeyGen websocket endpoint."""
|
|
251
|
+
try:
|
|
252
|
+
self._connected = False
|
|
253
|
+
if self._websocket:
|
|
254
|
+
await self._websocket.close()
|
|
255
|
+
except Exception as e:
|
|
256
|
+
logger.error(f"{self} disconnect error: {e}")
|
|
257
|
+
finally:
|
|
258
|
+
self._websocket = None
|
|
259
|
+
|
|
260
|
+
async def _ws_send(self, message: dict) -> None:
|
|
261
|
+
"""Send a message to HeyGen websocket."""
|
|
262
|
+
if not self._connected:
|
|
263
|
+
logger.debug(f"{self} websocket is not connected anymore.")
|
|
264
|
+
return
|
|
265
|
+
try:
|
|
266
|
+
if self._websocket:
|
|
267
|
+
await self._websocket.send(json.dumps(message))
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.error(f"Error sending message to HeyGen websocket: {e}")
|
|
270
|
+
raise e
|
|
271
|
+
|
|
272
|
+
async def interrupt(self, event_id: str) -> None:
|
|
273
|
+
"""Interrupt the avatar's current action.
|
|
274
|
+
|
|
275
|
+
Stops the current animation/speech and returns the avatar to idle state.
|
|
276
|
+
Useful for handling user interruptions during avatar speech.
|
|
277
|
+
"""
|
|
278
|
+
logger.debug("HeyGenClient interrupt")
|
|
279
|
+
self._reset_audio_timing()
|
|
280
|
+
await self._ws_send(
|
|
281
|
+
{
|
|
282
|
+
"type": "agent.interrupt",
|
|
283
|
+
"event_id": event_id,
|
|
284
|
+
}
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
async def start_agent_listening(self) -> None:
|
|
288
|
+
"""Start the avatar's listening animation.
|
|
289
|
+
|
|
290
|
+
Triggers visual cues indicating the avatar is listening to user input.
|
|
291
|
+
"""
|
|
292
|
+
logger.debug("HeyGenClient start_agent_listening")
|
|
293
|
+
await self._ws_send(
|
|
294
|
+
{
|
|
295
|
+
"type": "agent.start_listening",
|
|
296
|
+
"event_id": str(uuid.uuid4()),
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
async def stop_agent_listening(self) -> None:
|
|
301
|
+
"""Stop the avatar's listening animation.
|
|
302
|
+
|
|
303
|
+
Returns the avatar to idle state from listening state.
|
|
304
|
+
"""
|
|
305
|
+
await self._ws_send(
|
|
306
|
+
{
|
|
307
|
+
"type": "agent.stop_listening",
|
|
308
|
+
"event_id": str(uuid.uuid4()),
|
|
309
|
+
}
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def transport_ready(self) -> None:
|
|
313
|
+
"""Indicates that the output transport is ready and able to receive frames."""
|
|
314
|
+
self._transport_ready = True
|
|
315
|
+
|
|
316
|
+
@property
|
|
317
|
+
def out_sample_rate(self) -> int:
|
|
318
|
+
"""Get the output sample rate.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
The output sample rate in Hz.
|
|
322
|
+
"""
|
|
323
|
+
return self._out_sample_rate
|
|
324
|
+
|
|
325
|
+
@property
|
|
326
|
+
def in_sample_rate(self) -> int:
|
|
327
|
+
"""Get the input sample rate.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
The input sample rate in Hz.
|
|
331
|
+
"""
|
|
332
|
+
return self._in_sample_rate
|
|
333
|
+
|
|
334
|
+
async def agent_speak(self, audio: bytes, event_id: str) -> None:
|
|
335
|
+
"""Send audio data to the agent speak.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
audio: Audio data in base64 encoded format
|
|
339
|
+
event_id: Unique identifier for the event
|
|
340
|
+
"""
|
|
341
|
+
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
|
342
|
+
await self._ws_send(
|
|
343
|
+
{
|
|
344
|
+
"type": "agent.speak",
|
|
345
|
+
"audio": audio_base64,
|
|
346
|
+
"event_id": event_id,
|
|
347
|
+
}
|
|
348
|
+
)
|
|
349
|
+
# Simulate audio playback with a sleep.
|
|
350
|
+
await self._write_audio_sleep()
|
|
351
|
+
|
|
352
|
+
def _reset_audio_timing(self):
|
|
353
|
+
"""Reset audio timing control variables."""
|
|
354
|
+
self._audio_seconds_sent = 0.0
|
|
355
|
+
self._next_send_time = 0
|
|
356
|
+
|
|
357
|
+
async def _write_audio_sleep(self):
|
|
358
|
+
"""Simulate audio playback timing with appropriate delays."""
|
|
359
|
+
# Only sleep after we've sent the first second of audio
|
|
360
|
+
# This appears to reduce the latency to receive the answer from HeyGen
|
|
361
|
+
if self._audio_seconds_sent < 3.0:
|
|
362
|
+
self._audio_seconds_sent += self._send_interval
|
|
363
|
+
self._next_send_time = time.monotonic() + self._send_interval
|
|
364
|
+
return
|
|
365
|
+
|
|
366
|
+
# After first second, use normal timing
|
|
367
|
+
current_time = time.monotonic()
|
|
368
|
+
sleep_duration = max(0, self._next_send_time - current_time)
|
|
369
|
+
if sleep_duration > 0:
|
|
370
|
+
await asyncio.sleep(sleep_duration)
|
|
371
|
+
self._next_send_time += self._send_interval
|
|
372
|
+
else:
|
|
373
|
+
self._next_send_time = time.monotonic() + self._send_interval
|
|
374
|
+
|
|
375
|
+
async def agent_speak_end(self, event_id: str) -> None:
|
|
376
|
+
"""Send signaling that the agent has finished speaking.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
event_id: Unique identifier for the event
|
|
380
|
+
"""
|
|
381
|
+
self._reset_audio_timing()
|
|
382
|
+
await self._ws_send(
|
|
383
|
+
{
|
|
384
|
+
"type": "agent.speak_end",
|
|
385
|
+
"event_id": event_id,
|
|
386
|
+
}
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
async def capture_participant_audio(self, participant_id: str, callback) -> None:
|
|
390
|
+
"""Capture audio frames from the HeyGen avatar.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
participant_id: Identifier of the participant to capture audio from
|
|
394
|
+
callback: Async function to handle received audio frames
|
|
395
|
+
"""
|
|
396
|
+
logger.debug(f"capture_participant_audio: {participant_id}")
|
|
397
|
+
self._audio_frame_callback = callback
|
|
398
|
+
if self._audio_task is not None:
|
|
399
|
+
logger.warning(
|
|
400
|
+
"Trying to capture more than one audio stream. It is currently not supported."
|
|
401
|
+
)
|
|
402
|
+
return
|
|
403
|
+
|
|
404
|
+
# Check if we already have audio tracks and participant is connected
|
|
405
|
+
if self._livekit_room and participant_id in self._livekit_room.remote_participants:
|
|
406
|
+
participant = self._livekit_room.remote_participants[participant_id]
|
|
407
|
+
for track_pub in participant.track_publications.values():
|
|
408
|
+
if track_pub.kind == rtc.TrackKind.KIND_AUDIO and track_pub.track is not None:
|
|
409
|
+
logger.debug(f"Starting audio capture for existing track: {track_pub.sid}")
|
|
410
|
+
audio_stream = rtc.AudioStream(track_pub.track)
|
|
411
|
+
self._audio_task = self._task_manager.create_task(
|
|
412
|
+
self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
|
|
413
|
+
)
|
|
414
|
+
break
|
|
415
|
+
|
|
416
|
+
async def capture_participant_video(self, participant_id: str, callback) -> None:
|
|
417
|
+
"""Capture video frames from the HeyGen avatar.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
participant_id: Identifier of the participant to capture video from
|
|
421
|
+
callback: Async function to handle received video frames
|
|
422
|
+
"""
|
|
423
|
+
logger.debug(f"capture_participant_video: {participant_id}")
|
|
424
|
+
self._video_frame_callback = callback
|
|
425
|
+
if self._video_task is not None:
|
|
426
|
+
logger.warning(
|
|
427
|
+
"Trying to capture more than one audio stream. It is currently not supported."
|
|
428
|
+
)
|
|
429
|
+
return
|
|
430
|
+
|
|
431
|
+
# Check if we already have video tracks and participant is connected
|
|
432
|
+
if self._livekit_room and participant_id in self._livekit_room.remote_participants:
|
|
433
|
+
participant = self._livekit_room.remote_participants[participant_id]
|
|
434
|
+
for track_pub in participant.track_publications.values():
|
|
435
|
+
if track_pub.kind == rtc.TrackKind.KIND_VIDEO and track_pub.track is not None:
|
|
436
|
+
logger.debug(f"Starting video capture for existing track: {track_pub.sid}")
|
|
437
|
+
video_stream = rtc.VideoStream(track_pub.track)
|
|
438
|
+
self._video_task = self._task_manager.create_task(
|
|
439
|
+
self._process_video_frames(video_stream), name="HeyGenClient_Receive_Video"
|
|
440
|
+
)
|
|
441
|
+
break
|
|
442
|
+
|
|
443
|
+
# Livekit integration to receive audio and video
|
|
444
|
+
async def _process_audio_frames(self, stream: rtc.AudioStream):
|
|
445
|
+
"""Process audio frames from LiveKit stream."""
|
|
446
|
+
try:
|
|
447
|
+
logger.debug("Starting audio frame processing...")
|
|
448
|
+
async for frame_event in stream:
|
|
449
|
+
try:
|
|
450
|
+
audio_frame = frame_event.frame
|
|
451
|
+
# Convert audio to raw bytes
|
|
452
|
+
audio_data = bytes(audio_frame.data)
|
|
453
|
+
|
|
454
|
+
audio_frame = AudioRawFrame(
|
|
455
|
+
audio=audio_data,
|
|
456
|
+
sample_rate=audio_frame.sample_rate,
|
|
457
|
+
num_channels=1, # HeyGen uses mono audio
|
|
458
|
+
)
|
|
459
|
+
if self._transport_ready and self._audio_frame_callback:
|
|
460
|
+
await self._audio_frame_callback(audio_frame)
|
|
461
|
+
|
|
462
|
+
except Exception as e:
|
|
463
|
+
logger.error(f"Error processing audio frame: {e}")
|
|
464
|
+
except Exception as e:
|
|
465
|
+
logger.error(f"Error processing audio frames: {e}")
|
|
466
|
+
finally:
|
|
467
|
+
logger.debug(f"Audio frame processing ended.")
|
|
468
|
+
|
|
469
|
+
async def _process_video_frames(self, stream: rtc.VideoStream):
|
|
470
|
+
"""Process video frames from LiveKit stream."""
|
|
471
|
+
try:
|
|
472
|
+
logger.debug("Starting video frame processing...")
|
|
473
|
+
async for frame_event in stream:
|
|
474
|
+
try:
|
|
475
|
+
video_frame = frame_event.frame
|
|
476
|
+
|
|
477
|
+
# Convert to RGB24 if not already
|
|
478
|
+
if video_frame.type != VideoBufferType.RGB24:
|
|
479
|
+
video_frame = video_frame.convert(VideoBufferType.RGB24)
|
|
480
|
+
|
|
481
|
+
# Create frame with original dimensions
|
|
482
|
+
image_frame = ImageRawFrame(
|
|
483
|
+
image=bytes(video_frame.data),
|
|
484
|
+
size=(video_frame.width, video_frame.height),
|
|
485
|
+
format="RGB",
|
|
486
|
+
)
|
|
487
|
+
image_frame.pts = frame_event.timestamp_us // 1000 # Convert to milliseconds
|
|
488
|
+
|
|
489
|
+
if self._transport_ready and self._video_frame_callback:
|
|
490
|
+
await self._video_frame_callback(image_frame)
|
|
491
|
+
except Exception as e:
|
|
492
|
+
logger.error(f"Error processing individual video frame: {e}")
|
|
493
|
+
except Exception as e:
|
|
494
|
+
logger.error(f"Error processing video frames: {e}")
|
|
495
|
+
finally:
|
|
496
|
+
logger.debug(f"Video frame processing ended.")
|
|
497
|
+
|
|
498
|
+
async def _livekit_connect(self):
|
|
499
|
+
"""Connect to LiveKit room."""
|
|
500
|
+
try:
|
|
501
|
+
logger.debug(f"HeyGenClient livekit connecting to room URL: {self._heyGen_session.url}")
|
|
502
|
+
self._livekit_room = rtc.Room()
|
|
503
|
+
|
|
504
|
+
@self._livekit_room.on("participant_connected")
|
|
505
|
+
def on_participant_connected(participant: rtc.RemoteParticipant):
|
|
506
|
+
logger.debug(
|
|
507
|
+
f"Participant connected - SID: {participant.sid}, Identity: {participant.identity}"
|
|
508
|
+
)
|
|
509
|
+
for track_pub in participant.track_publications.values():
|
|
510
|
+
logger.debug(
|
|
511
|
+
f"Available track - SID: {track_pub.sid}, Kind: {track_pub.kind}, Name: {track_pub.name}"
|
|
512
|
+
)
|
|
513
|
+
self._call_event_callback(
|
|
514
|
+
self._callbacks.on_participant_connected, participant.identity
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
@self._livekit_room.on("track_subscribed")
|
|
518
|
+
def on_track_subscribed(
|
|
519
|
+
track: rtc.Track,
|
|
520
|
+
publication: rtc.RemoteTrackPublication,
|
|
521
|
+
participant: rtc.RemoteParticipant,
|
|
522
|
+
):
|
|
523
|
+
if (
|
|
524
|
+
track.kind == rtc.TrackKind.KIND_VIDEO
|
|
525
|
+
and self._video_frame_callback is not None
|
|
526
|
+
and self._video_task is None
|
|
527
|
+
):
|
|
528
|
+
logger.debug(f"Creating video stream processor for track: {publication.sid}")
|
|
529
|
+
video_stream = rtc.VideoStream(track)
|
|
530
|
+
self._video_task = self._task_manager.create_task(
|
|
531
|
+
self._process_video_frames(video_stream), name="HeyGenClient_Receive_Video"
|
|
532
|
+
)
|
|
533
|
+
elif (
|
|
534
|
+
track.kind == rtc.TrackKind.KIND_AUDIO
|
|
535
|
+
and self._audio_frame_callback is not None
|
|
536
|
+
and self._audio_task is None
|
|
537
|
+
):
|
|
538
|
+
logger.debug(f"Creating audio stream processor for track: {publication.sid}")
|
|
539
|
+
audio_stream = rtc.AudioStream(track)
|
|
540
|
+
self._audio_task = self._task_manager.create_task(
|
|
541
|
+
self._process_audio_frames(audio_stream), name="HeyGenClient_Receive_Audio"
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
@self._livekit_room.on("track_unsubscribed")
|
|
545
|
+
def on_track_unsubscribed(
|
|
546
|
+
track: rtc.Track,
|
|
547
|
+
publication: rtc.RemoteTrackPublication,
|
|
548
|
+
participant: rtc.RemoteParticipant,
|
|
549
|
+
):
|
|
550
|
+
logger.debug(f"Track unsubscribed - SID: {publication.sid}, Kind: {track.kind}")
|
|
551
|
+
|
|
552
|
+
@self._livekit_room.on("participant_disconnected")
|
|
553
|
+
def on_participant_disconnected(participant: rtc.RemoteParticipant):
|
|
554
|
+
logger.debug(
|
|
555
|
+
f"Participant disconnected - SID: {participant.sid}, Identity: {participant.identity}"
|
|
556
|
+
)
|
|
557
|
+
self._call_event_callback(
|
|
558
|
+
self._callbacks.on_participant_disconnected, participant.identity
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
await self._livekit_room.connect(
|
|
562
|
+
self._heyGen_session.url, self._heyGen_session.access_token
|
|
563
|
+
)
|
|
564
|
+
logger.debug(f"Successfully connected to LiveKit room: {self._livekit_room.name}")
|
|
565
|
+
logger.debug(f"Local participant SID: {self._livekit_room.local_participant.sid}")
|
|
566
|
+
logger.debug(
|
|
567
|
+
f"Number of remote participants: {len(self._livekit_room.remote_participants)}"
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Log existing participants and their tracks
|
|
571
|
+
for participant in self._livekit_room.remote_participants.values():
|
|
572
|
+
logger.debug(
|
|
573
|
+
f"Existing participant - SID: {participant.sid}, Identity: {participant.identity}"
|
|
574
|
+
)
|
|
575
|
+
self._call_event_callback(
|
|
576
|
+
self._callbacks.on_participant_connected, participant.identity
|
|
577
|
+
)
|
|
578
|
+
for track_pub in participant.track_publications.values():
|
|
579
|
+
logger.debug(
|
|
580
|
+
f"Existing track - SID: {track_pub.sid}, Kind: {track_pub.kind}, Name: {track_pub.name}"
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
except Exception as e:
|
|
584
|
+
logger.error(f"LiveKit initialization error: {e}")
|
|
585
|
+
self._livekit_room = None
|
|
586
|
+
|
|
587
|
+
async def _livekit_disconnect(self):
|
|
588
|
+
"""Disconnect from LiveKit room."""
|
|
589
|
+
try:
|
|
590
|
+
logger.debug("Starting LiveKit disconnect...")
|
|
591
|
+
if self._video_task:
|
|
592
|
+
await self._task_manager.cancel_task(self._video_task)
|
|
593
|
+
self._video_task = None
|
|
594
|
+
|
|
595
|
+
if self._audio_task:
|
|
596
|
+
await self._task_manager.cancel_task(self._audio_task)
|
|
597
|
+
self._audio_task = None
|
|
598
|
+
|
|
599
|
+
if self._livekit_room:
|
|
600
|
+
logger.debug("Disconnecting from LiveKit room")
|
|
601
|
+
await self._livekit_room.disconnect()
|
|
602
|
+
self._livekit_room = None
|
|
603
|
+
logger.debug("Successfully disconnected from LiveKit room")
|
|
604
|
+
except Exception as e:
|
|
605
|
+
logger.error(f"LiveKit disconnect error: {e}")
|
|
606
|
+
|
|
607
|
+
#
|
|
608
|
+
# Queue callback handling
|
|
609
|
+
#
|
|
610
|
+
|
|
611
|
+
def _call_event_callback(self, callback, *args):
|
|
612
|
+
"""Queue an event callback for async execution."""
|
|
613
|
+
self._event_queue.put_nowait((callback, *args))
|
|
614
|
+
|
|
615
|
+
async def _callback_task_handler(self, queue: asyncio.Queue):
|
|
616
|
+
"""Handle queued callbacks from the specified queue."""
|
|
617
|
+
while True:
|
|
618
|
+
(callback, *args) = await queue.get()
|
|
619
|
+
await callback(*args)
|
|
620
|
+
queue.task_done()
|