dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Hamsa Speech-to-Text service implementation.
|
|
8
|
+
|
|
9
|
+
This module implements speech-to-text transcription using the Hamsa API.
|
|
10
|
+
Hamsa supports Arabic and English languages via HTTP POST requests.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import base64
|
|
15
|
+
import json
|
|
16
|
+
from typing import AsyncGenerator, Optional
|
|
17
|
+
|
|
18
|
+
import aiohttp
|
|
19
|
+
from loguru import logger
|
|
20
|
+
from pydantic import BaseModel, Field
|
|
21
|
+
from typing_extensions import override
|
|
22
|
+
|
|
23
|
+
from pipecat.frames.frames import (
|
|
24
|
+
ErrorFrame,
|
|
25
|
+
Frame,
|
|
26
|
+
TranscriptionFrame,
|
|
27
|
+
)
|
|
28
|
+
from pipecat.services.stt_service import SegmentedSTTService
|
|
29
|
+
from pipecat.transcriptions.language import Language
|
|
30
|
+
from pipecat.utils.time import time_now_iso8601
|
|
31
|
+
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def language_to_hamsa_language(language: Language) -> Optional[str]:
|
|
35
|
+
"""Convert a Language enum to Hamsa's language code format.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
language: The Language enum value to convert
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
The Hamsa language code string or None if not supported
|
|
42
|
+
"""
|
|
43
|
+
# Hamsa supports Arabic and English
|
|
44
|
+
language_map = {
|
|
45
|
+
# Arabic
|
|
46
|
+
Language.AR: "ar",
|
|
47
|
+
Language.AR_AE: "ar",
|
|
48
|
+
Language.AR_BH: "ar",
|
|
49
|
+
Language.AR_DZ: "ar",
|
|
50
|
+
Language.AR_EG: "ar",
|
|
51
|
+
Language.AR_IQ: "ar",
|
|
52
|
+
Language.AR_JO: "ar",
|
|
53
|
+
Language.AR_KW: "ar",
|
|
54
|
+
Language.AR_LB: "ar",
|
|
55
|
+
Language.AR_LY: "ar",
|
|
56
|
+
Language.AR_MA: "ar",
|
|
57
|
+
Language.AR_OM: "ar",
|
|
58
|
+
Language.AR_QA: "ar",
|
|
59
|
+
Language.AR_SA: "ar",
|
|
60
|
+
Language.AR_SY: "ar",
|
|
61
|
+
Language.AR_TN: "ar",
|
|
62
|
+
Language.AR_YE: "ar",
|
|
63
|
+
# English
|
|
64
|
+
Language.EN: "en",
|
|
65
|
+
Language.EN_AU: "en",
|
|
66
|
+
Language.EN_CA: "en",
|
|
67
|
+
Language.EN_GB: "en",
|
|
68
|
+
Language.EN_HK: "en",
|
|
69
|
+
Language.EN_IE: "en",
|
|
70
|
+
Language.EN_IN: "en",
|
|
71
|
+
Language.EN_KE: "en",
|
|
72
|
+
Language.EN_NG: "en",
|
|
73
|
+
Language.EN_NZ: "en",
|
|
74
|
+
Language.EN_PH: "en",
|
|
75
|
+
Language.EN_SG: "en",
|
|
76
|
+
Language.EN_TZ: "en",
|
|
77
|
+
Language.EN_US: "en",
|
|
78
|
+
Language.EN_ZA: "en",
|
|
79
|
+
}
|
|
80
|
+
return language_map.get(language)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class HamsaSTTService(SegmentedSTTService):
|
|
84
|
+
"""Hamsa Speech-to-Text service implementation.
|
|
85
|
+
|
|
86
|
+
This service uses the Hamsa API for speech-to-text transcription.
|
|
87
|
+
It inherits from SegmentedSTTService to handle audio buffering and
|
|
88
|
+
processes complete audio segments when the user stops speaking.
|
|
89
|
+
|
|
90
|
+
Features:
|
|
91
|
+
- Supports Arabic and English languages
|
|
92
|
+
- Uses HTTP POST requests (not streaming)
|
|
93
|
+
- Configurable End of Speech (EOS) detection
|
|
94
|
+
- Base64 audio encoding
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
api_key: Hamsa API key for authentication
|
|
98
|
+
language: Language for transcription (defaults to Arabic "ar")
|
|
99
|
+
eos_threshold: End of speech threshold (0.0-1.0, default 0.3)
|
|
100
|
+
base_url: Hamsa API base URL
|
|
101
|
+
aiohttp_session: Optional aiohttp session for connection pooling
|
|
102
|
+
**kwargs: Additional arguments passed to SegmentedSTTService
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
class InputParams(BaseModel):
|
|
106
|
+
language: str = Field(default="ar", description="Language code ('ar' or 'en')")
|
|
107
|
+
eos_threshold: float = Field(default=0.3, description="End of speech threshold")
|
|
108
|
+
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
*,
|
|
112
|
+
api_key: str,
|
|
113
|
+
language: Language = Language.AR,
|
|
114
|
+
eos_threshold: float = 0.3,
|
|
115
|
+
base_url: str = "https://api.tryhamsa.com",
|
|
116
|
+
aiohttp_session: Optional[aiohttp.ClientSession] = None,
|
|
117
|
+
**kwargs,
|
|
118
|
+
):
|
|
119
|
+
super().__init__(**kwargs)
|
|
120
|
+
|
|
121
|
+
self._api_key = api_key
|
|
122
|
+
self._base_url = base_url.rstrip("/")
|
|
123
|
+
self._language = language_to_hamsa_language(language) or "ar"
|
|
124
|
+
self._eos_threshold = eos_threshold
|
|
125
|
+
self._aiohttp_session = aiohttp_session
|
|
126
|
+
self._endpoint = f"{self._base_url}/v1/realtime/stt"
|
|
127
|
+
|
|
128
|
+
# Store current settings
|
|
129
|
+
self._settings = {
|
|
130
|
+
"language": self._language,
|
|
131
|
+
"eos_threshold": self._eos_threshold,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
async def set_language(self, language: Language):
|
|
135
|
+
"""Set the language for speech recognition.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
language: The language to use for speech recognition
|
|
139
|
+
"""
|
|
140
|
+
hamsa_language = language_to_hamsa_language(language)
|
|
141
|
+
if hamsa_language:
|
|
142
|
+
self._language = hamsa_language
|
|
143
|
+
self._settings["language"] = hamsa_language
|
|
144
|
+
logger.info(f"Updated Hamsa STT language to: {hamsa_language}")
|
|
145
|
+
else:
|
|
146
|
+
logger.warning(f"Language {language} not supported by Hamsa STT")
|
|
147
|
+
|
|
148
|
+
@traced_stt
|
|
149
|
+
@override
|
|
150
|
+
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
151
|
+
"""Run speech-to-text transcription on the provided audio.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
audio: Raw audio bytes (WAV format) to transcribe
|
|
155
|
+
|
|
156
|
+
Yields:
|
|
157
|
+
Frame: TranscriptionFrame with transcription results or ErrorFrame on failure
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
# Convert audio bytes to base64
|
|
161
|
+
audio_b64 = base64.b64encode(audio).decode("utf-8")
|
|
162
|
+
|
|
163
|
+
# Prepare request payload
|
|
164
|
+
payload = {
|
|
165
|
+
"audioBase64": audio_b64,
|
|
166
|
+
"language": self._language,
|
|
167
|
+
"eos_threshold": self._eos_threshold,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
headers = {
|
|
171
|
+
"Authorization": f"Token {self._api_key}",
|
|
172
|
+
"Content-Type": "application/json",
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
# Use provided session or create a new one
|
|
176
|
+
session = self._aiohttp_session
|
|
177
|
+
should_close_session = False
|
|
178
|
+
|
|
179
|
+
if not session:
|
|
180
|
+
session = aiohttp.ClientSession()
|
|
181
|
+
should_close_session = True
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
# Make the HTTP POST request
|
|
185
|
+
async with session.post(
|
|
186
|
+
self._endpoint,
|
|
187
|
+
json=payload,
|
|
188
|
+
headers=headers,
|
|
189
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
|
190
|
+
) as response:
|
|
191
|
+
if response.status == 200:
|
|
192
|
+
result = await response.json()
|
|
193
|
+
|
|
194
|
+
# Extract transcribed text from response
|
|
195
|
+
transcription = result.get("text", "").strip()
|
|
196
|
+
|
|
197
|
+
if transcription:
|
|
198
|
+
logger.debug(f"Hamsa STT transcription: {transcription}")
|
|
199
|
+
yield TranscriptionFrame(
|
|
200
|
+
text=transcription,
|
|
201
|
+
user_id="user",
|
|
202
|
+
timestamp=time_now_iso8601(),
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
logger.debug("Hamsa STT returned empty transcription")
|
|
206
|
+
|
|
207
|
+
elif response.status == 401:
|
|
208
|
+
error_msg = "Hamsa STT authentication failed - check API key"
|
|
209
|
+
logger.error(error_msg)
|
|
210
|
+
yield ErrorFrame(error=error_msg)
|
|
211
|
+
|
|
212
|
+
elif response.status == 400:
|
|
213
|
+
error_text = await response.text()
|
|
214
|
+
error_msg = f"Hamsa STT bad request: {error_text}"
|
|
215
|
+
logger.error(error_msg)
|
|
216
|
+
yield ErrorFrame(error=error_msg)
|
|
217
|
+
|
|
218
|
+
else:
|
|
219
|
+
error_text = await response.text()
|
|
220
|
+
error_msg = f"Hamsa STT request failed: {response.status} - {error_text}"
|
|
221
|
+
logger.error(error_msg)
|
|
222
|
+
yield ErrorFrame(error=error_msg)
|
|
223
|
+
|
|
224
|
+
finally:
|
|
225
|
+
if should_close_session and session:
|
|
226
|
+
await session.close()
|
|
227
|
+
|
|
228
|
+
except asyncio.TimeoutError:
|
|
229
|
+
error_msg = "Hamsa STT request timed out"
|
|
230
|
+
logger.error(error_msg)
|
|
231
|
+
yield ErrorFrame(error=error_msg)
|
|
232
|
+
|
|
233
|
+
except aiohttp.ClientError as e:
|
|
234
|
+
error_msg = f"Hamsa STT client error: {str(e)}"
|
|
235
|
+
logger.error(error_msg)
|
|
236
|
+
yield ErrorFrame(error=error_msg)
|
|
237
|
+
|
|
238
|
+
except Exception as e:
|
|
239
|
+
error_msg = f"Hamsa STT unexpected error: {str(e)}"
|
|
240
|
+
logger.error(error_msg)
|
|
241
|
+
yield ErrorFrame(error=error_msg)
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""HeyGen API.
|
|
8
|
+
|
|
9
|
+
API to communicate with HeyGen Streaming API.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any, Dict, Literal, Optional
|
|
14
|
+
|
|
15
|
+
import aiohttp
|
|
16
|
+
from loguru import logger
|
|
17
|
+
from pydantic import BaseModel, Field
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AvatarQuality(str, Enum):
|
|
21
|
+
"""Enum representing different avatar quality levels."""
|
|
22
|
+
|
|
23
|
+
low = "low"
|
|
24
|
+
medium = "medium"
|
|
25
|
+
high = "high"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class VideoEncoding(str, Enum):
|
|
29
|
+
"""Enum representing the video encoding."""
|
|
30
|
+
|
|
31
|
+
H264 = "H264"
|
|
32
|
+
VP8 = "VP8"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class VoiceEmotion(str, Enum):
|
|
36
|
+
"""Enum representing different voice emotion types."""
|
|
37
|
+
|
|
38
|
+
EXCITED = "excited"
|
|
39
|
+
SERIOUS = "serious"
|
|
40
|
+
FRIENDLY = "friendly"
|
|
41
|
+
SOOTHING = "soothing"
|
|
42
|
+
BROADCASTER = "broadcaster"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ElevenLabsSettings(BaseModel):
|
|
46
|
+
"""Settings for ElevenLabs voice configuration.
|
|
47
|
+
|
|
48
|
+
Parameters:
|
|
49
|
+
stability (Optional[float]): Stability of the voice synthesis.
|
|
50
|
+
similarity_boost (Optional[float]): Adjustment for similarity in voice performance.
|
|
51
|
+
model_id (Optional[str]): Identifier for the ElevenLabs model to use.
|
|
52
|
+
style (Optional[int]): Style metric to apply for the voice.
|
|
53
|
+
use_speaker_boost (Optional[bool]): Flag to enable speaker boost.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
stability: Optional[float] = None
|
|
57
|
+
similarity_boost: Optional[float] = None
|
|
58
|
+
model_id: Optional[str] = None
|
|
59
|
+
style: Optional[int] = None
|
|
60
|
+
use_speaker_boost: Optional[bool] = None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class VoiceSettings(BaseModel):
|
|
64
|
+
"""Voice configuration settings.
|
|
65
|
+
|
|
66
|
+
Parameters:
|
|
67
|
+
voice_id (Optional[str]): ID of the voice to be used.
|
|
68
|
+
rate (Optional[float]): Speaking rate for the voice.
|
|
69
|
+
emotion (Optional[VoiceEmotion]): Emotion tone for the voice.
|
|
70
|
+
elevenlabs_settings (Optional[ElevenLabsSettings]): Details for ElevenLabs configuration.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
voice_id: Optional[str] = Field(None, alias="voiceId")
|
|
74
|
+
rate: Optional[float] = None
|
|
75
|
+
emotion: Optional[VoiceEmotion] = None
|
|
76
|
+
elevenlabs_settings: Optional[ElevenLabsSettings] = Field(None, alias="elevenlabsSettings")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class NewSessionRequest(BaseModel):
|
|
80
|
+
"""Requesting model for creating a new HeyGen session.
|
|
81
|
+
|
|
82
|
+
Parameters:
|
|
83
|
+
quality (Optional[AvatarQuality]): Desired quality of the avatar.
|
|
84
|
+
avatar_id (Optional[str]): Unique identifier for the avatar.
|
|
85
|
+
voice (Optional[VoiceSettings]): Voice configurations for the session.
|
|
86
|
+
video_encoding (Optional[VideoEncoding]): Desired encoding for the video stream.
|
|
87
|
+
knowledge_id (Optional[str]): Identifier for the knowledge base (if applicable).
|
|
88
|
+
knowledge_base (Optional[str]): Details of any external knowledge base.
|
|
89
|
+
version (Literal["v2"]): API version to use.
|
|
90
|
+
disable_idle_timeout (Optional[bool]): Flag to disable automatic idle timeout.
|
|
91
|
+
activity_idle_timeout (Optional[int]): Timeout in seconds for activity-based idle detection.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
quality: Optional[AvatarQuality] = None
|
|
95
|
+
avatar_id: Optional[str] = None
|
|
96
|
+
voice: Optional[VoiceSettings] = None
|
|
97
|
+
video_encoding: Optional[VideoEncoding] = None
|
|
98
|
+
knowledge_id: Optional[str] = None
|
|
99
|
+
knowledge_base: Optional[str] = None
|
|
100
|
+
version: Literal["v2"] = "v2"
|
|
101
|
+
disable_idle_timeout: Optional[bool] = None
|
|
102
|
+
activity_idle_timeout: Optional[int] = None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class HeyGenSession(BaseModel):
|
|
106
|
+
"""Response model for a HeyGen session.
|
|
107
|
+
|
|
108
|
+
Parameters:
|
|
109
|
+
session_id (str): Unique identifier for the streaming session.
|
|
110
|
+
access_token (str): Token for accessing the session securely.
|
|
111
|
+
realtime_endpoint (str): Real-time communication endpoint URL.
|
|
112
|
+
url (str): Direct URL for the session.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
session_id: str
|
|
116
|
+
access_token: str
|
|
117
|
+
realtime_endpoint: str
|
|
118
|
+
url: str
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class HeygenApiError(Exception):
|
|
122
|
+
"""Custom exception for HeyGen API errors."""
|
|
123
|
+
|
|
124
|
+
def __init__(self, message: str, status: int, response_text: str) -> None:
|
|
125
|
+
"""Initialize the HeyGen API error.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
message: Error message
|
|
129
|
+
status: HTTP status code
|
|
130
|
+
response_text: Raw response text from the API
|
|
131
|
+
"""
|
|
132
|
+
super().__init__(message)
|
|
133
|
+
self.status = status
|
|
134
|
+
self.response_text = response_text
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class HeyGenApi:
|
|
138
|
+
"""HeyGen Streaming API client."""
|
|
139
|
+
|
|
140
|
+
BASE_URL = "https://api.heygen.com/v1"
|
|
141
|
+
|
|
142
|
+
def __init__(self, api_key: str, session: aiohttp.ClientSession) -> None:
|
|
143
|
+
"""Initialize the HeyGen API.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
api_key: HeyGen API key
|
|
147
|
+
session: Optional aiohttp client session
|
|
148
|
+
"""
|
|
149
|
+
self.api_key = api_key
|
|
150
|
+
self.session = session
|
|
151
|
+
|
|
152
|
+
async def _request(self, path: str, params: Dict[str, Any], expect_data: bool = True) -> Any:
|
|
153
|
+
"""Make a POST request to the HeyGen API.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
path: API endpoint path.
|
|
157
|
+
params: JSON-serializable parameters.
|
|
158
|
+
expect_data: Whether to expect and extract 'data' field from response (default: True).
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Parsed JSON response data.
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
HeygenApiError: If the API response is not successful or data is missing when expected.
|
|
165
|
+
aiohttp.ClientError: For network-related errors.
|
|
166
|
+
"""
|
|
167
|
+
url = f"{self.BASE_URL}{path}"
|
|
168
|
+
headers = {
|
|
169
|
+
"x-api-key": self.api_key,
|
|
170
|
+
"Content-Type": "application/json",
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
logger.debug(f"HeyGen API request: {url}")
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
async with self.session.post(url, json=params, headers=headers) as response:
|
|
177
|
+
if not response.ok:
|
|
178
|
+
response_text = await response.text()
|
|
179
|
+
logger.error(f"HeyGen API error: {response_text}")
|
|
180
|
+
raise HeygenApiError(
|
|
181
|
+
f"API request failed with status {response.status}",
|
|
182
|
+
response.status,
|
|
183
|
+
response_text,
|
|
184
|
+
)
|
|
185
|
+
if expect_data:
|
|
186
|
+
json_data = await response.json()
|
|
187
|
+
data = json_data.get("data")
|
|
188
|
+
return data
|
|
189
|
+
return await response.text()
|
|
190
|
+
except aiohttp.ClientError as e:
|
|
191
|
+
logger.error(f"Network error while calling HeyGen API: {str(e)}")
|
|
192
|
+
raise
|
|
193
|
+
|
|
194
|
+
async def new_session(self, request_data: NewSessionRequest) -> HeyGenSession:
|
|
195
|
+
"""Create a new streaming session.
|
|
196
|
+
|
|
197
|
+
https://docs.heygen.com/reference/new-session
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
request_data: Session configuration parameters.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Session information, including ID and access token.
|
|
204
|
+
"""
|
|
205
|
+
params = {
|
|
206
|
+
"quality": request_data.quality,
|
|
207
|
+
"avatar_id": request_data.avatar_id,
|
|
208
|
+
"voice": {
|
|
209
|
+
"voice_id": request_data.voice.voiceId if request_data.voice else None,
|
|
210
|
+
"rate": request_data.voice.rate if request_data.voice else None,
|
|
211
|
+
"emotion": request_data.voice.emotion if request_data.voice else None,
|
|
212
|
+
"elevenlabs_settings": (
|
|
213
|
+
request_data.voice.elevenlabsSettings if request_data.voice else None
|
|
214
|
+
),
|
|
215
|
+
},
|
|
216
|
+
"knowledge_id": request_data.knowledge_id,
|
|
217
|
+
"knowledge_base": request_data.knowledge_base,
|
|
218
|
+
"version": request_data.version,
|
|
219
|
+
"video_encoding": request_data.video_encoding,
|
|
220
|
+
"disable_idle_timeout": request_data.disable_idle_timeout,
|
|
221
|
+
"activity_idle_timeout": request_data.activity_idle_timeout,
|
|
222
|
+
}
|
|
223
|
+
session_info = await self._request("/streaming.new", params)
|
|
224
|
+
print("heygen session info", session_info)
|
|
225
|
+
|
|
226
|
+
return HeyGenSession.model_validate(session_info)
|
|
227
|
+
|
|
228
|
+
async def start_session(self, session_id: str) -> Any:
|
|
229
|
+
"""Start the streaming session.
|
|
230
|
+
|
|
231
|
+
https://docs.heygen.com/reference/start-session
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
session_id: ID of the session to start.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Response data from the start session API call.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
ValueError: If session ID is not set.
|
|
241
|
+
"""
|
|
242
|
+
if not session_id:
|
|
243
|
+
raise ValueError("Session ID is not set. Call new_session first.")
|
|
244
|
+
|
|
245
|
+
params = {
|
|
246
|
+
"session_id": session_id,
|
|
247
|
+
}
|
|
248
|
+
return await self._request("/streaming.start", params)
|
|
249
|
+
|
|
250
|
+
async def close_session(self, session_id: str) -> Any:
|
|
251
|
+
"""Terminate an active the streaming session.
|
|
252
|
+
|
|
253
|
+
https://docs.heygen.com/reference/close-session
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
session_id: ID of the session to stop.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Response data from the stop session API call.
|
|
260
|
+
|
|
261
|
+
Raises:
|
|
262
|
+
ValueError: If session ID is not set.
|
|
263
|
+
"""
|
|
264
|
+
if not session_id:
|
|
265
|
+
raise ValueError("Session ID is not set. Call new_session first.")
|
|
266
|
+
|
|
267
|
+
params = {
|
|
268
|
+
"session_id": session_id,
|
|
269
|
+
}
|
|
270
|
+
return await self._request("/streaming.stop", params, expect_data=False)
|
|
271
|
+
|
|
272
|
+
async def create_token(self) -> str:
|
|
273
|
+
"""Create a streaming token.
|
|
274
|
+
|
|
275
|
+
https://docs.heygen.com/reference/streaming-token
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
str: The generated access token for the streaming session
|
|
279
|
+
"""
|
|
280
|
+
token_info = await self._request("/streaming.create_token", {})
|
|
281
|
+
return token_info["token"]
|