videosdk-plugins-azure 0.0.61__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- videosdk_plugins_azure-0.0.61/.gitignore +19 -0
- videosdk_plugins_azure-0.0.61/PKG-INFO +31 -0
- videosdk_plugins_azure-0.0.61/README.md +9 -0
- videosdk_plugins_azure-0.0.61/pyproject.toml +33 -0
- videosdk_plugins_azure-0.0.61/videosdk/plugins/azure/__init__.py +5 -0
- videosdk_plugins_azure-0.0.61/videosdk/plugins/azure/stt.py +277 -0
- videosdk_plugins_azure-0.0.61/videosdk/plugins/azure/tts.py +313 -0
- videosdk_plugins_azure-0.0.61/videosdk/plugins/azure/version.py +2 -0
- videosdk_plugins_azure-0.0.61/videosdk/plugins/azure/voice_live.py +542 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videosdk-plugins-azure
|
|
3
|
+
Version: 0.0.61
|
|
4
|
+
Summary: VideoSDK Agent Framework plugin for azure
|
|
5
|
+
Author: videosdk
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Keywords: ai,audio,azure,video,videosdk
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Communications :: Conferencing
|
|
11
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
12
|
+
Classifier: Topic :: Multimedia :: Video
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Requires-Dist: azure-ai-voicelive>=1.0.0b1
|
|
16
|
+
Requires-Dist: azure-cognitiveservices-speech>=1.46.0
|
|
17
|
+
Requires-Dist: azure-core==1.35.0
|
|
18
|
+
Requires-Dist: azure-identity==1.22.0
|
|
19
|
+
Requires-Dist: httpx>=0.28.1
|
|
20
|
+
Requires-Dist: videosdk-agents>=0.0.61
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# VideoSDK Azure Plugin
|
|
24
|
+
|
|
25
|
+
Agent Framework plugin for AzureVoiceLive(realtime), STT And TTS services from azure.
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install videosdk-plugins-azure
|
|
31
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "videosdk-plugins-azure"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "VideoSDK Agent Framework plugin for azure"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "videosdk" }]
|
|
13
|
+
keywords = ["video", "audio", "ai", "videosdk", "azure"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Topic :: Communications :: Conferencing",
|
|
19
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
20
|
+
"Topic :: Multimedia :: Video",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = ["videosdk-agents>=0.0.61","azure-cognitiveservices-speech>=1.46.0","azure-ai-voicelive>=1.0.0b1","azure-core==1.35.0",
|
|
24
|
+
"azure-identity==1.22.0","httpx>=0.28.1"]
|
|
25
|
+
|
|
26
|
+
[tool.hatch.version]
|
|
27
|
+
path = "videosdk/plugins/azure/version.py"
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["videosdk"]
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.sdist]
|
|
33
|
+
include = ["/videosdk"]
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import threading
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any, Optional, List
|
|
9
|
+
|
|
10
|
+
import azure.cognitiveservices.speech as speechsdk
|
|
11
|
+
|
|
12
|
+
from videosdk.agents import (
|
|
13
|
+
STT as BaseSTT,
|
|
14
|
+
STTResponse,
|
|
15
|
+
SpeechEventType,
|
|
16
|
+
SpeechData,
|
|
17
|
+
global_event_emitter,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from scipy import signal
|
|
26
|
+
import numpy as np
|
|
27
|
+
|
|
28
|
+
SCIPY_AVAILABLE = True
|
|
29
|
+
except ImportError:
|
|
30
|
+
SCIPY_AVAILABLE = False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class AzureSTTConfig:
|
|
35
|
+
"""Configuration for Azure STT"""
|
|
36
|
+
|
|
37
|
+
speech_key: str
|
|
38
|
+
speech_region: str
|
|
39
|
+
language: str = "en-US"
|
|
40
|
+
sample_rate: int = 16000
|
|
41
|
+
enable_phrase_list: bool = False
|
|
42
|
+
phrase_list: Optional[List[str]] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AzureSTT(BaseSTT):
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
*,
|
|
49
|
+
speech_key: Optional[str] = None,
|
|
50
|
+
speech_region: Optional[str] = None,
|
|
51
|
+
language: str = "en-US",
|
|
52
|
+
sample_rate: int = 16000,
|
|
53
|
+
enable_phrase_list: bool = False,
|
|
54
|
+
phrase_list: Optional[List[str]] = None,
|
|
55
|
+
**kwargs: Any,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""Initialize the Azure STT plugin.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
speech_key (Optional[str]): Azure Speech API key. Uses AZURE_SPEECH_KEY environment variable if not provided.
|
|
61
|
+
speech_region (Optional[str]): Azure Speech region. Uses AZURE_SPEECH_REGION environment variable if not provided.
|
|
62
|
+
language (str): The language to use for the STT plugin. Defaults to "en-US".
|
|
63
|
+
sample_rate (int): Sample rate to use for the STT plugin. Defaults to 16000.
|
|
64
|
+
enable_phrase_list (bool): Whether to enable phrase list for better recognition. Defaults to False.
|
|
65
|
+
phrase_list (Optional[List[str]]): List of phrases to boost recognition. Defaults to None.
|
|
66
|
+
"""
|
|
67
|
+
super().__init__()
|
|
68
|
+
|
|
69
|
+
if not SCIPY_AVAILABLE:
|
|
70
|
+
raise ImportError(
|
|
71
|
+
"scipy and numpy are required for Azure STT. Please install with 'pip install scipy numpy'"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self.speech_key = speech_key or os.getenv("AZURE_SPEECH_KEY")
|
|
75
|
+
self.speech_region = speech_region or os.getenv("AZURE_SPEECH_REGION")
|
|
76
|
+
|
|
77
|
+
if not self.speech_key or not self.speech_region:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
"Azure Speech key and region must be provided either through parameters or "
|
|
80
|
+
"AZURE_SPEECH_KEY and AZURE_SPEECH_REGION environment variables"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
self.config = AzureSTTConfig(
|
|
84
|
+
speech_key=self.speech_key,
|
|
85
|
+
speech_region=self.speech_region,
|
|
86
|
+
language=language,
|
|
87
|
+
sample_rate=sample_rate,
|
|
88
|
+
enable_phrase_list=enable_phrase_list,
|
|
89
|
+
phrase_list=phrase_list,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
self.input_sample_rate = 48000
|
|
93
|
+
self.target_sample_rate = sample_rate
|
|
94
|
+
|
|
95
|
+
self._speech_processor: Optional[speechsdk.SpeechRecognizer] = None
|
|
96
|
+
self._audio_stream: Optional[speechsdk.audio.PushAudioInputStream] = None
|
|
97
|
+
self._is_speaking = False
|
|
98
|
+
self._last_speech_time = 0.0
|
|
99
|
+
|
|
100
|
+
self._loop = asyncio.get_running_loop()
|
|
101
|
+
self._event_queue = asyncio.Queue()
|
|
102
|
+
self._processing_task: Optional[asyncio.Task] = None
|
|
103
|
+
|
|
104
|
+
async def process_audio(
|
|
105
|
+
self, audio_frames: bytes, language: Optional[str] = None, **kwargs: Any
|
|
106
|
+
) -> None:
|
|
107
|
+
"""Process audio frames and send to Azure Speech Service"""
|
|
108
|
+
try:
|
|
109
|
+
if not self._speech_processor:
|
|
110
|
+
await self._setup_speech_processor(language)
|
|
111
|
+
|
|
112
|
+
if self._audio_stream and SCIPY_AVAILABLE:
|
|
113
|
+
audio_data = np.frombuffer(audio_frames, dtype=np.int16)
|
|
114
|
+
|
|
115
|
+
if len(audio_data) > 0:
|
|
116
|
+
stereo_data = audio_data.reshape(-1, 2)
|
|
117
|
+
mono_data = stereo_data.mean(axis=1)
|
|
118
|
+
|
|
119
|
+
resampled_data = signal.resample(
|
|
120
|
+
mono_data,
|
|
121
|
+
int(
|
|
122
|
+
len(mono_data)
|
|
123
|
+
* self.target_sample_rate
|
|
124
|
+
/ self.input_sample_rate
|
|
125
|
+
),
|
|
126
|
+
)
|
|
127
|
+
resampled_bytes = resampled_data.astype(np.int16).tobytes()
|
|
128
|
+
self._audio_stream.write(resampled_bytes)
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Error in process_audio: {str(e)}")
|
|
132
|
+
self.emit("error", str(e))
|
|
133
|
+
await self._cleanup_speech_processor()
|
|
134
|
+
|
|
135
|
+
async def _setup_speech_processor(self, language: Optional[str] = None) -> None:
|
|
136
|
+
"""Setup Azure speech processor"""
|
|
137
|
+
try:
|
|
138
|
+
self._processing_task = self._loop.create_task(self._process_events())
|
|
139
|
+
|
|
140
|
+
speech_config = speechsdk.SpeechConfig(
|
|
141
|
+
subscription=self.config.speech_key, region=self.config.speech_region
|
|
142
|
+
)
|
|
143
|
+
speech_config.speech_recognition_language = language or self.config.language
|
|
144
|
+
|
|
145
|
+
stream_format = speechsdk.audio.AudioStreamFormat(
|
|
146
|
+
samples_per_second=self.config.sample_rate,
|
|
147
|
+
bits_per_sample=16,
|
|
148
|
+
channels=1,
|
|
149
|
+
)
|
|
150
|
+
self._audio_stream = speechsdk.audio.PushAudioInputStream(
|
|
151
|
+
stream_format=stream_format
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
audio_config = speechsdk.audio.AudioConfig(stream=self._audio_stream)
|
|
155
|
+
|
|
156
|
+
self._speech_processor = speechsdk.SpeechRecognizer(
|
|
157
|
+
speech_config=speech_config, audio_config=audio_config
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if self.config.enable_phrase_list and self.config.phrase_list:
|
|
161
|
+
phrase_list_grammar = speechsdk.PhraseListGrammar.from_recognizer(
|
|
162
|
+
self._speech_processor
|
|
163
|
+
)
|
|
164
|
+
for phrase in self.config.phrase_list:
|
|
165
|
+
phrase_list_grammar.addPhrase(phrase)
|
|
166
|
+
|
|
167
|
+
self._speech_processor.recognized.connect(self._on_final_transcript)
|
|
168
|
+
self._speech_processor.recognizing.connect(self._on_interim_transcript)
|
|
169
|
+
self._speech_processor.speech_start_detected.connect(self._on_user_started_speaking)
|
|
170
|
+
self._speech_processor.speech_end_detected.connect(self._on_user_stopped_speaking)
|
|
171
|
+
self._speech_processor.canceled.connect(self._on_speech_processing_error)
|
|
172
|
+
|
|
173
|
+
self._speech_processor.start_continuous_recognition()
|
|
174
|
+
logger.info("Azure STT speech processor started")
|
|
175
|
+
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.error(f"Failed to setup speech processor: {str(e)}")
|
|
178
|
+
raise
|
|
179
|
+
|
|
180
|
+
def _on_final_transcript(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
|
|
181
|
+
"""Handle final recognition results"""
|
|
182
|
+
text = evt.result.text.strip()
|
|
183
|
+
if not text:
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
if self._transcript_callback:
|
|
187
|
+
response = STTResponse(
|
|
188
|
+
event_type=SpeechEventType.FINAL,
|
|
189
|
+
data=SpeechData(
|
|
190
|
+
text=text, language=self.config.language, confidence=1.0
|
|
191
|
+
),
|
|
192
|
+
metadata={"provider": "azure", "result_reason": str(evt.result.reason)},
|
|
193
|
+
)
|
|
194
|
+
self._event_queue.put_nowait(response)
|
|
195
|
+
|
|
196
|
+
def _on_interim_transcript(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
|
|
197
|
+
"""Handle interim recognition results"""
|
|
198
|
+
text = evt.result.text.strip()
|
|
199
|
+
if not text:
|
|
200
|
+
return
|
|
201
|
+
|
|
202
|
+
if self._transcript_callback:
|
|
203
|
+
response = STTResponse(
|
|
204
|
+
event_type=SpeechEventType.INTERIM,
|
|
205
|
+
data=SpeechData(
|
|
206
|
+
text=text, language=self.config.language, confidence=0.5
|
|
207
|
+
),
|
|
208
|
+
metadata={"provider": "azure", "result_reason": str(evt.result.reason)},
|
|
209
|
+
)
|
|
210
|
+
self._event_queue.put_nowait(response)
|
|
211
|
+
|
|
212
|
+
def _on_user_started_speaking(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
|
|
213
|
+
"""Handle speech start detection"""
|
|
214
|
+
if self._is_speaking:
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
self._is_speaking = True
|
|
218
|
+
current_time = time.time()
|
|
219
|
+
|
|
220
|
+
if self._last_speech_time == 0.0:
|
|
221
|
+
self._last_speech_time = current_time
|
|
222
|
+
else:
|
|
223
|
+
if current_time - self._last_speech_time < 1.0:
|
|
224
|
+
global_event_emitter.emit("speech_started")
|
|
225
|
+
|
|
226
|
+
self._last_speech_time = current_time
|
|
227
|
+
|
|
228
|
+
def _on_user_stopped_speaking(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
|
|
229
|
+
"""Handle speech end detection"""
|
|
230
|
+
if not self._is_speaking:
|
|
231
|
+
return
|
|
232
|
+
|
|
233
|
+
self._is_speaking = False
|
|
234
|
+
global_event_emitter.emit("speech_stopped")
|
|
235
|
+
|
|
236
|
+
def _on_speech_processing_error(self, evt: speechsdk.SpeechRecognitionCanceledEventArgs) -> None:
|
|
237
|
+
"""Handle speech processing errors and cancellations"""
|
|
238
|
+
if evt.cancellation_details.reason == speechsdk.CancellationReason.Error:
|
|
239
|
+
error_msg = f"Speech recognition canceled due to error: {evt.cancellation_details.error_details}"
|
|
240
|
+
logger.error(error_msg)
|
|
241
|
+
self.emit("error", error_msg)
|
|
242
|
+
|
|
243
|
+
async def _process_events(self) -> None:
|
|
244
|
+
"""Process STT events from the queue"""
|
|
245
|
+
while True:
|
|
246
|
+
try:
|
|
247
|
+
response = await self._event_queue.get()
|
|
248
|
+
if self._transcript_callback:
|
|
249
|
+
await self._transcript_callback(response)
|
|
250
|
+
except asyncio.CancelledError:
|
|
251
|
+
break
|
|
252
|
+
except Exception as e:
|
|
253
|
+
logger.error("Error processing STT event: %s", str(e), exc_info=True)
|
|
254
|
+
|
|
255
|
+
async def _cleanup_speech_processor(self) -> None:
|
|
256
|
+
"""Cleanup speech processor resources"""
|
|
257
|
+
try:
|
|
258
|
+
if self._speech_processor:
|
|
259
|
+
self._speech_processor.stop_continuous_recognition()
|
|
260
|
+
self._speech_processor = None
|
|
261
|
+
|
|
262
|
+
if self._audio_stream:
|
|
263
|
+
self._audio_stream.close()
|
|
264
|
+
self._audio_stream = None
|
|
265
|
+
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error(f"Error during speech processor cleanup: {str(e)}")
|
|
268
|
+
|
|
269
|
+
async def aclose(self) -> None:
|
|
270
|
+
"""Cleanup resources"""
|
|
271
|
+
if self._processing_task:
|
|
272
|
+
self._processing_task.cancel()
|
|
273
|
+
await asyncio.gather(self._processing_task, return_exceptions=True)
|
|
274
|
+
|
|
275
|
+
await self._cleanup_speech_processor()
|
|
276
|
+
logger.info("Azure STT closed")
|
|
277
|
+
await super().aclose()
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Literal, AsyncIterator, Optional, Any
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
from videosdk.agents import TTS, segment_text
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class VoiceTuning:
|
|
17
|
+
"""Configuration for speech tuning (rate, volume, pitch)."""
|
|
18
|
+
|
|
19
|
+
_rate: Literal["x-slow", "slow", "medium", "fast", "x-fast"] | float | None = None
|
|
20
|
+
_volume: Literal["silent", "x-soft", "soft", "medium", "loud", "x-loud"] | float | None = None
|
|
21
|
+
_pitch: Literal["x-low", "low", "medium", "high", "x-high"] | None = None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def rate(self):
|
|
25
|
+
return self._rate
|
|
26
|
+
|
|
27
|
+
@rate.setter
|
|
28
|
+
def rate(self, value):
|
|
29
|
+
if value:
|
|
30
|
+
if isinstance(value, float) and not 0.5 <= value <= 2.0:
|
|
31
|
+
raise ValueError("Rate must be a float between 0.5 and 2.0")
|
|
32
|
+
if isinstance(value, str) and value not in ["x-slow", "slow", "medium", "fast", "x-fast"]:
|
|
33
|
+
raise ValueError("Rate must be one of 'x-slow', 'slow', 'medium', 'fast', 'x-fast'")
|
|
34
|
+
self._rate = value
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def volume(self):
|
|
38
|
+
return self._volume
|
|
39
|
+
|
|
40
|
+
@volume.setter
|
|
41
|
+
def volume(self, value):
|
|
42
|
+
if value:
|
|
43
|
+
if isinstance(value, float) and not 0 <= value <= 100.0:
|
|
44
|
+
raise ValueError("Volume must be a float between 0 and 100")
|
|
45
|
+
if isinstance(value, str) and value not in ["silent", "x-soft", "soft", "medium", "loud", "x-loud"]:
|
|
46
|
+
raise ValueError("Volume must be one of 'silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud'")
|
|
47
|
+
self._volume = value
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def pitch(self):
|
|
51
|
+
return self._pitch
|
|
52
|
+
|
|
53
|
+
@pitch.setter
|
|
54
|
+
def pitch(self, value):
|
|
55
|
+
if value and value not in ["x-low", "low", "medium", "high", "x-high"]:
|
|
56
|
+
raise ValueError("Pitch must be one of 'x-low', 'low', 'medium', 'high', 'x-high'")
|
|
57
|
+
self._pitch = value
|
|
58
|
+
|
|
59
|
+
def __init__(self, rate=None, volume=None, pitch=None):
|
|
60
|
+
self.rate = rate
|
|
61
|
+
self.volume = volume
|
|
62
|
+
self.pitch = pitch
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class SpeakingStyle:
|
|
67
|
+
"""Configuration for speech expressive style."""
|
|
68
|
+
style: str
|
|
69
|
+
_degree: float | None = None
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def degree(self):
|
|
73
|
+
return self._degree
|
|
74
|
+
|
|
75
|
+
@degree.setter
|
|
76
|
+
def degree(self, value: float | None):
|
|
77
|
+
if value is not None and not 0.1 <= value <= 2.0:
|
|
78
|
+
raise ValueError("Style degree must be between 0.1 and 2.0")
|
|
79
|
+
self._degree = value
|
|
80
|
+
|
|
81
|
+
def __init__(self, style: str, degree: float | None = None):
|
|
82
|
+
self.style = style
|
|
83
|
+
self.degree = degree
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class AzureTTS(TTS):
|
|
87
|
+
"""
|
|
88
|
+
Initialize the Azure TTS plugin.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
voice (str): Name of the Azure neural voice to use (default: "en-US-EmmaNeural").
|
|
92
|
+
For a full list of available voices, see:
|
|
93
|
+
https://eastus2.tts.speech.microsoft.com/cognitiveservices/voices/list
|
|
94
|
+
(Requires: curl --location --request GET with header 'Ocp-Apim-Subscription-Key')
|
|
95
|
+
language (str, optional): Language code for the voice (e.g., "en-US"). If not provided, defaults to the voice's language.
|
|
96
|
+
tuning (VoiceTuning, optional): VoiceTuning object to control speech rate, volume, and pitch.
|
|
97
|
+
style (SpeakingStyle, optional): SpeakingStyle object for expressive speech synthesis.
|
|
98
|
+
speech_key (str, optional): Azure Speech API key. If not provided, uses the AZURE_SPEECH_KEY environment variable.
|
|
99
|
+
speech_region (str, optional): Azure Speech region. If not provided, uses the AZURE_SPEECH_REGION environment variable.
|
|
100
|
+
speech_endpoint (str, optional): Custom endpoint URL. If not provided, uses the AZURE_SPEECH_ENDPOINT environment variable.
|
|
101
|
+
deployment_id (str, optional): Custom deployment ID for model deployment scenarios.
|
|
102
|
+
speech_auth_token (str, optional): Azure Speech authorization token for token-based authentication.
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
FIXED_SAMPLE_RATE = 24000
|
|
106
|
+
AZURE_OUTPUT_FORMAT = "raw-24khz-16bit-mono-pcm"
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
*,
|
|
111
|
+
voice: str = "en-US-EmmaNeural",
|
|
112
|
+
language: str | None = None,
|
|
113
|
+
tuning: Optional[VoiceTuning] = None,
|
|
114
|
+
style: Optional[SpeakingStyle] = None,
|
|
115
|
+
speech_key: str | None = None,
|
|
116
|
+
speech_region: str | None = None,
|
|
117
|
+
speech_endpoint: str | None = None,
|
|
118
|
+
deployment_id: str | None = None,
|
|
119
|
+
speech_auth_token: str | None = None,
|
|
120
|
+
**kwargs: Any,
|
|
121
|
+
) -> None:
|
|
122
|
+
super().__init__(
|
|
123
|
+
sample_rate=self.FIXED_SAMPLE_RATE,
|
|
124
|
+
num_channels=1,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
self.speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
|
|
128
|
+
self.speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")
|
|
129
|
+
self.speech_endpoint = speech_endpoint or os.environ.get(
|
|
130
|
+
"AZURE_SPEECH_ENDPOINT"
|
|
131
|
+
)
|
|
132
|
+
self.speech_auth_token = speech_auth_token
|
|
133
|
+
self.deployment_id = deployment_id
|
|
134
|
+
|
|
135
|
+
has_endpoint = bool(self.speech_endpoint)
|
|
136
|
+
has_key_and_region = bool(self.speech_key and self.speech_region)
|
|
137
|
+
has_token_and_region = bool(self.speech_auth_token and self.speech_region)
|
|
138
|
+
|
|
139
|
+
if not (has_endpoint or has_key_and_region or has_token_and_region):
|
|
140
|
+
raise ValueError(
|
|
141
|
+
"Authentication requires one of: speech_endpoint, (speech_key & speech_region), or (speech_auth_token & speech_region)."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
self.voice = voice
|
|
145
|
+
self.language = language
|
|
146
|
+
self.tuning = tuning
|
|
147
|
+
self.style = style
|
|
148
|
+
|
|
149
|
+
self._first_chunk_sent = False
|
|
150
|
+
self._interrupted = False
|
|
151
|
+
self._http_client: Optional[httpx.AsyncClient] = None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def reset_first_audio_tracking(self) -> None:
|
|
155
|
+
self._first_chunk_sent = False
|
|
156
|
+
|
|
157
|
+
def _get_endpoint_url(self) -> str:
|
|
158
|
+
if self.speech_endpoint:
|
|
159
|
+
base = self.speech_endpoint.rstrip("/")
|
|
160
|
+
if not base.endswith("/cognitiveservices/v1"):
|
|
161
|
+
base = f"{base}/cognitiveservices/v1"
|
|
162
|
+
else:
|
|
163
|
+
base = f"https://{self.speech_region}.tts.speech.microsoft.com/cognitiveservices/v1"
|
|
164
|
+
|
|
165
|
+
if self.deployment_id:
|
|
166
|
+
return f"{base}?deploymentId={self.deployment_id}"
|
|
167
|
+
return base
|
|
168
|
+
|
|
169
|
+
def _get_http_client(self) -> httpx.AsyncClient:
|
|
170
|
+
if not self._http_client:
|
|
171
|
+
self._http_client = httpx.AsyncClient(
|
|
172
|
+
timeout=httpx.Timeout(
|
|
173
|
+
connect=15.0, read=30.0, write=5.0, pool=5.0
|
|
174
|
+
),
|
|
175
|
+
follow_redirects=True,
|
|
176
|
+
limits=httpx.Limits(
|
|
177
|
+
max_connections=50,
|
|
178
|
+
max_keepalive_connections=50,
|
|
179
|
+
keepalive_expiry=120,
|
|
180
|
+
),
|
|
181
|
+
)
|
|
182
|
+
return self._http_client
|
|
183
|
+
|
|
184
|
+
async def synthesize(
|
|
185
|
+
self,
|
|
186
|
+
text: AsyncIterator[str] | str,
|
|
187
|
+
voice_id: Optional[str] = None,
|
|
188
|
+
**kwargs: Any,
|
|
189
|
+
) -> None:
|
|
190
|
+
try:
|
|
191
|
+
if not self.audio_track or not self.loop:
|
|
192
|
+
self.emit("error", "Audio track or event loop not set")
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
self._interrupted = False
|
|
196
|
+
|
|
197
|
+
if isinstance(text, AsyncIterator):
|
|
198
|
+
async for segment in segment_text(text):
|
|
199
|
+
if self._interrupted:
|
|
200
|
+
break
|
|
201
|
+
await self._synthesize_segment(segment, voice_id, **kwargs)
|
|
202
|
+
else:
|
|
203
|
+
if not self._interrupted:
|
|
204
|
+
await self._synthesize_segment(text, voice_id, **kwargs)
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error("Azure TTS synthesis failed: %s", str(e), exc_info=True)
|
|
208
|
+
self.emit("error", f"Azure TTS synthesis failed: {str(e)}")
|
|
209
|
+
raise
|
|
210
|
+
|
|
211
|
+
async def _synthesize_segment(
|
|
212
|
+
self, text: str, voice_id: Optional[str] = None, **kwargs: Any
|
|
213
|
+
) -> None:
|
|
214
|
+
if not text.strip() or self._interrupted:
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
|
|
219
|
+
headers = {
|
|
220
|
+
"Content-Type": "application/ssml+xml",
|
|
221
|
+
"X-Microsoft-OutputFormat": self.AZURE_OUTPUT_FORMAT,
|
|
222
|
+
"User-Agent": "VideoSDK Agents",
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if self.speech_auth_token:
|
|
226
|
+
headers["Authorization"] = f"Bearer {self.speech_auth_token}"
|
|
227
|
+
elif self.speech_key:
|
|
228
|
+
headers["Ocp-Apim-Subscription-Key"] = self.speech_key
|
|
229
|
+
|
|
230
|
+
ssml_data = self._build_ssml(text, voice_id or self.voice)
|
|
231
|
+
|
|
232
|
+
response = await self._get_http_client().post(
|
|
233
|
+
url=self._get_endpoint_url(),
|
|
234
|
+
headers=headers,
|
|
235
|
+
content=ssml_data,
|
|
236
|
+
)
|
|
237
|
+
response.raise_for_status()
|
|
238
|
+
|
|
239
|
+
audio_data = b""
|
|
240
|
+
async for chunk in response.aiter_bytes(chunk_size=8192):
|
|
241
|
+
if self._interrupted:
|
|
242
|
+
break
|
|
243
|
+
if chunk:
|
|
244
|
+
audio_data += chunk
|
|
245
|
+
|
|
246
|
+
if audio_data and not self._interrupted:
|
|
247
|
+
await self._stream_audio_chunks(audio_data)
|
|
248
|
+
|
|
249
|
+
except httpx.TimeoutException:
|
|
250
|
+
logger.error("Azure TTS request timeout")
|
|
251
|
+
self.emit("error", "Azure TTS request timeout")
|
|
252
|
+
except httpx.HTTPStatusError as e:
|
|
253
|
+
logger.error("Azure TTS HTTP error: %s - %s", e.response.status_code, e.response.text)
|
|
254
|
+
self.emit("error", f"Azure TTS HTTP error: {e.response.status_code} - {e.response.text}")
|
|
255
|
+
except Exception as e:
|
|
256
|
+
if not self._interrupted:
|
|
257
|
+
logger.error("Azure TTS synthesis failed: %s", str(e), exc_info=True)
|
|
258
|
+
self.emit("error", f"Azure TTS synthesis failed: {str(e)}")
|
|
259
|
+
raise
|
|
260
|
+
|
|
261
|
+
def _build_ssml(self, text: str, voice: str) -> str:
|
|
262
|
+
lang = self.language or "en-US"
|
|
263
|
+
ssml = (
|
|
264
|
+
f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" '
|
|
265
|
+
f'xmlns:mstts="http://www.w3.org/2001/mstts" xml:lang="{lang}">'
|
|
266
|
+
)
|
|
267
|
+
ssml += f'<voice name="{voice}">'
|
|
268
|
+
|
|
269
|
+
if self.style:
|
|
270
|
+
degree = f' styledegree="{self.style.degree}"' if self.style.degree else ""
|
|
271
|
+
ssml += f'<mstts:express-as style="{self.style.style}"{degree}>'
|
|
272
|
+
|
|
273
|
+
if self.tuning:
|
|
274
|
+
t = self.tuning
|
|
275
|
+
rate_attr = f' rate="{t.rate}"' if t.rate is not None else ""
|
|
276
|
+
vol_attr = f' volume="{t.volume}"' if t.volume is not None else ""
|
|
277
|
+
pitch_attr = f' pitch="{t.pitch}"' if t.pitch is not None else ""
|
|
278
|
+
ssml += f"<prosody{rate_attr}{vol_attr}{pitch_attr}>{text}</prosody>"
|
|
279
|
+
else:
|
|
280
|
+
ssml += text
|
|
281
|
+
|
|
282
|
+
if self.style:
|
|
283
|
+
ssml += "</mstts:express-as>"
|
|
284
|
+
|
|
285
|
+
ssml += "</voice></speak>"
|
|
286
|
+
return ssml
|
|
287
|
+
|
|
288
|
+
async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
|
|
289
|
+
chunk_size = int(self.FIXED_SAMPLE_RATE * 2 * 20 / 1000)
|
|
290
|
+
for i in range(0, len(audio_bytes), chunk_size):
|
|
291
|
+
if self._interrupted: break
|
|
292
|
+
chunk = audio_bytes[i : i + chunk_size]
|
|
293
|
+
if len(chunk) < chunk_size and len(chunk) > 0:
|
|
294
|
+
padding_needed = chunk_size - len(chunk)
|
|
295
|
+
chunk += b"\x00" * padding_needed
|
|
296
|
+
if len(chunk) == chunk_size:
|
|
297
|
+
if not self._first_chunk_sent and self._first_audio_callback:
|
|
298
|
+
self._first_chunk_sent = True
|
|
299
|
+
await self._first_audio_callback()
|
|
300
|
+
if self.audio_track:
|
|
301
|
+
asyncio.create_task(self.audio_track.add_new_bytes(chunk))
|
|
302
|
+
await asyncio.sleep(0.001)
|
|
303
|
+
|
|
304
|
+
async def interrupt(self) -> None:
|
|
305
|
+
self._interrupted = True
|
|
306
|
+
if self.audio_track:
|
|
307
|
+
self.audio_track.interrupt()
|
|
308
|
+
|
|
309
|
+
async def aclose(self) -> None:
|
|
310
|
+
if self._http_client:
|
|
311
|
+
await self._http_client.aclose()
|
|
312
|
+
self._http_client = None
|
|
313
|
+
await super().aclose()
|
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
import traceback
|
|
7
|
+
import base64
|
|
8
|
+
from typing import Any, Optional, Literal, List, Union, Dict
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
import numpy as np
|
|
11
|
+
from scipy import signal
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
|
|
14
|
+
from videosdk.agents import (
|
|
15
|
+
Agent,
|
|
16
|
+
CustomAudioStreamTrack,
|
|
17
|
+
RealtimeBaseModel,
|
|
18
|
+
realtime_metrics_collector,
|
|
19
|
+
)
|
|
20
|
+
from videosdk.agents.event_bus import global_event_emitter
|
|
21
|
+
|
|
22
|
+
from azure.core.credentials import AzureKeyCredential, TokenCredential
|
|
23
|
+
from azure.identity import DefaultAzureCredential
|
|
24
|
+
from azure.ai.voicelive.aio import connect
|
|
25
|
+
from azure.ai.voicelive.models import (
|
|
26
|
+
RequestSession,
|
|
27
|
+
ServerVad,
|
|
28
|
+
AzureStandardVoice,
|
|
29
|
+
Modality,
|
|
30
|
+
InputAudioFormat,
|
|
31
|
+
ServerEventType,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
load_dotenv()
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
AZURE_VOICE_LIVE_SAMPLE_RATE = 24000
|
|
39
|
+
VIDEOSDK_INPUT_SAMPLE_RATE = 48000
|
|
40
|
+
|
|
41
|
+
AzureVoiceLiveEventTypes = Literal["user_speech_started", "text_response", "error"]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class AzureVoiceLiveConfig:
|
|
46
|
+
"""Configuration for Azure Voice Live API (Beta)
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
voice: Voice ID for audio output. Can be Azure voice (e.g., 'en-US-AvaNeural') or OpenAI voice ('alloy', 'echo', etc.). Default is 'en-US-AvaNeural'
|
|
50
|
+
modalities: List of enabled response types. Options: [Modality.TEXT, Modality.AUDIO]. Default includes both
|
|
51
|
+
input_audio_format: Audio format for input. Default is AudioFormat.PCM16
|
|
52
|
+
output_audio_format: Audio format for output. Default is AudioFormat.PCM16
|
|
53
|
+
turn_detection_threshold: Voice activity detection threshold (0.0-1.0). Default is 0.5
|
|
54
|
+
turn_detection_prefix_padding_ms: Padding before speech start (ms). Default is 300
|
|
55
|
+
turn_detection_silence_duration_ms: Silence duration to mark end (ms). Default is 500
|
|
56
|
+
temperature: Controls randomness in response generation. Higher values make output more random. Default is None
|
|
57
|
+
max_completion_tokens: Maximum number of tokens in response. Default is None
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
voice: str = "en-US-AvaNeural"
|
|
61
|
+
modalities: List[Modality] = field(
|
|
62
|
+
default_factory=lambda: [Modality.TEXT, Modality.AUDIO]
|
|
63
|
+
)
|
|
64
|
+
input_audio_format: InputAudioFormat = InputAudioFormat.PCM16
|
|
65
|
+
output_audio_format: InputAudioFormat = InputAudioFormat.PCM16
|
|
66
|
+
turn_detection_threshold: float = 0.5
|
|
67
|
+
turn_detection_prefix_padding_ms: int = 300
|
|
68
|
+
turn_detection_silence_duration_ms: int = 500
|
|
69
|
+
temperature: Optional[float] = None
|
|
70
|
+
max_completion_tokens: Optional[int] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class AzureVoiceLiveSession:
|
|
75
|
+
"""Represents an Azure Voice Live session"""
|
|
76
|
+
|
|
77
|
+
connection: Any
|
|
78
|
+
session_id: Optional[str] = None
|
|
79
|
+
tasks: list[asyncio.Task] = field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class AzureVoiceLive(RealtimeBaseModel[AzureVoiceLiveEventTypes]):
|
|
83
|
+
"""Azure Voice Live realtime model implementation"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
*,
|
|
88
|
+
api_key: str | None = None,
|
|
89
|
+
endpoint: str | None = None,
|
|
90
|
+
model: str,
|
|
91
|
+
config: AzureVoiceLiveConfig | None = None,
|
|
92
|
+
credential: Union[AzureKeyCredential, TokenCredential] | None = None,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""
|
|
95
|
+
Initialize Azure Voice Live realtime model.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
api_key: Azure Voice Live API key. If not provided, will attempt to read from AZURE_VOICE_LIVE_API_KEY env var
|
|
99
|
+
endpoint: Azure Voice Live endpoint. If not provided, will attempt to read from AZURE_VOICE_LIVE_ENDPOINT env var
|
|
100
|
+
model: The model identifier to use (e.g., 'gpt-4o-realtime-preview')
|
|
101
|
+
config: Optional configuration object for customizing model behavior. Contains settings for:
|
|
102
|
+
- voice: Voice ID for audio output (Azure or OpenAI voices)
|
|
103
|
+
- modalities: List of enabled response types [TEXT, AUDIO]
|
|
104
|
+
- turn_detection: Voice activity detection settings
|
|
105
|
+
- temperature: Response randomness control
|
|
106
|
+
credential: Azure credential object. If provided, takes precedence over api_key
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
ValueError: If no API key or credential is provided and none found in environment variables
|
|
110
|
+
"""
|
|
111
|
+
super().__init__()
|
|
112
|
+
self.model = model
|
|
113
|
+
self.endpoint = endpoint or os.getenv(
|
|
114
|
+
"AZURE_VOICE_LIVE_ENDPOINT", "wss://api.voicelive.com/v1"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if credential:
|
|
118
|
+
self.credential = credential
|
|
119
|
+
elif api_key:
|
|
120
|
+
self.credential = AzureKeyCredential(api_key)
|
|
121
|
+
else:
|
|
122
|
+
env_api_key = os.getenv("AZURE_VOICE_LIVE_API_KEY")
|
|
123
|
+
if env_api_key:
|
|
124
|
+
self.credential = AzureKeyCredential(env_api_key)
|
|
125
|
+
else:
|
|
126
|
+
try:
|
|
127
|
+
self.credential = DefaultAzureCredential()
|
|
128
|
+
except Exception:
|
|
129
|
+
self.emit("error", "Azure Voice Live credentials required")
|
|
130
|
+
raise ValueError(
|
|
131
|
+
"Azure Voice Live credentials required. Provide api_key, credential, or set AZURE_VOICE_LIVE_API_KEY environment variable"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
self._session: Optional[AzureVoiceLiveSession] = None
|
|
135
|
+
self._closing = False
|
|
136
|
+
self._instructions: str = (
|
|
137
|
+
"You are a helpful voice assistant that can answer questions and help with tasks."
|
|
138
|
+
)
|
|
139
|
+
self.loop = None
|
|
140
|
+
self.audio_track: Optional[CustomAudioStreamTrack] = None
|
|
141
|
+
self.config: AzureVoiceLiveConfig = config or AzureVoiceLiveConfig()
|
|
142
|
+
self.input_sample_rate = VIDEOSDK_INPUT_SAMPLE_RATE
|
|
143
|
+
self.target_sample_rate = AZURE_VOICE_LIVE_SAMPLE_RATE
|
|
144
|
+
self._agent_speaking = False
|
|
145
|
+
self._user_speaking = False
|
|
146
|
+
self.session_ready = False
|
|
147
|
+
self._session_ready_event = asyncio.Event()
|
|
148
|
+
|
|
149
|
+
def set_agent(self, agent: Agent) -> None:
|
|
150
|
+
"""Set the agent configuration"""
|
|
151
|
+
self._instructions = agent.instructions
|
|
152
|
+
|
|
153
|
+
async def connect(self) -> None:
|
|
154
|
+
"""Connect to Azure Voice Live API"""
|
|
155
|
+
if self._session:
|
|
156
|
+
await self._cleanup_session(self._session)
|
|
157
|
+
self._session = None
|
|
158
|
+
|
|
159
|
+
self._closing = False
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
if (
|
|
163
|
+
not self.audio_track
|
|
164
|
+
and self.loop
|
|
165
|
+
and Modality.AUDIO in self.config.modalities
|
|
166
|
+
):
|
|
167
|
+
self.audio_track = CustomAudioStreamTrack(self.loop)
|
|
168
|
+
elif not self.loop and Modality.AUDIO in self.config.modalities:
|
|
169
|
+
self.emit(
|
|
170
|
+
"error", "Event loop not initialized. Audio playback will not work."
|
|
171
|
+
)
|
|
172
|
+
raise RuntimeError(
|
|
173
|
+
"Event loop not initialized. Audio playback will not work."
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
session = await self._create_session()
|
|
177
|
+
if session:
|
|
178
|
+
self._session = session
|
|
179
|
+
|
|
180
|
+
if self._session:
|
|
181
|
+
asyncio.create_task(
|
|
182
|
+
self._process_events(), name="azure-voice-live-events"
|
|
183
|
+
)
|
|
184
|
+
try:
|
|
185
|
+
logger.info("Waiting for Azure Voice Live session to be ready...")
|
|
186
|
+
await asyncio.wait_for(
|
|
187
|
+
self._session_ready_event.wait(), timeout=10.0
|
|
188
|
+
)
|
|
189
|
+
logger.info("Azure Voice Live session is ready.")
|
|
190
|
+
except asyncio.TimeoutError:
|
|
191
|
+
self.emit("error", "Azure Voice Live session ready timeout")
|
|
192
|
+
raise RuntimeError(
|
|
193
|
+
"Azure Voice Live session did not become ready in time"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
except Exception as e:
|
|
197
|
+
self.emit("error", f"Error connecting to Azure Voice Live API: {e}")
|
|
198
|
+
traceback.print_exc()
|
|
199
|
+
raise
|
|
200
|
+
|
|
201
|
+
async def _create_session(self) -> AzureVoiceLiveSession:
|
|
202
|
+
"""Create a new Azure Voice Live session"""
|
|
203
|
+
try:
|
|
204
|
+
connection_cm = connect(
|
|
205
|
+
endpoint=self.endpoint,
|
|
206
|
+
credential=self.credential,
|
|
207
|
+
model=self.model,
|
|
208
|
+
connection_options={
|
|
209
|
+
"max_msg_size": 10 * 1024 * 1024,
|
|
210
|
+
"heartbeat": 20,
|
|
211
|
+
"timeout": 20,
|
|
212
|
+
},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
connection = await connection_cm.__aenter__()
|
|
216
|
+
|
|
217
|
+
await self._setup_session(connection)
|
|
218
|
+
|
|
219
|
+
return AzureVoiceLiveSession(
|
|
220
|
+
connection=connection, session_id=None, tasks=[]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
except Exception as e:
|
|
224
|
+
self.emit("error", f"Failed to create Azure Voice Live session: {e}")
|
|
225
|
+
traceback.print_exc()
|
|
226
|
+
raise
|
|
227
|
+
|
|
228
|
+
async def _setup_session(self, connection) -> None:
|
|
229
|
+
"""Configure the Azure Voice Live session"""
|
|
230
|
+
logger.info("Setting up Azure Voice Live session...")
|
|
231
|
+
|
|
232
|
+
voice_config: Union[AzureStandardVoice, str]
|
|
233
|
+
if (
|
|
234
|
+
self.config.voice.startswith("en-US-")
|
|
235
|
+
or self.config.voice.startswith("en-CA-")
|
|
236
|
+
or "-" in self.config.voice
|
|
237
|
+
):
|
|
238
|
+
voice_config = AzureStandardVoice(
|
|
239
|
+
name=self.config.voice, type="azure-standard"
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
voice_config = self.config.voice
|
|
243
|
+
|
|
244
|
+
turn_detection_config = ServerVad(
|
|
245
|
+
threshold=self.config.turn_detection_threshold,
|
|
246
|
+
prefix_padding_ms=self.config.turn_detection_prefix_padding_ms,
|
|
247
|
+
silence_duration_ms=self.config.turn_detection_silence_duration_ms,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
session_config = RequestSession(
|
|
251
|
+
modalities=self.config.modalities,
|
|
252
|
+
instructions=self._instructions,
|
|
253
|
+
voice=voice_config,
|
|
254
|
+
input_audio_format=self.config.input_audio_format,
|
|
255
|
+
output_audio_format=self.config.output_audio_format,
|
|
256
|
+
turn_detection=turn_detection_config,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
if self.config.temperature is not None:
|
|
260
|
+
session_config.temperature = self.config.temperature
|
|
261
|
+
if self.config.max_completion_tokens is not None:
|
|
262
|
+
session_config.max_completion_tokens = self.config.max_completion_tokens
|
|
263
|
+
|
|
264
|
+
await connection.session.update(session=session_config)
|
|
265
|
+
logger.info("Azure Voice Live session configuration sent")
|
|
266
|
+
|
|
267
|
+
async def _process_events(self) -> None:
|
|
268
|
+
"""Process events from the Azure Voice Live connection"""
|
|
269
|
+
try:
|
|
270
|
+
if not self._session or not self._session.connection:
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
async for event in self._session.connection:
|
|
274
|
+
if self._closing:
|
|
275
|
+
break
|
|
276
|
+
await self._handle_event(event)
|
|
277
|
+
|
|
278
|
+
except asyncio.CancelledError:
|
|
279
|
+
logger.info("Event processing cancelled")
|
|
280
|
+
except Exception as e:
|
|
281
|
+
self.emit("error", f"Error processing events: {e}")
|
|
282
|
+
traceback.print_exc()
|
|
283
|
+
|
|
284
|
+
async def _handle_event(self, event) -> None:
|
|
285
|
+
"""Handle different types of events from Azure Voice Live"""
|
|
286
|
+
try:
|
|
287
|
+
logger.debug(f"Received event: {event.type}")
|
|
288
|
+
|
|
289
|
+
if event.type == ServerEventType.SESSION_UPDATED:
|
|
290
|
+
logger.info(f"Session ready: {event.session.id}")
|
|
291
|
+
if self._session:
|
|
292
|
+
self._session.session_id = event.session.id
|
|
293
|
+
self.session_ready = True
|
|
294
|
+
self._session_ready_event.set()
|
|
295
|
+
|
|
296
|
+
elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
|
|
297
|
+
logger.info("User started speaking")
|
|
298
|
+
if not self._user_speaking:
|
|
299
|
+
await realtime_metrics_collector.set_user_speech_start()
|
|
300
|
+
self._user_speaking = True
|
|
301
|
+
self.emit("user_speech_started", {"type": "done"})
|
|
302
|
+
|
|
303
|
+
if self.audio_track and Modality.AUDIO in self.config.modalities:
|
|
304
|
+
self.audio_track.interrupt()
|
|
305
|
+
|
|
306
|
+
if self._session and self._session.connection:
|
|
307
|
+
try:
|
|
308
|
+
await self._session.connection.response.cancel()
|
|
309
|
+
except Exception as e:
|
|
310
|
+
logger.debug(f"No response to cancel: {e}")
|
|
311
|
+
|
|
312
|
+
elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
|
|
313
|
+
logger.info("User stopped speaking")
|
|
314
|
+
if self._user_speaking:
|
|
315
|
+
await realtime_metrics_collector.set_user_speech_end()
|
|
316
|
+
self._user_speaking = False
|
|
317
|
+
|
|
318
|
+
elif event.type == ServerEventType.RESPONSE_CREATED:
|
|
319
|
+
logger.info("Assistant response created")
|
|
320
|
+
|
|
321
|
+
elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
|
|
322
|
+
logger.debug("Received audio delta")
|
|
323
|
+
if Modality.AUDIO in self.config.modalities:
|
|
324
|
+
if not self._agent_speaking:
|
|
325
|
+
await realtime_metrics_collector.set_agent_speech_start()
|
|
326
|
+
self._agent_speaking = True
|
|
327
|
+
|
|
328
|
+
if self.audio_track and self.loop:
|
|
329
|
+
asyncio.create_task(self.audio_track.add_new_bytes(event.delta))
|
|
330
|
+
|
|
331
|
+
elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
|
|
332
|
+
logger.info("Assistant finished speaking")
|
|
333
|
+
if self._agent_speaking:
|
|
334
|
+
await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
|
|
335
|
+
self._agent_speaking = False
|
|
336
|
+
|
|
337
|
+
elif event.type == ServerEventType.RESPONSE_TEXT_DELTA:
|
|
338
|
+
if hasattr(self, "_current_text_response"):
|
|
339
|
+
self._current_text_response += event.delta
|
|
340
|
+
else:
|
|
341
|
+
self._current_text_response = event.delta
|
|
342
|
+
|
|
343
|
+
elif event.type == ServerEventType.RESPONSE_TEXT_DONE:
|
|
344
|
+
if hasattr(self, "_current_text_response"):
|
|
345
|
+
global_event_emitter.emit(
|
|
346
|
+
"text_response",
|
|
347
|
+
{"text": self._current_text_response, "type": "done"},
|
|
348
|
+
)
|
|
349
|
+
await realtime_metrics_collector.set_agent_response(
|
|
350
|
+
self._current_text_response
|
|
351
|
+
)
|
|
352
|
+
try:
|
|
353
|
+
self.emit(
|
|
354
|
+
"realtime_model_transcription",
|
|
355
|
+
{
|
|
356
|
+
"role": "agent",
|
|
357
|
+
"text": self._current_text_response,
|
|
358
|
+
"is_final": True,
|
|
359
|
+
},
|
|
360
|
+
)
|
|
361
|
+
except Exception:
|
|
362
|
+
pass
|
|
363
|
+
self._current_text_response = ""
|
|
364
|
+
|
|
365
|
+
elif event.type == ServerEventType.RESPONSE_DONE:
|
|
366
|
+
logger.info("Response complete")
|
|
367
|
+
if self._agent_speaking:
|
|
368
|
+
await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
|
|
369
|
+
self._agent_speaking = False
|
|
370
|
+
|
|
371
|
+
elif event.type == ServerEventType.ERROR:
|
|
372
|
+
logger.error(f"Azure Voice Live error: {event.error.message}")
|
|
373
|
+
self.emit("error", f"Azure Voice Live error: {event.error.message}")
|
|
374
|
+
|
|
375
|
+
elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED:
|
|
376
|
+
logger.debug(f"Conversation item created: {event.item.id}")
|
|
377
|
+
|
|
378
|
+
if (
|
|
379
|
+
hasattr(event.item, "content")
|
|
380
|
+
and event.item.content
|
|
381
|
+
and hasattr(event.item.content[0], "transcript")
|
|
382
|
+
):
|
|
383
|
+
transcript = event.item.content[0].transcript
|
|
384
|
+
if transcript and event.item.role == "user":
|
|
385
|
+
await realtime_metrics_collector.set_user_transcript(transcript)
|
|
386
|
+
try:
|
|
387
|
+
self.emit(
|
|
388
|
+
"realtime_model_transcription",
|
|
389
|
+
{"role": "user", "text": transcript, "is_final": True},
|
|
390
|
+
)
|
|
391
|
+
except Exception:
|
|
392
|
+
pass
|
|
393
|
+
|
|
394
|
+
else:
|
|
395
|
+
logger.debug(f"Unhandled event type: {event.type}")
|
|
396
|
+
|
|
397
|
+
except Exception as e:
|
|
398
|
+
self.emit("error", f"Error handling event {event.type}: {e}")
|
|
399
|
+
traceback.print_exc()
|
|
400
|
+
|
|
401
|
+
async def handle_audio_input(self, audio_data: bytes) -> None:
|
|
402
|
+
"""Handle incoming audio data from the user"""
|
|
403
|
+
if not self._session or self._closing or not self.session_ready:
|
|
404
|
+
return
|
|
405
|
+
|
|
406
|
+
if Modality.AUDIO not in self.config.modalities:
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
try:
|
|
410
|
+
audio_array = np.frombuffer(audio_data, dtype=np.int16)
|
|
411
|
+
|
|
412
|
+
if len(audio_array) % 2 == 0:
|
|
413
|
+
audio_array = audio_array.reshape(-1, 2)
|
|
414
|
+
audio_array = np.mean(audio_array, axis=1).astype(np.int16)
|
|
415
|
+
|
|
416
|
+
target_length = int(
|
|
417
|
+
len(audio_array) * self.target_sample_rate / self.input_sample_rate
|
|
418
|
+
)
|
|
419
|
+
resampled_float = signal.resample(
|
|
420
|
+
audio_array.astype(np.float32), target_length
|
|
421
|
+
)
|
|
422
|
+
resampled_int16 = np.clip(resampled_float, -32768, 32767).astype(np.int16)
|
|
423
|
+
resampled_bytes = resampled_int16.tobytes()
|
|
424
|
+
|
|
425
|
+
encoded_audio = base64.b64encode(resampled_bytes).decode("utf-8")
|
|
426
|
+
|
|
427
|
+
await self._session.connection.input_audio_buffer.append(
|
|
428
|
+
audio=encoded_audio
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
except Exception as e:
|
|
432
|
+
self.emit("error", f"Error processing audio input: {e}")
|
|
433
|
+
|
|
434
|
+
async def interrupt(self) -> None:
|
|
435
|
+
"""Interrupt current response"""
|
|
436
|
+
if not self._session or self._closing:
|
|
437
|
+
return
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
if self._session.connection:
|
|
441
|
+
await self._session.connection.response.cancel()
|
|
442
|
+
|
|
443
|
+
if self.audio_track and Modality.AUDIO in self.config.modalities:
|
|
444
|
+
self.audio_track.interrupt()
|
|
445
|
+
|
|
446
|
+
await realtime_metrics_collector.set_interrupted()
|
|
447
|
+
|
|
448
|
+
if self._agent_speaking:
|
|
449
|
+
await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
|
|
450
|
+
self._agent_speaking = False
|
|
451
|
+
|
|
452
|
+
except Exception as e:
|
|
453
|
+
self.emit("error", f"Interrupt error: {e}")
|
|
454
|
+
|
|
455
|
+
async def send_message(self, message: str) -> None:
|
|
456
|
+
"""Send a text message to get audio response"""
|
|
457
|
+
retry_count = 0
|
|
458
|
+
max_retries = 5
|
|
459
|
+
while not self._session or not self.session_ready:
|
|
460
|
+
if retry_count >= max_retries:
|
|
461
|
+
raise RuntimeError(
|
|
462
|
+
"No active Azure Voice Live session after maximum retries"
|
|
463
|
+
)
|
|
464
|
+
logger.debug("No active session, waiting for connection...")
|
|
465
|
+
await asyncio.sleep(1)
|
|
466
|
+
retry_count += 1
|
|
467
|
+
|
|
468
|
+
try:
|
|
469
|
+
await self._session.connection.conversation.item.create(
|
|
470
|
+
item={
|
|
471
|
+
"type": "message",
|
|
472
|
+
"role": "assistant",
|
|
473
|
+
"content": [
|
|
474
|
+
{
|
|
475
|
+
"type": "text",
|
|
476
|
+
"text": f"Repeat the user's exact message back to them [DO NOT ADD ANYTHING ELSE]: {message}",
|
|
477
|
+
}
|
|
478
|
+
],
|
|
479
|
+
}
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
await self._session.connection.response.create()
|
|
483
|
+
|
|
484
|
+
except Exception as e:
|
|
485
|
+
self.emit("error", f"Error sending message: {e}")
|
|
486
|
+
|
|
487
|
+
async def send_text_message(self, message: str) -> None:
|
|
488
|
+
"""Send a text message for text-only communication"""
|
|
489
|
+
retry_count = 0
|
|
490
|
+
max_retries = 5
|
|
491
|
+
while not self._session or not self.session_ready:
|
|
492
|
+
if retry_count >= max_retries:
|
|
493
|
+
raise RuntimeError(
|
|
494
|
+
"No active Azure Voice Live session after maximum retries"
|
|
495
|
+
)
|
|
496
|
+
logger.debug("No active session, waiting for connection...")
|
|
497
|
+
await asyncio.sleep(1)
|
|
498
|
+
retry_count += 1
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
await self._session.connection.conversation.item.create(
|
|
502
|
+
item={
|
|
503
|
+
"type": "message",
|
|
504
|
+
"role": "user",
|
|
505
|
+
"content": [{"type": "input_text", "text": message}],
|
|
506
|
+
}
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
await self._session.connection.response.create()
|
|
510
|
+
|
|
511
|
+
except Exception as e:
|
|
512
|
+
self.emit("error", f"Error sending text message: {e}")
|
|
513
|
+
|
|
514
|
+
async def _cleanup_session(self, session: AzureVoiceLiveSession) -> None:
|
|
515
|
+
"""Clean up session resources"""
|
|
516
|
+
for task in session.tasks:
|
|
517
|
+
if not task.done():
|
|
518
|
+
task.cancel()
|
|
519
|
+
|
|
520
|
+
try:
|
|
521
|
+
if session.connection:
|
|
522
|
+
if hasattr(session.connection, "close"):
|
|
523
|
+
await session.connection.close()
|
|
524
|
+
except Exception as e:
|
|
525
|
+
self.emit("error", f"Error closing session: {e}")
|
|
526
|
+
|
|
527
|
+
async def aclose(self) -> None:
|
|
528
|
+
"""Clean up all resources"""
|
|
529
|
+
if self._closing:
|
|
530
|
+
return
|
|
531
|
+
|
|
532
|
+
self._closing = True
|
|
533
|
+
|
|
534
|
+
if self._session:
|
|
535
|
+
await self._cleanup_session(self._session)
|
|
536
|
+
self._session = None
|
|
537
|
+
|
|
538
|
+
if hasattr(self.audio_track, "cleanup") and self.audio_track:
|
|
539
|
+
try:
|
|
540
|
+
await self.audio_track.cleanup()
|
|
541
|
+
except Exception as e:
|
|
542
|
+
self.emit("error", f"Error cleaning up audio track: {e}")
|