videosdk-plugins-azure 0.0.58__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ from .stt import AzureSTT
2
+ from .tts import AzureTTS, VoiceTuning, SpeakingStyle
3
+ from .voice_live import AzureVoiceLive, AzureVoiceLiveConfig
4
+
5
+ __all__ = ["AzureSTT", "AzureTTS", "VoiceTuning", "SpeakingStyle", "AzureVoiceLive", "AzureVoiceLiveConfig"]
@@ -0,0 +1,277 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ import time
6
+ import threading
7
+ from dataclasses import dataclass
8
+ from typing import Any, Optional, List
9
+
10
+ import azure.cognitiveservices.speech as speechsdk
11
+
12
+ from videosdk.agents import (
13
+ STT as BaseSTT,
14
+ STTResponse,
15
+ SpeechEventType,
16
+ SpeechData,
17
+ global_event_emitter,
18
+ )
19
+
20
+ import logging
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ try:
25
+ from scipy import signal
26
+ import numpy as np
27
+
28
+ SCIPY_AVAILABLE = True
29
+ except ImportError:
30
+ SCIPY_AVAILABLE = False
31
+
32
+
33
+ @dataclass
34
+ class AzureSTTConfig:
35
+ """Configuration for Azure STT"""
36
+
37
+ speech_key: str
38
+ speech_region: str
39
+ language: str = "en-US"
40
+ sample_rate: int = 16000
41
+ enable_phrase_list: bool = False
42
+ phrase_list: Optional[List[str]] = None
43
+
44
+
45
+ class AzureSTT(BaseSTT):
46
+ def __init__(
47
+ self,
48
+ *,
49
+ speech_key: Optional[str] = None,
50
+ speech_region: Optional[str] = None,
51
+ language: str = "en-US",
52
+ sample_rate: int = 16000,
53
+ enable_phrase_list: bool = False,
54
+ phrase_list: Optional[List[str]] = None,
55
+ **kwargs: Any,
56
+ ) -> None:
57
+ """Initialize the Azure STT plugin.
58
+
59
+ Args:
60
+ speech_key (Optional[str]): Azure Speech API key. Uses AZURE_SPEECH_KEY environment variable if not provided.
61
+ speech_region (Optional[str]): Azure Speech region. Uses AZURE_SPEECH_REGION environment variable if not provided.
62
+ language (str): The language to use for the STT plugin. Defaults to "en-US".
63
+ sample_rate (int): Sample rate to use for the STT plugin. Defaults to 16000.
64
+ enable_phrase_list (bool): Whether to enable phrase list for better recognition. Defaults to False.
65
+ phrase_list (Optional[List[str]]): List of phrases to boost recognition. Defaults to None.
66
+ """
67
+ super().__init__()
68
+
69
+ if not SCIPY_AVAILABLE:
70
+ raise ImportError(
71
+ "scipy and numpy are required for Azure STT. Please install with 'pip install scipy numpy'"
72
+ )
73
+
74
+ self.speech_key = speech_key or os.getenv("AZURE_SPEECH_KEY")
75
+ self.speech_region = speech_region or os.getenv("AZURE_SPEECH_REGION")
76
+
77
+ if not self.speech_key or not self.speech_region:
78
+ raise ValueError(
79
+ "Azure Speech key and region must be provided either through parameters or "
80
+ "AZURE_SPEECH_KEY and AZURE_SPEECH_REGION environment variables"
81
+ )
82
+
83
+ self.config = AzureSTTConfig(
84
+ speech_key=self.speech_key,
85
+ speech_region=self.speech_region,
86
+ language=language,
87
+ sample_rate=sample_rate,
88
+ enable_phrase_list=enable_phrase_list,
89
+ phrase_list=phrase_list,
90
+ )
91
+
92
+ self.input_sample_rate = 48000
93
+ self.target_sample_rate = sample_rate
94
+
95
+ self._speech_processor: Optional[speechsdk.SpeechRecognizer] = None
96
+ self._audio_stream: Optional[speechsdk.audio.PushAudioInputStream] = None
97
+ self._is_speaking = False
98
+ self._last_speech_time = 0.0
99
+
100
+ self._loop = asyncio.get_running_loop()
101
+ self._event_queue = asyncio.Queue()
102
+ self._processing_task: Optional[asyncio.Task] = None
103
+
104
+ async def process_audio(
105
+ self, audio_frames: bytes, language: Optional[str] = None, **kwargs: Any
106
+ ) -> None:
107
+ """Process audio frames and send to Azure Speech Service"""
108
+ try:
109
+ if not self._speech_processor:
110
+ await self._setup_speech_processor(language)
111
+
112
+ if self._audio_stream and SCIPY_AVAILABLE:
113
+ audio_data = np.frombuffer(audio_frames, dtype=np.int16)
114
+
115
+ if len(audio_data) > 0:
116
+ stereo_data = audio_data.reshape(-1, 2)
117
+ mono_data = stereo_data.mean(axis=1)
118
+
119
+ resampled_data = signal.resample(
120
+ mono_data,
121
+ int(
122
+ len(mono_data)
123
+ * self.target_sample_rate
124
+ / self.input_sample_rate
125
+ ),
126
+ )
127
+ resampled_bytes = resampled_data.astype(np.int16).tobytes()
128
+ self._audio_stream.write(resampled_bytes)
129
+
130
+ except Exception as e:
131
+ logger.error(f"Error in process_audio: {str(e)}")
132
+ self.emit("error", str(e))
133
+ await self._cleanup_speech_processor()
134
+
135
+ async def _setup_speech_processor(self, language: Optional[str] = None) -> None:
136
+ """Setup Azure speech processor"""
137
+ try:
138
+ self._processing_task = self._loop.create_task(self._process_events())
139
+
140
+ speech_config = speechsdk.SpeechConfig(
141
+ subscription=self.config.speech_key, region=self.config.speech_region
142
+ )
143
+ speech_config.speech_recognition_language = language or self.config.language
144
+
145
+ stream_format = speechsdk.audio.AudioStreamFormat(
146
+ samples_per_second=self.config.sample_rate,
147
+ bits_per_sample=16,
148
+ channels=1,
149
+ )
150
+ self._audio_stream = speechsdk.audio.PushAudioInputStream(
151
+ stream_format=stream_format
152
+ )
153
+
154
+ audio_config = speechsdk.audio.AudioConfig(stream=self._audio_stream)
155
+
156
+ self._speech_processor = speechsdk.SpeechRecognizer(
157
+ speech_config=speech_config, audio_config=audio_config
158
+ )
159
+
160
+ if self.config.enable_phrase_list and self.config.phrase_list:
161
+ phrase_list_grammar = speechsdk.PhraseListGrammar.from_recognizer(
162
+ self._speech_processor
163
+ )
164
+ for phrase in self.config.phrase_list:
165
+ phrase_list_grammar.addPhrase(phrase)
166
+
167
+ self._speech_processor.recognized.connect(self._on_final_transcript)
168
+ self._speech_processor.recognizing.connect(self._on_interim_transcript)
169
+ self._speech_processor.speech_start_detected.connect(self._on_user_started_speaking)
170
+ self._speech_processor.speech_end_detected.connect(self._on_user_stopped_speaking)
171
+ self._speech_processor.canceled.connect(self._on_speech_processing_error)
172
+
173
+ self._speech_processor.start_continuous_recognition()
174
+ logger.info("Azure STT speech processor started")
175
+
176
+ except Exception as e:
177
+ logger.error(f"Failed to setup speech processor: {str(e)}")
178
+ raise
179
+
180
+ def _on_final_transcript(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
181
+ """Handle final recognition results"""
182
+ text = evt.result.text.strip()
183
+ if not text:
184
+ return
185
+
186
+ if self._transcript_callback:
187
+ response = STTResponse(
188
+ event_type=SpeechEventType.FINAL,
189
+ data=SpeechData(
190
+ text=text, language=self.config.language, confidence=1.0
191
+ ),
192
+ metadata={"provider": "azure", "result_reason": str(evt.result.reason)},
193
+ )
194
+ self._event_queue.put_nowait(response)
195
+
196
+ def _on_interim_transcript(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
197
+ """Handle interim recognition results"""
198
+ text = evt.result.text.strip()
199
+ if not text:
200
+ return
201
+
202
+ if self._transcript_callback:
203
+ response = STTResponse(
204
+ event_type=SpeechEventType.INTERIM,
205
+ data=SpeechData(
206
+ text=text, language=self.config.language, confidence=0.5
207
+ ),
208
+ metadata={"provider": "azure", "result_reason": str(evt.result.reason)},
209
+ )
210
+ self._event_queue.put_nowait(response)
211
+
212
+ def _on_user_started_speaking(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
213
+ """Handle speech start detection"""
214
+ if self._is_speaking:
215
+ return
216
+
217
+ self._is_speaking = True
218
+ current_time = time.time()
219
+
220
+ if self._last_speech_time == 0.0:
221
+ self._last_speech_time = current_time
222
+ else:
223
+ if current_time - self._last_speech_time < 1.0:
224
+ global_event_emitter.emit("speech_started")
225
+
226
+ self._last_speech_time = current_time
227
+
228
+ def _on_user_stopped_speaking(self, evt: speechsdk.SpeechRecognitionEventArgs) -> None:
229
+ """Handle speech end detection"""
230
+ if not self._is_speaking:
231
+ return
232
+
233
+ self._is_speaking = False
234
+ global_event_emitter.emit("speech_stopped")
235
+
236
+ def _on_speech_processing_error(self, evt: speechsdk.SpeechRecognitionCanceledEventArgs) -> None:
237
+ """Handle speech processing errors and cancellations"""
238
+ if evt.cancellation_details.reason == speechsdk.CancellationReason.Error:
239
+ error_msg = f"Speech recognition canceled due to error: {evt.cancellation_details.error_details}"
240
+ logger.error(error_msg)
241
+ self.emit("error", error_msg)
242
+
243
+ async def _process_events(self) -> None:
244
+ """Process STT events from the queue"""
245
+ while True:
246
+ try:
247
+ response = await self._event_queue.get()
248
+ if self._transcript_callback:
249
+ await self._transcript_callback(response)
250
+ except asyncio.CancelledError:
251
+ break
252
+ except Exception as e:
253
+ logger.error("Error processing STT event: %s", str(e), exc_info=True)
254
+
255
+ async def _cleanup_speech_processor(self) -> None:
256
+ """Cleanup speech processor resources"""
257
+ try:
258
+ if self._speech_processor:
259
+ self._speech_processor.stop_continuous_recognition()
260
+ self._speech_processor = None
261
+
262
+ if self._audio_stream:
263
+ self._audio_stream.close()
264
+ self._audio_stream = None
265
+
266
+ except Exception as e:
267
+ logger.error(f"Error during speech processor cleanup: {str(e)}")
268
+
269
+ async def aclose(self) -> None:
270
+ """Cleanup resources"""
271
+ if self._processing_task:
272
+ self._processing_task.cancel()
273
+ await asyncio.gather(self._processing_task, return_exceptions=True)
274
+
275
+ await self._cleanup_speech_processor()
276
+ logger.info("Azure STT closed")
277
+ await super().aclose()
@@ -0,0 +1,313 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ from dataclasses import dataclass, field
6
+ from typing import Literal, AsyncIterator, Optional, Any
7
+
8
+ import httpx
9
+
10
+ from videosdk.agents import TTS, segment_text
11
+ import logging
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ @dataclass
16
+ class VoiceTuning:
17
+ """Configuration for speech tuning (rate, volume, pitch)."""
18
+
19
+ _rate: Literal["x-slow", "slow", "medium", "fast", "x-fast"] | float | None = None
20
+ _volume: Literal["silent", "x-soft", "soft", "medium", "loud", "x-loud"] | float | None = None
21
+ _pitch: Literal["x-low", "low", "medium", "high", "x-high"] | None = None
22
+
23
+ @property
24
+ def rate(self):
25
+ return self._rate
26
+
27
+ @rate.setter
28
+ def rate(self, value):
29
+ if value:
30
+ if isinstance(value, float) and not 0.5 <= value <= 2.0:
31
+ raise ValueError("Rate must be a float between 0.5 and 2.0")
32
+ if isinstance(value, str) and value not in ["x-slow", "slow", "medium", "fast", "x-fast"]:
33
+ raise ValueError("Rate must be one of 'x-slow', 'slow', 'medium', 'fast', 'x-fast'")
34
+ self._rate = value
35
+
36
+ @property
37
+ def volume(self):
38
+ return self._volume
39
+
40
+ @volume.setter
41
+ def volume(self, value):
42
+ if value:
43
+ if isinstance(value, float) and not 0 <= value <= 100.0:
44
+ raise ValueError("Volume must be a float between 0 and 100")
45
+ if isinstance(value, str) and value not in ["silent", "x-soft", "soft", "medium", "loud", "x-loud"]:
46
+ raise ValueError("Volume must be one of 'silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud'")
47
+ self._volume = value
48
+
49
+ @property
50
+ def pitch(self):
51
+ return self._pitch
52
+
53
+ @pitch.setter
54
+ def pitch(self, value):
55
+ if value and value not in ["x-low", "low", "medium", "high", "x-high"]:
56
+ raise ValueError("Pitch must be one of 'x-low', 'low', 'medium', 'high', 'x-high'")
57
+ self._pitch = value
58
+
59
+ def __init__(self, rate=None, volume=None, pitch=None):
60
+ self.rate = rate
61
+ self.volume = volume
62
+ self.pitch = pitch
63
+
64
+
65
+ @dataclass
66
+ class SpeakingStyle:
67
+ """Configuration for speech expressive style."""
68
+ style: str
69
+ _degree: float | None = None
70
+
71
+ @property
72
+ def degree(self):
73
+ return self._degree
74
+
75
+ @degree.setter
76
+ def degree(self, value: float | None):
77
+ if value is not None and not 0.1 <= value <= 2.0:
78
+ raise ValueError("Style degree must be between 0.1 and 2.0")
79
+ self._degree = value
80
+
81
+ def __init__(self, style: str, degree: float | None = None):
82
+ self.style = style
83
+ self.degree = degree
84
+
85
+
86
+ class AzureTTS(TTS):
87
+ """
88
+ Initialize the Azure TTS plugin.
89
+
90
+ Args:
91
+ voice (str): Name of the Azure neural voice to use (default: "en-US-EmmaNeural").
92
+ For a full list of available voices, see:
93
+ https://eastus2.tts.speech.microsoft.com/cognitiveservices/voices/list
94
+ (Requires: curl --location --request GET with header 'Ocp-Apim-Subscription-Key')
95
+ language (str, optional): Language code for the voice (e.g., "en-US"). If not provided, defaults to the voice's language.
96
+ tuning (VoiceTuning, optional): VoiceTuning object to control speech rate, volume, and pitch.
97
+ style (SpeakingStyle, optional): SpeakingStyle object for expressive speech synthesis.
98
+ speech_key (str, optional): Azure Speech API key. If not provided, uses the AZURE_SPEECH_KEY environment variable.
99
+ speech_region (str, optional): Azure Speech region. If not provided, uses the AZURE_SPEECH_REGION environment variable.
100
+ speech_endpoint (str, optional): Custom endpoint URL. If not provided, uses the AZURE_SPEECH_ENDPOINT environment variable.
101
+ deployment_id (str, optional): Custom deployment ID for model deployment scenarios.
102
+ speech_auth_token (str, optional): Azure Speech authorization token for token-based authentication.
103
+
104
+ """
105
+ FIXED_SAMPLE_RATE = 24000
106
+ AZURE_OUTPUT_FORMAT = "raw-24khz-16bit-mono-pcm"
107
+
108
+ def __init__(
109
+ self,
110
+ *,
111
+ voice: str = "en-US-EmmaNeural",
112
+ language: str | None = None,
113
+ tuning: Optional[VoiceTuning] = None,
114
+ style: Optional[SpeakingStyle] = None,
115
+ speech_key: str | None = None,
116
+ speech_region: str | None = None,
117
+ speech_endpoint: str | None = None,
118
+ deployment_id: str | None = None,
119
+ speech_auth_token: str | None = None,
120
+ **kwargs: Any,
121
+ ) -> None:
122
+ super().__init__(
123
+ sample_rate=self.FIXED_SAMPLE_RATE,
124
+ num_channels=1,
125
+ )
126
+
127
+ self.speech_key = speech_key or os.environ.get("AZURE_SPEECH_KEY")
128
+ self.speech_region = speech_region or os.environ.get("AZURE_SPEECH_REGION")
129
+ self.speech_endpoint = speech_endpoint or os.environ.get(
130
+ "AZURE_SPEECH_ENDPOINT"
131
+ )
132
+ self.speech_auth_token = speech_auth_token
133
+ self.deployment_id = deployment_id
134
+
135
+ has_endpoint = bool(self.speech_endpoint)
136
+ has_key_and_region = bool(self.speech_key and self.speech_region)
137
+ has_token_and_region = bool(self.speech_auth_token and self.speech_region)
138
+
139
+ if not (has_endpoint or has_key_and_region or has_token_and_region):
140
+ raise ValueError(
141
+ "Authentication requires one of: speech_endpoint, (speech_key & speech_region), or (speech_auth_token & speech_region)."
142
+ )
143
+
144
+ self.voice = voice
145
+ self.language = language
146
+ self.tuning = tuning
147
+ self.style = style
148
+
149
+ self._first_chunk_sent = False
150
+ self._interrupted = False
151
+ self._http_client: Optional[httpx.AsyncClient] = None
152
+
153
+
154
+ def reset_first_audio_tracking(self) -> None:
155
+ self._first_chunk_sent = False
156
+
157
+ def _get_endpoint_url(self) -> str:
158
+ if self.speech_endpoint:
159
+ base = self.speech_endpoint.rstrip("/")
160
+ if not base.endswith("/cognitiveservices/v1"):
161
+ base = f"{base}/cognitiveservices/v1"
162
+ else:
163
+ base = f"https://{self.speech_region}.tts.speech.microsoft.com/cognitiveservices/v1"
164
+
165
+ if self.deployment_id:
166
+ return f"{base}?deploymentId={self.deployment_id}"
167
+ return base
168
+
169
+ def _get_http_client(self) -> httpx.AsyncClient:
170
+ if not self._http_client:
171
+ self._http_client = httpx.AsyncClient(
172
+ timeout=httpx.Timeout(
173
+ connect=15.0, read=30.0, write=5.0, pool=5.0
174
+ ),
175
+ follow_redirects=True,
176
+ limits=httpx.Limits(
177
+ max_connections=50,
178
+ max_keepalive_connections=50,
179
+ keepalive_expiry=120,
180
+ ),
181
+ )
182
+ return self._http_client
183
+
184
+ async def synthesize(
185
+ self,
186
+ text: AsyncIterator[str] | str,
187
+ voice_id: Optional[str] = None,
188
+ **kwargs: Any,
189
+ ) -> None:
190
+ try:
191
+ if not self.audio_track or not self.loop:
192
+ self.emit("error", "Audio track or event loop not set")
193
+ return
194
+
195
+ self._interrupted = False
196
+
197
+ if isinstance(text, AsyncIterator):
198
+ async for segment in segment_text(text):
199
+ if self._interrupted:
200
+ break
201
+ await self._synthesize_segment(segment, voice_id, **kwargs)
202
+ else:
203
+ if not self._interrupted:
204
+ await self._synthesize_segment(text, voice_id, **kwargs)
205
+
206
+ except Exception as e:
207
+ logger.error("Azure TTS synthesis failed: %s", str(e), exc_info=True)
208
+ self.emit("error", f"Azure TTS synthesis failed: {str(e)}")
209
+ raise
210
+
211
+ async def _synthesize_segment(
212
+ self, text: str, voice_id: Optional[str] = None, **kwargs: Any
213
+ ) -> None:
214
+ if not text.strip() or self._interrupted:
215
+ return
216
+
217
+ try:
218
+
219
+ headers = {
220
+ "Content-Type": "application/ssml+xml",
221
+ "X-Microsoft-OutputFormat": self.AZURE_OUTPUT_FORMAT,
222
+ "User-Agent": "VideoSDK Agents",
223
+ }
224
+
225
+ if self.speech_auth_token:
226
+ headers["Authorization"] = f"Bearer {self.speech_auth_token}"
227
+ elif self.speech_key:
228
+ headers["Ocp-Apim-Subscription-Key"] = self.speech_key
229
+
230
+ ssml_data = self._build_ssml(text, voice_id or self.voice)
231
+
232
+ response = await self._get_http_client().post(
233
+ url=self._get_endpoint_url(),
234
+ headers=headers,
235
+ content=ssml_data,
236
+ )
237
+ response.raise_for_status()
238
+
239
+ audio_data = b""
240
+ async for chunk in response.aiter_bytes(chunk_size=8192):
241
+ if self._interrupted:
242
+ break
243
+ if chunk:
244
+ audio_data += chunk
245
+
246
+ if audio_data and not self._interrupted:
247
+ await self._stream_audio_chunks(audio_data)
248
+
249
+ except httpx.TimeoutException:
250
+ logger.error("Azure TTS request timeout")
251
+ self.emit("error", "Azure TTS request timeout")
252
+ except httpx.HTTPStatusError as e:
253
+ logger.error("Azure TTS HTTP error: %s - %s", e.response.status_code, e.response.text)
254
+ self.emit("error", f"Azure TTS HTTP error: {e.response.status_code} - {e.response.text}")
255
+ except Exception as e:
256
+ if not self._interrupted:
257
+ logger.error("Azure TTS synthesis failed: %s", str(e), exc_info=True)
258
+ self.emit("error", f"Azure TTS synthesis failed: {str(e)}")
259
+ raise
260
+
261
+ def _build_ssml(self, text: str, voice: str) -> str:
262
+ lang = self.language or "en-US"
263
+ ssml = (
264
+ f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" '
265
+ f'xmlns:mstts="http://www.w3.org/2001/mstts" xml:lang="{lang}">'
266
+ )
267
+ ssml += f'<voice name="{voice}">'
268
+
269
+ if self.style:
270
+ degree = f' styledegree="{self.style.degree}"' if self.style.degree else ""
271
+ ssml += f'<mstts:express-as style="{self.style.style}"{degree}>'
272
+
273
+ if self.tuning:
274
+ t = self.tuning
275
+ rate_attr = f' rate="{t.rate}"' if t.rate is not None else ""
276
+ vol_attr = f' volume="{t.volume}"' if t.volume is not None else ""
277
+ pitch_attr = f' pitch="{t.pitch}"' if t.pitch is not None else ""
278
+ ssml += f"<prosody{rate_attr}{vol_attr}{pitch_attr}>{text}</prosody>"
279
+ else:
280
+ ssml += text
281
+
282
+ if self.style:
283
+ ssml += "</mstts:express-as>"
284
+
285
+ ssml += "</voice></speak>"
286
+ return ssml
287
+
288
+ async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
289
+ chunk_size = int(self.FIXED_SAMPLE_RATE * 2 * 20 / 1000)
290
+ for i in range(0, len(audio_bytes), chunk_size):
291
+ if self._interrupted: break
292
+ chunk = audio_bytes[i : i + chunk_size]
293
+ if len(chunk) < chunk_size and len(chunk) > 0:
294
+ padding_needed = chunk_size - len(chunk)
295
+ chunk += b"\x00" * padding_needed
296
+ if len(chunk) == chunk_size:
297
+ if not self._first_chunk_sent and self._first_audio_callback:
298
+ self._first_chunk_sent = True
299
+ await self._first_audio_callback()
300
+ if self.audio_track:
301
+ asyncio.create_task(self.audio_track.add_new_bytes(chunk))
302
+ await asyncio.sleep(0.001)
303
+
304
+ async def interrupt(self) -> None:
305
+ self._interrupted = True
306
+ if self.audio_track:
307
+ self.audio_track.interrupt()
308
+
309
+ async def aclose(self) -> None:
310
+ if self._http_client:
311
+ await self._http_client.aclose()
312
+ self._http_client = None
313
+ await super().aclose()
@@ -0,0 +1,2 @@
1
+ __version__ = "0.0.58"
2
+
@@ -0,0 +1,542 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ import logging
6
+ import traceback
7
+ import base64
8
+ from typing import Any, Optional, Literal, List, Union, Dict
9
+ from dataclasses import dataclass, field
10
+ import numpy as np
11
+ from scipy import signal
12
+ from dotenv import load_dotenv
13
+
14
+ from videosdk.agents import (
15
+ Agent,
16
+ CustomAudioStreamTrack,
17
+ RealtimeBaseModel,
18
+ realtime_metrics_collector,
19
+ )
20
+ from videosdk.agents.event_bus import global_event_emitter
21
+
22
+ from azure.core.credentials import AzureKeyCredential, TokenCredential
23
+ from azure.identity import DefaultAzureCredential
24
+ from azure.ai.voicelive.aio import connect
25
+ from azure.ai.voicelive.models import (
26
+ RequestSession,
27
+ ServerVad,
28
+ AzureStandardVoice,
29
+ Modality,
30
+ InputAudioFormat,
31
+ ServerEventType,
32
+ )
33
+
34
+ load_dotenv()
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ AZURE_VOICE_LIVE_SAMPLE_RATE = 24000
39
+ VIDEOSDK_INPUT_SAMPLE_RATE = 48000
40
+
41
+ AzureVoiceLiveEventTypes = Literal["user_speech_started", "text_response", "error"]
42
+
43
+
44
+ @dataclass
45
+ class AzureVoiceLiveConfig:
46
+ """Configuration for Azure Voice Live API (Beta)
47
+
48
+ Args:
49
+ voice: Voice ID for audio output. Can be Azure voice (e.g., 'en-US-AvaNeural') or OpenAI voice ('alloy', 'echo', etc.). Default is 'en-US-AvaNeural'
50
+ modalities: List of enabled response types. Options: [Modality.TEXT, Modality.AUDIO]. Default includes both
51
+ input_audio_format: Audio format for input. Default is AudioFormat.PCM16
52
+ output_audio_format: Audio format for output. Default is AudioFormat.PCM16
53
+ turn_detection_threshold: Voice activity detection threshold (0.0-1.0). Default is 0.5
54
+ turn_detection_prefix_padding_ms: Padding before speech start (ms). Default is 300
55
+ turn_detection_silence_duration_ms: Silence duration to mark end (ms). Default is 500
56
+ temperature: Controls randomness in response generation. Higher values make output more random. Default is None
57
+ max_completion_tokens: Maximum number of tokens in response. Default is None
58
+ """
59
+
60
+ voice: str = "en-US-AvaNeural"
61
+ modalities: List[Modality] = field(
62
+ default_factory=lambda: [Modality.TEXT, Modality.AUDIO]
63
+ )
64
+ input_audio_format: InputAudioFormat = InputAudioFormat.PCM16
65
+ output_audio_format: InputAudioFormat = InputAudioFormat.PCM16
66
+ turn_detection_threshold: float = 0.5
67
+ turn_detection_prefix_padding_ms: int = 300
68
+ turn_detection_silence_duration_ms: int = 500
69
+ temperature: Optional[float] = None
70
+ max_completion_tokens: Optional[int] = None
71
+
72
+
73
+ @dataclass
74
+ class AzureVoiceLiveSession:
75
+ """Represents an Azure Voice Live session"""
76
+
77
+ connection: Any
78
+ session_id: Optional[str] = None
79
+ tasks: list[asyncio.Task] = field(default_factory=list)
80
+
81
+
82
+ class AzureVoiceLive(RealtimeBaseModel[AzureVoiceLiveEventTypes]):
83
+ """Azure Voice Live realtime model implementation"""
84
+
85
+ def __init__(
86
+ self,
87
+ *,
88
+ api_key: str | None = None,
89
+ endpoint: str | None = None,
90
+ model: str,
91
+ config: AzureVoiceLiveConfig | None = None,
92
+ credential: Union[AzureKeyCredential, TokenCredential] | None = None,
93
+ ) -> None:
94
+ """
95
+ Initialize Azure Voice Live realtime model.
96
+
97
+ Args:
98
+ api_key: Azure Voice Live API key. If not provided, will attempt to read from AZURE_VOICE_LIVE_API_KEY env var
99
+ endpoint: Azure Voice Live endpoint. If not provided, will attempt to read from AZURE_VOICE_LIVE_ENDPOINT env var
100
+ model: The model identifier to use (e.g., 'gpt-4o-realtime-preview')
101
+ config: Optional configuration object for customizing model behavior. Contains settings for:
102
+ - voice: Voice ID for audio output (Azure or OpenAI voices)
103
+ - modalities: List of enabled response types [TEXT, AUDIO]
104
+ - turn_detection: Voice activity detection settings
105
+ - temperature: Response randomness control
106
+ credential: Azure credential object. If provided, takes precedence over api_key
107
+
108
+ Raises:
109
+ ValueError: If no API key or credential is provided and none found in environment variables
110
+ """
111
+ super().__init__()
112
+ self.model = model
113
+ self.endpoint = endpoint or os.getenv(
114
+ "AZURE_VOICE_LIVE_ENDPOINT", "wss://api.voicelive.com/v1"
115
+ )
116
+
117
+ if credential:
118
+ self.credential = credential
119
+ elif api_key:
120
+ self.credential = AzureKeyCredential(api_key)
121
+ else:
122
+ env_api_key = os.getenv("AZURE_VOICE_LIVE_API_KEY")
123
+ if env_api_key:
124
+ self.credential = AzureKeyCredential(env_api_key)
125
+ else:
126
+ try:
127
+ self.credential = DefaultAzureCredential()
128
+ except Exception:
129
+ self.emit("error", "Azure Voice Live credentials required")
130
+ raise ValueError(
131
+ "Azure Voice Live credentials required. Provide api_key, credential, or set AZURE_VOICE_LIVE_API_KEY environment variable"
132
+ )
133
+
134
+ self._session: Optional[AzureVoiceLiveSession] = None
135
+ self._closing = False
136
+ self._instructions: str = (
137
+ "You are a helpful voice assistant that can answer questions and help with tasks."
138
+ )
139
+ self.loop = None
140
+ self.audio_track: Optional[CustomAudioStreamTrack] = None
141
+ self.config: AzureVoiceLiveConfig = config or AzureVoiceLiveConfig()
142
+ self.input_sample_rate = VIDEOSDK_INPUT_SAMPLE_RATE
143
+ self.target_sample_rate = AZURE_VOICE_LIVE_SAMPLE_RATE
144
+ self._agent_speaking = False
145
+ self._user_speaking = False
146
+ self.session_ready = False
147
+ self._session_ready_event = asyncio.Event()
148
+
149
+ def set_agent(self, agent: Agent) -> None:
150
+ """Set the agent configuration"""
151
+ self._instructions = agent.instructions
152
+
153
+ async def connect(self) -> None:
154
+ """Connect to Azure Voice Live API"""
155
+ if self._session:
156
+ await self._cleanup_session(self._session)
157
+ self._session = None
158
+
159
+ self._closing = False
160
+
161
+ try:
162
+ if (
163
+ not self.audio_track
164
+ and self.loop
165
+ and Modality.AUDIO in self.config.modalities
166
+ ):
167
+ self.audio_track = CustomAudioStreamTrack(self.loop)
168
+ elif not self.loop and Modality.AUDIO in self.config.modalities:
169
+ self.emit(
170
+ "error", "Event loop not initialized. Audio playback will not work."
171
+ )
172
+ raise RuntimeError(
173
+ "Event loop not initialized. Audio playback will not work."
174
+ )
175
+
176
+ session = await self._create_session()
177
+ if session:
178
+ self._session = session
179
+
180
+ if self._session:
181
+ asyncio.create_task(
182
+ self._process_events(), name="azure-voice-live-events"
183
+ )
184
+ try:
185
+ logger.info("Waiting for Azure Voice Live session to be ready...")
186
+ await asyncio.wait_for(
187
+ self._session_ready_event.wait(), timeout=10.0
188
+ )
189
+ logger.info("Azure Voice Live session is ready.")
190
+ except asyncio.TimeoutError:
191
+ self.emit("error", "Azure Voice Live session ready timeout")
192
+ raise RuntimeError(
193
+ "Azure Voice Live session did not become ready in time"
194
+ )
195
+
196
+ except Exception as e:
197
+ self.emit("error", f"Error connecting to Azure Voice Live API: {e}")
198
+ traceback.print_exc()
199
+ raise
200
+
201
+ async def _create_session(self) -> AzureVoiceLiveSession:
202
+ """Create a new Azure Voice Live session"""
203
+ try:
204
+ connection_cm = connect(
205
+ endpoint=self.endpoint,
206
+ credential=self.credential,
207
+ model=self.model,
208
+ connection_options={
209
+ "max_msg_size": 10 * 1024 * 1024,
210
+ "heartbeat": 20,
211
+ "timeout": 20,
212
+ },
213
+ )
214
+
215
+ connection = await connection_cm.__aenter__()
216
+
217
+ await self._setup_session(connection)
218
+
219
+ return AzureVoiceLiveSession(
220
+ connection=connection, session_id=None, tasks=[]
221
+ )
222
+
223
+ except Exception as e:
224
+ self.emit("error", f"Failed to create Azure Voice Live session: {e}")
225
+ traceback.print_exc()
226
+ raise
227
+
228
+ async def _setup_session(self, connection) -> None:
229
+ """Configure the Azure Voice Live session"""
230
+ logger.info("Setting up Azure Voice Live session...")
231
+
232
+ voice_config: Union[AzureStandardVoice, str]
233
+ if (
234
+ self.config.voice.startswith("en-US-")
235
+ or self.config.voice.startswith("en-CA-")
236
+ or "-" in self.config.voice
237
+ ):
238
+ voice_config = AzureStandardVoice(
239
+ name=self.config.voice, type="azure-standard"
240
+ )
241
+ else:
242
+ voice_config = self.config.voice
243
+
244
+ turn_detection_config = ServerVad(
245
+ threshold=self.config.turn_detection_threshold,
246
+ prefix_padding_ms=self.config.turn_detection_prefix_padding_ms,
247
+ silence_duration_ms=self.config.turn_detection_silence_duration_ms,
248
+ )
249
+
250
+ session_config = RequestSession(
251
+ modalities=self.config.modalities,
252
+ instructions=self._instructions,
253
+ voice=voice_config,
254
+ input_audio_format=self.config.input_audio_format,
255
+ output_audio_format=self.config.output_audio_format,
256
+ turn_detection=turn_detection_config,
257
+ )
258
+
259
+ if self.config.temperature is not None:
260
+ session_config.temperature = self.config.temperature
261
+ if self.config.max_completion_tokens is not None:
262
+ session_config.max_completion_tokens = self.config.max_completion_tokens
263
+
264
+ await connection.session.update(session=session_config)
265
+ logger.info("Azure Voice Live session configuration sent")
266
+
267
+ async def _process_events(self) -> None:
268
+ """Process events from the Azure Voice Live connection"""
269
+ try:
270
+ if not self._session or not self._session.connection:
271
+ return
272
+
273
+ async for event in self._session.connection:
274
+ if self._closing:
275
+ break
276
+ await self._handle_event(event)
277
+
278
+ except asyncio.CancelledError:
279
+ logger.info("Event processing cancelled")
280
+ except Exception as e:
281
+ self.emit("error", f"Error processing events: {e}")
282
+ traceback.print_exc()
283
+
284
+ async def _handle_event(self, event) -> None:
285
+ """Handle different types of events from Azure Voice Live"""
286
+ try:
287
+ logger.debug(f"Received event: {event.type}")
288
+
289
+ if event.type == ServerEventType.SESSION_UPDATED:
290
+ logger.info(f"Session ready: {event.session.id}")
291
+ if self._session:
292
+ self._session.session_id = event.session.id
293
+ self.session_ready = True
294
+ self._session_ready_event.set()
295
+
296
+ elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED:
297
+ logger.info("User started speaking")
298
+ if not self._user_speaking:
299
+ await realtime_metrics_collector.set_user_speech_start()
300
+ self._user_speaking = True
301
+ self.emit("user_speech_started", {"type": "done"})
302
+
303
+ if self.audio_track and Modality.AUDIO in self.config.modalities:
304
+ self.audio_track.interrupt()
305
+
306
+ if self._session and self._session.connection:
307
+ try:
308
+ await self._session.connection.response.cancel()
309
+ except Exception as e:
310
+ logger.debug(f"No response to cancel: {e}")
311
+
312
+ elif event.type == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STOPPED:
313
+ logger.info("User stopped speaking")
314
+ if self._user_speaking:
315
+ await realtime_metrics_collector.set_user_speech_end()
316
+ self._user_speaking = False
317
+
318
+ elif event.type == ServerEventType.RESPONSE_CREATED:
319
+ logger.info("Assistant response created")
320
+
321
+ elif event.type == ServerEventType.RESPONSE_AUDIO_DELTA:
322
+ logger.debug("Received audio delta")
323
+ if Modality.AUDIO in self.config.modalities:
324
+ if not self._agent_speaking:
325
+ await realtime_metrics_collector.set_agent_speech_start()
326
+ self._agent_speaking = True
327
+
328
+ if self.audio_track and self.loop:
329
+ asyncio.create_task(self.audio_track.add_new_bytes(event.delta))
330
+
331
+ elif event.type == ServerEventType.RESPONSE_AUDIO_DONE:
332
+ logger.info("Assistant finished speaking")
333
+ if self._agent_speaking:
334
+ await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
335
+ self._agent_speaking = False
336
+
337
+ elif event.type == ServerEventType.RESPONSE_TEXT_DELTA:
338
+ if hasattr(self, "_current_text_response"):
339
+ self._current_text_response += event.delta
340
+ else:
341
+ self._current_text_response = event.delta
342
+
343
+ elif event.type == ServerEventType.RESPONSE_TEXT_DONE:
344
+ if hasattr(self, "_current_text_response"):
345
+ global_event_emitter.emit(
346
+ "text_response",
347
+ {"text": self._current_text_response, "type": "done"},
348
+ )
349
+ await realtime_metrics_collector.set_agent_response(
350
+ self._current_text_response
351
+ )
352
+ try:
353
+ self.emit(
354
+ "realtime_model_transcription",
355
+ {
356
+ "role": "agent",
357
+ "text": self._current_text_response,
358
+ "is_final": True,
359
+ },
360
+ )
361
+ except Exception:
362
+ pass
363
+ self._current_text_response = ""
364
+
365
+ elif event.type == ServerEventType.RESPONSE_DONE:
366
+ logger.info("Response complete")
367
+ if self._agent_speaking:
368
+ await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
369
+ self._agent_speaking = False
370
+
371
+ elif event.type == ServerEventType.ERROR:
372
+ logger.error(f"Azure Voice Live error: {event.error.message}")
373
+ self.emit("error", f"Azure Voice Live error: {event.error.message}")
374
+
375
+ elif event.type == ServerEventType.CONVERSATION_ITEM_CREATED:
376
+ logger.debug(f"Conversation item created: {event.item.id}")
377
+
378
+ if (
379
+ hasattr(event.item, "content")
380
+ and event.item.content
381
+ and hasattr(event.item.content[0], "transcript")
382
+ ):
383
+ transcript = event.item.content[0].transcript
384
+ if transcript and event.item.role == "user":
385
+ await realtime_metrics_collector.set_user_transcript(transcript)
386
+ try:
387
+ self.emit(
388
+ "realtime_model_transcription",
389
+ {"role": "user", "text": transcript, "is_final": True},
390
+ )
391
+ except Exception:
392
+ pass
393
+
394
+ else:
395
+ logger.debug(f"Unhandled event type: {event.type}")
396
+
397
+ except Exception as e:
398
+ self.emit("error", f"Error handling event {event.type}: {e}")
399
+ traceback.print_exc()
400
+
401
+ async def handle_audio_input(self, audio_data: bytes) -> None:
402
+ """Handle incoming audio data from the user"""
403
+ if not self._session or self._closing or not self.session_ready:
404
+ return
405
+
406
+ if Modality.AUDIO not in self.config.modalities:
407
+ return
408
+
409
+ try:
410
+ audio_array = np.frombuffer(audio_data, dtype=np.int16)
411
+
412
+ if len(audio_array) % 2 == 0:
413
+ audio_array = audio_array.reshape(-1, 2)
414
+ audio_array = np.mean(audio_array, axis=1).astype(np.int16)
415
+
416
+ target_length = int(
417
+ len(audio_array) * self.target_sample_rate / self.input_sample_rate
418
+ )
419
+ resampled_float = signal.resample(
420
+ audio_array.astype(np.float32), target_length
421
+ )
422
+ resampled_int16 = np.clip(resampled_float, -32768, 32767).astype(np.int16)
423
+ resampled_bytes = resampled_int16.tobytes()
424
+
425
+ encoded_audio = base64.b64encode(resampled_bytes).decode("utf-8")
426
+
427
+ await self._session.connection.input_audio_buffer.append(
428
+ audio=encoded_audio
429
+ )
430
+
431
+ except Exception as e:
432
+ self.emit("error", f"Error processing audio input: {e}")
433
+
434
+ async def interrupt(self) -> None:
435
+ """Interrupt current response"""
436
+ if not self._session or self._closing:
437
+ return
438
+
439
+ try:
440
+ if self._session.connection:
441
+ await self._session.connection.response.cancel()
442
+
443
+ if self.audio_track and Modality.AUDIO in self.config.modalities:
444
+ self.audio_track.interrupt()
445
+
446
+ await realtime_metrics_collector.set_interrupted()
447
+
448
+ if self._agent_speaking:
449
+ await realtime_metrics_collector.set_agent_speech_end(timeout=1.0)
450
+ self._agent_speaking = False
451
+
452
+ except Exception as e:
453
+ self.emit("error", f"Interrupt error: {e}")
454
+
455
+ async def send_message(self, message: str) -> None:
456
+ """Send a text message to get audio response"""
457
+ retry_count = 0
458
+ max_retries = 5
459
+ while not self._session or not self.session_ready:
460
+ if retry_count >= max_retries:
461
+ raise RuntimeError(
462
+ "No active Azure Voice Live session after maximum retries"
463
+ )
464
+ logger.debug("No active session, waiting for connection...")
465
+ await asyncio.sleep(1)
466
+ retry_count += 1
467
+
468
+ try:
469
+ await self._session.connection.conversation.item.create(
470
+ item={
471
+ "type": "message",
472
+ "role": "assistant",
473
+ "content": [
474
+ {
475
+ "type": "text",
476
+ "text": f"Repeat the user's exact message back to them [DO NOT ADD ANYTHING ELSE]: {message}",
477
+ }
478
+ ],
479
+ }
480
+ )
481
+
482
+ await self._session.connection.response.create()
483
+
484
+ except Exception as e:
485
+ self.emit("error", f"Error sending message: {e}")
486
+
487
+ async def send_text_message(self, message: str) -> None:
488
+ """Send a text message for text-only communication"""
489
+ retry_count = 0
490
+ max_retries = 5
491
+ while not self._session or not self.session_ready:
492
+ if retry_count >= max_retries:
493
+ raise RuntimeError(
494
+ "No active Azure Voice Live session after maximum retries"
495
+ )
496
+ logger.debug("No active session, waiting for connection...")
497
+ await asyncio.sleep(1)
498
+ retry_count += 1
499
+
500
+ try:
501
+ await self._session.connection.conversation.item.create(
502
+ item={
503
+ "type": "message",
504
+ "role": "user",
505
+ "content": [{"type": "input_text", "text": message}],
506
+ }
507
+ )
508
+
509
+ await self._session.connection.response.create()
510
+
511
+ except Exception as e:
512
+ self.emit("error", f"Error sending text message: {e}")
513
+
514
+ async def _cleanup_session(self, session: AzureVoiceLiveSession) -> None:
515
+ """Clean up session resources"""
516
+ for task in session.tasks:
517
+ if not task.done():
518
+ task.cancel()
519
+
520
+ try:
521
+ if session.connection:
522
+ if hasattr(session.connection, "close"):
523
+ await session.connection.close()
524
+ except Exception as e:
525
+ self.emit("error", f"Error closing session: {e}")
526
+
527
+ async def aclose(self) -> None:
528
+ """Clean up all resources"""
529
+ if self._closing:
530
+ return
531
+
532
+ self._closing = True
533
+
534
+ if self._session:
535
+ await self._cleanup_session(self._session)
536
+ self._session = None
537
+
538
+ if hasattr(self.audio_track, "cleanup") and self.audio_track:
539
+ try:
540
+ await self.audio_track.cleanup()
541
+ except Exception as e:
542
+ self.emit("error", f"Error cleaning up audio track: {e}")
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: videosdk-plugins-azure
3
+ Version: 0.0.58
4
+ Summary: VideoSDK Agent Framework plugin for azure
5
+ Author: videosdk
6
+ License-Expression: Apache-2.0
7
+ Keywords: ai,audio,azure,video,videosdk
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Communications :: Conferencing
11
+ Classifier: Topic :: Multimedia :: Sound/Audio
12
+ Classifier: Topic :: Multimedia :: Video
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.11
15
+ Requires-Dist: azure-ai-voicelive>=1.0.0b1
16
+ Requires-Dist: azure-cognitiveservices-speech>=1.46.0
17
+ Requires-Dist: azure-core==1.35.0
18
+ Requires-Dist: azure-identity==1.22.0
19
+ Requires-Dist: httpx>=0.28.1
20
+ Requires-Dist: videosdk-agents>=0.0.58
21
+ Description-Content-Type: text/markdown
22
+
23
+ # VideoSDK Azure Plugin
24
+
25
+ Agent Framework plugin for AzureVoiceLive(realtime), STT And TTS services from azure.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install videosdk-plugins-azure
31
+ ```
@@ -0,0 +1,8 @@
1
+ videosdk/plugins/azure/__init__.py,sha256=rpAVdGG4vJRSUIDZlase8-B_5CDVU5fgy4IEVG6iehA,250
2
+ videosdk/plugins/azure/stt.py,sha256=--1-9SXtxTEMtCW1jO85Y9IqIvDWSRQQSV163kWO6t4,10380
3
+ videosdk/plugins/azure/tts.py,sha256=5DHIVwMh2eGyQUVdpifdVcDUsgPQLMV07ihI2HIHmhc,11976
4
+ videosdk/plugins/azure/version.py,sha256=2OsZf8Q77CP8HEBmgzhI0p7cnKzucvTjVvRHnYtr0Vw,24
5
+ videosdk/plugins/azure/voice_live.py,sha256=Lhdc3TNPVT3n5IvJfmdsizfI-ymb7NBoNL3Ta6CTh1M,21353
6
+ videosdk_plugins_azure-0.0.58.dist-info/METADATA,sha256=KNfcKLeisQrZuwfYfUnmDB3ohx6q6S2FovykrJWPKno,981
7
+ videosdk_plugins_azure-0.0.58.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
8
+ videosdk_plugins_azure-0.0.58.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any