srx-lib-azure 0.1.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ import asyncio
2
+ import os
3
+ import subprocess
4
+ import tempfile
5
+ from pathlib import Path
6
+ from typing import Optional, Callable, AsyncGenerator
7
+
8
+ from loguru import logger
9
+
10
+ # Optional import - gracefully handle if azure-cognitiveservices-speech is not installed
11
+ try:
12
+ import azure.cognitiveservices.speech as speechsdk
13
+ SPEECH_SDK_AVAILABLE = True
14
+ except ImportError:
15
+ SPEECH_SDK_AVAILABLE = False
16
+ logger.warning(
17
+ "azure-cognitiveservices-speech not installed. "
18
+ "Install with: pip install srx-lib-azure[speech]"
19
+ )
20
+
21
+
22
+ class AzureSpeechService:
23
+ """Azure Speech Service for audio transcription.
24
+
25
+ Provides audio-to-text transcription using Azure Cognitive Services Speech SDK.
26
+ Supports continuous recognition for longer audio files and language selection.
27
+
28
+ Configuration can be passed explicitly via constructor or fallback to environment variables.
29
+ Operations will error if SDK is not installed or required credentials are missing.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ *,
35
+ speech_key: Optional[str] = None,
36
+ speech_region: Optional[str] = None,
37
+ speech_endpoint: Optional[str] = None,
38
+ warn_if_unconfigured: bool = False,
39
+ ) -> None:
40
+ """Initialize Azure Speech Service.
41
+
42
+ Args:
43
+ speech_key: Azure Speech API key (falls back to AZURE_SPEECH_KEY env var)
44
+ speech_region: Azure region (falls back to AZURE_SPEECH_REGION env var)
45
+ speech_endpoint: Optional custom endpoint (falls back to AZURE_SPEECH_ENDPOINT env var)
46
+ warn_if_unconfigured: Whether to warn at initialization if not configured
47
+ """
48
+ self.speech_key = speech_key or os.getenv("AZURE_SPEECH_KEY")
49
+ self.speech_region = speech_region or os.getenv("AZURE_SPEECH_REGION")
50
+ self.speech_endpoint = speech_endpoint or os.getenv("AZURE_SPEECH_ENDPOINT")
51
+
52
+ if warn_if_unconfigured and not self.speech_key:
53
+ logger.warning(
54
+ "Azure Speech credentials not configured; transcription operations may fail."
55
+ )
56
+
57
+ def _check_availability(self) -> None:
58
+ """Check if Speech SDK is available and credentials are configured."""
59
+ if not SPEECH_SDK_AVAILABLE:
60
+ raise RuntimeError(
61
+ "azure-cognitiveservices-speech package not installed. "
62
+ "Install with: pip install srx-lib-azure[speech]"
63
+ )
64
+ if not self.speech_key:
65
+ raise RuntimeError(
66
+ "Azure Speech credentials not configured. "
67
+ "Provide speech_key or set AZURE_SPEECH_KEY environment variable."
68
+ )
69
+ if not self.speech_region and not self.speech_endpoint:
70
+ raise RuntimeError(
71
+ "Azure Speech region or endpoint not configured. "
72
+ "Provide speech_region or speech_endpoint, or set AZURE_SPEECH_REGION environment variable."
73
+ )
74
+
75
+ def _preprocess_audio(self, input_path: str) -> str:
76
+ """Convert audio to 16kHz mono WAV format for optimal Azure Speech processing.
77
+
78
+ Args:
79
+ input_path: Path to input audio file
80
+
81
+ Returns:
82
+ Path to preprocessed WAV file
83
+
84
+ Raises:
85
+ RuntimeError: If ffmpeg is not available or conversion fails
86
+ """
87
+ try:
88
+ # Check if ffmpeg is available
89
+ subprocess.run(
90
+ ["ffmpeg", "-version"],
91
+ capture_output=True,
92
+ check=True,
93
+ )
94
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
95
+ raise RuntimeError(
96
+ "ffmpeg not found. Please install ffmpeg for audio preprocessing."
97
+ ) from e
98
+
99
+ # Create temporary WAV file
100
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
101
+ output_path = tf.name
102
+
103
+ try:
104
+ # Convert to 16kHz mono WAV
105
+ subprocess.run(
106
+ [
107
+ "ffmpeg",
108
+ "-i",
109
+ input_path,
110
+ "-ar",
111
+ "16000", # 16kHz sample rate
112
+ "-ac",
113
+ "1", # Mono
114
+ "-y", # Overwrite output file
115
+ output_path,
116
+ ],
117
+ capture_output=True,
118
+ check=True,
119
+ )
120
+ logger.info(f"Preprocessed audio: {input_path} -> {output_path}")
121
+ return output_path
122
+ except subprocess.CalledProcessError as e:
123
+ # Clean up on error
124
+ if os.path.exists(output_path):
125
+ os.unlink(output_path)
126
+ raise RuntimeError(f"Audio preprocessing failed: {e.stderr.decode()}") from e
127
+
128
+ async def transcribe_audio_to_markdown(
129
+ self,
130
+ audio_path: str,
131
+ language: str = "id-ID",
132
+ preprocess: bool = True,
133
+ on_recognizing: Optional[Callable[[str], None]] = None,
134
+ on_recognized: Optional[Callable[[str], None]] = None,
135
+ ) -> str:
136
+ """Transcribe audio file to markdown-formatted text.
137
+
138
+ Args:
139
+ audio_path: Path to audio file (mp3, m4a, wav, etc.)
140
+ language: BCP-47 language code (default: 'id-ID' for Indonesian)
141
+ Common codes: 'en-US', 'id-ID', 'ms-MY', 'zh-CN', 'ja-JP'
142
+ preprocess: Whether to preprocess audio to 16kHz mono WAV (recommended)
143
+ on_recognizing: Optional callback for intermediate recognition results
144
+ on_recognized: Optional callback for final recognition results
145
+
146
+ Returns:
147
+ Markdown-formatted transcription text
148
+
149
+ Raises:
150
+ RuntimeError: If SDK not available, credentials missing, or transcription fails
151
+ """
152
+ self._check_availability()
153
+
154
+ # Preprocess audio if requested
155
+ wav_path = audio_path
156
+ cleanup_wav = False
157
+ if preprocess:
158
+ wav_path = self._preprocess_audio(audio_path)
159
+ cleanup_wav = True
160
+
161
+ try:
162
+ # Configure Azure Speech
163
+ if self.speech_endpoint:
164
+ speech_config = speechsdk.SpeechConfig(
165
+ subscription=self.speech_key,
166
+ endpoint=self.speech_endpoint,
167
+ )
168
+ else:
169
+ speech_config = speechsdk.SpeechConfig(
170
+ subscription=self.speech_key,
171
+ region=self.speech_region,
172
+ )
173
+
174
+ # Configure audio input
175
+ audio_config = speechsdk.audio.AudioConfig(filename=wav_path)
176
+
177
+ # Create speech recognizer with language
178
+ recognizer = speechsdk.SpeechRecognizer(
179
+ speech_config=speech_config,
180
+ audio_config=audio_config,
181
+ language=language,
182
+ )
183
+
184
+ # Event-driven continuous recognition
185
+ paragraphs: list[str] = []
186
+ current: list[str] = []
187
+ done = asyncio.get_event_loop().create_future()
188
+
189
+ def recognizing_handler(evt):
190
+ """Handle intermediate recognition results."""
191
+ if evt.result.reason == speechsdk.ResultReason.RecognizingSpeech:
192
+ logger.debug(f"Recognizing: {evt.result.text}")
193
+ if on_recognizing:
194
+ on_recognizing(evt.result.text)
195
+
196
+ def recognized_handler(evt):
197
+ """Handle final recognition results."""
198
+ if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
199
+ text = evt.result.text.strip()
200
+ if text:
201
+ current.append(text)
202
+ logger.debug(f"Recognized: {text}")
203
+ if on_recognized:
204
+ on_recognized(text)
205
+ elif evt.result.reason == speechsdk.ResultReason.NoMatch:
206
+ logger.debug("No speech recognized")
207
+
208
+ def session_stopped(evt):
209
+ """Handle session stop."""
210
+ logger.info("Session stopped")
211
+ if current:
212
+ paragraphs.append(" ".join(current))
213
+ if not done.done():
214
+ done.set_result(True)
215
+
216
+ def canceled(evt):
217
+ """Handle cancellation."""
218
+ if evt.reason == speechsdk.CancellationReason.Error:
219
+ error_msg = f"Transcription error: {evt.error_details}"
220
+ logger.error(error_msg)
221
+ if not done.done():
222
+ done.set_exception(RuntimeError(error_msg))
223
+ else:
224
+ logger.info("Transcription canceled")
225
+ if not done.done():
226
+ done.set_result(True)
227
+
228
+ # Connect event handlers
229
+ recognizer.recognizing.connect(recognizing_handler)
230
+ recognizer.recognized.connect(recognized_handler)
231
+ recognizer.session_stopped.connect(session_stopped)
232
+ recognizer.canceled.connect(canceled)
233
+
234
+ # Start continuous recognition
235
+ logger.info(f"Starting transcription for {audio_path} (language: {language})")
236
+ recognizer.start_continuous_recognition_async().get()
237
+
238
+ # Wait for completion (max 15 minutes timeout)
239
+ try:
240
+ await asyncio.wait_for(done, timeout=900)
241
+ except asyncio.TimeoutError:
242
+ raise RuntimeError("Transcription timeout (15 minutes exceeded)")
243
+
244
+ # Stop recognition
245
+ recognizer.stop_continuous_recognition_async().get()
246
+
247
+ # Format as markdown with bullet points
248
+ if not paragraphs:
249
+ logger.warning("No transcription results")
250
+ return ""
251
+
252
+ markdown = "\n\n".join(f"- {para}" for para in paragraphs)
253
+ logger.info(f"Transcription completed: {len(paragraphs)} paragraphs")
254
+ return markdown
255
+
256
+ finally:
257
+ # Clean up preprocessed WAV file
258
+ if cleanup_wav and os.path.exists(wav_path):
259
+ try:
260
+ os.unlink(wav_path)
261
+ logger.debug(f"Cleaned up temporary file: {wav_path}")
262
+ except Exception as e:
263
+ logger.warning(f"Failed to clean up {wav_path}: {e}")
264
+
265
+ async def transcribe_audio_bytes(
266
+ self,
267
+ audio_bytes: bytes,
268
+ file_extension: str = ".mp3",
269
+ language: str = "id-ID",
270
+ ) -> str:
271
+ """Transcribe audio from bytes to markdown-formatted text.
272
+
273
+ Args:
274
+ audio_bytes: Audio file content as bytes
275
+ file_extension: File extension (for format detection)
276
+ language: BCP-47 language code (default: 'id-ID' for Indonesian)
277
+
278
+ Returns:
279
+ Markdown-formatted transcription text
280
+
281
+ Raises:
282
+ RuntimeError: If SDK not available, credentials missing, or transcription fails
283
+ """
284
+ # Write bytes to temporary file
285
+ with tempfile.NamedTemporaryFile(
286
+ suffix=file_extension,
287
+ delete=False,
288
+ ) as tf:
289
+ tf.write(audio_bytes)
290
+ temp_path = tf.name
291
+
292
+ try:
293
+ return await self.transcribe_audio_to_markdown(
294
+ temp_path,
295
+ language=language,
296
+ preprocess=True,
297
+ )
298
+ finally:
299
+ # Clean up temporary file
300
+ if os.path.exists(temp_path):
301
+ try:
302
+ os.unlink(temp_path)
303
+ except Exception as e:
304
+ logger.warning(f"Failed to clean up {temp_path}: {e}")
305
+
306
+ async def transcribe_audio_streaming(
307
+ self,
308
+ audio_path: str,
309
+ language: str = "id-ID",
310
+ preprocess: bool = True,
311
+ ) -> AsyncGenerator[dict, None]:
312
+ """Transcribe audio file with real-time streaming of intermediate and final results.
313
+
314
+ Args:
315
+ audio_path: Path to audio file (mp3, m4a, wav, etc.)
316
+ language: BCP-47 language code (default: 'id-ID' for Indonesian)
317
+ preprocess: Whether to preprocess audio to 16kHz mono WAV (recommended)
318
+
319
+ Yields:
320
+ Dict with keys:
321
+ - type: "recognizing" | "recognized" | "completed" | "error"
322
+ - text: The transcribed text (for recognizing/recognized types)
323
+ - markdown: Full markdown content (for completed type)
324
+ - error: Error message (for error type)
325
+
326
+ Raises:
327
+ RuntimeError: If SDK not available, credentials missing, or transcription fails
328
+ """
329
+ self._check_availability()
330
+
331
+ # Preprocess audio if requested
332
+ wav_path = audio_path
333
+ cleanup_wav = False
334
+ if preprocess:
335
+ wav_path = self._preprocess_audio(audio_path)
336
+ cleanup_wav = True
337
+
338
+ try:
339
+ # Configure Azure Speech
340
+ if self.speech_endpoint:
341
+ speech_config = speechsdk.SpeechConfig(
342
+ subscription=self.speech_key,
343
+ endpoint=self.speech_endpoint,
344
+ )
345
+ else:
346
+ speech_config = speechsdk.SpeechConfig(
347
+ subscription=self.speech_key,
348
+ region=self.speech_region,
349
+ )
350
+
351
+ # Configure audio input
352
+ audio_config = speechsdk.audio.AudioConfig(filename=wav_path)
353
+
354
+ # Create speech recognizer with language
355
+ recognizer = speechsdk.SpeechRecognizer(
356
+ speech_config=speech_config,
357
+ audio_config=audio_config,
358
+ language=language,
359
+ )
360
+
361
+ # Event-driven continuous recognition with queue for streaming
362
+ paragraphs: list[str] = []
363
+ current: list[str] = []
364
+ queue = asyncio.Queue()
365
+ done = asyncio.get_event_loop().create_future()
366
+
367
+ def recognizing_handler(evt):
368
+ """Handle intermediate recognition results."""
369
+ if evt.result.reason == speechsdk.ResultReason.RecognizingSpeech:
370
+ logger.debug(f"Recognizing: {evt.result.text}")
371
+ asyncio.create_task(
372
+ queue.put({"type": "recognizing", "text": evt.result.text})
373
+ )
374
+
375
+ def recognized_handler(evt):
376
+ """Handle final recognition results."""
377
+ if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
378
+ text = evt.result.text.strip()
379
+ if text:
380
+ current.append(text)
381
+ logger.debug(f"Recognized: {text}")
382
+ asyncio.create_task(
383
+ queue.put({"type": "recognized", "text": text})
384
+ )
385
+ elif evt.result.reason == speechsdk.ResultReason.NoMatch:
386
+ logger.debug("No speech recognized")
387
+
388
+ def session_stopped(evt):
389
+ """Handle session stop."""
390
+ logger.info("Session stopped")
391
+ if current:
392
+ paragraphs.append(" ".join(current))
393
+ if not done.done():
394
+ done.set_result(True)
395
+
396
+ def canceled(evt):
397
+ """Handle cancellation."""
398
+ if evt.reason == speechsdk.CancellationReason.Error:
399
+ error_msg = f"Transcription error: {evt.error_details}"
400
+ logger.error(error_msg)
401
+ asyncio.create_task(
402
+ queue.put({"type": "error", "error": error_msg})
403
+ )
404
+ if not done.done():
405
+ done.set_exception(RuntimeError(error_msg))
406
+ else:
407
+ logger.info("Transcription canceled")
408
+ if not done.done():
409
+ done.set_result(True)
410
+
411
+ # Connect event handlers
412
+ recognizer.recognizing.connect(recognizing_handler)
413
+ recognizer.recognized.connect(recognized_handler)
414
+ recognizer.session_stopped.connect(session_stopped)
415
+ recognizer.canceled.connect(canceled)
416
+
417
+ # Start continuous recognition
418
+ logger.info(
419
+ f"Starting streaming transcription for {audio_path} (language: {language})"
420
+ )
421
+ recognizer.start_continuous_recognition_async().get()
422
+
423
+ # Stream results as they come in
424
+ try:
425
+ while not done.done():
426
+ try:
427
+ # Wait for next event with short timeout
428
+ event = await asyncio.wait_for(queue.get(), timeout=0.5)
429
+ yield event
430
+ except asyncio.TimeoutError:
431
+ continue
432
+
433
+ # Wait for final completion
434
+ await asyncio.wait_for(done, timeout=900)
435
+
436
+ # Process any remaining events in queue
437
+ while not queue.empty():
438
+ event = await queue.get()
439
+ yield event
440
+
441
+ except asyncio.TimeoutError:
442
+ yield {"type": "error", "error": "Transcription timeout (15 minutes exceeded)"}
443
+ raise RuntimeError("Transcription timeout (15 minutes exceeded)")
444
+
445
+ # Stop recognition
446
+ recognizer.stop_continuous_recognition_async().get()
447
+
448
+ # Format final markdown
449
+ if paragraphs:
450
+ markdown = "\n\n".join(f"- {para}" for para in paragraphs)
451
+ logger.info(f"Streaming transcription completed: {len(paragraphs)} paragraphs")
452
+ yield {"type": "completed", "markdown": markdown}
453
+ else:
454
+ logger.warning("No transcription results")
455
+ yield {"type": "completed", "markdown": ""}
456
+
457
+ finally:
458
+ # Clean up preprocessed WAV file
459
+ if cleanup_wav and os.path.exists(wav_path):
460
+ try:
461
+ os.unlink(wav_path)
462
+ logger.debug(f"Cleaned up temporary file: {wav_path}")
463
+ except Exception as e:
464
+ logger.warning(f"Failed to clean up {wav_path}: {e}")