audio-scribe 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
audio_scribe/cli.py DELETED
@@ -1,567 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Audio Scribe
4
- -----------------
5
- A command-line script for transcribing audio files with speaker diarization
6
- using Whisper and Pyannote. The script uses a Hugging Face token for
7
- downloading Pyannote speaker-diarization models and displays a progress bar
8
- with resource usage while transcribing.
9
- """
10
-
11
- print("Initializing environment... Please wait while we load dependencies and models.")
12
- import sys
13
- sys.stdout.flush()
14
-
15
- import os
16
- import glob
17
- import wave
18
- import json
19
- import logging
20
- import warnings
21
- import argparse
22
- import readline
23
- from pathlib import Path
24
- from datetime import datetime
25
- from typing import Optional, Dict
26
- from dataclasses import dataclass
27
- import base64
28
-
29
- # Core dependencies
30
- from cryptography.fernet import Fernet
31
- from cryptography.hazmat.primitives import hashes
32
- from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
33
- import torch
34
- import whisper
35
- import importlib.metadata
36
- from importlib.metadata import PackageNotFoundError
37
- from pyannote.audio import Pipeline
38
-
39
- # Progress bar dependencies - imported via HAVE_PROGRESS_SUPPORT from __init__
40
- try:
41
- from alive_progress import alive_bar
42
- import psutil
43
- import GPUtil
44
- HAVE_PROGRESS_SUPPORT = True
45
- except ImportError:
46
- HAVE_PROGRESS_SUPPORT = False
47
-
48
-
49
- # Configure logging
50
- LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
51
- logging.basicConfig(
52
- level=logging.INFO,
53
- format=LOG_FORMAT,
54
- handlers=[
55
- logging.StreamHandler(),
56
- logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
57
- ],
58
- )
59
- logger = logging.getLogger(__name__)
60
-
61
- # ---------- FILE PATH TAB-COMPLETION SNIPPET ----------
62
- def complete_path(text, state):
63
- """
64
- Return the 'state'-th completion for 'text'.
65
- This function will be used by 'readline' to enable file path autocompletion.
66
- """
67
- # If the user typed a glob pattern (with * or ?)
68
- if '*' in text or '?' in text:
69
- matches = glob.glob(text)
70
- else:
71
- # Split off the directory name and partial file/directory name
72
- directory, partial = os.path.split(text)
73
- if not directory:
74
- directory = '.'
75
- try:
76
- # List everything in 'directory' that starts with 'partial'
77
- entries = os.listdir(directory)
78
- except OSError:
79
- # If directory doesn't exist or we lack permission, no matches
80
- entries = []
81
-
82
- matches = []
83
- for entry in entries:
84
- if entry.startswith(partial):
85
- full_path = os.path.join(directory, entry)
86
- # If it's a directory, add a trailing slash to indicate that
87
- if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
88
- full_path += os.path.sep
89
- matches.append(full_path)
90
-
91
- # Sort matches to have a consistent order
92
- matches.sort()
93
-
94
- # If 'state' is beyond last match, return None
95
- return matches[state] if state < len(matches) else None
96
-
97
-
98
- @dataclass
99
- class TranscriptionConfig:
100
- """
101
- Configuration settings for the transcription pipeline.
102
- """
103
- output_directory: Path
104
- whisper_model: str = "base.en"
105
- diarization_model: str = "pyannote/speaker-diarization-3.1"
106
- temp_directory: Optional[Path] = None
107
- device: Optional[str] = None
108
-
109
- def __post_init__(self):
110
- # Use CUDA if available, else fall back to CPU
111
- self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
112
- # Default temp directory inside the output directory
113
- self.temp_directory = self.temp_directory or (self.output_directory / "temp")
114
- # Ensure directories exist
115
- self.temp_directory.mkdir(parents=True, exist_ok=True)
116
- self.output_directory.mkdir(parents=True, exist_ok=True)
117
-
118
-
119
- class TokenManager:
120
- """
121
- Handles secure storage and retrieval of the Hugging Face authentication token.
122
- """
123
- def __init__(self):
124
- # Store config in ~/.pyannote/config.json
125
- self.config_dir = Path.home() / ".pyannote"
126
- self.config_file = self.config_dir / "config.json"
127
- self._initialize_config()
128
-
129
- def _initialize_config(self) -> None:
130
- """
131
- Initialize configuration directory and file with secure permissions.
132
- """
133
- self.config_dir.mkdir(exist_ok=True)
134
- if not self.config_file.exists():
135
- self._save_config({})
136
-
137
- # Set secure file and directory permissions on POSIX systems
138
- if os.name == "posix":
139
- os.chmod(self.config_dir, 0o700)
140
- os.chmod(self.config_file, 0o600)
141
-
142
- def _get_encryption_key(self) -> bytes:
143
- """
144
- Generate an encryption key from system-specific data.
145
- """
146
- salt = b"pyannote-audio-salt"
147
- kdf = PBKDF2HMAC(
148
- algorithm=hashes.SHA256(),
149
- length=32,
150
- salt=salt,
151
- iterations=100000,
152
- )
153
- key = kdf.derive(str(Path.home()).encode())
154
- return base64.urlsafe_b64encode(key)
155
-
156
- def _save_config(self, config: dict) -> None:
157
- """
158
- Securely save configuration to file.
159
- """
160
- with open(self.config_file, "w", encoding="utf-8") as f:
161
- json.dump(config, f)
162
-
163
- def _load_config(self) -> dict:
164
- """
165
- Load configuration from file.
166
- """
167
- try:
168
- with open(self.config_file, "r", encoding="utf-8") as f:
169
- return json.load(f)
170
- except Exception:
171
- return {}
172
-
173
- def store_token(self, token: str) -> bool:
174
- """
175
- Securely store authentication token.
176
- """
177
- try:
178
- fernet = Fernet(self._get_encryption_key())
179
- encrypted_token = fernet.encrypt(token.encode())
180
-
181
- config = self._load_config()
182
- config["token"] = encrypted_token.decode()
183
-
184
- self._save_config(config)
185
- return True
186
- except Exception as e:
187
- logger.error(f"Failed to store token: {e}")
188
- return False
189
-
190
- def retrieve_token(self) -> Optional[str]:
191
- """
192
- Retrieve stored authentication token.
193
- """
194
- try:
195
- config = self._load_config()
196
- if "token" in config:
197
- fernet = Fernet(self._get_encryption_key())
198
- return fernet.decrypt(config["token"].encode()).decode()
199
- except Exception as e:
200
- logger.error(f"Failed to retrieve token: {e}")
201
- return None
202
-
203
- def delete_token(self) -> bool:
204
- """
205
- Delete stored authentication token.
206
- """
207
- try:
208
- config = self._load_config()
209
- if "token" in config:
210
- del config["token"]
211
- self._save_config(config)
212
- return True
213
- except Exception as e:
214
- logger.error(f"Failed to delete token: {e}")
215
- return False
216
-
217
-
218
- class DependencyManager:
219
- """
220
- Manages and verifies system dependencies using importlib.metadata.
221
- """
222
- REQUIRED_PACKAGES = {
223
- "torch": None,
224
- "pyannote.audio": None,
225
- "openai-whisper": None,
226
- "pytorch-lightning": None,
227
- "keyring": None,
228
- }
229
-
230
- @classmethod
231
- def verify_dependencies(cls) -> bool:
232
- """
233
- Verify all required dependencies are installed with correct versions
234
- (if specified). Returns True if all are installed and correct, False otherwise.
235
- """
236
- missing = []
237
- outdated = []
238
-
239
- for package, required_version in cls.REQUIRED_PACKAGES.items():
240
- try:
241
- installed_version = importlib.metadata.version(package)
242
- if required_version and installed_version != required_version:
243
- outdated.append(
244
- f"{package} (installed: {installed_version}, required: {required_version})"
245
- )
246
- except PackageNotFoundError:
247
- missing.append(package)
248
-
249
- if missing or outdated:
250
- if missing:
251
- logger.error("Missing packages: %s", ", ".join(missing))
252
- if outdated:
253
- logger.error("Outdated packages: %s", ", ".join(outdated))
254
- logger.info(
255
- "Install required packages: pip install %s",
256
- " ".join(
257
- f"{pkg}=={ver}" if ver else pkg
258
- for pkg, ver in cls.REQUIRED_PACKAGES.items()
259
- ),
260
- )
261
- return False
262
- return True
263
-
264
-
265
- class AudioProcessor:
266
- """
267
- Handles audio file processing and segmentation using the `wave` module.
268
- """
269
- def __init__(self, config: TranscriptionConfig):
270
- self.config = config
271
-
272
- def load_audio_segment(
273
- self,
274
- audio_path: Path,
275
- start_time: float,
276
- end_time: float,
277
- output_path: Path,
278
- ) -> bool:
279
- """
280
- Extract and save the audio segment from `start_time` to `end_time`.
281
- """
282
- try:
283
- with wave.open(str(audio_path), "rb") as infile:
284
- params = infile.getparams()
285
- frame_rate = params.framerate
286
- start_frame = int(start_time * frame_rate)
287
- end_frame = min(int(end_time * frame_rate), infile.getnframes())
288
-
289
- infile.setpos(start_frame)
290
- frames = infile.readframes(end_frame - start_frame)
291
-
292
- with wave.open(str(output_path), "wb") as outfile:
293
- outfile.setparams(params)
294
- outfile.writeframes(frames)
295
- return True
296
- except Exception as e:
297
- logger.error(f"Failed to process audio segment: {e}")
298
- return False
299
-
300
-
301
- class TranscriptionPipeline:
302
- """
303
- Main pipeline for audio transcription (Whisper) and speaker diarization (Pyannote).
304
- """
305
- def __init__(self, config: TranscriptionConfig):
306
- self.config = config
307
- self.diarization_pipeline = None
308
- self.whisper_model = None
309
- self.token_manager = TokenManager()
310
- self._running = False # used for resource monitor thread
311
-
312
- def initialize_models(self, auth_token: str) -> bool:
313
- """
314
- Initialize the Pyannote diarization pipeline and the Whisper model.
315
- """
316
- try:
317
- # Load Whisper model (set download root to avoid clutter in home directory)
318
- self.whisper_model = whisper.load_model(
319
- self.config.whisper_model,
320
- device=self.config.device,
321
- download_root=str(self.config.output_directory / "models"),
322
- )
323
-
324
- # Load Pyannote diarization pipeline
325
- self.diarization_pipeline = Pipeline.from_pretrained(
326
- self.config.diarization_model, use_auth_token=auth_token
327
- )
328
- self.diarization_pipeline.to(torch.device(self.config.device))
329
-
330
- if self.config.device == "cpu":
331
- warnings.warn("Running on CPU. GPU is recommended for better performance.")
332
-
333
- return True
334
- except Exception as e:
335
- logger.error(f"Model initialization failed: {e}")
336
- logger.error("Please ensure you have accepted the model conditions at:")
337
- logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
338
- logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
339
- return False
340
-
341
- def _update_resources(self, bar):
342
- """
343
- Continuously update progress bar text with CPU/MEM/GPU usage, until self._running is False.
344
- """
345
- while self._running:
346
- try:
347
- import time
348
- time.sleep(0.5)
349
-
350
- cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
351
- memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
352
-
353
- if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
354
- gpus = GPUtil.getGPUs()
355
- gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
356
- gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
357
- gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
358
- else:
359
- gpu_usage_text = "N/A"
360
-
361
- resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
362
- bar.text(resource_text)
363
- except Exception as e:
364
- logger.error(f"Resource monitoring error: {e}")
365
-
366
- def process_file(self, audio_path: Path) -> bool:
367
- """
368
- Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback.
369
- """
370
- try:
371
- logger.info("Starting audio processing...")
372
- diarization = self.diarization_pipeline(str(audio_path))
373
- segments = list(diarization.itertracks(yield_label=True))
374
- total_segments = len(segments)
375
-
376
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
377
- output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
378
- audio_processor = AudioProcessor(self.config)
379
-
380
- if not HAVE_PROGRESS_SUPPORT:
381
- # No alive_progress, psutil, or GPUtil installed
382
- logger.info("Processing audio without progress bar (missing optional packages).")
383
- with output_file.open("w", encoding="utf-8") as f:
384
- for turn, _, speaker in segments:
385
- segment_path = (
386
- self.config.temp_directory
387
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
388
- )
389
- if self.audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
390
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
391
- segment_path.unlink(missing_ok=True)
392
-
393
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
394
- f.write(line)
395
- logger.info(line.strip())
396
- return True
397
- else:
398
- # Use a progress bar to track segment transcription
399
- from alive_progress import alive_bar
400
- import threading
401
-
402
- self._running = True
403
- with output_file.open("w", encoding="utf-8") as f, alive_bar(
404
- total_segments,
405
- title="Transcribing Audio",
406
- spinner="pulse",
407
- theme="classic",
408
- stats=False,
409
- elapsed=True,
410
- monitor=True,
411
- ) as bar:
412
-
413
- # Start a background thread for resource monitoring
414
- resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
415
- resource_thread.start()
416
-
417
- for turn, _, speaker in segments:
418
- segment_path = (
419
- self.config.temp_directory
420
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
421
- )
422
- if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
423
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
424
- segment_path.unlink(missing_ok=True)
425
-
426
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
427
- f.write(line)
428
- logger.info(line.strip())
429
-
430
- # Update the progress bar
431
- bar()
432
-
433
- # Stop resource monitoring
434
- self._running = False
435
- resource_thread.join()
436
-
437
- logger.info(f"Transcription completed. Output saved to: {output_file}")
438
- return True
439
-
440
- except Exception as e:
441
- logger.error(f"Processing failed: {e}")
442
- return False
443
-
444
-
445
- def get_token(token_manager: TokenManager) -> Optional[str]:
446
- """
447
- Get authentication token from storage or user input.
448
- """
449
- stored_token = token_manager.retrieve_token()
450
- if stored_token:
451
- choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
452
- if choice == "y":
453
- return stored_token
454
-
455
- print("\nA HuggingFace token is required for speaker diarization.")
456
- print("Get your token at: https://huggingface.co/settings/tokens")
457
- print("\nEnsure you have accepted:")
458
- print(" 1. pyannote/segmentation-3.0 conditions")
459
- print(" 2. pyannote/speaker-diarization-3.1 conditions")
460
-
461
- token = input("\nEnter HuggingFace token: ").strip()
462
- if token:
463
- choice = input("Save token for future use? (y/n): ").lower().strip()
464
- if choice == "y":
465
- if token_manager.store_token(token):
466
- print("Token saved successfully.")
467
- else:
468
- print("Failed to save token. It will be used for this session only.")
469
- return token if token else None
470
-
471
-
472
- def main():
473
- parser = argparse.ArgumentParser(
474
- description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
475
- )
476
- parser.add_argument(
477
- "--audio",
478
- type=Path,
479
- help="Path to the audio file to transcribe."
480
- )
481
- parser.add_argument(
482
- "--token",
483
- help="HuggingFace API token. Overrides any saved token."
484
- )
485
- parser.add_argument(
486
- "--output",
487
- type=Path,
488
- help="Path to the output directory for transcripts and temporary files.",
489
- )
490
- parser.add_argument(
491
- "--delete-token",
492
- action="store_true",
493
- help="Delete any stored Hugging Face token and exit.",
494
- )
495
- parser.add_argument(
496
- "--show-warnings",
497
- action="store_true",
498
- help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
499
- )
500
- parser.add_argument(
501
- "--whisper-model",
502
- default="base.en",
503
- help="Specify the Whisper model to use (default: 'base.en').",
504
- )
505
- args = parser.parse_args()
506
-
507
- # Manage user warnings
508
- if not args.show_warnings:
509
- warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
510
- warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
511
- else:
512
- warnings.resetwarnings()
513
-
514
- # Check dependencies
515
- if not DependencyManager.verify_dependencies():
516
- sys.exit(1)
517
-
518
- # Initialize tab-completion for file paths (Unix-like only, or with pyreadline on Windows)
519
- readline.set_completer_delims(' \t\n;')
520
- readline.set_completer(complete_path)
521
- readline.parse_and_bind("tab: complete")
522
-
523
- # Initialize the token manager
524
- token_manager = TokenManager()
525
-
526
- # If user wants to delete the stored token, do so and exit
527
- if args.delete_token:
528
- success = token_manager.delete_token()
529
- sys.exit(0 if success else 1)
530
-
531
- # Prepare configuration
532
- output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
533
- config = TranscriptionConfig(
534
- output_directory=output_dir,
535
- whisper_model=args.whisper_model
536
- )
537
-
538
- # Initialize pipeline
539
- pipeline = TranscriptionPipeline(config)
540
- hf_token = args.token or get_token(token_manager)
541
- if not hf_token:
542
- logger.error("No Hugging Face token provided. Exiting.")
543
- sys.exit(1)
544
-
545
- # Initialize models
546
- if not pipeline.initialize_models(hf_token):
547
- logger.error("Failed to initialize pipeline. Exiting.")
548
- sys.exit(1)
549
-
550
- # Prompt user for audio file path if not passed in
551
- audio_path = args.audio
552
- while not audio_path or not audio_path.exists():
553
- audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
554
- audio_path = Path(audio_path_str)
555
- if not audio_path.exists():
556
- print(f"File '{audio_path}' not found. Please try again.")
557
-
558
- print("Audio file path accepted. Preparing to process the audio...")
559
- sys.stdout.flush()
560
-
561
- # Process the audio file
562
- if not pipeline.process_file(audio_path):
563
- sys.exit(1)
564
-
565
-
566
- if __name__ == "__main__":
567
- main()
@@ -1,9 +0,0 @@
1
- audio_scribe/__init__.py,sha256=19NLfiVus01TtbB1SFwJ3Q-vFvN9nLzNYGIZiNB45qM,587
2
- audio_scribe/cli.py,sha256=LToGAiCHHXDitsXBuMqKMHkH_HzSARX0C06-Ha74jKU,20287
3
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- tests/test_audio_scribe_main.py,sha256=Jv5XixrhsK49RJBh6HlghnIxxDiCq52PUv0lf8ljpJY,15571
5
- audio_scribe-0.1.1.dist-info/METADATA,sha256=CBHEE3qzCnRWQ-7ljDGMbY8i5awoGYYDcgbPUH6Za-M,10455
6
- audio_scribe-0.1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
7
- audio_scribe-0.1.1.dist-info/entry_points.txt,sha256=eaO9r_zAFnrWseKyJcBpGUHQq-P7NXBw5er8sZaPfFU,55
8
- audio_scribe-0.1.1.dist-info/top_level.txt,sha256=K08EDnZLtXcJJ9RxLnzDUz-AmnUo5vGRyYmS3wSirtE,19
9
- audio_scribe-0.1.1.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- audio-scribe = audio_scribe.cli:main
tests/__init__.py DELETED
File without changes