audio-scribe 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
audio_scribe/cli.py DELETED
@@ -1,567 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Audio Scribe
4
- -----------------
5
- A command-line script for transcribing audio files with speaker diarization
6
- using Whisper and Pyannote. The script uses a Hugging Face token for
7
- downloading Pyannote speaker-diarization models and displays a progress bar
8
- with resource usage while transcribing.
9
- """
10
-
11
- print("Initializing environment... Please wait while we load dependencies and models.")
12
- import sys
13
- sys.stdout.flush()
14
-
15
- import os
16
- import glob
17
- import wave
18
- import json
19
- import logging
20
- import warnings
21
- import argparse
22
- import readline
23
- from pathlib import Path
24
- from datetime import datetime
25
- from typing import Optional, Dict
26
- from dataclasses import dataclass
27
- import base64
28
-
29
- # Core dependencies
30
- from cryptography.fernet import Fernet
31
- from cryptography.hazmat.primitives import hashes
32
- from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
33
- import torch
34
- import whisper
35
- import importlib.metadata
36
- from importlib.metadata import PackageNotFoundError
37
- from pyannote.audio import Pipeline
38
-
39
- # Progress bar dependencies - imported via HAVE_PROGRESS_SUPPORT from __init__
40
- try:
41
- from alive_progress import alive_bar
42
- import psutil
43
- import GPUtil
44
- HAVE_PROGRESS_SUPPORT = True
45
- except ImportError:
46
- HAVE_PROGRESS_SUPPORT = False
47
-
48
-
49
- # Configure logging
50
- LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
51
- logging.basicConfig(
52
- level=logging.INFO,
53
- format=LOG_FORMAT,
54
- handlers=[
55
- logging.StreamHandler(),
56
- logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
57
- ],
58
- )
59
- logger = logging.getLogger(__name__)
60
-
61
- # ---------- FILE PATH TAB-COMPLETION SNIPPET ----------
62
- def complete_path(text, state):
63
- """
64
- Return the 'state'-th completion for 'text'.
65
- This function will be used by 'readline' to enable file path autocompletion.
66
- """
67
- # If the user typed a glob pattern (with * or ?)
68
- if '*' in text or '?' in text:
69
- matches = glob.glob(text)
70
- else:
71
- # Split off the directory name and partial file/directory name
72
- directory, partial = os.path.split(text)
73
- if not directory:
74
- directory = '.'
75
- try:
76
- # List everything in 'directory' that starts with 'partial'
77
- entries = os.listdir(directory)
78
- except OSError:
79
- # If directory doesn't exist or we lack permission, no matches
80
- entries = []
81
-
82
- matches = []
83
- for entry in entries:
84
- if entry.startswith(partial):
85
- full_path = os.path.join(directory, entry)
86
- # If it's a directory, add a trailing slash to indicate that
87
- if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
88
- full_path += os.path.sep
89
- matches.append(full_path)
90
-
91
- # Sort matches to have a consistent order
92
- matches.sort()
93
-
94
- # If 'state' is beyond last match, return None
95
- return matches[state] if state < len(matches) else None
96
-
97
-
98
- @dataclass
99
- class TranscriptionConfig:
100
- """
101
- Configuration settings for the transcription pipeline.
102
- """
103
- output_directory: Path
104
- whisper_model: str = "base.en"
105
- diarization_model: str = "pyannote/speaker-diarization-3.1"
106
- temp_directory: Optional[Path] = None
107
- device: Optional[str] = None
108
-
109
- def __post_init__(self):
110
- # Use CUDA if available, else fall back to CPU
111
- self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
112
- # Default temp directory inside the output directory
113
- self.temp_directory = self.temp_directory or (self.output_directory / "temp")
114
- # Ensure directories exist
115
- self.temp_directory.mkdir(parents=True, exist_ok=True)
116
- self.output_directory.mkdir(parents=True, exist_ok=True)
117
-
118
-
119
- class TokenManager:
120
- """
121
- Handles secure storage and retrieval of the Hugging Face authentication token.
122
- """
123
- def __init__(self):
124
- # Store config in ~/.pyannote/config.json
125
- self.config_dir = Path.home() / ".pyannote"
126
- self.config_file = self.config_dir / "config.json"
127
- self._initialize_config()
128
-
129
- def _initialize_config(self) -> None:
130
- """
131
- Initialize configuration directory and file with secure permissions.
132
- """
133
- self.config_dir.mkdir(exist_ok=True)
134
- if not self.config_file.exists():
135
- self._save_config({})
136
-
137
- # Set secure file and directory permissions on POSIX systems
138
- if os.name == "posix":
139
- os.chmod(self.config_dir, 0o700)
140
- os.chmod(self.config_file, 0o600)
141
-
142
- def _get_encryption_key(self) -> bytes:
143
- """
144
- Generate an encryption key from system-specific data.
145
- """
146
- salt = b"pyannote-audio-salt"
147
- kdf = PBKDF2HMAC(
148
- algorithm=hashes.SHA256(),
149
- length=32,
150
- salt=salt,
151
- iterations=100000,
152
- )
153
- key = kdf.derive(str(Path.home()).encode())
154
- return base64.urlsafe_b64encode(key)
155
-
156
- def _save_config(self, config: dict) -> None:
157
- """
158
- Securely save configuration to file.
159
- """
160
- with open(self.config_file, "w", encoding="utf-8") as f:
161
- json.dump(config, f)
162
-
163
- def _load_config(self) -> dict:
164
- """
165
- Load configuration from file.
166
- """
167
- try:
168
- with open(self.config_file, "r", encoding="utf-8") as f:
169
- return json.load(f)
170
- except Exception:
171
- return {}
172
-
173
- def store_token(self, token: str) -> bool:
174
- """
175
- Securely store authentication token.
176
- """
177
- try:
178
- fernet = Fernet(self._get_encryption_key())
179
- encrypted_token = fernet.encrypt(token.encode())
180
-
181
- config = self._load_config()
182
- config["token"] = encrypted_token.decode()
183
-
184
- self._save_config(config)
185
- return True
186
- except Exception as e:
187
- logger.error(f"Failed to store token: {e}")
188
- return False
189
-
190
- def retrieve_token(self) -> Optional[str]:
191
- """
192
- Retrieve stored authentication token.
193
- """
194
- try:
195
- config = self._load_config()
196
- if "token" in config:
197
- fernet = Fernet(self._get_encryption_key())
198
- return fernet.decrypt(config["token"].encode()).decode()
199
- except Exception as e:
200
- logger.error(f"Failed to retrieve token: {e}")
201
- return None
202
-
203
- def delete_token(self) -> bool:
204
- """
205
- Delete stored authentication token.
206
- """
207
- try:
208
- config = self._load_config()
209
- if "token" in config:
210
- del config["token"]
211
- self._save_config(config)
212
- return True
213
- except Exception as e:
214
- logger.error(f"Failed to delete token: {e}")
215
- return False
216
-
217
-
218
- class DependencyManager:
219
- """
220
- Manages and verifies system dependencies using importlib.metadata.
221
- """
222
- REQUIRED_PACKAGES = {
223
- "torch": None,
224
- "pyannote.audio": None,
225
- "openai-whisper": None,
226
- "pytorch-lightning": None,
227
- "keyring": None,
228
- }
229
-
230
- @classmethod
231
- def verify_dependencies(cls) -> bool:
232
- """
233
- Verify all required dependencies are installed with correct versions
234
- (if specified). Returns True if all are installed and correct, False otherwise.
235
- """
236
- missing = []
237
- outdated = []
238
-
239
- for package, required_version in cls.REQUIRED_PACKAGES.items():
240
- try:
241
- installed_version = importlib.metadata.version(package)
242
- if required_version and installed_version != required_version:
243
- outdated.append(
244
- f"{package} (installed: {installed_version}, required: {required_version})"
245
- )
246
- except PackageNotFoundError:
247
- missing.append(package)
248
-
249
- if missing or outdated:
250
- if missing:
251
- logger.error("Missing packages: %s", ", ".join(missing))
252
- if outdated:
253
- logger.error("Outdated packages: %s", ", ".join(outdated))
254
- logger.info(
255
- "Install required packages: pip install %s",
256
- " ".join(
257
- f"{pkg}=={ver}" if ver else pkg
258
- for pkg, ver in cls.REQUIRED_PACKAGES.items()
259
- ),
260
- )
261
- return False
262
- return True
263
-
264
-
265
- class AudioProcessor:
266
- """
267
- Handles audio file processing and segmentation using the `wave` module.
268
- """
269
- def __init__(self, config: TranscriptionConfig):
270
- self.config = config
271
-
272
- def load_audio_segment(
273
- self,
274
- audio_path: Path,
275
- start_time: float,
276
- end_time: float,
277
- output_path: Path,
278
- ) -> bool:
279
- """
280
- Extract and save the audio segment from `start_time` to `end_time`.
281
- """
282
- try:
283
- with wave.open(str(audio_path), "rb") as infile:
284
- params = infile.getparams()
285
- frame_rate = params.framerate
286
- start_frame = int(start_time * frame_rate)
287
- end_frame = min(int(end_time * frame_rate), infile.getnframes())
288
-
289
- infile.setpos(start_frame)
290
- frames = infile.readframes(end_frame - start_frame)
291
-
292
- with wave.open(str(output_path), "wb") as outfile:
293
- outfile.setparams(params)
294
- outfile.writeframes(frames)
295
- return True
296
- except Exception as e:
297
- logger.error(f"Failed to process audio segment: {e}")
298
- return False
299
-
300
-
301
- class TranscriptionPipeline:
302
- """
303
- Main pipeline for audio transcription (Whisper) and speaker diarization (Pyannote).
304
- """
305
- def __init__(self, config: TranscriptionConfig):
306
- self.config = config
307
- self.diarization_pipeline = None
308
- self.whisper_model = None
309
- self.token_manager = TokenManager()
310
- self._running = False # used for resource monitor thread
311
-
312
- def initialize_models(self, auth_token: str) -> bool:
313
- """
314
- Initialize the Pyannote diarization pipeline and the Whisper model.
315
- """
316
- try:
317
- # Load Whisper model (set download root to avoid clutter in home directory)
318
- self.whisper_model = whisper.load_model(
319
- self.config.whisper_model,
320
- device=self.config.device,
321
- download_root=str(self.config.output_directory / "models"),
322
- )
323
-
324
- # Load Pyannote diarization pipeline
325
- self.diarization_pipeline = Pipeline.from_pretrained(
326
- self.config.diarization_model, use_auth_token=auth_token
327
- )
328
- self.diarization_pipeline.to(torch.device(self.config.device))
329
-
330
- if self.config.device == "cpu":
331
- warnings.warn("Running on CPU. GPU is recommended for better performance.")
332
-
333
- return True
334
- except Exception as e:
335
- logger.error(f"Model initialization failed: {e}")
336
- logger.error("Please ensure you have accepted the model conditions at:")
337
- logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
338
- logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
339
- return False
340
-
341
- def _update_resources(self, bar):
342
- """
343
- Continuously update progress bar text with CPU/MEM/GPU usage, until self._running is False.
344
- """
345
- while self._running:
346
- try:
347
- import time
348
- time.sleep(0.5)
349
-
350
- cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
351
- memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
352
-
353
- if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
354
- gpus = GPUtil.getGPUs()
355
- gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
356
- gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
357
- gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
358
- else:
359
- gpu_usage_text = "N/A"
360
-
361
- resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
362
- bar.text(resource_text)
363
- except Exception as e:
364
- logger.error(f"Resource monitoring error: {e}")
365
-
366
- def process_file(self, audio_path: Path) -> bool:
367
- """
368
- Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback.
369
- """
370
- try:
371
- logger.info("Starting audio processing...")
372
- diarization = self.diarization_pipeline(str(audio_path))
373
- segments = list(diarization.itertracks(yield_label=True))
374
- total_segments = len(segments)
375
-
376
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
377
- output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
378
- audio_processor = AudioProcessor(self.config)
379
-
380
- if not HAVE_PROGRESS_SUPPORT:
381
- # No alive_progress, psutil, or GPUtil installed
382
- logger.info("Processing audio without progress bar (missing optional packages).")
383
- with output_file.open("w", encoding="utf-8") as f:
384
- for turn, _, speaker in segments:
385
- segment_path = (
386
- self.config.temp_directory
387
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
388
- )
389
- if self.audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
390
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
391
- segment_path.unlink(missing_ok=True)
392
-
393
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
394
- f.write(line)
395
- logger.info(line.strip())
396
- return True
397
- else:
398
- # Use a progress bar to track segment transcription
399
- from alive_progress import alive_bar
400
- import threading
401
-
402
- self._running = True
403
- with output_file.open("w", encoding="utf-8") as f, alive_bar(
404
- total_segments,
405
- title="Transcribing Audio",
406
- spinner="pulse",
407
- theme="classic",
408
- stats=False,
409
- elapsed=True,
410
- monitor=True,
411
- ) as bar:
412
-
413
- # Start a background thread for resource monitoring
414
- resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
415
- resource_thread.start()
416
-
417
- for turn, _, speaker in segments:
418
- segment_path = (
419
- self.config.temp_directory
420
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
421
- )
422
- if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
423
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
424
- segment_path.unlink(missing_ok=True)
425
-
426
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
427
- f.write(line)
428
- logger.info(line.strip())
429
-
430
- # Update the progress bar
431
- bar()
432
-
433
- # Stop resource monitoring
434
- self._running = False
435
- resource_thread.join()
436
-
437
- logger.info(f"Transcription completed. Output saved to: {output_file}")
438
- return True
439
-
440
- except Exception as e:
441
- logger.error(f"Processing failed: {e}")
442
- return False
443
-
444
-
445
- def get_token(token_manager: TokenManager) -> Optional[str]:
446
- """
447
- Get authentication token from storage or user input.
448
- """
449
- stored_token = token_manager.retrieve_token()
450
- if stored_token:
451
- choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
452
- if choice == "y":
453
- return stored_token
454
-
455
- print("\nA HuggingFace token is required for speaker diarization.")
456
- print("Get your token at: https://huggingface.co/settings/tokens")
457
- print("\nEnsure you have accepted:")
458
- print(" 1. pyannote/segmentation-3.0 conditions")
459
- print(" 2. pyannote/speaker-diarization-3.1 conditions")
460
-
461
- token = input("\nEnter HuggingFace token: ").strip()
462
- if token:
463
- choice = input("Save token for future use? (y/n): ").lower().strip()
464
- if choice == "y":
465
- if token_manager.store_token(token):
466
- print("Token saved successfully.")
467
- else:
468
- print("Failed to save token. It will be used for this session only.")
469
- return token if token else None
470
-
471
-
472
- def main():
473
- parser = argparse.ArgumentParser(
474
- description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
475
- )
476
- parser.add_argument(
477
- "--audio",
478
- type=Path,
479
- help="Path to the audio file to transcribe."
480
- )
481
- parser.add_argument(
482
- "--token",
483
- help="HuggingFace API token. Overrides any saved token."
484
- )
485
- parser.add_argument(
486
- "--output",
487
- type=Path,
488
- help="Path to the output directory for transcripts and temporary files.",
489
- )
490
- parser.add_argument(
491
- "--delete-token",
492
- action="store_true",
493
- help="Delete any stored Hugging Face token and exit.",
494
- )
495
- parser.add_argument(
496
- "--show-warnings",
497
- action="store_true",
498
- help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
499
- )
500
- parser.add_argument(
501
- "--whisper-model",
502
- default="base.en",
503
- help="Specify the Whisper model to use (default: 'base.en').",
504
- )
505
- args = parser.parse_args()
506
-
507
- # Manage user warnings
508
- if not args.show_warnings:
509
- warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
510
- warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
511
- else:
512
- warnings.resetwarnings()
513
-
514
- # Check dependencies
515
- if not DependencyManager.verify_dependencies():
516
- sys.exit(1)
517
-
518
- # Initialize tab-completion for file paths (Unix-like only, or with pyreadline on Windows)
519
- readline.set_completer_delims(' \t\n;')
520
- readline.set_completer(complete_path)
521
- readline.parse_and_bind("tab: complete")
522
-
523
- # Initialize the token manager
524
- token_manager = TokenManager()
525
-
526
- # If user wants to delete the stored token, do so and exit
527
- if args.delete_token:
528
- success = token_manager.delete_token()
529
- sys.exit(0 if success else 1)
530
-
531
- # Prepare configuration
532
- output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
533
- config = TranscriptionConfig(
534
- output_directory=output_dir,
535
- whisper_model=args.whisper_model
536
- )
537
-
538
- # Initialize pipeline
539
- pipeline = TranscriptionPipeline(config)
540
- hf_token = args.token or get_token(token_manager)
541
- if not hf_token:
542
- logger.error("No Hugging Face token provided. Exiting.")
543
- sys.exit(1)
544
-
545
- # Initialize models
546
- if not pipeline.initialize_models(hf_token):
547
- logger.error("Failed to initialize pipeline. Exiting.")
548
- sys.exit(1)
549
-
550
- # Prompt user for audio file path if not passed in
551
- audio_path = args.audio
552
- while not audio_path or not audio_path.exists():
553
- audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
554
- audio_path = Path(audio_path_str)
555
- if not audio_path.exists():
556
- print(f"File '{audio_path}' not found. Please try again.")
557
-
558
- print("Audio file path accepted. Preparing to process the audio...")
559
- sys.stdout.flush()
560
-
561
- # Process the audio file
562
- if not pipeline.process_file(audio_path):
563
- sys.exit(1)
564
-
565
-
566
- if __name__ == "__main__":
567
- main()
@@ -1,9 +0,0 @@
1
- audio_scribe/__init__.py,sha256=19NLfiVus01TtbB1SFwJ3Q-vFvN9nLzNYGIZiNB45qM,587
2
- audio_scribe/cli.py,sha256=LToGAiCHHXDitsXBuMqKMHkH_HzSARX0C06-Ha74jKU,20287
3
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- tests/test_audio_scribe_main.py,sha256=Jv5XixrhsK49RJBh6HlghnIxxDiCq52PUv0lf8ljpJY,15571
5
- audio_scribe-0.1.1.dist-info/METADATA,sha256=CBHEE3qzCnRWQ-7ljDGMbY8i5awoGYYDcgbPUH6Za-M,10455
6
- audio_scribe-0.1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
7
- audio_scribe-0.1.1.dist-info/entry_points.txt,sha256=eaO9r_zAFnrWseKyJcBpGUHQq-P7NXBw5er8sZaPfFU,55
8
- audio_scribe-0.1.1.dist-info/top_level.txt,sha256=K08EDnZLtXcJJ9RxLnzDUz-AmnUo5vGRyYmS3wSirtE,19
9
- audio_scribe-0.1.1.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- audio-scribe = audio_scribe.cli:main
tests/__init__.py DELETED
File without changes