audio-scribe 0.1.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
File without changes
audio_scribe/cli.py ADDED
@@ -0,0 +1,567 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Audio Scribe
4
+ -----------------
5
+ A command-line script for transcribing audio files with speaker diarization
6
+ using Whisper and Pyannote. The script uses a Hugging Face token for
7
+ downloading Pyannote speaker-diarization models and displays a progress bar
8
+ with resource usage while transcribing.
9
+ """
10
+
11
+ print("Initializing environment... Please wait while we load dependencies and models.")
12
+ import sys
13
+ sys.stdout.flush()
14
+
15
+ import os
16
+ import glob
17
+ import wave
18
+ import json
19
+ import logging
20
+ import warnings
21
+ import argparse
22
+ import readline # <--- For enabling tab-completion on Unix/Linux
23
+ from pathlib import Path
24
+ from datetime import datetime
25
+ from typing import Optional, Dict
26
+ from dataclasses import dataclass
27
+ import base64
28
+
29
+ from cryptography.fernet import Fernet
30
+ from cryptography.hazmat.primitives import hashes
31
+ from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
32
+
33
+ import torch
34
+ import whisper
35
+
36
+ import importlib.metadata
37
+ from importlib.metadata import PackageNotFoundError
38
+
39
+ from pyannote.audio import Pipeline
40
+
41
+ # Attempt to import optional packages for progress bar and resource monitoring
42
+ try:
43
+ from alive_progress import alive_bar
44
+ import psutil
45
+ import GPUtil
46
+ HAVE_PROGRESS_SUPPORT = True
47
+ except ImportError:
48
+ HAVE_PROGRESS_SUPPORT = False
49
+
50
+ # Configure logging
51
+ LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
52
+ logging.basicConfig(
53
+ level=logging.INFO,
54
+ format=LOG_FORMAT,
55
+ handlers=[
56
+ logging.StreamHandler(),
57
+ logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
58
+ ],
59
+ )
60
+ logger = logging.getLogger(__name__)
61
+
62
+ # ---------- FILE PATH TAB-COMPLETION SNIPPET ----------
63
+ def complete_path(text, state):
64
+ """
65
+ Return the 'state'-th completion for 'text'.
66
+ This function will be used by 'readline' to enable file path autocompletion.
67
+ """
68
+ # If the user typed a glob pattern (with * or ?)
69
+ if '*' in text or '?' in text:
70
+ matches = glob.glob(text)
71
+ else:
72
+ # Split off the directory name and partial file/directory name
73
+ directory, partial = os.path.split(text)
74
+ if not directory:
75
+ directory = '.'
76
+ try:
77
+ # List everything in 'directory' that starts with 'partial'
78
+ entries = os.listdir(directory)
79
+ except OSError:
80
+ # If directory doesn't exist or we lack permission, no matches
81
+ entries = []
82
+
83
+ matches = []
84
+ for entry in entries:
85
+ if entry.startswith(partial):
86
+ full_path = os.path.join(directory, entry)
87
+ # If it's a directory, add a trailing slash to indicate that
88
+ if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
89
+ full_path += os.path.sep
90
+ matches.append(full_path)
91
+
92
+ # Sort matches to have a consistent order
93
+ matches.sort()
94
+
95
+ # If 'state' is beyond last match, return None
96
+ return matches[state] if state < len(matches) else None
97
+
98
+
99
+ @dataclass
100
+ class TranscriptionConfig:
101
+ """
102
+ Configuration settings for the transcription pipeline.
103
+ """
104
+ output_directory: Path
105
+ whisper_model: str = "base.en"
106
+ diarization_model: str = "pyannote/speaker-diarization-3.1"
107
+ temp_directory: Optional[Path] = None
108
+ device: Optional[str] = None
109
+
110
+ def __post_init__(self):
111
+ # Use CUDA if available, else fall back to CPU
112
+ self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
113
+ # Default temp directory inside the output directory
114
+ self.temp_directory = self.temp_directory or (self.output_directory / "temp")
115
+ # Ensure directories exist
116
+ self.temp_directory.mkdir(parents=True, exist_ok=True)
117
+ self.output_directory.mkdir(parents=True, exist_ok=True)
118
+
119
+
120
+ class TokenManager:
121
+ """
122
+ Handles secure storage and retrieval of the Hugging Face authentication token.
123
+ """
124
+ def __init__(self):
125
+ # Store config in ~/.pyannote/config.json
126
+ self.config_dir = Path.home() / ".pyannote"
127
+ self.config_file = self.config_dir / "config.json"
128
+ self._initialize_config()
129
+
130
+ def _initialize_config(self) -> None:
131
+ """
132
+ Initialize configuration directory and file with secure permissions.
133
+ """
134
+ self.config_dir.mkdir(exist_ok=True)
135
+ if not self.config_file.exists():
136
+ self._save_config({})
137
+
138
+ # Set secure file and directory permissions on POSIX systems
139
+ if os.name == "posix":
140
+ os.chmod(self.config_dir, 0o700)
141
+ os.chmod(self.config_file, 0o600)
142
+
143
+ def _get_encryption_key(self) -> bytes:
144
+ """
145
+ Generate an encryption key from system-specific data.
146
+ """
147
+ salt = b"pyannote-audio-salt"
148
+ kdf = PBKDF2HMAC(
149
+ algorithm=hashes.SHA256(),
150
+ length=32,
151
+ salt=salt,
152
+ iterations=100000,
153
+ )
154
+ key = kdf.derive(str(Path.home()).encode())
155
+ return base64.urlsafe_b64encode(key)
156
+
157
+ def _save_config(self, config: dict) -> None:
158
+ """
159
+ Securely save configuration to file.
160
+ """
161
+ with open(self.config_file, "w", encoding="utf-8") as f:
162
+ json.dump(config, f)
163
+
164
+ def _load_config(self) -> dict:
165
+ """
166
+ Load configuration from file.
167
+ """
168
+ try:
169
+ with open(self.config_file, "r", encoding="utf-8") as f:
170
+ return json.load(f)
171
+ except Exception:
172
+ return {}
173
+
174
+ def store_token(self, token: str) -> bool:
175
+ """
176
+ Securely store authentication token.
177
+ """
178
+ try:
179
+ fernet = Fernet(self._get_encryption_key())
180
+ encrypted_token = fernet.encrypt(token.encode())
181
+
182
+ config = self._load_config()
183
+ config["token"] = encrypted_token.decode()
184
+
185
+ self._save_config(config)
186
+ return True
187
+ except Exception as e:
188
+ logger.error(f"Failed to store token: {e}")
189
+ return False
190
+
191
+ def retrieve_token(self) -> Optional[str]:
192
+ """
193
+ Retrieve stored authentication token.
194
+ """
195
+ try:
196
+ config = self._load_config()
197
+ if "token" in config:
198
+ fernet = Fernet(self._get_encryption_key())
199
+ return fernet.decrypt(config["token"].encode()).decode()
200
+ except Exception as e:
201
+ logger.error(f"Failed to retrieve token: {e}")
202
+ return None
203
+
204
+ def delete_token(self) -> bool:
205
+ """
206
+ Delete stored authentication token.
207
+ """
208
+ try:
209
+ config = self._load_config()
210
+ if "token" in config:
211
+ del config["token"]
212
+ self._save_config(config)
213
+ return True
214
+ except Exception as e:
215
+ logger.error(f"Failed to delete token: {e}")
216
+ return False
217
+
218
+
219
+ class DependencyManager:
220
+ """
221
+ Manages and verifies system dependencies using importlib.metadata.
222
+ """
223
+ REQUIRED_PACKAGES = {
224
+ "torch": None,
225
+ "pyannote.audio": None,
226
+ "openai-whisper": None,
227
+ "pytorch-lightning": None,
228
+ "keyring": None,
229
+ }
230
+
231
+ @classmethod
232
+ def verify_dependencies(cls) -> bool:
233
+ """
234
+ Verify all required dependencies are installed with correct versions
235
+ (if specified). Returns True if all are installed and correct, False otherwise.
236
+ """
237
+ missing = []
238
+ outdated = []
239
+
240
+ for package, required_version in cls.REQUIRED_PACKAGES.items():
241
+ try:
242
+ installed_version = importlib.metadata.version(package)
243
+ if required_version and installed_version != required_version:
244
+ outdated.append(
245
+ f"{package} (installed: {installed_version}, required: {required_version})"
246
+ )
247
+ except PackageNotFoundError:
248
+ missing.append(package)
249
+
250
+ if missing or outdated:
251
+ if missing:
252
+ logger.error("Missing packages: %s", ", ".join(missing))
253
+ if outdated:
254
+ logger.error("Outdated packages: %s", ", ".join(outdated))
255
+ logger.info(
256
+ "Install required packages: pip install %s",
257
+ " ".join(
258
+ f"{pkg}=={ver}" if ver else pkg
259
+ for pkg, ver in cls.REQUIRED_PACKAGES.items()
260
+ ),
261
+ )
262
+ return False
263
+ return True
264
+
265
+
266
+ class AudioProcessor:
267
+ """
268
+ Handles audio file processing and segmentation using the `wave` module.
269
+ """
270
+ def __init__(self, config: TranscriptionConfig):
271
+ self.config = config
272
+
273
+ def load_audio_segment(
274
+ self,
275
+ audio_path: Path,
276
+ start_time: float,
277
+ end_time: float,
278
+ output_path: Path,
279
+ ) -> bool:
280
+ """
281
+ Extract and save the audio segment from `start_time` to `end_time`.
282
+ """
283
+ try:
284
+ with wave.open(str(audio_path), "rb") as infile:
285
+ params = infile.getparams()
286
+ frame_rate = params.framerate
287
+ start_frame = int(start_time * frame_rate)
288
+ end_frame = min(int(end_time * frame_rate), infile.getnframes())
289
+
290
+ infile.setpos(start_frame)
291
+ frames = infile.readframes(end_frame - start_frame)
292
+
293
+ with wave.open(str(output_path), "wb") as outfile:
294
+ outfile.setparams(params)
295
+ outfile.writeframes(frames)
296
+ return True
297
+ except Exception as e:
298
+ logger.error(f"Failed to process audio segment: {e}")
299
+ return False
300
+
301
+
302
+ class TranscriptionPipeline:
303
+ """
304
+ Main pipeline for audio transcription (Whisper) and speaker diarization (Pyannote).
305
+ """
306
+ def __init__(self, config: TranscriptionConfig):
307
+ self.config = config
308
+ self.diarization_pipeline = None
309
+ self.whisper_model = None
310
+ self.token_manager = TokenManager()
311
+ self._running = False # used for resource monitor thread
312
+
313
+ def initialize_models(self, auth_token: str) -> bool:
314
+ """
315
+ Initialize the Pyannote diarization pipeline and the Whisper model.
316
+ """
317
+ try:
318
+ # Load Whisper model (set download root to avoid clutter in home directory)
319
+ self.whisper_model = whisper.load_model(
320
+ self.config.whisper_model,
321
+ device=self.config.device,
322
+ download_root=str(self.config.output_directory / "models"),
323
+ )
324
+
325
+ # Load Pyannote diarization pipeline
326
+ self.diarization_pipeline = Pipeline.from_pretrained(
327
+ self.config.diarization_model, use_auth_token=auth_token
328
+ )
329
+ self.diarization_pipeline.to(torch.device(self.config.device))
330
+
331
+ if self.config.device == "cpu":
332
+ warnings.warn("Running on CPU. GPU is recommended for better performance.")
333
+
334
+ return True
335
+ except Exception as e:
336
+ logger.error(f"Model initialization failed: {e}")
337
+ logger.error("Please ensure you have accepted the model conditions at:")
338
+ logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
339
+ logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
340
+ return False
341
+
342
+ def _update_resources(self, bar):
343
+ """
344
+ Continuously update progress bar text with CPU/MEM/GPU usage, until self._running is False.
345
+ """
346
+ while self._running:
347
+ try:
348
+ import time
349
+ time.sleep(0.5)
350
+
351
+ cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
352
+ memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
353
+
354
+ if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
355
+ gpus = GPUtil.getGPUs()
356
+ gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
357
+ gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
358
+ gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
359
+ else:
360
+ gpu_usage_text = "N/A"
361
+
362
+ resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
363
+ bar.text(resource_text)
364
+ except Exception as e:
365
+ logger.error(f"Resource monitoring error: {e}")
366
+
367
+ def process_file(self, audio_path: Path) -> bool:
368
+ """
369
+ Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback.
370
+ """
371
+ try:
372
+ logger.info("Starting audio processing...")
373
+ diarization = self.diarization_pipeline(str(audio_path))
374
+ segments = list(diarization.itertracks(yield_label=True))
375
+ total_segments = len(segments)
376
+
377
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
378
+ output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
379
+ audio_processor = AudioProcessor(self.config)
380
+
381
+ if not HAVE_PROGRESS_SUPPORT:
382
+ # No alive_progress, psutil, or GPUtil installed
383
+ logger.info("Processing audio without progress bar (missing optional packages).")
384
+ with output_file.open("w", encoding="utf-8") as f:
385
+ for turn, _, speaker in segments:
386
+ segment_path = (
387
+ self.config.temp_directory
388
+ / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
389
+ )
390
+ if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
391
+ transcription = self.whisper_model.transcribe(str(segment_path))["text"]
392
+ segment_path.unlink(missing_ok=True)
393
+
394
+ line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
395
+ f.write(line)
396
+ logger.info(line.strip())
397
+ else:
398
+ # Use a progress bar to track segment transcription
399
+ from alive_progress import alive_bar
400
+ import threading
401
+
402
+ self._running = True
403
+ with output_file.open("w", encoding="utf-8") as f, alive_bar(
404
+ total_segments,
405
+ title="Transcribing Audio",
406
+ spinner="pulse",
407
+ theme="classic",
408
+ stats=False,
409
+ elapsed=True,
410
+ monitor=True,
411
+ ) as bar:
412
+
413
+ # Start a background thread for resource monitoring
414
+ resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
415
+ resource_thread.start()
416
+
417
+ for turn, _, speaker in segments:
418
+ segment_path = (
419
+ self.config.temp_directory
420
+ / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
421
+ )
422
+ if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
423
+ transcription = self.whisper_model.transcribe(str(segment_path))["text"]
424
+ segment_path.unlink(missing_ok=True)
425
+
426
+ line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
427
+ f.write(line)
428
+ logger.info(line.strip())
429
+
430
+ # Update the progress bar
431
+ bar()
432
+
433
+ # Stop resource monitoring
434
+ self._running = False
435
+ resource_thread.join()
436
+
437
+ logger.info(f"Transcription completed. Output saved to: {output_file}")
438
+ return True
439
+
440
+ except Exception as e:
441
+ logger.error(f"Processing failed: {e}")
442
+ return False
443
+
444
+
445
+ def get_token(token_manager: TokenManager) -> Optional[str]:
446
+ """
447
+ Get authentication token from storage or user input.
448
+ """
449
+ stored_token = token_manager.retrieve_token()
450
+ if stored_token:
451
+ choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
452
+ if choice == "y":
453
+ return stored_token
454
+
455
+ print("\nA HuggingFace token is required for speaker diarization.")
456
+ print("Get your token at: https://huggingface.co/settings/tokens")
457
+ print("\nEnsure you have accepted:")
458
+ print(" 1. pyannote/segmentation-3.0 conditions")
459
+ print(" 2. pyannote/speaker-diarization-3.1 conditions")
460
+
461
+ token = input("\nEnter HuggingFace token: ").strip()
462
+ if token:
463
+ choice = input("Save token for future use? (y/n): ").lower().strip()
464
+ if choice == "y":
465
+ if token_manager.store_token(token):
466
+ print("Token saved successfully.")
467
+ else:
468
+ print("Failed to save token. It will be used for this session only.")
469
+ return token if token else None
470
+
471
+
472
+ def main():
473
+ parser = argparse.ArgumentParser(
474
+ description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
475
+ )
476
+ parser.add_argument(
477
+ "--audio",
478
+ type=Path,
479
+ help="Path to the audio file to transcribe."
480
+ )
481
+ parser.add_argument(
482
+ "--token",
483
+ help="HuggingFace API token. Overrides any saved token."
484
+ )
485
+ parser.add_argument(
486
+ "--output",
487
+ type=Path,
488
+ help="Path to the output directory for transcripts and temporary files.",
489
+ )
490
+ parser.add_argument(
491
+ "--delete-token",
492
+ action="store_true",
493
+ help="Delete any stored Hugging Face token and exit.",
494
+ )
495
+ parser.add_argument(
496
+ "--show-warnings",
497
+ action="store_true",
498
+ help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
499
+ )
500
+ parser.add_argument(
501
+ "--whisper-model",
502
+ default="base.en",
503
+ help="Specify the Whisper model to use (default: 'base.en').",
504
+ )
505
+ args = parser.parse_args()
506
+
507
+ # Manage user warnings
508
+ if not args.show_warnings:
509
+ warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
510
+ warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
511
+ else:
512
+ warnings.resetwarnings()
513
+
514
+ # Check dependencies
515
+ if not DependencyManager.verify_dependencies():
516
+ sys.exit(1)
517
+
518
+ # Initialize tab-completion for file paths (Unix-like only, or with pyreadline on Windows)
519
+ readline.set_completer_delims(' \t\n;')
520
+ readline.set_completer(complete_path)
521
+ readline.parse_and_bind("tab: complete")
522
+
523
+ # Initialize the token manager
524
+ token_manager = TokenManager()
525
+
526
+ # If user wants to delete the stored token, do so and exit
527
+ if args.delete_token:
528
+ success = token_manager.delete_token()
529
+ sys.exit(0 if success else 1)
530
+
531
+ # Prepare configuration
532
+ output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
533
+ config = TranscriptionConfig(
534
+ output_directory=output_dir,
535
+ whisper_model=args.whisper_model
536
+ )
537
+
538
+ # Initialize pipeline
539
+ pipeline = TranscriptionPipeline(config)
540
+ hf_token = args.token or get_token(token_manager)
541
+ if not hf_token:
542
+ logger.error("No Hugging Face token provided. Exiting.")
543
+ sys.exit(1)
544
+
545
+ # Initialize models
546
+ if not pipeline.initialize_models(hf_token):
547
+ logger.error("Failed to initialize pipeline. Exiting.")
548
+ sys.exit(1)
549
+
550
+ # Prompt user for audio file path if not passed in
551
+ audio_path = args.audio
552
+ while not audio_path or not audio_path.exists():
553
+ audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
554
+ audio_path = Path(audio_path_str)
555
+ if not audio_path.exists():
556
+ print(f"File '{audio_path}' not found. Please try again.")
557
+
558
+ print("Audio file path accepted. Preparing to process the audio...")
559
+ sys.stdout.flush()
560
+
561
+ # Process the audio file
562
+ if not pipeline.process_file(audio_path):
563
+ sys.exit(1)
564
+
565
+
566
+ if __name__ == "__main__":
567
+ main()
@@ -0,0 +1,273 @@
1
+ Metadata-Version: 2.2
2
+ Name: audio_scribe
3
+ Version: 0.1.0
4
+ Summary: A command-line tool for audio transcription with Whisper and Pyannote.
5
+ Home-page: https://gitlab.genomicops.cloud/genomicops/audio-scribe
6
+ Author: Gurasis Osahan
7
+ Author-email: contact@genomicops.com
8
+ License: Apache-2.0
9
+ Project-URL: Source, https://gitlab.genomicops.cloud/genomicops/audio-scribe
10
+ Project-URL: Tracker, https://gitlab.genomicops.cloud/genomicops/audio-scribe/-/issues
11
+ Keywords: whisper pyannote transcription audio diarization
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Topic :: Multimedia :: Sound/Audio
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: License :: OSI Approved :: Apache Software License
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Operating System :: OS Independent
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: torch
26
+ Requires-Dist: openai-whisper
27
+ Requires-Dist: pyannote.audio
28
+ Requires-Dist: pytorch-lightning
29
+ Requires-Dist: keyring
30
+ Requires-Dist: cryptography
31
+ Requires-Dist: alive-progress
32
+ Requires-Dist: psutil
33
+ Requires-Dist: GPUtil
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: classifier
37
+ Dynamic: description
38
+ Dynamic: description-content-type
39
+ Dynamic: home-page
40
+ Dynamic: keywords
41
+ Dynamic: license
42
+ Dynamic: project-url
43
+ Dynamic: requires-dist
44
+ Dynamic: requires-python
45
+ Dynamic: summary
46
+
47
+ # Audio Scribe
48
+
49
+ **A Command-Line Tool for Audio Transcription (Audio Scribe) and Speaker Diarization Using OpenAI Whisper and Pyannote**
50
+
51
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
52
+
53
+ ## Overview
54
+
55
+ **Audio Scribe** is a command-line tool that transcribes audio files with speaker diarization. Leveraging [OpenAI Whisper](https://github.com/openai/whisper) for transcription and [Pyannote Audio](https://github.com/pyannote/pyannote-audio) for speaker diarization, this solution converts audio into segmented text files, identifying each speaker turn. Key features include:
56
+
57
+ - **Progress Bar & Resource Monitoring**: See real-time CPU, memory, and GPU usage with a live progress bar.
58
+ - **Speaker Diarization**: Automatically separates speaker turns using Pyannote’s state-of-the-art models.
59
+ - **Tab-Completion for File Paths**: Easily navigate your file system when prompted for the audio path.
60
+ - **Secure Token Storage**: Encrypts and stores your Hugging Face token for private model downloads.
61
+ - **Customizable Whisper Models**: Default to `base.en`, or specify `tiny`, `small`, `medium`, `large`, etc.
62
+
63
+ This repository is licensed under the [Apache License 2.0](#license).
64
+
65
+ ---
66
+
67
+ ## Table of Contents
68
+
69
+ - [Audio Scribe](#audio-scribe)
70
+ - [Overview](#overview)
71
+ - [Table of Contents](#table-of-contents)
72
+ - [Features](#features)
73
+ - [Installation](#installation)
74
+ - [Installing from PyPI](#installing-from-pypi)
75
+ - [Installing from GitHub](#installing-from-github)
76
+ - [Quick Start](#quick-start)
77
+ - [Usage](#usage)
78
+ - [Dependencies](#dependencies)
79
+ - [Sample `requirements.txt`](#sample-requirementstxt)
80
+ - [Contributing](#contributing)
81
+ - [License](#license)
82
+
83
+ ---
84
+
85
+ ## Features
86
+
87
+ - **Whisper Transcription**
88
+ Utilizes [OpenAI Whisper](https://github.com/openai/whisper) to convert speech to text in multiple languages.
89
+ - **Pyannote Speaker Diarization**
90
+ Identifies different speakers and segments your audio output accordingly.
91
+ - **Progress Bar & Resource Usage**
92
+ Displays a live progress bar with CPU, memory, and GPU stats through [alive-progress](https://github.com/rsalmei/alive-progress), [psutil](https://pypi.org/project/psutil/), and [GPUtil](https://pypi.org/project/GPUtil/).
93
+ - **Tab-Completion**
94
+ Press **Tab** to autocomplete file paths on Unix-like systems (and on Windows with [pyreadline3](https://pypi.org/project/pyreadline3/)).
95
+ - **Secure Token Storage**
96
+ Saves your Hugging Face token via [cryptography](https://pypi.org/project/cryptography/) for model downloads (e.g., `pyannote/speaker-diarization-3.1`).
97
+ - **Configurable Models**
98
+ Default is `base.en` but you can specify any other Whisper model using `--whisper-model`.
99
+
100
+ ---
101
+
102
+ ## Installation
103
+
104
+ ### Installing from PyPI
105
+
106
+ **Audio Scribe** is available on PyPI. You can install it with:
107
+
108
+ ```bash
109
+ pip install audio-scribe
110
+ ```
111
+
112
+ After installation, the **`audio-scribe`** command should be available in your terminal (depending on how your PATH is configured). If you prefer to run via Python module, you can also do:
113
+
114
+ ```bash
115
+ python -m audio-scribe --audio path/to/yourfile.wav
116
+ ```
117
+
118
+ ### Installing from GitHub
119
+
120
+ To install the latest development version directly from GitHub:
121
+
122
+ ```bash
123
+ git clone https://gitlab.genomicops.cloud/genomicops/audio-scribe.git
124
+ cd audio-scribe
125
+ pip install -r requirements.txt
126
+ ```
127
+
128
+ This approach is particularly useful if you want the newest changes or plan to contribute.
129
+
130
+ ---
131
+
132
+ ## Quick Start
133
+
134
+ 1. **Obtain a Hugging Face Token**
135
+ - Create a token at [Hugging Face Settings](https://huggingface.co/settings/tokens).
136
+ - Accept the model conditions for `pyannote/segmentation-3.0` and `pyannote/speaker-diarization-3.1`.
137
+
138
+ 2. **Run the Command-Line Tool**
139
+ ```bash
140
+ audio-scribe --audio path/to/audio.wav
141
+ ```
142
+ > On the first run, you’ll be prompted for your Hugging Face token if you haven’t stored one yet.
143
+
144
+ 3. **Watch the Progress Bar**
145
+ - The tool displays a progress bar for each diarized speaker turn, along with real-time CPU, GPU, and memory usage.
146
+
147
+ ---
148
+
149
+ ## Usage
150
+
151
+ Below is a summary of the main command-line options:
152
+
153
+ ```
154
+ usage: audio-scribe [options]
155
+
156
+ Audio Transcription (Audio Scribe) Pipeline using Whisper + Pyannote, with optional progress bar.
157
+
158
+ optional arguments:
159
+ --audio PATH Path to the audio file to transcribe.
160
+ --token TOKEN HuggingFace API token. Overrides any saved token.
161
+ --output PATH Path to the output directory for transcripts and temporary files.
162
+ --delete-token Delete any stored Hugging Face token and exit.
163
+ --show-warnings Enable user warnings (e.g., from pyannote.audio). Disabled by default.
164
+ --whisper-model MODEL Specify the Whisper model to use (default: 'base.en').
165
+ ```
166
+
167
+ **Examples:**
168
+
169
+ - **Basic Transcription**
170
+ ```bash
171
+ audio-scribe --audio meeting.wav
172
+ ```
173
+
174
+ - **Specify a Different Whisper Model**
175
+ ```bash
176
+ audio-scribe --audio webinar.mp3 --whisper-model small
177
+ ```
178
+
179
+ - **Delete a Stored Token**
180
+ ```bash
181
+ audio-scribe --delete-token
182
+ ```
183
+
184
+ - **Show Internal Warnings**
185
+ ```bash
186
+ audio-scribe --audio session.wav --show-warnings
187
+ ```
188
+
189
+ - **Tab-Completion**
190
+ ```bash
191
+ audio-scribe
192
+ # When prompted for an audio file path, press Tab to autocomplete
193
+ ```
194
+
195
+ ---
196
+
197
+ ## Dependencies
198
+
199
+ **Core Libraries**
200
+ - **Python 3.8+**
201
+ - [PyTorch](https://pytorch.org/)
202
+ - [openai-whisper](https://github.com/openai/whisper)
203
+ - [pyannote.audio](https://github.com/pyannote/pyannote-audio)
204
+ - [pytorch-lightning](https://pypi.org/project/pytorch-lightning/)
205
+ - [cryptography](https://pypi.org/project/cryptography/)
206
+ - [keyring](https://pypi.org/project/keyring/)
207
+
208
+ **Optional for Extended Functionality**
209
+ - [alive-progress](https://pypi.org/project/alive-progress/) – Real-time progress bar
210
+ - [psutil](https://pypi.org/project/psutil/) – CPU/memory usage
211
+ - [GPUtil](https://pypi.org/project/GPUtil/) – GPU usage
212
+ - [pyreadline3](https://pypi.org/project/pyreadline3/) (for Windows tab-completion)
213
+
214
+ ### Sample `requirements.txt`
215
+
216
+ Below is a typical `requirements.txt` you can place in your repository:
217
+
218
+ ```
219
+ torch>=1.9
220
+ openai-whisper
221
+ pyannote.audio
222
+ pytorch-lightning
223
+ cryptography
224
+ keyring
225
+ alive-progress
226
+ psutil
227
+ GPUtil
228
+ pyreadline3; sys_platform == "win32"
229
+ ```
230
+
231
+ > Note:
232
+ > - `pyreadline3` is appended with a [PEP 508 marker](https://peps.python.org/pep-0508/) (`; sys_platform == "win32"`) so it only installs on Windows.
233
+ > - For GPU support, ensure you install a compatible PyTorch version with CUDA.
234
+
235
+ ---
236
+
237
+ ## Contributing
238
+
239
+ We welcome contributions to **Audio Scribe**!
240
+
241
+ 1. **Fork** the repository and clone your fork.
242
+ 2. **Create a new branch** for your feature or bugfix.
243
+ 3. **Implement your changes**, ensuring code is well-documented and follows best practices.
244
+ 4. **Open a pull request**, detailing the changes you’ve made.
245
+
246
+ Please read any available guidelines or templates in our repository (such as `CONTRIBUTING.md` or `CODE_OF_CONDUCT.md`) before submitting.
247
+
248
+ ---
249
+
250
+ ## License
251
+
252
+ This project is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
253
+
254
+ ```
255
+ Copyright 2025 Gurasis Osahan
256
+
257
+ Licensed under the Apache License, Version 2.0 (the "License");
258
+ you may not use this file except in compliance with the License.
259
+ You may obtain a copy of the License at
260
+
261
+ http://www.apache.org/licenses/LICENSE-2.0
262
+
263
+ Unless required by applicable law or agreed to in writing, software
264
+ distributed under the License is distributed on an "AS IS" BASIS,
265
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
266
+ See the License for the specific language governing permissions and
267
+ limitations under the License.
268
+ ```
269
+
270
+ ---
271
+
272
+ **Thank you for using Audio Scribe!**
273
+ For questions or feedback, please open a [GitHub issue](https://gitlab.genomicops.cloud/genomicops/audio-scribe/issues) or contact the maintainers.
@@ -0,0 +1,7 @@
1
+ audio_scribe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ audio_scribe/cli.py,sha256=dabh7fe9wAEORwVIBd-V8FAzHBBzbkjnfMSR-wOywO8,20286
3
+ audio_scribe-0.1.0.dist-info/METADATA,sha256=BBx81TI9DPCYgsdKyBn2PWEJ9pJsnhqTUb8ZsWoS1Ps,9503
4
+ audio_scribe-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ audio_scribe-0.1.0.dist-info/entry_points.txt,sha256=eaO9r_zAFnrWseKyJcBpGUHQq-P7NXBw5er8sZaPfFU,55
6
+ audio_scribe-0.1.0.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
7
+ audio_scribe-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ audio-scribe = audio_scribe.cli:main
@@ -0,0 +1 @@
1
+ audio_scribe