audio-scribe 0.1.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
audio_scribe/__init__.py
ADDED
File without changes
|
audio_scribe/cli.py
ADDED
@@ -0,0 +1,567 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Audio Scribe
|
4
|
+
-----------------
|
5
|
+
A command-line script for transcribing audio files with speaker diarization
|
6
|
+
using Whisper and Pyannote. The script uses a Hugging Face token for
|
7
|
+
downloading Pyannote speaker-diarization models and displays a progress bar
|
8
|
+
with resource usage while transcribing.
|
9
|
+
"""
|
10
|
+
|
11
|
+
print("Initializing environment... Please wait while we load dependencies and models.")
|
12
|
+
import sys
|
13
|
+
sys.stdout.flush()
|
14
|
+
|
15
|
+
import os
|
16
|
+
import glob
|
17
|
+
import wave
|
18
|
+
import json
|
19
|
+
import logging
|
20
|
+
import warnings
|
21
|
+
import argparse
|
22
|
+
import readline # <--- For enabling tab-completion on Unix/Linux
|
23
|
+
from pathlib import Path
|
24
|
+
from datetime import datetime
|
25
|
+
from typing import Optional, Dict
|
26
|
+
from dataclasses import dataclass
|
27
|
+
import base64
|
28
|
+
|
29
|
+
from cryptography.fernet import Fernet
|
30
|
+
from cryptography.hazmat.primitives import hashes
|
31
|
+
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
32
|
+
|
33
|
+
import torch
|
34
|
+
import whisper
|
35
|
+
|
36
|
+
import importlib.metadata
|
37
|
+
from importlib.metadata import PackageNotFoundError
|
38
|
+
|
39
|
+
from pyannote.audio import Pipeline
|
40
|
+
|
41
|
+
# Attempt to import optional packages for progress bar and resource monitoring
|
42
|
+
try:
|
43
|
+
from alive_progress import alive_bar
|
44
|
+
import psutil
|
45
|
+
import GPUtil
|
46
|
+
HAVE_PROGRESS_SUPPORT = True
|
47
|
+
except ImportError:
|
48
|
+
HAVE_PROGRESS_SUPPORT = False
|
49
|
+
|
50
|
+
# Configure logging
|
51
|
+
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
|
52
|
+
logging.basicConfig(
|
53
|
+
level=logging.INFO,
|
54
|
+
format=LOG_FORMAT,
|
55
|
+
handlers=[
|
56
|
+
logging.StreamHandler(),
|
57
|
+
logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
|
58
|
+
],
|
59
|
+
)
|
60
|
+
logger = logging.getLogger(__name__)
|
61
|
+
|
62
|
+
# ---------- FILE PATH TAB-COMPLETION SNIPPET ----------
|
63
|
+
def complete_path(text, state):
|
64
|
+
"""
|
65
|
+
Return the 'state'-th completion for 'text'.
|
66
|
+
This function will be used by 'readline' to enable file path autocompletion.
|
67
|
+
"""
|
68
|
+
# If the user typed a glob pattern (with * or ?)
|
69
|
+
if '*' in text or '?' in text:
|
70
|
+
matches = glob.glob(text)
|
71
|
+
else:
|
72
|
+
# Split off the directory name and partial file/directory name
|
73
|
+
directory, partial = os.path.split(text)
|
74
|
+
if not directory:
|
75
|
+
directory = '.'
|
76
|
+
try:
|
77
|
+
# List everything in 'directory' that starts with 'partial'
|
78
|
+
entries = os.listdir(directory)
|
79
|
+
except OSError:
|
80
|
+
# If directory doesn't exist or we lack permission, no matches
|
81
|
+
entries = []
|
82
|
+
|
83
|
+
matches = []
|
84
|
+
for entry in entries:
|
85
|
+
if entry.startswith(partial):
|
86
|
+
full_path = os.path.join(directory, entry)
|
87
|
+
# If it's a directory, add a trailing slash to indicate that
|
88
|
+
if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
|
89
|
+
full_path += os.path.sep
|
90
|
+
matches.append(full_path)
|
91
|
+
|
92
|
+
# Sort matches to have a consistent order
|
93
|
+
matches.sort()
|
94
|
+
|
95
|
+
# If 'state' is beyond last match, return None
|
96
|
+
return matches[state] if state < len(matches) else None
|
97
|
+
|
98
|
+
|
99
|
+
@dataclass
|
100
|
+
class TranscriptionConfig:
|
101
|
+
"""
|
102
|
+
Configuration settings for the transcription pipeline.
|
103
|
+
"""
|
104
|
+
output_directory: Path
|
105
|
+
whisper_model: str = "base.en"
|
106
|
+
diarization_model: str = "pyannote/speaker-diarization-3.1"
|
107
|
+
temp_directory: Optional[Path] = None
|
108
|
+
device: Optional[str] = None
|
109
|
+
|
110
|
+
def __post_init__(self):
|
111
|
+
# Use CUDA if available, else fall back to CPU
|
112
|
+
self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
|
113
|
+
# Default temp directory inside the output directory
|
114
|
+
self.temp_directory = self.temp_directory or (self.output_directory / "temp")
|
115
|
+
# Ensure directories exist
|
116
|
+
self.temp_directory.mkdir(parents=True, exist_ok=True)
|
117
|
+
self.output_directory.mkdir(parents=True, exist_ok=True)
|
118
|
+
|
119
|
+
|
120
|
+
class TokenManager:
|
121
|
+
"""
|
122
|
+
Handles secure storage and retrieval of the Hugging Face authentication token.
|
123
|
+
"""
|
124
|
+
def __init__(self):
|
125
|
+
# Store config in ~/.pyannote/config.json
|
126
|
+
self.config_dir = Path.home() / ".pyannote"
|
127
|
+
self.config_file = self.config_dir / "config.json"
|
128
|
+
self._initialize_config()
|
129
|
+
|
130
|
+
def _initialize_config(self) -> None:
|
131
|
+
"""
|
132
|
+
Initialize configuration directory and file with secure permissions.
|
133
|
+
"""
|
134
|
+
self.config_dir.mkdir(exist_ok=True)
|
135
|
+
if not self.config_file.exists():
|
136
|
+
self._save_config({})
|
137
|
+
|
138
|
+
# Set secure file and directory permissions on POSIX systems
|
139
|
+
if os.name == "posix":
|
140
|
+
os.chmod(self.config_dir, 0o700)
|
141
|
+
os.chmod(self.config_file, 0o600)
|
142
|
+
|
143
|
+
def _get_encryption_key(self) -> bytes:
|
144
|
+
"""
|
145
|
+
Generate an encryption key from system-specific data.
|
146
|
+
"""
|
147
|
+
salt = b"pyannote-audio-salt"
|
148
|
+
kdf = PBKDF2HMAC(
|
149
|
+
algorithm=hashes.SHA256(),
|
150
|
+
length=32,
|
151
|
+
salt=salt,
|
152
|
+
iterations=100000,
|
153
|
+
)
|
154
|
+
key = kdf.derive(str(Path.home()).encode())
|
155
|
+
return base64.urlsafe_b64encode(key)
|
156
|
+
|
157
|
+
def _save_config(self, config: dict) -> None:
|
158
|
+
"""
|
159
|
+
Securely save configuration to file.
|
160
|
+
"""
|
161
|
+
with open(self.config_file, "w", encoding="utf-8") as f:
|
162
|
+
json.dump(config, f)
|
163
|
+
|
164
|
+
def _load_config(self) -> dict:
|
165
|
+
"""
|
166
|
+
Load configuration from file.
|
167
|
+
"""
|
168
|
+
try:
|
169
|
+
with open(self.config_file, "r", encoding="utf-8") as f:
|
170
|
+
return json.load(f)
|
171
|
+
except Exception:
|
172
|
+
return {}
|
173
|
+
|
174
|
+
def store_token(self, token: str) -> bool:
|
175
|
+
"""
|
176
|
+
Securely store authentication token.
|
177
|
+
"""
|
178
|
+
try:
|
179
|
+
fernet = Fernet(self._get_encryption_key())
|
180
|
+
encrypted_token = fernet.encrypt(token.encode())
|
181
|
+
|
182
|
+
config = self._load_config()
|
183
|
+
config["token"] = encrypted_token.decode()
|
184
|
+
|
185
|
+
self._save_config(config)
|
186
|
+
return True
|
187
|
+
except Exception as e:
|
188
|
+
logger.error(f"Failed to store token: {e}")
|
189
|
+
return False
|
190
|
+
|
191
|
+
def retrieve_token(self) -> Optional[str]:
|
192
|
+
"""
|
193
|
+
Retrieve stored authentication token.
|
194
|
+
"""
|
195
|
+
try:
|
196
|
+
config = self._load_config()
|
197
|
+
if "token" in config:
|
198
|
+
fernet = Fernet(self._get_encryption_key())
|
199
|
+
return fernet.decrypt(config["token"].encode()).decode()
|
200
|
+
except Exception as e:
|
201
|
+
logger.error(f"Failed to retrieve token: {e}")
|
202
|
+
return None
|
203
|
+
|
204
|
+
def delete_token(self) -> bool:
|
205
|
+
"""
|
206
|
+
Delete stored authentication token.
|
207
|
+
"""
|
208
|
+
try:
|
209
|
+
config = self._load_config()
|
210
|
+
if "token" in config:
|
211
|
+
del config["token"]
|
212
|
+
self._save_config(config)
|
213
|
+
return True
|
214
|
+
except Exception as e:
|
215
|
+
logger.error(f"Failed to delete token: {e}")
|
216
|
+
return False
|
217
|
+
|
218
|
+
|
219
|
+
class DependencyManager:
|
220
|
+
"""
|
221
|
+
Manages and verifies system dependencies using importlib.metadata.
|
222
|
+
"""
|
223
|
+
REQUIRED_PACKAGES = {
|
224
|
+
"torch": None,
|
225
|
+
"pyannote.audio": None,
|
226
|
+
"openai-whisper": None,
|
227
|
+
"pytorch-lightning": None,
|
228
|
+
"keyring": None,
|
229
|
+
}
|
230
|
+
|
231
|
+
@classmethod
|
232
|
+
def verify_dependencies(cls) -> bool:
|
233
|
+
"""
|
234
|
+
Verify all required dependencies are installed with correct versions
|
235
|
+
(if specified). Returns True if all are installed and correct, False otherwise.
|
236
|
+
"""
|
237
|
+
missing = []
|
238
|
+
outdated = []
|
239
|
+
|
240
|
+
for package, required_version in cls.REQUIRED_PACKAGES.items():
|
241
|
+
try:
|
242
|
+
installed_version = importlib.metadata.version(package)
|
243
|
+
if required_version and installed_version != required_version:
|
244
|
+
outdated.append(
|
245
|
+
f"{package} (installed: {installed_version}, required: {required_version})"
|
246
|
+
)
|
247
|
+
except PackageNotFoundError:
|
248
|
+
missing.append(package)
|
249
|
+
|
250
|
+
if missing or outdated:
|
251
|
+
if missing:
|
252
|
+
logger.error("Missing packages: %s", ", ".join(missing))
|
253
|
+
if outdated:
|
254
|
+
logger.error("Outdated packages: %s", ", ".join(outdated))
|
255
|
+
logger.info(
|
256
|
+
"Install required packages: pip install %s",
|
257
|
+
" ".join(
|
258
|
+
f"{pkg}=={ver}" if ver else pkg
|
259
|
+
for pkg, ver in cls.REQUIRED_PACKAGES.items()
|
260
|
+
),
|
261
|
+
)
|
262
|
+
return False
|
263
|
+
return True
|
264
|
+
|
265
|
+
|
266
|
+
class AudioProcessor:
|
267
|
+
"""
|
268
|
+
Handles audio file processing and segmentation using the `wave` module.
|
269
|
+
"""
|
270
|
+
def __init__(self, config: TranscriptionConfig):
|
271
|
+
self.config = config
|
272
|
+
|
273
|
+
def load_audio_segment(
|
274
|
+
self,
|
275
|
+
audio_path: Path,
|
276
|
+
start_time: float,
|
277
|
+
end_time: float,
|
278
|
+
output_path: Path,
|
279
|
+
) -> bool:
|
280
|
+
"""
|
281
|
+
Extract and save the audio segment from `start_time` to `end_time`.
|
282
|
+
"""
|
283
|
+
try:
|
284
|
+
with wave.open(str(audio_path), "rb") as infile:
|
285
|
+
params = infile.getparams()
|
286
|
+
frame_rate = params.framerate
|
287
|
+
start_frame = int(start_time * frame_rate)
|
288
|
+
end_frame = min(int(end_time * frame_rate), infile.getnframes())
|
289
|
+
|
290
|
+
infile.setpos(start_frame)
|
291
|
+
frames = infile.readframes(end_frame - start_frame)
|
292
|
+
|
293
|
+
with wave.open(str(output_path), "wb") as outfile:
|
294
|
+
outfile.setparams(params)
|
295
|
+
outfile.writeframes(frames)
|
296
|
+
return True
|
297
|
+
except Exception as e:
|
298
|
+
logger.error(f"Failed to process audio segment: {e}")
|
299
|
+
return False
|
300
|
+
|
301
|
+
|
302
|
+
class TranscriptionPipeline:
|
303
|
+
"""
|
304
|
+
Main pipeline for audio transcription (Whisper) and speaker diarization (Pyannote).
|
305
|
+
"""
|
306
|
+
def __init__(self, config: TranscriptionConfig):
|
307
|
+
self.config = config
|
308
|
+
self.diarization_pipeline = None
|
309
|
+
self.whisper_model = None
|
310
|
+
self.token_manager = TokenManager()
|
311
|
+
self._running = False # used for resource monitor thread
|
312
|
+
|
313
|
+
def initialize_models(self, auth_token: str) -> bool:
|
314
|
+
"""
|
315
|
+
Initialize the Pyannote diarization pipeline and the Whisper model.
|
316
|
+
"""
|
317
|
+
try:
|
318
|
+
# Load Whisper model (set download root to avoid clutter in home directory)
|
319
|
+
self.whisper_model = whisper.load_model(
|
320
|
+
self.config.whisper_model,
|
321
|
+
device=self.config.device,
|
322
|
+
download_root=str(self.config.output_directory / "models"),
|
323
|
+
)
|
324
|
+
|
325
|
+
# Load Pyannote diarization pipeline
|
326
|
+
self.diarization_pipeline = Pipeline.from_pretrained(
|
327
|
+
self.config.diarization_model, use_auth_token=auth_token
|
328
|
+
)
|
329
|
+
self.diarization_pipeline.to(torch.device(self.config.device))
|
330
|
+
|
331
|
+
if self.config.device == "cpu":
|
332
|
+
warnings.warn("Running on CPU. GPU is recommended for better performance.")
|
333
|
+
|
334
|
+
return True
|
335
|
+
except Exception as e:
|
336
|
+
logger.error(f"Model initialization failed: {e}")
|
337
|
+
logger.error("Please ensure you have accepted the model conditions at:")
|
338
|
+
logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
|
339
|
+
logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
|
340
|
+
return False
|
341
|
+
|
342
|
+
def _update_resources(self, bar):
|
343
|
+
"""
|
344
|
+
Continuously update progress bar text with CPU/MEM/GPU usage, until self._running is False.
|
345
|
+
"""
|
346
|
+
while self._running:
|
347
|
+
try:
|
348
|
+
import time
|
349
|
+
time.sleep(0.5)
|
350
|
+
|
351
|
+
cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
|
352
|
+
memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
|
353
|
+
|
354
|
+
if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
|
355
|
+
gpus = GPUtil.getGPUs()
|
356
|
+
gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
|
357
|
+
gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
|
358
|
+
gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
|
359
|
+
else:
|
360
|
+
gpu_usage_text = "N/A"
|
361
|
+
|
362
|
+
resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
|
363
|
+
bar.text(resource_text)
|
364
|
+
except Exception as e:
|
365
|
+
logger.error(f"Resource monitoring error: {e}")
|
366
|
+
|
367
|
+
def process_file(self, audio_path: Path) -> bool:
|
368
|
+
"""
|
369
|
+
Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback.
|
370
|
+
"""
|
371
|
+
try:
|
372
|
+
logger.info("Starting audio processing...")
|
373
|
+
diarization = self.diarization_pipeline(str(audio_path))
|
374
|
+
segments = list(diarization.itertracks(yield_label=True))
|
375
|
+
total_segments = len(segments)
|
376
|
+
|
377
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
378
|
+
output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
|
379
|
+
audio_processor = AudioProcessor(self.config)
|
380
|
+
|
381
|
+
if not HAVE_PROGRESS_SUPPORT:
|
382
|
+
# No alive_progress, psutil, or GPUtil installed
|
383
|
+
logger.info("Processing audio without progress bar (missing optional packages).")
|
384
|
+
with output_file.open("w", encoding="utf-8") as f:
|
385
|
+
for turn, _, speaker in segments:
|
386
|
+
segment_path = (
|
387
|
+
self.config.temp_directory
|
388
|
+
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
389
|
+
)
|
390
|
+
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
391
|
+
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
392
|
+
segment_path.unlink(missing_ok=True)
|
393
|
+
|
394
|
+
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
395
|
+
f.write(line)
|
396
|
+
logger.info(line.strip())
|
397
|
+
else:
|
398
|
+
# Use a progress bar to track segment transcription
|
399
|
+
from alive_progress import alive_bar
|
400
|
+
import threading
|
401
|
+
|
402
|
+
self._running = True
|
403
|
+
with output_file.open("w", encoding="utf-8") as f, alive_bar(
|
404
|
+
total_segments,
|
405
|
+
title="Transcribing Audio",
|
406
|
+
spinner="pulse",
|
407
|
+
theme="classic",
|
408
|
+
stats=False,
|
409
|
+
elapsed=True,
|
410
|
+
monitor=True,
|
411
|
+
) as bar:
|
412
|
+
|
413
|
+
# Start a background thread for resource monitoring
|
414
|
+
resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
|
415
|
+
resource_thread.start()
|
416
|
+
|
417
|
+
for turn, _, speaker in segments:
|
418
|
+
segment_path = (
|
419
|
+
self.config.temp_directory
|
420
|
+
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
421
|
+
)
|
422
|
+
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
423
|
+
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
424
|
+
segment_path.unlink(missing_ok=True)
|
425
|
+
|
426
|
+
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
427
|
+
f.write(line)
|
428
|
+
logger.info(line.strip())
|
429
|
+
|
430
|
+
# Update the progress bar
|
431
|
+
bar()
|
432
|
+
|
433
|
+
# Stop resource monitoring
|
434
|
+
self._running = False
|
435
|
+
resource_thread.join()
|
436
|
+
|
437
|
+
logger.info(f"Transcription completed. Output saved to: {output_file}")
|
438
|
+
return True
|
439
|
+
|
440
|
+
except Exception as e:
|
441
|
+
logger.error(f"Processing failed: {e}")
|
442
|
+
return False
|
443
|
+
|
444
|
+
|
445
|
+
def get_token(token_manager: TokenManager) -> Optional[str]:
|
446
|
+
"""
|
447
|
+
Get authentication token from storage or user input.
|
448
|
+
"""
|
449
|
+
stored_token = token_manager.retrieve_token()
|
450
|
+
if stored_token:
|
451
|
+
choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
|
452
|
+
if choice == "y":
|
453
|
+
return stored_token
|
454
|
+
|
455
|
+
print("\nA HuggingFace token is required for speaker diarization.")
|
456
|
+
print("Get your token at: https://huggingface.co/settings/tokens")
|
457
|
+
print("\nEnsure you have accepted:")
|
458
|
+
print(" 1. pyannote/segmentation-3.0 conditions")
|
459
|
+
print(" 2. pyannote/speaker-diarization-3.1 conditions")
|
460
|
+
|
461
|
+
token = input("\nEnter HuggingFace token: ").strip()
|
462
|
+
if token:
|
463
|
+
choice = input("Save token for future use? (y/n): ").lower().strip()
|
464
|
+
if choice == "y":
|
465
|
+
if token_manager.store_token(token):
|
466
|
+
print("Token saved successfully.")
|
467
|
+
else:
|
468
|
+
print("Failed to save token. It will be used for this session only.")
|
469
|
+
return token if token else None
|
470
|
+
|
471
|
+
|
472
|
+
def main():
|
473
|
+
parser = argparse.ArgumentParser(
|
474
|
+
description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
|
475
|
+
)
|
476
|
+
parser.add_argument(
|
477
|
+
"--audio",
|
478
|
+
type=Path,
|
479
|
+
help="Path to the audio file to transcribe."
|
480
|
+
)
|
481
|
+
parser.add_argument(
|
482
|
+
"--token",
|
483
|
+
help="HuggingFace API token. Overrides any saved token."
|
484
|
+
)
|
485
|
+
parser.add_argument(
|
486
|
+
"--output",
|
487
|
+
type=Path,
|
488
|
+
help="Path to the output directory for transcripts and temporary files.",
|
489
|
+
)
|
490
|
+
parser.add_argument(
|
491
|
+
"--delete-token",
|
492
|
+
action="store_true",
|
493
|
+
help="Delete any stored Hugging Face token and exit.",
|
494
|
+
)
|
495
|
+
parser.add_argument(
|
496
|
+
"--show-warnings",
|
497
|
+
action="store_true",
|
498
|
+
help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
|
499
|
+
)
|
500
|
+
parser.add_argument(
|
501
|
+
"--whisper-model",
|
502
|
+
default="base.en",
|
503
|
+
help="Specify the Whisper model to use (default: 'base.en').",
|
504
|
+
)
|
505
|
+
args = parser.parse_args()
|
506
|
+
|
507
|
+
# Manage user warnings
|
508
|
+
if not args.show_warnings:
|
509
|
+
warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
|
510
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
|
511
|
+
else:
|
512
|
+
warnings.resetwarnings()
|
513
|
+
|
514
|
+
# Check dependencies
|
515
|
+
if not DependencyManager.verify_dependencies():
|
516
|
+
sys.exit(1)
|
517
|
+
|
518
|
+
# Initialize tab-completion for file paths (Unix-like only, or with pyreadline on Windows)
|
519
|
+
readline.set_completer_delims(' \t\n;')
|
520
|
+
readline.set_completer(complete_path)
|
521
|
+
readline.parse_and_bind("tab: complete")
|
522
|
+
|
523
|
+
# Initialize the token manager
|
524
|
+
token_manager = TokenManager()
|
525
|
+
|
526
|
+
# If user wants to delete the stored token, do so and exit
|
527
|
+
if args.delete_token:
|
528
|
+
success = token_manager.delete_token()
|
529
|
+
sys.exit(0 if success else 1)
|
530
|
+
|
531
|
+
# Prepare configuration
|
532
|
+
output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
|
533
|
+
config = TranscriptionConfig(
|
534
|
+
output_directory=output_dir,
|
535
|
+
whisper_model=args.whisper_model
|
536
|
+
)
|
537
|
+
|
538
|
+
# Initialize pipeline
|
539
|
+
pipeline = TranscriptionPipeline(config)
|
540
|
+
hf_token = args.token or get_token(token_manager)
|
541
|
+
if not hf_token:
|
542
|
+
logger.error("No Hugging Face token provided. Exiting.")
|
543
|
+
sys.exit(1)
|
544
|
+
|
545
|
+
# Initialize models
|
546
|
+
if not pipeline.initialize_models(hf_token):
|
547
|
+
logger.error("Failed to initialize pipeline. Exiting.")
|
548
|
+
sys.exit(1)
|
549
|
+
|
550
|
+
# Prompt user for audio file path if not passed in
|
551
|
+
audio_path = args.audio
|
552
|
+
while not audio_path or not audio_path.exists():
|
553
|
+
audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
|
554
|
+
audio_path = Path(audio_path_str)
|
555
|
+
if not audio_path.exists():
|
556
|
+
print(f"File '{audio_path}' not found. Please try again.")
|
557
|
+
|
558
|
+
print("Audio file path accepted. Preparing to process the audio...")
|
559
|
+
sys.stdout.flush()
|
560
|
+
|
561
|
+
# Process the audio file
|
562
|
+
if not pipeline.process_file(audio_path):
|
563
|
+
sys.exit(1)
|
564
|
+
|
565
|
+
|
566
|
+
if __name__ == "__main__":
|
567
|
+
main()
|
@@ -0,0 +1,273 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: audio_scribe
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A command-line tool for audio transcription with Whisper and Pyannote.
|
5
|
+
Home-page: https://gitlab.genomicops.cloud/genomicops/audio-scribe
|
6
|
+
Author: Gurasis Osahan
|
7
|
+
Author-email: contact@genomicops.com
|
8
|
+
License: Apache-2.0
|
9
|
+
Project-URL: Source, https://gitlab.genomicops.cloud/genomicops/audio-scribe
|
10
|
+
Project-URL: Tracker, https://gitlab.genomicops.cloud/genomicops/audio-scribe/-/issues
|
11
|
+
Keywords: whisper pyannote transcription audio diarization
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
13
|
+
Classifier: Intended Audience :: Developers
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
15
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
17
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
22
|
+
Classifier: Operating System :: OS Independent
|
23
|
+
Requires-Python: >=3.8
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
Requires-Dist: torch
|
26
|
+
Requires-Dist: openai-whisper
|
27
|
+
Requires-Dist: pyannote.audio
|
28
|
+
Requires-Dist: pytorch-lightning
|
29
|
+
Requires-Dist: keyring
|
30
|
+
Requires-Dist: cryptography
|
31
|
+
Requires-Dist: alive-progress
|
32
|
+
Requires-Dist: psutil
|
33
|
+
Requires-Dist: GPUtil
|
34
|
+
Dynamic: author
|
35
|
+
Dynamic: author-email
|
36
|
+
Dynamic: classifier
|
37
|
+
Dynamic: description
|
38
|
+
Dynamic: description-content-type
|
39
|
+
Dynamic: home-page
|
40
|
+
Dynamic: keywords
|
41
|
+
Dynamic: license
|
42
|
+
Dynamic: project-url
|
43
|
+
Dynamic: requires-dist
|
44
|
+
Dynamic: requires-python
|
45
|
+
Dynamic: summary
|
46
|
+
|
47
|
+
# Audio Scribe
|
48
|
+
|
49
|
+
**A Command-Line Tool for Audio Transcription (Audio Scribe) and Speaker Diarization Using OpenAI Whisper and Pyannote**
|
50
|
+
|
51
|
+
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
|
52
|
+
|
53
|
+
## Overview
|
54
|
+
|
55
|
+
**Audio Scribe** is a command-line tool that transcribes audio files with speaker diarization. Leveraging [OpenAI Whisper](https://github.com/openai/whisper) for transcription and [Pyannote Audio](https://github.com/pyannote/pyannote-audio) for speaker diarization, this solution converts audio into segmented text files, identifying each speaker turn. Key features include:
|
56
|
+
|
57
|
+
- **Progress Bar & Resource Monitoring**: See real-time CPU, memory, and GPU usage with a live progress bar.
|
58
|
+
- **Speaker Diarization**: Automatically separates speaker turns using Pyannote’s state-of-the-art models.
|
59
|
+
- **Tab-Completion for File Paths**: Easily navigate your file system when prompted for the audio path.
|
60
|
+
- **Secure Token Storage**: Encrypts and stores your Hugging Face token for private model downloads.
|
61
|
+
- **Customizable Whisper Models**: Default to `base.en`, or specify `tiny`, `small`, `medium`, `large`, etc.
|
62
|
+
|
63
|
+
This repository is licensed under the [Apache License 2.0](#license).
|
64
|
+
|
65
|
+
---
|
66
|
+
|
67
|
+
## Table of Contents
|
68
|
+
|
69
|
+
- [Audio Scribe](#audio-scribe)
|
70
|
+
- [Overview](#overview)
|
71
|
+
- [Table of Contents](#table-of-contents)
|
72
|
+
- [Features](#features)
|
73
|
+
- [Installation](#installation)
|
74
|
+
- [Installing from PyPI](#installing-from-pypi)
|
75
|
+
- [Installing from GitHub](#installing-from-github)
|
76
|
+
- [Quick Start](#quick-start)
|
77
|
+
- [Usage](#usage)
|
78
|
+
- [Dependencies](#dependencies)
|
79
|
+
- [Sample `requirements.txt`](#sample-requirementstxt)
|
80
|
+
- [Contributing](#contributing)
|
81
|
+
- [License](#license)
|
82
|
+
|
83
|
+
---
|
84
|
+
|
85
|
+
## Features
|
86
|
+
|
87
|
+
- **Whisper Transcription**
|
88
|
+
Utilizes [OpenAI Whisper](https://github.com/openai/whisper) to convert speech to text in multiple languages.
|
89
|
+
- **Pyannote Speaker Diarization**
|
90
|
+
Identifies different speakers and segments your audio output accordingly.
|
91
|
+
- **Progress Bar & Resource Usage**
|
92
|
+
Displays a live progress bar with CPU, memory, and GPU stats through [alive-progress](https://github.com/rsalmei/alive-progress), [psutil](https://pypi.org/project/psutil/), and [GPUtil](https://pypi.org/project/GPUtil/).
|
93
|
+
- **Tab-Completion**
|
94
|
+
Press **Tab** to autocomplete file paths on Unix-like systems (and on Windows with [pyreadline3](https://pypi.org/project/pyreadline3/)).
|
95
|
+
- **Secure Token Storage**
|
96
|
+
Saves your Hugging Face token via [cryptography](https://pypi.org/project/cryptography/) for model downloads (e.g., `pyannote/speaker-diarization-3.1`).
|
97
|
+
- **Configurable Models**
|
98
|
+
Default is `base.en` but you can specify any other Whisper model using `--whisper-model`.
|
99
|
+
|
100
|
+
---
|
101
|
+
|
102
|
+
## Installation
|
103
|
+
|
104
|
+
### Installing from PyPI
|
105
|
+
|
106
|
+
**Audio Scribe** is available on PyPI. You can install it with:
|
107
|
+
|
108
|
+
```bash
|
109
|
+
pip install audio-scribe
|
110
|
+
```
|
111
|
+
|
112
|
+
After installation, the **`audio-scribe`** command should be available in your terminal (depending on how your PATH is configured). If you prefer to run via Python module, you can also do:
|
113
|
+
|
114
|
+
```bash
|
115
|
+
python -m audio-scribe --audio path/to/yourfile.wav
|
116
|
+
```
|
117
|
+
|
118
|
+
### Installing from GitHub
|
119
|
+
|
120
|
+
To install the latest development version directly from GitHub:
|
121
|
+
|
122
|
+
```bash
|
123
|
+
git clone https://gitlab.genomicops.cloud/genomicops/audio-scribe.git
|
124
|
+
cd audio-scribe
|
125
|
+
pip install -r requirements.txt
|
126
|
+
```
|
127
|
+
|
128
|
+
This approach is particularly useful if you want the newest changes or plan to contribute.
|
129
|
+
|
130
|
+
---
|
131
|
+
|
132
|
+
## Quick Start
|
133
|
+
|
134
|
+
1. **Obtain a Hugging Face Token**
|
135
|
+
- Create a token at [Hugging Face Settings](https://huggingface.co/settings/tokens).
|
136
|
+
- Accept the model conditions for `pyannote/segmentation-3.0` and `pyannote/speaker-diarization-3.1`.
|
137
|
+
|
138
|
+
2. **Run the Command-Line Tool**
|
139
|
+
```bash
|
140
|
+
audio-scribe --audio path/to/audio.wav
|
141
|
+
```
|
142
|
+
> On the first run, you’ll be prompted for your Hugging Face token if you haven’t stored one yet.
|
143
|
+
|
144
|
+
3. **Watch the Progress Bar**
|
145
|
+
- The tool displays a progress bar for each diarized speaker turn, along with real-time CPU, GPU, and memory usage.
|
146
|
+
|
147
|
+
---
|
148
|
+
|
149
|
+
## Usage
|
150
|
+
|
151
|
+
Below is a summary of the main command-line options:
|
152
|
+
|
153
|
+
```
|
154
|
+
usage: audio-scribe [options]
|
155
|
+
|
156
|
+
Audio Transcription (Audio Scribe) Pipeline using Whisper + Pyannote, with optional progress bar.
|
157
|
+
|
158
|
+
optional arguments:
|
159
|
+
--audio PATH Path to the audio file to transcribe.
|
160
|
+
--token TOKEN HuggingFace API token. Overrides any saved token.
|
161
|
+
--output PATH Path to the output directory for transcripts and temporary files.
|
162
|
+
--delete-token Delete any stored Hugging Face token and exit.
|
163
|
+
--show-warnings Enable user warnings (e.g., from pyannote.audio). Disabled by default.
|
164
|
+
--whisper-model MODEL Specify the Whisper model to use (default: 'base.en').
|
165
|
+
```
|
166
|
+
|
167
|
+
**Examples:**
|
168
|
+
|
169
|
+
- **Basic Transcription**
|
170
|
+
```bash
|
171
|
+
audio-scribe --audio meeting.wav
|
172
|
+
```
|
173
|
+
|
174
|
+
- **Specify a Different Whisper Model**
|
175
|
+
```bash
|
176
|
+
audio-scribe --audio webinar.mp3 --whisper-model small
|
177
|
+
```
|
178
|
+
|
179
|
+
- **Delete a Stored Token**
|
180
|
+
```bash
|
181
|
+
audio-scribe --delete-token
|
182
|
+
```
|
183
|
+
|
184
|
+
- **Show Internal Warnings**
|
185
|
+
```bash
|
186
|
+
audio-scribe --audio session.wav --show-warnings
|
187
|
+
```
|
188
|
+
|
189
|
+
- **Tab-Completion**
|
190
|
+
```bash
|
191
|
+
audio-scribe
|
192
|
+
# When prompted for an audio file path, press Tab to autocomplete
|
193
|
+
```
|
194
|
+
|
195
|
+
---
|
196
|
+
|
197
|
+
## Dependencies
|
198
|
+
|
199
|
+
**Core Libraries**
|
200
|
+
- **Python 3.8+**
|
201
|
+
- [PyTorch](https://pytorch.org/)
|
202
|
+
- [openai-whisper](https://github.com/openai/whisper)
|
203
|
+
- [pyannote.audio](https://github.com/pyannote/pyannote-audio)
|
204
|
+
- [pytorch-lightning](https://pypi.org/project/pytorch-lightning/)
|
205
|
+
- [cryptography](https://pypi.org/project/cryptography/)
|
206
|
+
- [keyring](https://pypi.org/project/keyring/)
|
207
|
+
|
208
|
+
**Optional for Extended Functionality**
|
209
|
+
- [alive-progress](https://pypi.org/project/alive-progress/) – Real-time progress bar
|
210
|
+
- [psutil](https://pypi.org/project/psutil/) – CPU/memory usage
|
211
|
+
- [GPUtil](https://pypi.org/project/GPUtil/) – GPU usage
|
212
|
+
- [pyreadline3](https://pypi.org/project/pyreadline3/) (for Windows tab-completion)
|
213
|
+
|
214
|
+
### Sample `requirements.txt`
|
215
|
+
|
216
|
+
Below is a typical `requirements.txt` you can place in your repository:
|
217
|
+
|
218
|
+
```
|
219
|
+
torch>=1.9
|
220
|
+
openai-whisper
|
221
|
+
pyannote.audio
|
222
|
+
pytorch-lightning
|
223
|
+
cryptography
|
224
|
+
keyring
|
225
|
+
alive-progress
|
226
|
+
psutil
|
227
|
+
GPUtil
|
228
|
+
pyreadline3; sys_platform == "win32"
|
229
|
+
```
|
230
|
+
|
231
|
+
> Note:
|
232
|
+
> - `pyreadline3` is appended with a [PEP 508 marker](https://peps.python.org/pep-0508/) (`; sys_platform == "win32"`) so it only installs on Windows.
|
233
|
+
> - For GPU support, ensure you install a compatible PyTorch version with CUDA.
|
234
|
+
|
235
|
+
---
|
236
|
+
|
237
|
+
## Contributing
|
238
|
+
|
239
|
+
We welcome contributions to **Audio Scribe**!
|
240
|
+
|
241
|
+
1. **Fork** the repository and clone your fork.
|
242
|
+
2. **Create a new branch** for your feature or bugfix.
|
243
|
+
3. **Implement your changes**, ensuring code is well-documented and follows best practices.
|
244
|
+
4. **Open a pull request**, detailing the changes you’ve made.
|
245
|
+
|
246
|
+
Please read any available guidelines or templates in our repository (such as `CONTRIBUTING.md` or `CODE_OF_CONDUCT.md`) before submitting.
|
247
|
+
|
248
|
+
---
|
249
|
+
|
250
|
+
## License
|
251
|
+
|
252
|
+
This project is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
|
253
|
+
|
254
|
+
```
|
255
|
+
Copyright 2025 Gurasis Osahan
|
256
|
+
|
257
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
258
|
+
you may not use this file except in compliance with the License.
|
259
|
+
You may obtain a copy of the License at
|
260
|
+
|
261
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
262
|
+
|
263
|
+
Unless required by applicable law or agreed to in writing, software
|
264
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
265
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
266
|
+
See the License for the specific language governing permissions and
|
267
|
+
limitations under the License.
|
268
|
+
```
|
269
|
+
|
270
|
+
---
|
271
|
+
|
272
|
+
**Thank you for using Audio Scribe!**
|
273
|
+
For questions or feedback, please open a [GitHub issue](https://gitlab.genomicops.cloud/genomicops/audio-scribe/issues) or contact the maintainers.
|
@@ -0,0 +1,7 @@
|
|
1
|
+
audio_scribe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
audio_scribe/cli.py,sha256=dabh7fe9wAEORwVIBd-V8FAzHBBzbkjnfMSR-wOywO8,20286
|
3
|
+
audio_scribe-0.1.0.dist-info/METADATA,sha256=BBx81TI9DPCYgsdKyBn2PWEJ9pJsnhqTUb8ZsWoS1Ps,9503
|
4
|
+
audio_scribe-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
audio_scribe-0.1.0.dist-info/entry_points.txt,sha256=eaO9r_zAFnrWseKyJcBpGUHQq-P7NXBw5er8sZaPfFU,55
|
6
|
+
audio_scribe-0.1.0.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
|
7
|
+
audio_scribe-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
audio_scribe
|