audio-scribe 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audio_scribe/__init__.py +24 -0
- audio_scribe/auth.py +119 -0
- audio_scribe/config.py +24 -0
- audio_scribe/models.py +196 -0
- audio_scribe/transcriber.py +131 -0
- audio_scribe/utils.py +93 -0
- {audio_scribe-0.1.0.dist-info → audio_scribe-0.1.2.dist-info}/METADATA +9 -2
- audio_scribe-0.1.2.dist-info/RECORD +11 -0
- audio_scribe-0.1.2.dist-info/entry_points.txt +2 -0
- audio_scribe/cli.py +0 -567
- audio_scribe-0.1.0.dist-info/RECORD +0 -7
- audio_scribe-0.1.0.dist-info/entry_points.txt +0 -2
- {audio_scribe-0.1.0.dist-info → audio_scribe-0.1.2.dist-info}/WHEEL +0 -0
- {audio_scribe-0.1.0.dist-info → audio_scribe-0.1.2.dist-info}/top_level.txt +0 -0
audio_scribe/__init__.py
CHANGED
@@ -0,0 +1,24 @@
|
|
1
|
+
"""
|
2
|
+
Audio Scribe
|
3
|
+
-----------------
|
4
|
+
A Python package for transcribing audio files with speaker diarization
|
5
|
+
using Whisper and Pyannote.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .transcriber import main
|
9
|
+
from .models import TranscriptionPipeline, AudioProcessor
|
10
|
+
from .config import TranscriptionConfig
|
11
|
+
from .auth import TokenManager
|
12
|
+
from .utils import DependencyManager, complete_path
|
13
|
+
|
14
|
+
__version__ = "0.1.2"
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"main",
|
18
|
+
"TranscriptionPipeline",
|
19
|
+
"TranscriptionConfig",
|
20
|
+
"AudioProcessor",
|
21
|
+
"TokenManager",
|
22
|
+
"DependencyManager",
|
23
|
+
"complete_path",
|
24
|
+
]
|
audio_scribe/auth.py
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
"""Authentication and token management for Audio Scribe."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import json
|
5
|
+
import base64
|
6
|
+
import logging
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Optional
|
9
|
+
from cryptography.fernet import Fernet
|
10
|
+
from cryptography.hazmat.primitives import hashes
|
11
|
+
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
class TokenManager:
|
16
|
+
"""Handles secure storage and retrieval of the Hugging Face authentication token."""
|
17
|
+
def __init__(self):
|
18
|
+
# Store config in ~/.pyannote/config.json
|
19
|
+
self.config_dir = Path.home() / ".pyannote"
|
20
|
+
self.config_file = self.config_dir / "config.json"
|
21
|
+
self._initialize_config()
|
22
|
+
|
23
|
+
def _initialize_config(self) -> None:
|
24
|
+
"""Initialize configuration directory and file with secure permissions."""
|
25
|
+
self.config_dir.mkdir(exist_ok=True)
|
26
|
+
if not self.config_file.exists():
|
27
|
+
self._save_config({})
|
28
|
+
|
29
|
+
# Set secure file and directory permissions on POSIX systems
|
30
|
+
if os.name == "posix":
|
31
|
+
os.chmod(self.config_dir, 0o700)
|
32
|
+
os.chmod(self.config_file, 0o600)
|
33
|
+
|
34
|
+
def _get_encryption_key(self) -> bytes:
|
35
|
+
"""Generate an encryption key from system-specific data."""
|
36
|
+
salt = b"pyannote-audio-salt"
|
37
|
+
kdf = PBKDF2HMAC(
|
38
|
+
algorithm=hashes.SHA256(),
|
39
|
+
length=32,
|
40
|
+
salt=salt,
|
41
|
+
iterations=100000,
|
42
|
+
)
|
43
|
+
key = kdf.derive(str(Path.home()).encode())
|
44
|
+
return base64.urlsafe_b64encode(key)
|
45
|
+
|
46
|
+
def _save_config(self, config: dict) -> None:
|
47
|
+
"""Securely save configuration to file."""
|
48
|
+
with open(self.config_file, "w", encoding="utf-8") as f:
|
49
|
+
json.dump(config, f)
|
50
|
+
|
51
|
+
def _load_config(self) -> dict:
|
52
|
+
"""Load configuration from file."""
|
53
|
+
try:
|
54
|
+
with open(self.config_file, "r", encoding="utf-8") as f:
|
55
|
+
return json.load(f)
|
56
|
+
except Exception:
|
57
|
+
return {}
|
58
|
+
|
59
|
+
def store_token(self, token: str) -> bool:
|
60
|
+
"""Securely store authentication token."""
|
61
|
+
try:
|
62
|
+
fernet = Fernet(self._get_encryption_key())
|
63
|
+
encrypted_token = fernet.encrypt(token.encode())
|
64
|
+
|
65
|
+
config = self._load_config()
|
66
|
+
config["token"] = encrypted_token.decode()
|
67
|
+
|
68
|
+
self._save_config(config)
|
69
|
+
return True
|
70
|
+
except Exception as e:
|
71
|
+
logger.error(f"Failed to store token: {e}")
|
72
|
+
return False
|
73
|
+
|
74
|
+
def retrieve_token(self) -> Optional[str]:
|
75
|
+
"""Retrieve stored authentication token."""
|
76
|
+
try:
|
77
|
+
config = self._load_config()
|
78
|
+
if "token" in config:
|
79
|
+
fernet = Fernet(self._get_encryption_key())
|
80
|
+
return fernet.decrypt(config["token"].encode()).decode()
|
81
|
+
except Exception as e:
|
82
|
+
logger.error(f"Failed to retrieve token: {e}")
|
83
|
+
return None
|
84
|
+
|
85
|
+
def delete_token(self) -> bool:
|
86
|
+
"""Delete stored authentication token."""
|
87
|
+
try:
|
88
|
+
config = self._load_config()
|
89
|
+
if "token" in config:
|
90
|
+
del config["token"]
|
91
|
+
self._save_config(config)
|
92
|
+
return True
|
93
|
+
except Exception as e:
|
94
|
+
logger.error(f"Failed to delete token: {e}")
|
95
|
+
return False
|
96
|
+
|
97
|
+
def get_token(token_manager: TokenManager) -> Optional[str]:
|
98
|
+
"""Get authentication token from storage or user input."""
|
99
|
+
stored_token = token_manager.retrieve_token()
|
100
|
+
if stored_token:
|
101
|
+
choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
|
102
|
+
if choice == "y":
|
103
|
+
return stored_token
|
104
|
+
|
105
|
+
print("\nA HuggingFace token is required for speaker diarization.")
|
106
|
+
print("Get your token at: https://huggingface.co/settings/tokens")
|
107
|
+
print("\nEnsure you have accepted:")
|
108
|
+
print(" 1. pyannote/segmentation-3.0 conditions")
|
109
|
+
print(" 2. pyannote/speaker-diarization-3.1 conditions")
|
110
|
+
|
111
|
+
token = input("\nEnter HuggingFace token: ").strip()
|
112
|
+
if token:
|
113
|
+
choice = input("Save token for future use? (y/n): ").lower().strip()
|
114
|
+
if choice == "y":
|
115
|
+
if token_manager.store_token(token):
|
116
|
+
print("Token saved successfully.")
|
117
|
+
else:
|
118
|
+
print("Failed to save token. It will be used for this session only.")
|
119
|
+
return token if token else None
|
audio_scribe/config.py
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
"""Configuration management for Audio Scribe."""
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Optional
|
6
|
+
import torch
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class TranscriptionConfig:
|
10
|
+
"""Configuration settings for the transcription pipeline."""
|
11
|
+
output_directory: Path
|
12
|
+
whisper_model: str = "base.en"
|
13
|
+
diarization_model: str = "pyannote/speaker-diarization-3.1"
|
14
|
+
temp_directory: Optional[Path] = None
|
15
|
+
device: Optional[str] = None
|
16
|
+
|
17
|
+
def __post_init__(self):
|
18
|
+
# Use CUDA if available, else fall back to CPU
|
19
|
+
self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
|
20
|
+
# Default temp directory inside the output directory
|
21
|
+
self.temp_directory = self.temp_directory or (self.output_directory / "temp")
|
22
|
+
# Ensure directories exist
|
23
|
+
self.temp_directory.mkdir(parents=True, exist_ok=True)
|
24
|
+
self.output_directory.mkdir(parents=True, exist_ok=True)
|
audio_scribe/models.py
ADDED
@@ -0,0 +1,196 @@
|
|
1
|
+
"""Model handling and audio processing for Audio Scribe."""
|
2
|
+
|
3
|
+
import wave
|
4
|
+
import torch
|
5
|
+
import whisper
|
6
|
+
import logging
|
7
|
+
import warnings
|
8
|
+
import threading
|
9
|
+
from datetime import datetime
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Optional
|
12
|
+
from pyannote.audio import Pipeline
|
13
|
+
|
14
|
+
from .config import TranscriptionConfig
|
15
|
+
from .auth import TokenManager
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
try:
|
20
|
+
from alive_progress import alive_bar
|
21
|
+
import psutil
|
22
|
+
import GPUtil
|
23
|
+
HAVE_PROGRESS_SUPPORT = True
|
24
|
+
except ImportError:
|
25
|
+
HAVE_PROGRESS_SUPPORT = False
|
26
|
+
|
27
|
+
|
28
|
+
class AudioProcessor:
|
29
|
+
"""Handles audio file processing and segmentation."""
|
30
|
+
|
31
|
+
def __init__(self, config: TranscriptionConfig):
|
32
|
+
self.config = config
|
33
|
+
|
34
|
+
def load_audio_segment(
|
35
|
+
self,
|
36
|
+
audio_path: Path,
|
37
|
+
start_time: float,
|
38
|
+
end_time: float,
|
39
|
+
output_path: Path,
|
40
|
+
) -> bool:
|
41
|
+
"""Extract and save the audio segment from start_time to end_time."""
|
42
|
+
try:
|
43
|
+
with wave.open(str(audio_path), "rb") as infile:
|
44
|
+
params = infile.getparams()
|
45
|
+
frame_rate = params.framerate
|
46
|
+
start_frame = int(start_time * frame_rate)
|
47
|
+
end_frame = min(int(end_time * frame_rate), infile.getnframes())
|
48
|
+
|
49
|
+
infile.setpos(start_frame)
|
50
|
+
frames = infile.readframes(end_frame - start_frame)
|
51
|
+
|
52
|
+
with wave.open(str(output_path), "wb") as outfile:
|
53
|
+
outfile.setparams(params)
|
54
|
+
outfile.writeframes(frames)
|
55
|
+
return True
|
56
|
+
except Exception as e:
|
57
|
+
logger.error(f"Failed to process audio segment: {e}")
|
58
|
+
return False
|
59
|
+
|
60
|
+
|
61
|
+
class TranscriptionPipeline:
|
62
|
+
"""Main pipeline for audio transcription and speaker diarization."""
|
63
|
+
|
64
|
+
def __init__(self, config: TranscriptionConfig):
|
65
|
+
self.config = config
|
66
|
+
self.diarization_pipeline = None
|
67
|
+
self.whisper_model = None
|
68
|
+
self.token_manager = TokenManager()
|
69
|
+
self._running = False # used for resource monitor thread
|
70
|
+
|
71
|
+
def initialize_models(self, auth_token: str) -> bool:
|
72
|
+
"""Initialize the Pyannote diarization pipeline and Whisper model."""
|
73
|
+
try:
|
74
|
+
# Load Whisper model
|
75
|
+
self.whisper_model = whisper.load_model(
|
76
|
+
self.config.whisper_model,
|
77
|
+
device=self.config.device,
|
78
|
+
download_root=str(self.config.output_directory / "models"),
|
79
|
+
)
|
80
|
+
|
81
|
+
# Load Pyannote diarization pipeline
|
82
|
+
self.diarization_pipeline = Pipeline.from_pretrained(
|
83
|
+
self.config.diarization_model,
|
84
|
+
use_auth_token=auth_token
|
85
|
+
)
|
86
|
+
self.diarization_pipeline.to(torch.device(self.config.device))
|
87
|
+
|
88
|
+
if self.config.device == "cpu":
|
89
|
+
warnings.warn("Running on CPU. GPU is recommended for better performance.")
|
90
|
+
|
91
|
+
return True
|
92
|
+
except Exception as e:
|
93
|
+
logger.error(f"Model initialization failed: {e}")
|
94
|
+
logger.error("Please ensure you have accepted the model conditions at:")
|
95
|
+
logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
|
96
|
+
logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
|
97
|
+
return False
|
98
|
+
|
99
|
+
def _update_resources(self, bar):
|
100
|
+
"""Update progress bar with resource usage information."""
|
101
|
+
while self._running:
|
102
|
+
try:
|
103
|
+
import time
|
104
|
+
time.sleep(0.5)
|
105
|
+
|
106
|
+
cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
|
107
|
+
memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
|
108
|
+
|
109
|
+
if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
|
110
|
+
gpus = GPUtil.getGPUs()
|
111
|
+
gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
|
112
|
+
gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
|
113
|
+
gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
|
114
|
+
else:
|
115
|
+
gpu_usage_text = "N/A"
|
116
|
+
|
117
|
+
resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
|
118
|
+
bar.text(resource_text)
|
119
|
+
except Exception as e:
|
120
|
+
logger.error(f"Resource monitoring error: {e}")
|
121
|
+
|
122
|
+
def process_file(self, audio_path: Path) -> bool:
|
123
|
+
"""Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback."""
|
124
|
+
try:
|
125
|
+
logger.info("Starting audio processing...")
|
126
|
+
diarization = self.diarization_pipeline(str(audio_path))
|
127
|
+
segments = list(diarization.itertracks(yield_label=True))
|
128
|
+
total_segments = len(segments)
|
129
|
+
|
130
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
131
|
+
output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
|
132
|
+
audio_processor = AudioProcessor(self.config)
|
133
|
+
|
134
|
+
if not HAVE_PROGRESS_SUPPORT:
|
135
|
+
# No alive_progress, psutil, or GPUtil installed
|
136
|
+
logger.info("Processing audio without progress bar (missing optional packages).")
|
137
|
+
with output_file.open("w", encoding="utf-8") as f:
|
138
|
+
for turn, _, speaker in segments:
|
139
|
+
segment_path = (
|
140
|
+
self.config.temp_directory
|
141
|
+
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
142
|
+
)
|
143
|
+
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
144
|
+
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
145
|
+
segment_path.unlink(missing_ok=True)
|
146
|
+
|
147
|
+
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
148
|
+
f.write(line)
|
149
|
+
logger.info(line.strip())
|
150
|
+
return True
|
151
|
+
else:
|
152
|
+
# Use a progress bar to track segment transcription
|
153
|
+
from alive_progress import alive_bar
|
154
|
+
import threading
|
155
|
+
|
156
|
+
self._running = True
|
157
|
+
with output_file.open("w", encoding="utf-8") as f, alive_bar(
|
158
|
+
total_segments,
|
159
|
+
title="Transcribing Audio",
|
160
|
+
spinner="pulse",
|
161
|
+
theme="classic",
|
162
|
+
stats=False,
|
163
|
+
elapsed=True,
|
164
|
+
monitor=True,
|
165
|
+
) as bar:
|
166
|
+
|
167
|
+
# Start a background thread for resource monitoring
|
168
|
+
resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
|
169
|
+
resource_thread.start()
|
170
|
+
|
171
|
+
for turn, _, speaker in segments:
|
172
|
+
segment_path = (
|
173
|
+
self.config.temp_directory
|
174
|
+
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
175
|
+
)
|
176
|
+
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
177
|
+
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
178
|
+
segment_path.unlink(missing_ok=True)
|
179
|
+
|
180
|
+
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
181
|
+
f.write(line)
|
182
|
+
logger.info(line.strip())
|
183
|
+
|
184
|
+
# Update the progress bar
|
185
|
+
bar()
|
186
|
+
|
187
|
+
# Stop resource monitoring
|
188
|
+
self._running = False
|
189
|
+
resource_thread.join()
|
190
|
+
|
191
|
+
logger.info(f"Transcription completed. Output saved to: {output_file}")
|
192
|
+
return True
|
193
|
+
|
194
|
+
except Exception as e:
|
195
|
+
logger.error(f"Processing failed: {e}")
|
196
|
+
return False
|
@@ -0,0 +1,131 @@
|
|
1
|
+
"""
|
2
|
+
Main entry point for Audio Scribe transcription tool.
|
3
|
+
Handles CLI interface and orchestrates the transcription process.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import sys
|
7
|
+
import logging
|
8
|
+
import warnings
|
9
|
+
import argparse
|
10
|
+
import readline
|
11
|
+
from pathlib import Path
|
12
|
+
from datetime import datetime
|
13
|
+
|
14
|
+
from .config import TranscriptionConfig
|
15
|
+
from .models import TranscriptionPipeline
|
16
|
+
from .auth import TokenManager, get_token
|
17
|
+
from .utils import DependencyManager, complete_path
|
18
|
+
|
19
|
+
# Configure logging
|
20
|
+
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
|
21
|
+
logging.basicConfig(
|
22
|
+
level=logging.INFO,
|
23
|
+
format=LOG_FORMAT,
|
24
|
+
handlers=[
|
25
|
+
logging.StreamHandler(),
|
26
|
+
logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
|
27
|
+
],
|
28
|
+
)
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
def main():
|
33
|
+
"""Main entry point for the Audio Scribe CLI."""
|
34
|
+
print("Initializing environment... Please wait while we load dependencies and models.")
|
35
|
+
sys.stdout.flush()
|
36
|
+
|
37
|
+
parser = argparse.ArgumentParser(
|
38
|
+
description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
|
39
|
+
)
|
40
|
+
parser.add_argument(
|
41
|
+
"--audio",
|
42
|
+
type=Path,
|
43
|
+
help="Path to the audio file to transcribe."
|
44
|
+
)
|
45
|
+
parser.add_argument(
|
46
|
+
"--token",
|
47
|
+
help="HuggingFace API token. Overrides any saved token."
|
48
|
+
)
|
49
|
+
parser.add_argument(
|
50
|
+
"--output",
|
51
|
+
type=Path,
|
52
|
+
help="Path to the output directory for transcripts and temporary files.",
|
53
|
+
)
|
54
|
+
parser.add_argument(
|
55
|
+
"--delete-token",
|
56
|
+
action="store_true",
|
57
|
+
help="Delete any stored Hugging Face token and exit.",
|
58
|
+
)
|
59
|
+
parser.add_argument(
|
60
|
+
"--show-warnings",
|
61
|
+
action="store_true",
|
62
|
+
help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
|
63
|
+
)
|
64
|
+
parser.add_argument(
|
65
|
+
"--whisper-model",
|
66
|
+
default="base.en",
|
67
|
+
help="Specify the Whisper model to use (default: 'base.en').",
|
68
|
+
)
|
69
|
+
args = parser.parse_args()
|
70
|
+
|
71
|
+
# Manage user warnings
|
72
|
+
if not args.show_warnings:
|
73
|
+
warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
|
74
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
|
75
|
+
else:
|
76
|
+
warnings.resetwarnings()
|
77
|
+
|
78
|
+
# Check dependencies
|
79
|
+
if not DependencyManager.verify_dependencies():
|
80
|
+
sys.exit(1)
|
81
|
+
|
82
|
+
# Initialize tab-completion for file paths
|
83
|
+
readline.set_completer_delims(' \t\n;')
|
84
|
+
readline.set_completer(complete_path)
|
85
|
+
readline.parse_and_bind("tab: complete")
|
86
|
+
|
87
|
+
# Initialize the token manager
|
88
|
+
token_manager = TokenManager()
|
89
|
+
|
90
|
+
# If user wants to delete the stored token, do so and exit
|
91
|
+
if args.delete_token:
|
92
|
+
success = token_manager.delete_token()
|
93
|
+
sys.exit(0 if success else 1)
|
94
|
+
|
95
|
+
# Prepare configuration
|
96
|
+
output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
|
97
|
+
config = TranscriptionConfig(
|
98
|
+
output_directory=output_dir,
|
99
|
+
whisper_model=args.whisper_model
|
100
|
+
)
|
101
|
+
|
102
|
+
# Initialize pipeline
|
103
|
+
pipeline = TranscriptionPipeline(config)
|
104
|
+
hf_token = args.token or get_token(token_manager)
|
105
|
+
if not hf_token:
|
106
|
+
logger.error("No Hugging Face token provided. Exiting.")
|
107
|
+
sys.exit(1)
|
108
|
+
|
109
|
+
# Initialize models
|
110
|
+
if not pipeline.initialize_models(hf_token):
|
111
|
+
logger.error("Failed to initialize pipeline. Exiting.")
|
112
|
+
sys.exit(1)
|
113
|
+
|
114
|
+
# Prompt user for audio file path if not passed in
|
115
|
+
audio_path = args.audio
|
116
|
+
while not audio_path or not audio_path.exists():
|
117
|
+
audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
|
118
|
+
audio_path = Path(audio_path_str)
|
119
|
+
if not audio_path.exists():
|
120
|
+
print(f"File '{audio_path}' not found. Please try again.")
|
121
|
+
|
122
|
+
print("Audio file path accepted. Preparing to process the audio...")
|
123
|
+
sys.stdout.flush()
|
124
|
+
|
125
|
+
# Process the audio file
|
126
|
+
if not pipeline.process_file(audio_path):
|
127
|
+
sys.exit(1)
|
128
|
+
|
129
|
+
|
130
|
+
if __name__ == "__main__":
|
131
|
+
main()
|
audio_scribe/utils.py
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
"""Utility functions and classes for Audio Scribe."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import glob
|
5
|
+
import logging
|
6
|
+
import importlib.metadata
|
7
|
+
from importlib.metadata import PackageNotFoundError
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
def complete_path(text, state):
|
12
|
+
"""
|
13
|
+
Return the 'state'-th completion for 'text'.
|
14
|
+
This function will be used by 'readline' to enable file path autocompletion.
|
15
|
+
"""
|
16
|
+
# If the user typed a glob pattern (with * or ?)
|
17
|
+
if '*' in text or '?' in text:
|
18
|
+
matches = sorted(glob.glob(text))
|
19
|
+
else:
|
20
|
+
# Split off the directory name and partial file/directory name
|
21
|
+
directory, partial = os.path.split(text)
|
22
|
+
if not directory:
|
23
|
+
directory = '.'
|
24
|
+
try:
|
25
|
+
# List everything in 'directory' that starts with 'partial'
|
26
|
+
entries = sorted(os.listdir(directory))
|
27
|
+
except OSError:
|
28
|
+
# If directory doesn't exist or we lack permission, no matches
|
29
|
+
entries = []
|
30
|
+
|
31
|
+
matches = []
|
32
|
+
for entry in entries:
|
33
|
+
if entry.startswith(partial):
|
34
|
+
if directory == '.':
|
35
|
+
# Don't prefix current directory paths
|
36
|
+
full_path = entry
|
37
|
+
else:
|
38
|
+
# Keep the directory prefix for subdirectories
|
39
|
+
full_path = os.path.join(directory, entry)
|
40
|
+
|
41
|
+
# If it's a directory, add a trailing slash to indicate that
|
42
|
+
if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
|
43
|
+
full_path += os.path.sep
|
44
|
+
matches.append(full_path)
|
45
|
+
|
46
|
+
# If 'state' is beyond last match, return None
|
47
|
+
return matches[state] if state < len(matches) else None
|
48
|
+
|
49
|
+
|
50
|
+
class DependencyManager:
|
51
|
+
"""Manages and verifies system dependencies."""
|
52
|
+
|
53
|
+
REQUIRED_PACKAGES = {
|
54
|
+
"torch": None,
|
55
|
+
"pyannote.audio": None,
|
56
|
+
"openai-whisper": None,
|
57
|
+
"pytorch-lightning": None,
|
58
|
+
"keyring": None,
|
59
|
+
}
|
60
|
+
|
61
|
+
@classmethod
|
62
|
+
def verify_dependencies(cls) -> bool:
|
63
|
+
"""
|
64
|
+
Verify all required dependencies are installed with correct versions.
|
65
|
+
Returns True if all are installed and correct, False otherwise.
|
66
|
+
"""
|
67
|
+
missing = []
|
68
|
+
outdated = []
|
69
|
+
|
70
|
+
for package, required_version in cls.REQUIRED_PACKAGES.items():
|
71
|
+
try:
|
72
|
+
installed_version = importlib.metadata.version(package)
|
73
|
+
if required_version and installed_version != required_version:
|
74
|
+
outdated.append(
|
75
|
+
f"{package} (installed: {installed_version}, required: {required_version})"
|
76
|
+
)
|
77
|
+
except PackageNotFoundError:
|
78
|
+
missing.append(package)
|
79
|
+
|
80
|
+
if missing or outdated:
|
81
|
+
if missing:
|
82
|
+
logger.error("Missing packages: %s", ", ".join(missing))
|
83
|
+
if outdated:
|
84
|
+
logger.error("Outdated packages: %s", ", ".join(outdated))
|
85
|
+
logger.info(
|
86
|
+
"Install required packages: pip install %s",
|
87
|
+
" ".join(
|
88
|
+
f"{pkg}=={ver}" if ver else pkg
|
89
|
+
for pkg, ver in cls.REQUIRED_PACKAGES.items()
|
90
|
+
),
|
91
|
+
)
|
92
|
+
return False
|
93
|
+
return True
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: audio_scribe
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: A command-line tool for audio transcription with Whisper and Pyannote.
|
5
5
|
Home-page: https://gitlab.genomicops.cloud/genomicops/audio-scribe
|
6
6
|
Author: Gurasis Osahan
|
@@ -46,9 +46,16 @@ Dynamic: summary
|
|
46
46
|
|
47
47
|
# Audio Scribe
|
48
48
|
|
49
|
-
**A Command-Line Tool for Audio Transcription
|
49
|
+
**A Command-Line Tool for Audio Transcription and Speaker Diarization Using OpenAI Whisper and Pyannote**
|
50
50
|
|
51
51
|
[](LICENSE)
|
52
|
+

|
53
|
+
[](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main)
|
54
|
+
[](https://badge.fury.io/py/audio-scribe)
|
55
|
+
[](https://pypi.org/project/audio-scribe/)
|
56
|
+
[](https://pypi.org/project/audio-scribe/)
|
57
|
+
[](https://pypi.org/project/audio-scribe/)
|
58
|
+
<!-- [](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main) -->
|
52
59
|
|
53
60
|
## Overview
|
54
61
|
|
@@ -0,0 +1,11 @@
|
|
1
|
+
audio_scribe/__init__.py,sha256=FH63cMQOjdp5Kp9JTAl-uaQbtEVh-lKneUJRKl5hEXc,543
|
2
|
+
audio_scribe/auth.py,sha256=YD9ElcMtFIMMYW26XZqMCzpYjOsmXkS5-TC2hTmCOEw,4351
|
3
|
+
audio_scribe/config.py,sha256=umD9-QBfi4e5RZG33lCOpdLBBbriG0LFyyDwvgHlSlQ,935
|
4
|
+
audio_scribe/models.py,sha256=Z5eJJf7rxq6k60fJMfVW98jwB9MDT7JxKBVvFmXZN-Q,7971
|
5
|
+
audio_scribe/transcriber.py,sha256=xMWt50QmNXeLhpTZhJlLtmJSzeOcRSWKtYRMJghjUnI,4026
|
6
|
+
audio_scribe/utils.py,sha256=iKt0ZZKF_Jmo7WNKJxldOHlwo__afEWuYWi_ckNd9gU,3278
|
7
|
+
audio_scribe-0.1.2.dist-info/METADATA,sha256=sKvgdqqlHZ9fqlwJu-sXCDdpqAFKVBsis8BHq0Q6yyU,10355
|
8
|
+
audio_scribe-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
+
audio_scribe-0.1.2.dist-info/entry_points.txt,sha256=Bj7Co8Er22Ux59Vs2_S63ds2bnwDURvhHYNXVviZdPM,63
|
10
|
+
audio_scribe-0.1.2.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
|
11
|
+
audio_scribe-0.1.2.dist-info/RECORD,,
|
audio_scribe/cli.py
DELETED
@@ -1,567 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
Audio Scribe
|
4
|
-
-----------------
|
5
|
-
A command-line script for transcribing audio files with speaker diarization
|
6
|
-
using Whisper and Pyannote. The script uses a Hugging Face token for
|
7
|
-
downloading Pyannote speaker-diarization models and displays a progress bar
|
8
|
-
with resource usage while transcribing.
|
9
|
-
"""
|
10
|
-
|
11
|
-
print("Initializing environment... Please wait while we load dependencies and models.")
|
12
|
-
import sys
|
13
|
-
sys.stdout.flush()
|
14
|
-
|
15
|
-
import os
|
16
|
-
import glob
|
17
|
-
import wave
|
18
|
-
import json
|
19
|
-
import logging
|
20
|
-
import warnings
|
21
|
-
import argparse
|
22
|
-
import readline # <--- For enabling tab-completion on Unix/Linux
|
23
|
-
from pathlib import Path
|
24
|
-
from datetime import datetime
|
25
|
-
from typing import Optional, Dict
|
26
|
-
from dataclasses import dataclass
|
27
|
-
import base64
|
28
|
-
|
29
|
-
from cryptography.fernet import Fernet
|
30
|
-
from cryptography.hazmat.primitives import hashes
|
31
|
-
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
32
|
-
|
33
|
-
import torch
|
34
|
-
import whisper
|
35
|
-
|
36
|
-
import importlib.metadata
|
37
|
-
from importlib.metadata import PackageNotFoundError
|
38
|
-
|
39
|
-
from pyannote.audio import Pipeline
|
40
|
-
|
41
|
-
# Attempt to import optional packages for progress bar and resource monitoring
|
42
|
-
try:
|
43
|
-
from alive_progress import alive_bar
|
44
|
-
import psutil
|
45
|
-
import GPUtil
|
46
|
-
HAVE_PROGRESS_SUPPORT = True
|
47
|
-
except ImportError:
|
48
|
-
HAVE_PROGRESS_SUPPORT = False
|
49
|
-
|
50
|
-
# Configure logging
|
51
|
-
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
|
52
|
-
logging.basicConfig(
|
53
|
-
level=logging.INFO,
|
54
|
-
format=LOG_FORMAT,
|
55
|
-
handlers=[
|
56
|
-
logging.StreamHandler(),
|
57
|
-
logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
|
58
|
-
],
|
59
|
-
)
|
60
|
-
logger = logging.getLogger(__name__)
|
61
|
-
|
62
|
-
# ---------- FILE PATH TAB-COMPLETION SNIPPET ----------
|
63
|
-
def complete_path(text, state):
|
64
|
-
"""
|
65
|
-
Return the 'state'-th completion for 'text'.
|
66
|
-
This function will be used by 'readline' to enable file path autocompletion.
|
67
|
-
"""
|
68
|
-
# If the user typed a glob pattern (with * or ?)
|
69
|
-
if '*' in text or '?' in text:
|
70
|
-
matches = glob.glob(text)
|
71
|
-
else:
|
72
|
-
# Split off the directory name and partial file/directory name
|
73
|
-
directory, partial = os.path.split(text)
|
74
|
-
if not directory:
|
75
|
-
directory = '.'
|
76
|
-
try:
|
77
|
-
# List everything in 'directory' that starts with 'partial'
|
78
|
-
entries = os.listdir(directory)
|
79
|
-
except OSError:
|
80
|
-
# If directory doesn't exist or we lack permission, no matches
|
81
|
-
entries = []
|
82
|
-
|
83
|
-
matches = []
|
84
|
-
for entry in entries:
|
85
|
-
if entry.startswith(partial):
|
86
|
-
full_path = os.path.join(directory, entry)
|
87
|
-
# If it's a directory, add a trailing slash to indicate that
|
88
|
-
if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
|
89
|
-
full_path += os.path.sep
|
90
|
-
matches.append(full_path)
|
91
|
-
|
92
|
-
# Sort matches to have a consistent order
|
93
|
-
matches.sort()
|
94
|
-
|
95
|
-
# If 'state' is beyond last match, return None
|
96
|
-
return matches[state] if state < len(matches) else None
|
97
|
-
|
98
|
-
|
99
|
-
@dataclass
|
100
|
-
class TranscriptionConfig:
|
101
|
-
"""
|
102
|
-
Configuration settings for the transcription pipeline.
|
103
|
-
"""
|
104
|
-
output_directory: Path
|
105
|
-
whisper_model: str = "base.en"
|
106
|
-
diarization_model: str = "pyannote/speaker-diarization-3.1"
|
107
|
-
temp_directory: Optional[Path] = None
|
108
|
-
device: Optional[str] = None
|
109
|
-
|
110
|
-
def __post_init__(self):
|
111
|
-
# Use CUDA if available, else fall back to CPU
|
112
|
-
self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
|
113
|
-
# Default temp directory inside the output directory
|
114
|
-
self.temp_directory = self.temp_directory or (self.output_directory / "temp")
|
115
|
-
# Ensure directories exist
|
116
|
-
self.temp_directory.mkdir(parents=True, exist_ok=True)
|
117
|
-
self.output_directory.mkdir(parents=True, exist_ok=True)
|
118
|
-
|
119
|
-
|
120
|
-
class TokenManager:
|
121
|
-
"""
|
122
|
-
Handles secure storage and retrieval of the Hugging Face authentication token.
|
123
|
-
"""
|
124
|
-
def __init__(self):
|
125
|
-
# Store config in ~/.pyannote/config.json
|
126
|
-
self.config_dir = Path.home() / ".pyannote"
|
127
|
-
self.config_file = self.config_dir / "config.json"
|
128
|
-
self._initialize_config()
|
129
|
-
|
130
|
-
def _initialize_config(self) -> None:
|
131
|
-
"""
|
132
|
-
Initialize configuration directory and file with secure permissions.
|
133
|
-
"""
|
134
|
-
self.config_dir.mkdir(exist_ok=True)
|
135
|
-
if not self.config_file.exists():
|
136
|
-
self._save_config({})
|
137
|
-
|
138
|
-
# Set secure file and directory permissions on POSIX systems
|
139
|
-
if os.name == "posix":
|
140
|
-
os.chmod(self.config_dir, 0o700)
|
141
|
-
os.chmod(self.config_file, 0o600)
|
142
|
-
|
143
|
-
def _get_encryption_key(self) -> bytes:
|
144
|
-
"""
|
145
|
-
Generate an encryption key from system-specific data.
|
146
|
-
"""
|
147
|
-
salt = b"pyannote-audio-salt"
|
148
|
-
kdf = PBKDF2HMAC(
|
149
|
-
algorithm=hashes.SHA256(),
|
150
|
-
length=32,
|
151
|
-
salt=salt,
|
152
|
-
iterations=100000,
|
153
|
-
)
|
154
|
-
key = kdf.derive(str(Path.home()).encode())
|
155
|
-
return base64.urlsafe_b64encode(key)
|
156
|
-
|
157
|
-
def _save_config(self, config: dict) -> None:
|
158
|
-
"""
|
159
|
-
Securely save configuration to file.
|
160
|
-
"""
|
161
|
-
with open(self.config_file, "w", encoding="utf-8") as f:
|
162
|
-
json.dump(config, f)
|
163
|
-
|
164
|
-
def _load_config(self) -> dict:
|
165
|
-
"""
|
166
|
-
Load configuration from file.
|
167
|
-
"""
|
168
|
-
try:
|
169
|
-
with open(self.config_file, "r", encoding="utf-8") as f:
|
170
|
-
return json.load(f)
|
171
|
-
except Exception:
|
172
|
-
return {}
|
173
|
-
|
174
|
-
def store_token(self, token: str) -> bool:
|
175
|
-
"""
|
176
|
-
Securely store authentication token.
|
177
|
-
"""
|
178
|
-
try:
|
179
|
-
fernet = Fernet(self._get_encryption_key())
|
180
|
-
encrypted_token = fernet.encrypt(token.encode())
|
181
|
-
|
182
|
-
config = self._load_config()
|
183
|
-
config["token"] = encrypted_token.decode()
|
184
|
-
|
185
|
-
self._save_config(config)
|
186
|
-
return True
|
187
|
-
except Exception as e:
|
188
|
-
logger.error(f"Failed to store token: {e}")
|
189
|
-
return False
|
190
|
-
|
191
|
-
def retrieve_token(self) -> Optional[str]:
|
192
|
-
"""
|
193
|
-
Retrieve stored authentication token.
|
194
|
-
"""
|
195
|
-
try:
|
196
|
-
config = self._load_config()
|
197
|
-
if "token" in config:
|
198
|
-
fernet = Fernet(self._get_encryption_key())
|
199
|
-
return fernet.decrypt(config["token"].encode()).decode()
|
200
|
-
except Exception as e:
|
201
|
-
logger.error(f"Failed to retrieve token: {e}")
|
202
|
-
return None
|
203
|
-
|
204
|
-
def delete_token(self) -> bool:
|
205
|
-
"""
|
206
|
-
Delete stored authentication token.
|
207
|
-
"""
|
208
|
-
try:
|
209
|
-
config = self._load_config()
|
210
|
-
if "token" in config:
|
211
|
-
del config["token"]
|
212
|
-
self._save_config(config)
|
213
|
-
return True
|
214
|
-
except Exception as e:
|
215
|
-
logger.error(f"Failed to delete token: {e}")
|
216
|
-
return False
|
217
|
-
|
218
|
-
|
219
|
-
class DependencyManager:
|
220
|
-
"""
|
221
|
-
Manages and verifies system dependencies using importlib.metadata.
|
222
|
-
"""
|
223
|
-
REQUIRED_PACKAGES = {
|
224
|
-
"torch": None,
|
225
|
-
"pyannote.audio": None,
|
226
|
-
"openai-whisper": None,
|
227
|
-
"pytorch-lightning": None,
|
228
|
-
"keyring": None,
|
229
|
-
}
|
230
|
-
|
231
|
-
@classmethod
|
232
|
-
def verify_dependencies(cls) -> bool:
|
233
|
-
"""
|
234
|
-
Verify all required dependencies are installed with correct versions
|
235
|
-
(if specified). Returns True if all are installed and correct, False otherwise.
|
236
|
-
"""
|
237
|
-
missing = []
|
238
|
-
outdated = []
|
239
|
-
|
240
|
-
for package, required_version in cls.REQUIRED_PACKAGES.items():
|
241
|
-
try:
|
242
|
-
installed_version = importlib.metadata.version(package)
|
243
|
-
if required_version and installed_version != required_version:
|
244
|
-
outdated.append(
|
245
|
-
f"{package} (installed: {installed_version}, required: {required_version})"
|
246
|
-
)
|
247
|
-
except PackageNotFoundError:
|
248
|
-
missing.append(package)
|
249
|
-
|
250
|
-
if missing or outdated:
|
251
|
-
if missing:
|
252
|
-
logger.error("Missing packages: %s", ", ".join(missing))
|
253
|
-
if outdated:
|
254
|
-
logger.error("Outdated packages: %s", ", ".join(outdated))
|
255
|
-
logger.info(
|
256
|
-
"Install required packages: pip install %s",
|
257
|
-
" ".join(
|
258
|
-
f"{pkg}=={ver}" if ver else pkg
|
259
|
-
for pkg, ver in cls.REQUIRED_PACKAGES.items()
|
260
|
-
),
|
261
|
-
)
|
262
|
-
return False
|
263
|
-
return True
|
264
|
-
|
265
|
-
|
266
|
-
class AudioProcessor:
|
267
|
-
"""
|
268
|
-
Handles audio file processing and segmentation using the `wave` module.
|
269
|
-
"""
|
270
|
-
def __init__(self, config: TranscriptionConfig):
|
271
|
-
self.config = config
|
272
|
-
|
273
|
-
def load_audio_segment(
|
274
|
-
self,
|
275
|
-
audio_path: Path,
|
276
|
-
start_time: float,
|
277
|
-
end_time: float,
|
278
|
-
output_path: Path,
|
279
|
-
) -> bool:
|
280
|
-
"""
|
281
|
-
Extract and save the audio segment from `start_time` to `end_time`.
|
282
|
-
"""
|
283
|
-
try:
|
284
|
-
with wave.open(str(audio_path), "rb") as infile:
|
285
|
-
params = infile.getparams()
|
286
|
-
frame_rate = params.framerate
|
287
|
-
start_frame = int(start_time * frame_rate)
|
288
|
-
end_frame = min(int(end_time * frame_rate), infile.getnframes())
|
289
|
-
|
290
|
-
infile.setpos(start_frame)
|
291
|
-
frames = infile.readframes(end_frame - start_frame)
|
292
|
-
|
293
|
-
with wave.open(str(output_path), "wb") as outfile:
|
294
|
-
outfile.setparams(params)
|
295
|
-
outfile.writeframes(frames)
|
296
|
-
return True
|
297
|
-
except Exception as e:
|
298
|
-
logger.error(f"Failed to process audio segment: {e}")
|
299
|
-
return False
|
300
|
-
|
301
|
-
|
302
|
-
class TranscriptionPipeline:
|
303
|
-
"""
|
304
|
-
Main pipeline for audio transcription (Whisper) and speaker diarization (Pyannote).
|
305
|
-
"""
|
306
|
-
def __init__(self, config: TranscriptionConfig):
|
307
|
-
self.config = config
|
308
|
-
self.diarization_pipeline = None
|
309
|
-
self.whisper_model = None
|
310
|
-
self.token_manager = TokenManager()
|
311
|
-
self._running = False # used for resource monitor thread
|
312
|
-
|
313
|
-
def initialize_models(self, auth_token: str) -> bool:
|
314
|
-
"""
|
315
|
-
Initialize the Pyannote diarization pipeline and the Whisper model.
|
316
|
-
"""
|
317
|
-
try:
|
318
|
-
# Load Whisper model (set download root to avoid clutter in home directory)
|
319
|
-
self.whisper_model = whisper.load_model(
|
320
|
-
self.config.whisper_model,
|
321
|
-
device=self.config.device,
|
322
|
-
download_root=str(self.config.output_directory / "models"),
|
323
|
-
)
|
324
|
-
|
325
|
-
# Load Pyannote diarization pipeline
|
326
|
-
self.diarization_pipeline = Pipeline.from_pretrained(
|
327
|
-
self.config.diarization_model, use_auth_token=auth_token
|
328
|
-
)
|
329
|
-
self.diarization_pipeline.to(torch.device(self.config.device))
|
330
|
-
|
331
|
-
if self.config.device == "cpu":
|
332
|
-
warnings.warn("Running on CPU. GPU is recommended for better performance.")
|
333
|
-
|
334
|
-
return True
|
335
|
-
except Exception as e:
|
336
|
-
logger.error(f"Model initialization failed: {e}")
|
337
|
-
logger.error("Please ensure you have accepted the model conditions at:")
|
338
|
-
logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
|
339
|
-
logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
|
340
|
-
return False
|
341
|
-
|
342
|
-
def _update_resources(self, bar):
|
343
|
-
"""
|
344
|
-
Continuously update progress bar text with CPU/MEM/GPU usage, until self._running is False.
|
345
|
-
"""
|
346
|
-
while self._running:
|
347
|
-
try:
|
348
|
-
import time
|
349
|
-
time.sleep(0.5)
|
350
|
-
|
351
|
-
cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
|
352
|
-
memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
|
353
|
-
|
354
|
-
if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
|
355
|
-
gpus = GPUtil.getGPUs()
|
356
|
-
gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
|
357
|
-
gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
|
358
|
-
gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
|
359
|
-
else:
|
360
|
-
gpu_usage_text = "N/A"
|
361
|
-
|
362
|
-
resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
|
363
|
-
bar.text(resource_text)
|
364
|
-
except Exception as e:
|
365
|
-
logger.error(f"Resource monitoring error: {e}")
|
366
|
-
|
367
|
-
def process_file(self, audio_path: Path) -> bool:
|
368
|
-
"""
|
369
|
-
Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback.
|
370
|
-
"""
|
371
|
-
try:
|
372
|
-
logger.info("Starting audio processing...")
|
373
|
-
diarization = self.diarization_pipeline(str(audio_path))
|
374
|
-
segments = list(diarization.itertracks(yield_label=True))
|
375
|
-
total_segments = len(segments)
|
376
|
-
|
377
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
378
|
-
output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
|
379
|
-
audio_processor = AudioProcessor(self.config)
|
380
|
-
|
381
|
-
if not HAVE_PROGRESS_SUPPORT:
|
382
|
-
# No alive_progress, psutil, or GPUtil installed
|
383
|
-
logger.info("Processing audio without progress bar (missing optional packages).")
|
384
|
-
with output_file.open("w", encoding="utf-8") as f:
|
385
|
-
for turn, _, speaker in segments:
|
386
|
-
segment_path = (
|
387
|
-
self.config.temp_directory
|
388
|
-
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
389
|
-
)
|
390
|
-
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
391
|
-
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
392
|
-
segment_path.unlink(missing_ok=True)
|
393
|
-
|
394
|
-
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
395
|
-
f.write(line)
|
396
|
-
logger.info(line.strip())
|
397
|
-
else:
|
398
|
-
# Use a progress bar to track segment transcription
|
399
|
-
from alive_progress import alive_bar
|
400
|
-
import threading
|
401
|
-
|
402
|
-
self._running = True
|
403
|
-
with output_file.open("w", encoding="utf-8") as f, alive_bar(
|
404
|
-
total_segments,
|
405
|
-
title="Transcribing Audio",
|
406
|
-
spinner="pulse",
|
407
|
-
theme="classic",
|
408
|
-
stats=False,
|
409
|
-
elapsed=True,
|
410
|
-
monitor=True,
|
411
|
-
) as bar:
|
412
|
-
|
413
|
-
# Start a background thread for resource monitoring
|
414
|
-
resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
|
415
|
-
resource_thread.start()
|
416
|
-
|
417
|
-
for turn, _, speaker in segments:
|
418
|
-
segment_path = (
|
419
|
-
self.config.temp_directory
|
420
|
-
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
421
|
-
)
|
422
|
-
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
423
|
-
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
424
|
-
segment_path.unlink(missing_ok=True)
|
425
|
-
|
426
|
-
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
427
|
-
f.write(line)
|
428
|
-
logger.info(line.strip())
|
429
|
-
|
430
|
-
# Update the progress bar
|
431
|
-
bar()
|
432
|
-
|
433
|
-
# Stop resource monitoring
|
434
|
-
self._running = False
|
435
|
-
resource_thread.join()
|
436
|
-
|
437
|
-
logger.info(f"Transcription completed. Output saved to: {output_file}")
|
438
|
-
return True
|
439
|
-
|
440
|
-
except Exception as e:
|
441
|
-
logger.error(f"Processing failed: {e}")
|
442
|
-
return False
|
443
|
-
|
444
|
-
|
445
|
-
def get_token(token_manager: TokenManager) -> Optional[str]:
|
446
|
-
"""
|
447
|
-
Get authentication token from storage or user input.
|
448
|
-
"""
|
449
|
-
stored_token = token_manager.retrieve_token()
|
450
|
-
if stored_token:
|
451
|
-
choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
|
452
|
-
if choice == "y":
|
453
|
-
return stored_token
|
454
|
-
|
455
|
-
print("\nA HuggingFace token is required for speaker diarization.")
|
456
|
-
print("Get your token at: https://huggingface.co/settings/tokens")
|
457
|
-
print("\nEnsure you have accepted:")
|
458
|
-
print(" 1. pyannote/segmentation-3.0 conditions")
|
459
|
-
print(" 2. pyannote/speaker-diarization-3.1 conditions")
|
460
|
-
|
461
|
-
token = input("\nEnter HuggingFace token: ").strip()
|
462
|
-
if token:
|
463
|
-
choice = input("Save token for future use? (y/n): ").lower().strip()
|
464
|
-
if choice == "y":
|
465
|
-
if token_manager.store_token(token):
|
466
|
-
print("Token saved successfully.")
|
467
|
-
else:
|
468
|
-
print("Failed to save token. It will be used for this session only.")
|
469
|
-
return token if token else None
|
470
|
-
|
471
|
-
|
472
|
-
def main():
|
473
|
-
parser = argparse.ArgumentParser(
|
474
|
-
description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
|
475
|
-
)
|
476
|
-
parser.add_argument(
|
477
|
-
"--audio",
|
478
|
-
type=Path,
|
479
|
-
help="Path to the audio file to transcribe."
|
480
|
-
)
|
481
|
-
parser.add_argument(
|
482
|
-
"--token",
|
483
|
-
help="HuggingFace API token. Overrides any saved token."
|
484
|
-
)
|
485
|
-
parser.add_argument(
|
486
|
-
"--output",
|
487
|
-
type=Path,
|
488
|
-
help="Path to the output directory for transcripts and temporary files.",
|
489
|
-
)
|
490
|
-
parser.add_argument(
|
491
|
-
"--delete-token",
|
492
|
-
action="store_true",
|
493
|
-
help="Delete any stored Hugging Face token and exit.",
|
494
|
-
)
|
495
|
-
parser.add_argument(
|
496
|
-
"--show-warnings",
|
497
|
-
action="store_true",
|
498
|
-
help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
|
499
|
-
)
|
500
|
-
parser.add_argument(
|
501
|
-
"--whisper-model",
|
502
|
-
default="base.en",
|
503
|
-
help="Specify the Whisper model to use (default: 'base.en').",
|
504
|
-
)
|
505
|
-
args = parser.parse_args()
|
506
|
-
|
507
|
-
# Manage user warnings
|
508
|
-
if not args.show_warnings:
|
509
|
-
warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
|
510
|
-
warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
|
511
|
-
else:
|
512
|
-
warnings.resetwarnings()
|
513
|
-
|
514
|
-
# Check dependencies
|
515
|
-
if not DependencyManager.verify_dependencies():
|
516
|
-
sys.exit(1)
|
517
|
-
|
518
|
-
# Initialize tab-completion for file paths (Unix-like only, or with pyreadline on Windows)
|
519
|
-
readline.set_completer_delims(' \t\n;')
|
520
|
-
readline.set_completer(complete_path)
|
521
|
-
readline.parse_and_bind("tab: complete")
|
522
|
-
|
523
|
-
# Initialize the token manager
|
524
|
-
token_manager = TokenManager()
|
525
|
-
|
526
|
-
# If user wants to delete the stored token, do so and exit
|
527
|
-
if args.delete_token:
|
528
|
-
success = token_manager.delete_token()
|
529
|
-
sys.exit(0 if success else 1)
|
530
|
-
|
531
|
-
# Prepare configuration
|
532
|
-
output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
|
533
|
-
config = TranscriptionConfig(
|
534
|
-
output_directory=output_dir,
|
535
|
-
whisper_model=args.whisper_model
|
536
|
-
)
|
537
|
-
|
538
|
-
# Initialize pipeline
|
539
|
-
pipeline = TranscriptionPipeline(config)
|
540
|
-
hf_token = args.token or get_token(token_manager)
|
541
|
-
if not hf_token:
|
542
|
-
logger.error("No Hugging Face token provided. Exiting.")
|
543
|
-
sys.exit(1)
|
544
|
-
|
545
|
-
# Initialize models
|
546
|
-
if not pipeline.initialize_models(hf_token):
|
547
|
-
logger.error("Failed to initialize pipeline. Exiting.")
|
548
|
-
sys.exit(1)
|
549
|
-
|
550
|
-
# Prompt user for audio file path if not passed in
|
551
|
-
audio_path = args.audio
|
552
|
-
while not audio_path or not audio_path.exists():
|
553
|
-
audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
|
554
|
-
audio_path = Path(audio_path_str)
|
555
|
-
if not audio_path.exists():
|
556
|
-
print(f"File '{audio_path}' not found. Please try again.")
|
557
|
-
|
558
|
-
print("Audio file path accepted. Preparing to process the audio...")
|
559
|
-
sys.stdout.flush()
|
560
|
-
|
561
|
-
# Process the audio file
|
562
|
-
if not pipeline.process_file(audio_path):
|
563
|
-
sys.exit(1)
|
564
|
-
|
565
|
-
|
566
|
-
if __name__ == "__main__":
|
567
|
-
main()
|
@@ -1,7 +0,0 @@
|
|
1
|
-
audio_scribe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
audio_scribe/cli.py,sha256=dabh7fe9wAEORwVIBd-V8FAzHBBzbkjnfMSR-wOywO8,20286
|
3
|
-
audio_scribe-0.1.0.dist-info/METADATA,sha256=BBx81TI9DPCYgsdKyBn2PWEJ9pJsnhqTUb8ZsWoS1Ps,9503
|
4
|
-
audio_scribe-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
audio_scribe-0.1.0.dist-info/entry_points.txt,sha256=eaO9r_zAFnrWseKyJcBpGUHQq-P7NXBw5er8sZaPfFU,55
|
6
|
-
audio_scribe-0.1.0.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
|
7
|
-
audio_scribe-0.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|