audio-scribe 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
audio_scribe/__init__.py CHANGED
@@ -0,0 +1,24 @@
1
+ """
2
+ Audio Scribe
3
+ -----------------
4
+ A Python package for transcribing audio files with speaker diarization
5
+ using Whisper and Pyannote.
6
+ """
7
+
8
+ from .transcriber import main
9
+ from .models import TranscriptionPipeline, AudioProcessor
10
+ from .config import TranscriptionConfig
11
+ from .auth import TokenManager
12
+ from .utils import DependencyManager, complete_path
13
+
14
+ __version__ = "0.1.2"
15
+
16
+ __all__ = [
17
+ "main",
18
+ "TranscriptionPipeline",
19
+ "TranscriptionConfig",
20
+ "AudioProcessor",
21
+ "TokenManager",
22
+ "DependencyManager",
23
+ "complete_path",
24
+ ]
audio_scribe/auth.py ADDED
@@ -0,0 +1,119 @@
1
+ """Authentication and token management for Audio Scribe."""
2
+
3
+ import os
4
+ import json
5
+ import base64
6
+ import logging
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ from cryptography.fernet import Fernet
10
+ from cryptography.hazmat.primitives import hashes
11
+ from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class TokenManager:
16
+ """Handles secure storage and retrieval of the Hugging Face authentication token."""
17
+ def __init__(self):
18
+ # Store config in ~/.pyannote/config.json
19
+ self.config_dir = Path.home() / ".pyannote"
20
+ self.config_file = self.config_dir / "config.json"
21
+ self._initialize_config()
22
+
23
+ def _initialize_config(self) -> None:
24
+ """Initialize configuration directory and file with secure permissions."""
25
+ self.config_dir.mkdir(exist_ok=True)
26
+ if not self.config_file.exists():
27
+ self._save_config({})
28
+
29
+ # Set secure file and directory permissions on POSIX systems
30
+ if os.name == "posix":
31
+ os.chmod(self.config_dir, 0o700)
32
+ os.chmod(self.config_file, 0o600)
33
+
34
+ def _get_encryption_key(self) -> bytes:
35
+ """Generate an encryption key from system-specific data."""
36
+ salt = b"pyannote-audio-salt"
37
+ kdf = PBKDF2HMAC(
38
+ algorithm=hashes.SHA256(),
39
+ length=32,
40
+ salt=salt,
41
+ iterations=100000,
42
+ )
43
+ key = kdf.derive(str(Path.home()).encode())
44
+ return base64.urlsafe_b64encode(key)
45
+
46
+ def _save_config(self, config: dict) -> None:
47
+ """Securely save configuration to file."""
48
+ with open(self.config_file, "w", encoding="utf-8") as f:
49
+ json.dump(config, f)
50
+
51
+ def _load_config(self) -> dict:
52
+ """Load configuration from file."""
53
+ try:
54
+ with open(self.config_file, "r", encoding="utf-8") as f:
55
+ return json.load(f)
56
+ except Exception:
57
+ return {}
58
+
59
+ def store_token(self, token: str) -> bool:
60
+ """Securely store authentication token."""
61
+ try:
62
+ fernet = Fernet(self._get_encryption_key())
63
+ encrypted_token = fernet.encrypt(token.encode())
64
+
65
+ config = self._load_config()
66
+ config["token"] = encrypted_token.decode()
67
+
68
+ self._save_config(config)
69
+ return True
70
+ except Exception as e:
71
+ logger.error(f"Failed to store token: {e}")
72
+ return False
73
+
74
+ def retrieve_token(self) -> Optional[str]:
75
+ """Retrieve stored authentication token."""
76
+ try:
77
+ config = self._load_config()
78
+ if "token" in config:
79
+ fernet = Fernet(self._get_encryption_key())
80
+ return fernet.decrypt(config["token"].encode()).decode()
81
+ except Exception as e:
82
+ logger.error(f"Failed to retrieve token: {e}")
83
+ return None
84
+
85
+ def delete_token(self) -> bool:
86
+ """Delete stored authentication token."""
87
+ try:
88
+ config = self._load_config()
89
+ if "token" in config:
90
+ del config["token"]
91
+ self._save_config(config)
92
+ return True
93
+ except Exception as e:
94
+ logger.error(f"Failed to delete token: {e}")
95
+ return False
96
+
97
+ def get_token(token_manager: TokenManager) -> Optional[str]:
98
+ """Get authentication token from storage or user input."""
99
+ stored_token = token_manager.retrieve_token()
100
+ if stored_token:
101
+ choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
102
+ if choice == "y":
103
+ return stored_token
104
+
105
+ print("\nA HuggingFace token is required for speaker diarization.")
106
+ print("Get your token at: https://huggingface.co/settings/tokens")
107
+ print("\nEnsure you have accepted:")
108
+ print(" 1. pyannote/segmentation-3.0 conditions")
109
+ print(" 2. pyannote/speaker-diarization-3.1 conditions")
110
+
111
+ token = input("\nEnter HuggingFace token: ").strip()
112
+ if token:
113
+ choice = input("Save token for future use? (y/n): ").lower().strip()
114
+ if choice == "y":
115
+ if token_manager.store_token(token):
116
+ print("Token saved successfully.")
117
+ else:
118
+ print("Failed to save token. It will be used for this session only.")
119
+ return token if token else None
audio_scribe/config.py ADDED
@@ -0,0 +1,24 @@
1
+ """Configuration management for Audio Scribe."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Optional
6
+ import torch
7
+
8
+ @dataclass
9
+ class TranscriptionConfig:
10
+ """Configuration settings for the transcription pipeline."""
11
+ output_directory: Path
12
+ whisper_model: str = "base.en"
13
+ diarization_model: str = "pyannote/speaker-diarization-3.1"
14
+ temp_directory: Optional[Path] = None
15
+ device: Optional[str] = None
16
+
17
+ def __post_init__(self):
18
+ # Use CUDA if available, else fall back to CPU
19
+ self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
20
+ # Default temp directory inside the output directory
21
+ self.temp_directory = self.temp_directory or (self.output_directory / "temp")
22
+ # Ensure directories exist
23
+ self.temp_directory.mkdir(parents=True, exist_ok=True)
24
+ self.output_directory.mkdir(parents=True, exist_ok=True)
audio_scribe/models.py ADDED
@@ -0,0 +1,196 @@
1
+ """Model handling and audio processing for Audio Scribe."""
2
+
3
+ import wave
4
+ import torch
5
+ import whisper
6
+ import logging
7
+ import warnings
8
+ import threading
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ from pyannote.audio import Pipeline
13
+
14
+ from .config import TranscriptionConfig
15
+ from .auth import TokenManager
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ try:
20
+ from alive_progress import alive_bar
21
+ import psutil
22
+ import GPUtil
23
+ HAVE_PROGRESS_SUPPORT = True
24
+ except ImportError:
25
+ HAVE_PROGRESS_SUPPORT = False
26
+
27
+
28
+ class AudioProcessor:
29
+ """Handles audio file processing and segmentation."""
30
+
31
+ def __init__(self, config: TranscriptionConfig):
32
+ self.config = config
33
+
34
+ def load_audio_segment(
35
+ self,
36
+ audio_path: Path,
37
+ start_time: float,
38
+ end_time: float,
39
+ output_path: Path,
40
+ ) -> bool:
41
+ """Extract and save the audio segment from start_time to end_time."""
42
+ try:
43
+ with wave.open(str(audio_path), "rb") as infile:
44
+ params = infile.getparams()
45
+ frame_rate = params.framerate
46
+ start_frame = int(start_time * frame_rate)
47
+ end_frame = min(int(end_time * frame_rate), infile.getnframes())
48
+
49
+ infile.setpos(start_frame)
50
+ frames = infile.readframes(end_frame - start_frame)
51
+
52
+ with wave.open(str(output_path), "wb") as outfile:
53
+ outfile.setparams(params)
54
+ outfile.writeframes(frames)
55
+ return True
56
+ except Exception as e:
57
+ logger.error(f"Failed to process audio segment: {e}")
58
+ return False
59
+
60
+
61
+ class TranscriptionPipeline:
62
+ """Main pipeline for audio transcription and speaker diarization."""
63
+
64
+ def __init__(self, config: TranscriptionConfig):
65
+ self.config = config
66
+ self.diarization_pipeline = None
67
+ self.whisper_model = None
68
+ self.token_manager = TokenManager()
69
+ self._running = False # used for resource monitor thread
70
+
71
+ def initialize_models(self, auth_token: str) -> bool:
72
+ """Initialize the Pyannote diarization pipeline and Whisper model."""
73
+ try:
74
+ # Load Whisper model
75
+ self.whisper_model = whisper.load_model(
76
+ self.config.whisper_model,
77
+ device=self.config.device,
78
+ download_root=str(self.config.output_directory / "models"),
79
+ )
80
+
81
+ # Load Pyannote diarization pipeline
82
+ self.diarization_pipeline = Pipeline.from_pretrained(
83
+ self.config.diarization_model,
84
+ use_auth_token=auth_token
85
+ )
86
+ self.diarization_pipeline.to(torch.device(self.config.device))
87
+
88
+ if self.config.device == "cpu":
89
+ warnings.warn("Running on CPU. GPU is recommended for better performance.")
90
+
91
+ return True
92
+ except Exception as e:
93
+ logger.error(f"Model initialization failed: {e}")
94
+ logger.error("Please ensure you have accepted the model conditions at:")
95
+ logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
96
+ logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
97
+ return False
98
+
99
+ def _update_resources(self, bar):
100
+ """Update progress bar with resource usage information."""
101
+ while self._running:
102
+ try:
103
+ import time
104
+ time.sleep(0.5)
105
+
106
+ cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
107
+ memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
108
+
109
+ if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
110
+ gpus = GPUtil.getGPUs()
111
+ gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
112
+ gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
113
+ gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
114
+ else:
115
+ gpu_usage_text = "N/A"
116
+
117
+ resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
118
+ bar.text(resource_text)
119
+ except Exception as e:
120
+ logger.error(f"Resource monitoring error: {e}")
121
+
122
+ def process_file(self, audio_path: Path) -> bool:
123
+ """Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback."""
124
+ try:
125
+ logger.info("Starting audio processing...")
126
+ diarization = self.diarization_pipeline(str(audio_path))
127
+ segments = list(diarization.itertracks(yield_label=True))
128
+ total_segments = len(segments)
129
+
130
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
131
+ output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
132
+ audio_processor = AudioProcessor(self.config)
133
+
134
+ if not HAVE_PROGRESS_SUPPORT:
135
+ # No alive_progress, psutil, or GPUtil installed
136
+ logger.info("Processing audio without progress bar (missing optional packages).")
137
+ with output_file.open("w", encoding="utf-8") as f:
138
+ for turn, _, speaker in segments:
139
+ segment_path = (
140
+ self.config.temp_directory
141
+ / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
142
+ )
143
+ if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
144
+ transcription = self.whisper_model.transcribe(str(segment_path))["text"]
145
+ segment_path.unlink(missing_ok=True)
146
+
147
+ line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
148
+ f.write(line)
149
+ logger.info(line.strip())
150
+ return True
151
+ else:
152
+ # Use a progress bar to track segment transcription
153
+ from alive_progress import alive_bar
154
+ import threading
155
+
156
+ self._running = True
157
+ with output_file.open("w", encoding="utf-8") as f, alive_bar(
158
+ total_segments,
159
+ title="Transcribing Audio",
160
+ spinner="pulse",
161
+ theme="classic",
162
+ stats=False,
163
+ elapsed=True,
164
+ monitor=True,
165
+ ) as bar:
166
+
167
+ # Start a background thread for resource monitoring
168
+ resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
169
+ resource_thread.start()
170
+
171
+ for turn, _, speaker in segments:
172
+ segment_path = (
173
+ self.config.temp_directory
174
+ / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
175
+ )
176
+ if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
177
+ transcription = self.whisper_model.transcribe(str(segment_path))["text"]
178
+ segment_path.unlink(missing_ok=True)
179
+
180
+ line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
181
+ f.write(line)
182
+ logger.info(line.strip())
183
+
184
+ # Update the progress bar
185
+ bar()
186
+
187
+ # Stop resource monitoring
188
+ self._running = False
189
+ resource_thread.join()
190
+
191
+ logger.info(f"Transcription completed. Output saved to: {output_file}")
192
+ return True
193
+
194
+ except Exception as e:
195
+ logger.error(f"Processing failed: {e}")
196
+ return False
@@ -0,0 +1,131 @@
1
+ """
2
+ Main entry point for Audio Scribe transcription tool.
3
+ Handles CLI interface and orchestrates the transcription process.
4
+ """
5
+
6
+ import sys
7
+ import logging
8
+ import warnings
9
+ import argparse
10
+ import readline
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+
14
+ from .config import TranscriptionConfig
15
+ from .models import TranscriptionPipeline
16
+ from .auth import TokenManager, get_token
17
+ from .utils import DependencyManager, complete_path
18
+
19
+ # Configure logging
20
+ LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format=LOG_FORMAT,
24
+ handlers=[
25
+ logging.StreamHandler(),
26
+ logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
27
+ ],
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def main():
33
+ """Main entry point for the Audio Scribe CLI."""
34
+ print("Initializing environment... Please wait while we load dependencies and models.")
35
+ sys.stdout.flush()
36
+
37
+ parser = argparse.ArgumentParser(
38
+ description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
39
+ )
40
+ parser.add_argument(
41
+ "--audio",
42
+ type=Path,
43
+ help="Path to the audio file to transcribe."
44
+ )
45
+ parser.add_argument(
46
+ "--token",
47
+ help="HuggingFace API token. Overrides any saved token."
48
+ )
49
+ parser.add_argument(
50
+ "--output",
51
+ type=Path,
52
+ help="Path to the output directory for transcripts and temporary files.",
53
+ )
54
+ parser.add_argument(
55
+ "--delete-token",
56
+ action="store_true",
57
+ help="Delete any stored Hugging Face token and exit.",
58
+ )
59
+ parser.add_argument(
60
+ "--show-warnings",
61
+ action="store_true",
62
+ help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
63
+ )
64
+ parser.add_argument(
65
+ "--whisper-model",
66
+ default="base.en",
67
+ help="Specify the Whisper model to use (default: 'base.en').",
68
+ )
69
+ args = parser.parse_args()
70
+
71
+ # Manage user warnings
72
+ if not args.show_warnings:
73
+ warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
74
+ warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
75
+ else:
76
+ warnings.resetwarnings()
77
+
78
+ # Check dependencies
79
+ if not DependencyManager.verify_dependencies():
80
+ sys.exit(1)
81
+
82
+ # Initialize tab-completion for file paths
83
+ readline.set_completer_delims(' \t\n;')
84
+ readline.set_completer(complete_path)
85
+ readline.parse_and_bind("tab: complete")
86
+
87
+ # Initialize the token manager
88
+ token_manager = TokenManager()
89
+
90
+ # If user wants to delete the stored token, do so and exit
91
+ if args.delete_token:
92
+ success = token_manager.delete_token()
93
+ sys.exit(0 if success else 1)
94
+
95
+ # Prepare configuration
96
+ output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
97
+ config = TranscriptionConfig(
98
+ output_directory=output_dir,
99
+ whisper_model=args.whisper_model
100
+ )
101
+
102
+ # Initialize pipeline
103
+ pipeline = TranscriptionPipeline(config)
104
+ hf_token = args.token or get_token(token_manager)
105
+ if not hf_token:
106
+ logger.error("No Hugging Face token provided. Exiting.")
107
+ sys.exit(1)
108
+
109
+ # Initialize models
110
+ if not pipeline.initialize_models(hf_token):
111
+ logger.error("Failed to initialize pipeline. Exiting.")
112
+ sys.exit(1)
113
+
114
+ # Prompt user for audio file path if not passed in
115
+ audio_path = args.audio
116
+ while not audio_path or not audio_path.exists():
117
+ audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
118
+ audio_path = Path(audio_path_str)
119
+ if not audio_path.exists():
120
+ print(f"File '{audio_path}' not found. Please try again.")
121
+
122
+ print("Audio file path accepted. Preparing to process the audio...")
123
+ sys.stdout.flush()
124
+
125
+ # Process the audio file
126
+ if not pipeline.process_file(audio_path):
127
+ sys.exit(1)
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
audio_scribe/utils.py ADDED
@@ -0,0 +1,93 @@
1
+ """Utility functions and classes for Audio Scribe."""
2
+
3
+ import os
4
+ import glob
5
+ import logging
6
+ import importlib.metadata
7
+ from importlib.metadata import PackageNotFoundError
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def complete_path(text, state):
12
+ """
13
+ Return the 'state'-th completion for 'text'.
14
+ This function will be used by 'readline' to enable file path autocompletion.
15
+ """
16
+ # If the user typed a glob pattern (with * or ?)
17
+ if '*' in text or '?' in text:
18
+ matches = sorted(glob.glob(text))
19
+ else:
20
+ # Split off the directory name and partial file/directory name
21
+ directory, partial = os.path.split(text)
22
+ if not directory:
23
+ directory = '.'
24
+ try:
25
+ # List everything in 'directory' that starts with 'partial'
26
+ entries = sorted(os.listdir(directory))
27
+ except OSError:
28
+ # If directory doesn't exist or we lack permission, no matches
29
+ entries = []
30
+
31
+ matches = []
32
+ for entry in entries:
33
+ if entry.startswith(partial):
34
+ if directory == '.':
35
+ # Don't prefix current directory paths
36
+ full_path = entry
37
+ else:
38
+ # Keep the directory prefix for subdirectories
39
+ full_path = os.path.join(directory, entry)
40
+
41
+ # If it's a directory, add a trailing slash to indicate that
42
+ if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
43
+ full_path += os.path.sep
44
+ matches.append(full_path)
45
+
46
+ # If 'state' is beyond last match, return None
47
+ return matches[state] if state < len(matches) else None
48
+
49
+
50
+ class DependencyManager:
51
+ """Manages and verifies system dependencies."""
52
+
53
+ REQUIRED_PACKAGES = {
54
+ "torch": None,
55
+ "pyannote.audio": None,
56
+ "openai-whisper": None,
57
+ "pytorch-lightning": None,
58
+ "keyring": None,
59
+ }
60
+
61
+ @classmethod
62
+ def verify_dependencies(cls) -> bool:
63
+ """
64
+ Verify all required dependencies are installed with correct versions.
65
+ Returns True if all are installed and correct, False otherwise.
66
+ """
67
+ missing = []
68
+ outdated = []
69
+
70
+ for package, required_version in cls.REQUIRED_PACKAGES.items():
71
+ try:
72
+ installed_version = importlib.metadata.version(package)
73
+ if required_version and installed_version != required_version:
74
+ outdated.append(
75
+ f"{package} (installed: {installed_version}, required: {required_version})"
76
+ )
77
+ except PackageNotFoundError:
78
+ missing.append(package)
79
+
80
+ if missing or outdated:
81
+ if missing:
82
+ logger.error("Missing packages: %s", ", ".join(missing))
83
+ if outdated:
84
+ logger.error("Outdated packages: %s", ", ".join(outdated))
85
+ logger.info(
86
+ "Install required packages: pip install %s",
87
+ " ".join(
88
+ f"{pkg}=={ver}" if ver else pkg
89
+ for pkg, ver in cls.REQUIRED_PACKAGES.items()
90
+ ),
91
+ )
92
+ return False
93
+ return True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: audio_scribe
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: A command-line tool for audio transcription with Whisper and Pyannote.
5
5
  Home-page: https://gitlab.genomicops.cloud/genomicops/audio-scribe
6
6
  Author: Gurasis Osahan
@@ -46,9 +46,16 @@ Dynamic: summary
46
46
 
47
47
  # Audio Scribe
48
48
 
49
- **A Command-Line Tool for Audio Transcription (Audio Scribe) and Speaker Diarization Using OpenAI Whisper and Pyannote**
49
+ **A Command-Line Tool for Audio Transcription and Speaker Diarization Using OpenAI Whisper and Pyannote**
50
50
 
51
51
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
52
+ ![Coverage](https://img.shields.io/badge/coverage-94.3%25-brightgreen)
53
+ [![Pipeline Status](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/pipeline.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main)
54
+ [![PyPI Version](https://badge.fury.io/py/audio-scribe.svg)](https://badge.fury.io/py/audio-scribe)
55
+ [![Python Versions](https://img.shields.io/pypi/pyversions/audio-scribe)](https://pypi.org/project/audio-scribe/)
56
+ [![PyPI Downloads](https://img.shields.io/pypi/dm/audio-scribe)](https://pypi.org/project/audio-scribe/)
57
+ [![PyPI License](https://img.shields.io/pypi/l/audio-scribe)](https://pypi.org/project/audio-scribe/)
58
+ <!-- [![Coverage Report](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/coverage.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main) -->
52
59
 
53
60
  ## Overview
54
61
 
@@ -0,0 +1,11 @@
1
+ audio_scribe/__init__.py,sha256=FH63cMQOjdp5Kp9JTAl-uaQbtEVh-lKneUJRKl5hEXc,543
2
+ audio_scribe/auth.py,sha256=YD9ElcMtFIMMYW26XZqMCzpYjOsmXkS5-TC2hTmCOEw,4351
3
+ audio_scribe/config.py,sha256=umD9-QBfi4e5RZG33lCOpdLBBbriG0LFyyDwvgHlSlQ,935
4
+ audio_scribe/models.py,sha256=Z5eJJf7rxq6k60fJMfVW98jwB9MDT7JxKBVvFmXZN-Q,7971
5
+ audio_scribe/transcriber.py,sha256=xMWt50QmNXeLhpTZhJlLtmJSzeOcRSWKtYRMJghjUnI,4026
6
+ audio_scribe/utils.py,sha256=iKt0ZZKF_Jmo7WNKJxldOHlwo__afEWuYWi_ckNd9gU,3278
7
+ audio_scribe-0.1.2.dist-info/METADATA,sha256=sKvgdqqlHZ9fqlwJu-sXCDdpqAFKVBsis8BHq0Q6yyU,10355
8
+ audio_scribe-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
+ audio_scribe-0.1.2.dist-info/entry_points.txt,sha256=Bj7Co8Er22Ux59Vs2_S63ds2bnwDURvhHYNXVviZdPM,63
10
+ audio_scribe-0.1.2.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
11
+ audio_scribe-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ audio-scribe = audio_scribe.transcriber:main
audio_scribe/cli.py DELETED
@@ -1,567 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Audio Scribe
4
- -----------------
5
- A command-line script for transcribing audio files with speaker diarization
6
- using Whisper and Pyannote. The script uses a Hugging Face token for
7
- downloading Pyannote speaker-diarization models and displays a progress bar
8
- with resource usage while transcribing.
9
- """
10
-
11
- print("Initializing environment... Please wait while we load dependencies and models.")
12
- import sys
13
- sys.stdout.flush()
14
-
15
- import os
16
- import glob
17
- import wave
18
- import json
19
- import logging
20
- import warnings
21
- import argparse
22
- import readline # <--- For enabling tab-completion on Unix/Linux
23
- from pathlib import Path
24
- from datetime import datetime
25
- from typing import Optional, Dict
26
- from dataclasses import dataclass
27
- import base64
28
-
29
- from cryptography.fernet import Fernet
30
- from cryptography.hazmat.primitives import hashes
31
- from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
32
-
33
- import torch
34
- import whisper
35
-
36
- import importlib.metadata
37
- from importlib.metadata import PackageNotFoundError
38
-
39
- from pyannote.audio import Pipeline
40
-
41
- # Attempt to import optional packages for progress bar and resource monitoring
42
- try:
43
- from alive_progress import alive_bar
44
- import psutil
45
- import GPUtil
46
- HAVE_PROGRESS_SUPPORT = True
47
- except ImportError:
48
- HAVE_PROGRESS_SUPPORT = False
49
-
50
- # Configure logging
51
- LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
52
- logging.basicConfig(
53
- level=logging.INFO,
54
- format=LOG_FORMAT,
55
- handlers=[
56
- logging.StreamHandler(),
57
- logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
58
- ],
59
- )
60
- logger = logging.getLogger(__name__)
61
-
62
- # ---------- FILE PATH TAB-COMPLETION SNIPPET ----------
63
- def complete_path(text, state):
64
- """
65
- Return the 'state'-th completion for 'text'.
66
- This function will be used by 'readline' to enable file path autocompletion.
67
- """
68
- # If the user typed a glob pattern (with * or ?)
69
- if '*' in text or '?' in text:
70
- matches = glob.glob(text)
71
- else:
72
- # Split off the directory name and partial file/directory name
73
- directory, partial = os.path.split(text)
74
- if not directory:
75
- directory = '.'
76
- try:
77
- # List everything in 'directory' that starts with 'partial'
78
- entries = os.listdir(directory)
79
- except OSError:
80
- # If directory doesn't exist or we lack permission, no matches
81
- entries = []
82
-
83
- matches = []
84
- for entry in entries:
85
- if entry.startswith(partial):
86
- full_path = os.path.join(directory, entry)
87
- # If it's a directory, add a trailing slash to indicate that
88
- if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
89
- full_path += os.path.sep
90
- matches.append(full_path)
91
-
92
- # Sort matches to have a consistent order
93
- matches.sort()
94
-
95
- # If 'state' is beyond last match, return None
96
- return matches[state] if state < len(matches) else None
97
-
98
-
99
- @dataclass
100
- class TranscriptionConfig:
101
- """
102
- Configuration settings for the transcription pipeline.
103
- """
104
- output_directory: Path
105
- whisper_model: str = "base.en"
106
- diarization_model: str = "pyannote/speaker-diarization-3.1"
107
- temp_directory: Optional[Path] = None
108
- device: Optional[str] = None
109
-
110
- def __post_init__(self):
111
- # Use CUDA if available, else fall back to CPU
112
- self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
113
- # Default temp directory inside the output directory
114
- self.temp_directory = self.temp_directory or (self.output_directory / "temp")
115
- # Ensure directories exist
116
- self.temp_directory.mkdir(parents=True, exist_ok=True)
117
- self.output_directory.mkdir(parents=True, exist_ok=True)
118
-
119
-
120
- class TokenManager:
121
- """
122
- Handles secure storage and retrieval of the Hugging Face authentication token.
123
- """
124
- def __init__(self):
125
- # Store config in ~/.pyannote/config.json
126
- self.config_dir = Path.home() / ".pyannote"
127
- self.config_file = self.config_dir / "config.json"
128
- self._initialize_config()
129
-
130
- def _initialize_config(self) -> None:
131
- """
132
- Initialize configuration directory and file with secure permissions.
133
- """
134
- self.config_dir.mkdir(exist_ok=True)
135
- if not self.config_file.exists():
136
- self._save_config({})
137
-
138
- # Set secure file and directory permissions on POSIX systems
139
- if os.name == "posix":
140
- os.chmod(self.config_dir, 0o700)
141
- os.chmod(self.config_file, 0o600)
142
-
143
- def _get_encryption_key(self) -> bytes:
144
- """
145
- Generate an encryption key from system-specific data.
146
- """
147
- salt = b"pyannote-audio-salt"
148
- kdf = PBKDF2HMAC(
149
- algorithm=hashes.SHA256(),
150
- length=32,
151
- salt=salt,
152
- iterations=100000,
153
- )
154
- key = kdf.derive(str(Path.home()).encode())
155
- return base64.urlsafe_b64encode(key)
156
-
157
- def _save_config(self, config: dict) -> None:
158
- """
159
- Securely save configuration to file.
160
- """
161
- with open(self.config_file, "w", encoding="utf-8") as f:
162
- json.dump(config, f)
163
-
164
- def _load_config(self) -> dict:
165
- """
166
- Load configuration from file.
167
- """
168
- try:
169
- with open(self.config_file, "r", encoding="utf-8") as f:
170
- return json.load(f)
171
- except Exception:
172
- return {}
173
-
174
- def store_token(self, token: str) -> bool:
175
- """
176
- Securely store authentication token.
177
- """
178
- try:
179
- fernet = Fernet(self._get_encryption_key())
180
- encrypted_token = fernet.encrypt(token.encode())
181
-
182
- config = self._load_config()
183
- config["token"] = encrypted_token.decode()
184
-
185
- self._save_config(config)
186
- return True
187
- except Exception as e:
188
- logger.error(f"Failed to store token: {e}")
189
- return False
190
-
191
- def retrieve_token(self) -> Optional[str]:
192
- """
193
- Retrieve stored authentication token.
194
- """
195
- try:
196
- config = self._load_config()
197
- if "token" in config:
198
- fernet = Fernet(self._get_encryption_key())
199
- return fernet.decrypt(config["token"].encode()).decode()
200
- except Exception as e:
201
- logger.error(f"Failed to retrieve token: {e}")
202
- return None
203
-
204
- def delete_token(self) -> bool:
205
- """
206
- Delete stored authentication token.
207
- """
208
- try:
209
- config = self._load_config()
210
- if "token" in config:
211
- del config["token"]
212
- self._save_config(config)
213
- return True
214
- except Exception as e:
215
- logger.error(f"Failed to delete token: {e}")
216
- return False
217
-
218
-
219
- class DependencyManager:
220
- """
221
- Manages and verifies system dependencies using importlib.metadata.
222
- """
223
- REQUIRED_PACKAGES = {
224
- "torch": None,
225
- "pyannote.audio": None,
226
- "openai-whisper": None,
227
- "pytorch-lightning": None,
228
- "keyring": None,
229
- }
230
-
231
- @classmethod
232
- def verify_dependencies(cls) -> bool:
233
- """
234
- Verify all required dependencies are installed with correct versions
235
- (if specified). Returns True if all are installed and correct, False otherwise.
236
- """
237
- missing = []
238
- outdated = []
239
-
240
- for package, required_version in cls.REQUIRED_PACKAGES.items():
241
- try:
242
- installed_version = importlib.metadata.version(package)
243
- if required_version and installed_version != required_version:
244
- outdated.append(
245
- f"{package} (installed: {installed_version}, required: {required_version})"
246
- )
247
- except PackageNotFoundError:
248
- missing.append(package)
249
-
250
- if missing or outdated:
251
- if missing:
252
- logger.error("Missing packages: %s", ", ".join(missing))
253
- if outdated:
254
- logger.error("Outdated packages: %s", ", ".join(outdated))
255
- logger.info(
256
- "Install required packages: pip install %s",
257
- " ".join(
258
- f"{pkg}=={ver}" if ver else pkg
259
- for pkg, ver in cls.REQUIRED_PACKAGES.items()
260
- ),
261
- )
262
- return False
263
- return True
264
-
265
-
266
- class AudioProcessor:
267
- """
268
- Handles audio file processing and segmentation using the `wave` module.
269
- """
270
- def __init__(self, config: TranscriptionConfig):
271
- self.config = config
272
-
273
- def load_audio_segment(
274
- self,
275
- audio_path: Path,
276
- start_time: float,
277
- end_time: float,
278
- output_path: Path,
279
- ) -> bool:
280
- """
281
- Extract and save the audio segment from `start_time` to `end_time`.
282
- """
283
- try:
284
- with wave.open(str(audio_path), "rb") as infile:
285
- params = infile.getparams()
286
- frame_rate = params.framerate
287
- start_frame = int(start_time * frame_rate)
288
- end_frame = min(int(end_time * frame_rate), infile.getnframes())
289
-
290
- infile.setpos(start_frame)
291
- frames = infile.readframes(end_frame - start_frame)
292
-
293
- with wave.open(str(output_path), "wb") as outfile:
294
- outfile.setparams(params)
295
- outfile.writeframes(frames)
296
- return True
297
- except Exception as e:
298
- logger.error(f"Failed to process audio segment: {e}")
299
- return False
300
-
301
-
302
- class TranscriptionPipeline:
303
- """
304
- Main pipeline for audio transcription (Whisper) and speaker diarization (Pyannote).
305
- """
306
- def __init__(self, config: TranscriptionConfig):
307
- self.config = config
308
- self.diarization_pipeline = None
309
- self.whisper_model = None
310
- self.token_manager = TokenManager()
311
- self._running = False # used for resource monitor thread
312
-
313
- def initialize_models(self, auth_token: str) -> bool:
314
- """
315
- Initialize the Pyannote diarization pipeline and the Whisper model.
316
- """
317
- try:
318
- # Load Whisper model (set download root to avoid clutter in home directory)
319
- self.whisper_model = whisper.load_model(
320
- self.config.whisper_model,
321
- device=self.config.device,
322
- download_root=str(self.config.output_directory / "models"),
323
- )
324
-
325
- # Load Pyannote diarization pipeline
326
- self.diarization_pipeline = Pipeline.from_pretrained(
327
- self.config.diarization_model, use_auth_token=auth_token
328
- )
329
- self.diarization_pipeline.to(torch.device(self.config.device))
330
-
331
- if self.config.device == "cpu":
332
- warnings.warn("Running on CPU. GPU is recommended for better performance.")
333
-
334
- return True
335
- except Exception as e:
336
- logger.error(f"Model initialization failed: {e}")
337
- logger.error("Please ensure you have accepted the model conditions at:")
338
- logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
339
- logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
340
- return False
341
-
342
- def _update_resources(self, bar):
343
- """
344
- Continuously update progress bar text with CPU/MEM/GPU usage, until self._running is False.
345
- """
346
- while self._running:
347
- try:
348
- import time
349
- time.sleep(0.5)
350
-
351
- cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
352
- memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
353
-
354
- if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
355
- gpus = GPUtil.getGPUs()
356
- gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
357
- gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
358
- gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
359
- else:
360
- gpu_usage_text = "N/A"
361
-
362
- resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
363
- bar.text(resource_text)
364
- except Exception as e:
365
- logger.error(f"Resource monitoring error: {e}")
366
-
367
- def process_file(self, audio_path: Path) -> bool:
368
- """
369
- Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback.
370
- """
371
- try:
372
- logger.info("Starting audio processing...")
373
- diarization = self.diarization_pipeline(str(audio_path))
374
- segments = list(diarization.itertracks(yield_label=True))
375
- total_segments = len(segments)
376
-
377
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
378
- output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
379
- audio_processor = AudioProcessor(self.config)
380
-
381
- if not HAVE_PROGRESS_SUPPORT:
382
- # No alive_progress, psutil, or GPUtil installed
383
- logger.info("Processing audio without progress bar (missing optional packages).")
384
- with output_file.open("w", encoding="utf-8") as f:
385
- for turn, _, speaker in segments:
386
- segment_path = (
387
- self.config.temp_directory
388
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
389
- )
390
- if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
391
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
392
- segment_path.unlink(missing_ok=True)
393
-
394
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
395
- f.write(line)
396
- logger.info(line.strip())
397
- else:
398
- # Use a progress bar to track segment transcription
399
- from alive_progress import alive_bar
400
- import threading
401
-
402
- self._running = True
403
- with output_file.open("w", encoding="utf-8") as f, alive_bar(
404
- total_segments,
405
- title="Transcribing Audio",
406
- spinner="pulse",
407
- theme="classic",
408
- stats=False,
409
- elapsed=True,
410
- monitor=True,
411
- ) as bar:
412
-
413
- # Start a background thread for resource monitoring
414
- resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
415
- resource_thread.start()
416
-
417
- for turn, _, speaker in segments:
418
- segment_path = (
419
- self.config.temp_directory
420
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
421
- )
422
- if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
423
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
424
- segment_path.unlink(missing_ok=True)
425
-
426
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
427
- f.write(line)
428
- logger.info(line.strip())
429
-
430
- # Update the progress bar
431
- bar()
432
-
433
- # Stop resource monitoring
434
- self._running = False
435
- resource_thread.join()
436
-
437
- logger.info(f"Transcription completed. Output saved to: {output_file}")
438
- return True
439
-
440
- except Exception as e:
441
- logger.error(f"Processing failed: {e}")
442
- return False
443
-
444
-
445
- def get_token(token_manager: TokenManager) -> Optional[str]:
446
- """
447
- Get authentication token from storage or user input.
448
- """
449
- stored_token = token_manager.retrieve_token()
450
- if stored_token:
451
- choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
452
- if choice == "y":
453
- return stored_token
454
-
455
- print("\nA HuggingFace token is required for speaker diarization.")
456
- print("Get your token at: https://huggingface.co/settings/tokens")
457
- print("\nEnsure you have accepted:")
458
- print(" 1. pyannote/segmentation-3.0 conditions")
459
- print(" 2. pyannote/speaker-diarization-3.1 conditions")
460
-
461
- token = input("\nEnter HuggingFace token: ").strip()
462
- if token:
463
- choice = input("Save token for future use? (y/n): ").lower().strip()
464
- if choice == "y":
465
- if token_manager.store_token(token):
466
- print("Token saved successfully.")
467
- else:
468
- print("Failed to save token. It will be used for this session only.")
469
- return token if token else None
470
-
471
-
472
- def main():
473
- parser = argparse.ArgumentParser(
474
- description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
475
- )
476
- parser.add_argument(
477
- "--audio",
478
- type=Path,
479
- help="Path to the audio file to transcribe."
480
- )
481
- parser.add_argument(
482
- "--token",
483
- help="HuggingFace API token. Overrides any saved token."
484
- )
485
- parser.add_argument(
486
- "--output",
487
- type=Path,
488
- help="Path to the output directory for transcripts and temporary files.",
489
- )
490
- parser.add_argument(
491
- "--delete-token",
492
- action="store_true",
493
- help="Delete any stored Hugging Face token and exit.",
494
- )
495
- parser.add_argument(
496
- "--show-warnings",
497
- action="store_true",
498
- help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
499
- )
500
- parser.add_argument(
501
- "--whisper-model",
502
- default="base.en",
503
- help="Specify the Whisper model to use (default: 'base.en').",
504
- )
505
- args = parser.parse_args()
506
-
507
- # Manage user warnings
508
- if not args.show_warnings:
509
- warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
510
- warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
511
- else:
512
- warnings.resetwarnings()
513
-
514
- # Check dependencies
515
- if not DependencyManager.verify_dependencies():
516
- sys.exit(1)
517
-
518
- # Initialize tab-completion for file paths (Unix-like only, or with pyreadline on Windows)
519
- readline.set_completer_delims(' \t\n;')
520
- readline.set_completer(complete_path)
521
- readline.parse_and_bind("tab: complete")
522
-
523
- # Initialize the token manager
524
- token_manager = TokenManager()
525
-
526
- # If user wants to delete the stored token, do so and exit
527
- if args.delete_token:
528
- success = token_manager.delete_token()
529
- sys.exit(0 if success else 1)
530
-
531
- # Prepare configuration
532
- output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
533
- config = TranscriptionConfig(
534
- output_directory=output_dir,
535
- whisper_model=args.whisper_model
536
- )
537
-
538
- # Initialize pipeline
539
- pipeline = TranscriptionPipeline(config)
540
- hf_token = args.token or get_token(token_manager)
541
- if not hf_token:
542
- logger.error("No Hugging Face token provided. Exiting.")
543
- sys.exit(1)
544
-
545
- # Initialize models
546
- if not pipeline.initialize_models(hf_token):
547
- logger.error("Failed to initialize pipeline. Exiting.")
548
- sys.exit(1)
549
-
550
- # Prompt user for audio file path if not passed in
551
- audio_path = args.audio
552
- while not audio_path or not audio_path.exists():
553
- audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
554
- audio_path = Path(audio_path_str)
555
- if not audio_path.exists():
556
- print(f"File '{audio_path}' not found. Please try again.")
557
-
558
- print("Audio file path accepted. Preparing to process the audio...")
559
- sys.stdout.flush()
560
-
561
- # Process the audio file
562
- if not pipeline.process_file(audio_path):
563
- sys.exit(1)
564
-
565
-
566
- if __name__ == "__main__":
567
- main()
@@ -1,7 +0,0 @@
1
- audio_scribe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- audio_scribe/cli.py,sha256=dabh7fe9wAEORwVIBd-V8FAzHBBzbkjnfMSR-wOywO8,20286
3
- audio_scribe-0.1.0.dist-info/METADATA,sha256=BBx81TI9DPCYgsdKyBn2PWEJ9pJsnhqTUb8ZsWoS1Ps,9503
4
- audio_scribe-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- audio_scribe-0.1.0.dist-info/entry_points.txt,sha256=eaO9r_zAFnrWseKyJcBpGUHQq-P7NXBw5er8sZaPfFU,55
6
- audio_scribe-0.1.0.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
7
- audio_scribe-0.1.0.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- audio-scribe = audio_scribe.cli:main