audio-scribe 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
audio_scribe/__init__.py CHANGED
@@ -0,0 +1,24 @@
1
+ """
2
+ Audio Scribe
3
+ -----------------
4
+ A Python package for transcribing audio files with speaker diarization
5
+ using Whisper and Pyannote.
6
+ """
7
+
8
+ from .transcriber import main
9
+ from .models import TranscriptionPipeline, AudioProcessor
10
+ from .config import TranscriptionConfig
11
+ from .auth import TokenManager
12
+ from .utils import DependencyManager, complete_path
13
+
14
+ __version__ = "0.1.2"
15
+
16
+ __all__ = [
17
+ "main",
18
+ "TranscriptionPipeline",
19
+ "TranscriptionConfig",
20
+ "AudioProcessor",
21
+ "TokenManager",
22
+ "DependencyManager",
23
+ "complete_path",
24
+ ]
audio_scribe/auth.py ADDED
@@ -0,0 +1,119 @@
1
+ """Authentication and token management for Audio Scribe."""
2
+
3
+ import os
4
+ import json
5
+ import base64
6
+ import logging
7
+ from pathlib import Path
8
+ from typing import Optional
9
+ from cryptography.fernet import Fernet
10
+ from cryptography.hazmat.primitives import hashes
11
+ from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class TokenManager:
16
+ """Handles secure storage and retrieval of the Hugging Face authentication token."""
17
+ def __init__(self):
18
+ # Store config in ~/.pyannote/config.json
19
+ self.config_dir = Path.home() / ".pyannote"
20
+ self.config_file = self.config_dir / "config.json"
21
+ self._initialize_config()
22
+
23
+ def _initialize_config(self) -> None:
24
+ """Initialize configuration directory and file with secure permissions."""
25
+ self.config_dir.mkdir(exist_ok=True)
26
+ if not self.config_file.exists():
27
+ self._save_config({})
28
+
29
+ # Set secure file and directory permissions on POSIX systems
30
+ if os.name == "posix":
31
+ os.chmod(self.config_dir, 0o700)
32
+ os.chmod(self.config_file, 0o600)
33
+
34
+ def _get_encryption_key(self) -> bytes:
35
+ """Generate an encryption key from system-specific data."""
36
+ salt = b"pyannote-audio-salt"
37
+ kdf = PBKDF2HMAC(
38
+ algorithm=hashes.SHA256(),
39
+ length=32,
40
+ salt=salt,
41
+ iterations=100000,
42
+ )
43
+ key = kdf.derive(str(Path.home()).encode())
44
+ return base64.urlsafe_b64encode(key)
45
+
46
+ def _save_config(self, config: dict) -> None:
47
+ """Securely save configuration to file."""
48
+ with open(self.config_file, "w", encoding="utf-8") as f:
49
+ json.dump(config, f)
50
+
51
+ def _load_config(self) -> dict:
52
+ """Load configuration from file."""
53
+ try:
54
+ with open(self.config_file, "r", encoding="utf-8") as f:
55
+ return json.load(f)
56
+ except Exception:
57
+ return {}
58
+
59
+ def store_token(self, token: str) -> bool:
60
+ """Securely store authentication token."""
61
+ try:
62
+ fernet = Fernet(self._get_encryption_key())
63
+ encrypted_token = fernet.encrypt(token.encode())
64
+
65
+ config = self._load_config()
66
+ config["token"] = encrypted_token.decode()
67
+
68
+ self._save_config(config)
69
+ return True
70
+ except Exception as e:
71
+ logger.error(f"Failed to store token: {e}")
72
+ return False
73
+
74
+ def retrieve_token(self) -> Optional[str]:
75
+ """Retrieve stored authentication token."""
76
+ try:
77
+ config = self._load_config()
78
+ if "token" in config:
79
+ fernet = Fernet(self._get_encryption_key())
80
+ return fernet.decrypt(config["token"].encode()).decode()
81
+ except Exception as e:
82
+ logger.error(f"Failed to retrieve token: {e}")
83
+ return None
84
+
85
+ def delete_token(self) -> bool:
86
+ """Delete stored authentication token."""
87
+ try:
88
+ config = self._load_config()
89
+ if "token" in config:
90
+ del config["token"]
91
+ self._save_config(config)
92
+ return True
93
+ except Exception as e:
94
+ logger.error(f"Failed to delete token: {e}")
95
+ return False
96
+
97
+ def get_token(token_manager: TokenManager) -> Optional[str]:
98
+ """Get authentication token from storage or user input."""
99
+ stored_token = token_manager.retrieve_token()
100
+ if stored_token:
101
+ choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
102
+ if choice == "y":
103
+ return stored_token
104
+
105
+ print("\nA HuggingFace token is required for speaker diarization.")
106
+ print("Get your token at: https://huggingface.co/settings/tokens")
107
+ print("\nEnsure you have accepted:")
108
+ print(" 1. pyannote/segmentation-3.0 conditions")
109
+ print(" 2. pyannote/speaker-diarization-3.1 conditions")
110
+
111
+ token = input("\nEnter HuggingFace token: ").strip()
112
+ if token:
113
+ choice = input("Save token for future use? (y/n): ").lower().strip()
114
+ if choice == "y":
115
+ if token_manager.store_token(token):
116
+ print("Token saved successfully.")
117
+ else:
118
+ print("Failed to save token. It will be used for this session only.")
119
+ return token if token else None
audio_scribe/config.py ADDED
@@ -0,0 +1,24 @@
1
+ """Configuration management for Audio Scribe."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Optional
6
+ import torch
7
+
8
+ @dataclass
9
+ class TranscriptionConfig:
10
+ """Configuration settings for the transcription pipeline."""
11
+ output_directory: Path
12
+ whisper_model: str = "base.en"
13
+ diarization_model: str = "pyannote/speaker-diarization-3.1"
14
+ temp_directory: Optional[Path] = None
15
+ device: Optional[str] = None
16
+
17
+ def __post_init__(self):
18
+ # Use CUDA if available, else fall back to CPU
19
+ self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
20
+ # Default temp directory inside the output directory
21
+ self.temp_directory = self.temp_directory or (self.output_directory / "temp")
22
+ # Ensure directories exist
23
+ self.temp_directory.mkdir(parents=True, exist_ok=True)
24
+ self.output_directory.mkdir(parents=True, exist_ok=True)
audio_scribe/models.py ADDED
@@ -0,0 +1,196 @@
1
+ """Model handling and audio processing for Audio Scribe."""
2
+
3
+ import wave
4
+ import torch
5
+ import whisper
6
+ import logging
7
+ import warnings
8
+ import threading
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ from pyannote.audio import Pipeline
13
+
14
+ from .config import TranscriptionConfig
15
+ from .auth import TokenManager
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ try:
20
+ from alive_progress import alive_bar
21
+ import psutil
22
+ import GPUtil
23
+ HAVE_PROGRESS_SUPPORT = True
24
+ except ImportError:
25
+ HAVE_PROGRESS_SUPPORT = False
26
+
27
+
28
+ class AudioProcessor:
29
+ """Handles audio file processing and segmentation."""
30
+
31
+ def __init__(self, config: TranscriptionConfig):
32
+ self.config = config
33
+
34
+ def load_audio_segment(
35
+ self,
36
+ audio_path: Path,
37
+ start_time: float,
38
+ end_time: float,
39
+ output_path: Path,
40
+ ) -> bool:
41
+ """Extract and save the audio segment from start_time to end_time."""
42
+ try:
43
+ with wave.open(str(audio_path), "rb") as infile:
44
+ params = infile.getparams()
45
+ frame_rate = params.framerate
46
+ start_frame = int(start_time * frame_rate)
47
+ end_frame = min(int(end_time * frame_rate), infile.getnframes())
48
+
49
+ infile.setpos(start_frame)
50
+ frames = infile.readframes(end_frame - start_frame)
51
+
52
+ with wave.open(str(output_path), "wb") as outfile:
53
+ outfile.setparams(params)
54
+ outfile.writeframes(frames)
55
+ return True
56
+ except Exception as e:
57
+ logger.error(f"Failed to process audio segment: {e}")
58
+ return False
59
+
60
+
61
+ class TranscriptionPipeline:
62
+ """Main pipeline for audio transcription and speaker diarization."""
63
+
64
+ def __init__(self, config: TranscriptionConfig):
65
+ self.config = config
66
+ self.diarization_pipeline = None
67
+ self.whisper_model = None
68
+ self.token_manager = TokenManager()
69
+ self._running = False # used for resource monitor thread
70
+
71
+ def initialize_models(self, auth_token: str) -> bool:
72
+ """Initialize the Pyannote diarization pipeline and Whisper model."""
73
+ try:
74
+ # Load Whisper model
75
+ self.whisper_model = whisper.load_model(
76
+ self.config.whisper_model,
77
+ device=self.config.device,
78
+ download_root=str(self.config.output_directory / "models"),
79
+ )
80
+
81
+ # Load Pyannote diarization pipeline
82
+ self.diarization_pipeline = Pipeline.from_pretrained(
83
+ self.config.diarization_model,
84
+ use_auth_token=auth_token
85
+ )
86
+ self.diarization_pipeline.to(torch.device(self.config.device))
87
+
88
+ if self.config.device == "cpu":
89
+ warnings.warn("Running on CPU. GPU is recommended for better performance.")
90
+
91
+ return True
92
+ except Exception as e:
93
+ logger.error(f"Model initialization failed: {e}")
94
+ logger.error("Please ensure you have accepted the model conditions at:")
95
+ logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
96
+ logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
97
+ return False
98
+
99
+ def _update_resources(self, bar):
100
+ """Update progress bar with resource usage information."""
101
+ while self._running:
102
+ try:
103
+ import time
104
+ time.sleep(0.5)
105
+
106
+ cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
107
+ memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
108
+
109
+ if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
110
+ gpus = GPUtil.getGPUs()
111
+ gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
112
+ gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
113
+ gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
114
+ else:
115
+ gpu_usage_text = "N/A"
116
+
117
+ resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
118
+ bar.text(resource_text)
119
+ except Exception as e:
120
+ logger.error(f"Resource monitoring error: {e}")
121
+
122
+ def process_file(self, audio_path: Path) -> bool:
123
+ """Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback."""
124
+ try:
125
+ logger.info("Starting audio processing...")
126
+ diarization = self.diarization_pipeline(str(audio_path))
127
+ segments = list(diarization.itertracks(yield_label=True))
128
+ total_segments = len(segments)
129
+
130
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
131
+ output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
132
+ audio_processor = AudioProcessor(self.config)
133
+
134
+ if not HAVE_PROGRESS_SUPPORT:
135
+ # No alive_progress, psutil, or GPUtil installed
136
+ logger.info("Processing audio without progress bar (missing optional packages).")
137
+ with output_file.open("w", encoding="utf-8") as f:
138
+ for turn, _, speaker in segments:
139
+ segment_path = (
140
+ self.config.temp_directory
141
+ / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
142
+ )
143
+ if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
144
+ transcription = self.whisper_model.transcribe(str(segment_path))["text"]
145
+ segment_path.unlink(missing_ok=True)
146
+
147
+ line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
148
+ f.write(line)
149
+ logger.info(line.strip())
150
+ return True
151
+ else:
152
+ # Use a progress bar to track segment transcription
153
+ from alive_progress import alive_bar
154
+ import threading
155
+
156
+ self._running = True
157
+ with output_file.open("w", encoding="utf-8") as f, alive_bar(
158
+ total_segments,
159
+ title="Transcribing Audio",
160
+ spinner="pulse",
161
+ theme="classic",
162
+ stats=False,
163
+ elapsed=True,
164
+ monitor=True,
165
+ ) as bar:
166
+
167
+ # Start a background thread for resource monitoring
168
+ resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
169
+ resource_thread.start()
170
+
171
+ for turn, _, speaker in segments:
172
+ segment_path = (
173
+ self.config.temp_directory
174
+ / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
175
+ )
176
+ if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
177
+ transcription = self.whisper_model.transcribe(str(segment_path))["text"]
178
+ segment_path.unlink(missing_ok=True)
179
+
180
+ line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
181
+ f.write(line)
182
+ logger.info(line.strip())
183
+
184
+ # Update the progress bar
185
+ bar()
186
+
187
+ # Stop resource monitoring
188
+ self._running = False
189
+ resource_thread.join()
190
+
191
+ logger.info(f"Transcription completed. Output saved to: {output_file}")
192
+ return True
193
+
194
+ except Exception as e:
195
+ logger.error(f"Processing failed: {e}")
196
+ return False
@@ -0,0 +1,131 @@
1
+ """
2
+ Main entry point for Audio Scribe transcription tool.
3
+ Handles CLI interface and orchestrates the transcription process.
4
+ """
5
+
6
+ import sys
7
+ import logging
8
+ import warnings
9
+ import argparse
10
+ import readline
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+
14
+ from .config import TranscriptionConfig
15
+ from .models import TranscriptionPipeline
16
+ from .auth import TokenManager, get_token
17
+ from .utils import DependencyManager, complete_path
18
+
19
+ # Configure logging
20
+ LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format=LOG_FORMAT,
24
+ handlers=[
25
+ logging.StreamHandler(),
26
+ logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
27
+ ],
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def main():
33
+ """Main entry point for the Audio Scribe CLI."""
34
+ print("Initializing environment... Please wait while we load dependencies and models.")
35
+ sys.stdout.flush()
36
+
37
+ parser = argparse.ArgumentParser(
38
+ description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
39
+ )
40
+ parser.add_argument(
41
+ "--audio",
42
+ type=Path,
43
+ help="Path to the audio file to transcribe."
44
+ )
45
+ parser.add_argument(
46
+ "--token",
47
+ help="HuggingFace API token. Overrides any saved token."
48
+ )
49
+ parser.add_argument(
50
+ "--output",
51
+ type=Path,
52
+ help="Path to the output directory for transcripts and temporary files.",
53
+ )
54
+ parser.add_argument(
55
+ "--delete-token",
56
+ action="store_true",
57
+ help="Delete any stored Hugging Face token and exit.",
58
+ )
59
+ parser.add_argument(
60
+ "--show-warnings",
61
+ action="store_true",
62
+ help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
63
+ )
64
+ parser.add_argument(
65
+ "--whisper-model",
66
+ default="base.en",
67
+ help="Specify the Whisper model to use (default: 'base.en').",
68
+ )
69
+ args = parser.parse_args()
70
+
71
+ # Manage user warnings
72
+ if not args.show_warnings:
73
+ warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
74
+ warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
75
+ else:
76
+ warnings.resetwarnings()
77
+
78
+ # Check dependencies
79
+ if not DependencyManager.verify_dependencies():
80
+ sys.exit(1)
81
+
82
+ # Initialize tab-completion for file paths
83
+ readline.set_completer_delims(' \t\n;')
84
+ readline.set_completer(complete_path)
85
+ readline.parse_and_bind("tab: complete")
86
+
87
+ # Initialize the token manager
88
+ token_manager = TokenManager()
89
+
90
+ # If user wants to delete the stored token, do so and exit
91
+ if args.delete_token:
92
+ success = token_manager.delete_token()
93
+ sys.exit(0 if success else 1)
94
+
95
+ # Prepare configuration
96
+ output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
97
+ config = TranscriptionConfig(
98
+ output_directory=output_dir,
99
+ whisper_model=args.whisper_model
100
+ )
101
+
102
+ # Initialize pipeline
103
+ pipeline = TranscriptionPipeline(config)
104
+ hf_token = args.token or get_token(token_manager)
105
+ if not hf_token:
106
+ logger.error("No Hugging Face token provided. Exiting.")
107
+ sys.exit(1)
108
+
109
+ # Initialize models
110
+ if not pipeline.initialize_models(hf_token):
111
+ logger.error("Failed to initialize pipeline. Exiting.")
112
+ sys.exit(1)
113
+
114
+ # Prompt user for audio file path if not passed in
115
+ audio_path = args.audio
116
+ while not audio_path or not audio_path.exists():
117
+ audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
118
+ audio_path = Path(audio_path_str)
119
+ if not audio_path.exists():
120
+ print(f"File '{audio_path}' not found. Please try again.")
121
+
122
+ print("Audio file path accepted. Preparing to process the audio...")
123
+ sys.stdout.flush()
124
+
125
+ # Process the audio file
126
+ if not pipeline.process_file(audio_path):
127
+ sys.exit(1)
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
audio_scribe/utils.py ADDED
@@ -0,0 +1,93 @@
1
+ """Utility functions and classes for Audio Scribe."""
2
+
3
+ import os
4
+ import glob
5
+ import logging
6
+ import importlib.metadata
7
+ from importlib.metadata import PackageNotFoundError
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def complete_path(text, state):
12
+ """
13
+ Return the 'state'-th completion for 'text'.
14
+ This function will be used by 'readline' to enable file path autocompletion.
15
+ """
16
+ # If the user typed a glob pattern (with * or ?)
17
+ if '*' in text or '?' in text:
18
+ matches = sorted(glob.glob(text))
19
+ else:
20
+ # Split off the directory name and partial file/directory name
21
+ directory, partial = os.path.split(text)
22
+ if not directory:
23
+ directory = '.'
24
+ try:
25
+ # List everything in 'directory' that starts with 'partial'
26
+ entries = sorted(os.listdir(directory))
27
+ except OSError:
28
+ # If directory doesn't exist or we lack permission, no matches
29
+ entries = []
30
+
31
+ matches = []
32
+ for entry in entries:
33
+ if entry.startswith(partial):
34
+ if directory == '.':
35
+ # Don't prefix current directory paths
36
+ full_path = entry
37
+ else:
38
+ # Keep the directory prefix for subdirectories
39
+ full_path = os.path.join(directory, entry)
40
+
41
+ # If it's a directory, add a trailing slash to indicate that
42
+ if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
43
+ full_path += os.path.sep
44
+ matches.append(full_path)
45
+
46
+ # If 'state' is beyond last match, return None
47
+ return matches[state] if state < len(matches) else None
48
+
49
+
50
+ class DependencyManager:
51
+ """Manages and verifies system dependencies."""
52
+
53
+ REQUIRED_PACKAGES = {
54
+ "torch": None,
55
+ "pyannote.audio": None,
56
+ "openai-whisper": None,
57
+ "pytorch-lightning": None,
58
+ "keyring": None,
59
+ }
60
+
61
+ @classmethod
62
+ def verify_dependencies(cls) -> bool:
63
+ """
64
+ Verify all required dependencies are installed with correct versions.
65
+ Returns True if all are installed and correct, False otherwise.
66
+ """
67
+ missing = []
68
+ outdated = []
69
+
70
+ for package, required_version in cls.REQUIRED_PACKAGES.items():
71
+ try:
72
+ installed_version = importlib.metadata.version(package)
73
+ if required_version and installed_version != required_version:
74
+ outdated.append(
75
+ f"{package} (installed: {installed_version}, required: {required_version})"
76
+ )
77
+ except PackageNotFoundError:
78
+ missing.append(package)
79
+
80
+ if missing or outdated:
81
+ if missing:
82
+ logger.error("Missing packages: %s", ", ".join(missing))
83
+ if outdated:
84
+ logger.error("Outdated packages: %s", ", ".join(outdated))
85
+ logger.info(
86
+ "Install required packages: pip install %s",
87
+ " ".join(
88
+ f"{pkg}=={ver}" if ver else pkg
89
+ for pkg, ver in cls.REQUIRED_PACKAGES.items()
90
+ ),
91
+ )
92
+ return False
93
+ return True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: audio_scribe
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: A command-line tool for audio transcription with Whisper and Pyannote.
5
5
  Home-page: https://gitlab.genomicops.cloud/genomicops/audio-scribe
6
6
  Author: Gurasis Osahan
@@ -46,9 +46,16 @@ Dynamic: summary
46
46
 
47
47
  # Audio Scribe
48
48
 
49
- **A Command-Line Tool for Audio Transcription (Audio Scribe) and Speaker Diarization Using OpenAI Whisper and Pyannote**
49
+ **A Command-Line Tool for Audio Transcription and Speaker Diarization Using OpenAI Whisper and Pyannote**
50
50
 
51
51
  [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
52
+ ![Coverage](https://img.shields.io/badge/coverage-94.3%25-brightgreen)
53
+ [![Pipeline Status](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/pipeline.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main)
54
+ [![PyPI Version](https://badge.fury.io/py/audio-scribe.svg)](https://badge.fury.io/py/audio-scribe)
55
+ [![Python Versions](https://img.shields.io/pypi/pyversions/audio-scribe)](https://pypi.org/project/audio-scribe/)
56
+ [![PyPI Downloads](https://img.shields.io/pypi/dm/audio-scribe)](https://pypi.org/project/audio-scribe/)
57
+ [![PyPI License](https://img.shields.io/pypi/l/audio-scribe)](https://pypi.org/project/audio-scribe/)
58
+ <!-- [![Coverage Report](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/coverage.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main) -->
52
59
 
53
60
  ## Overview
54
61
 
@@ -0,0 +1,11 @@
1
+ audio_scribe/__init__.py,sha256=FH63cMQOjdp5Kp9JTAl-uaQbtEVh-lKneUJRKl5hEXc,543
2
+ audio_scribe/auth.py,sha256=YD9ElcMtFIMMYW26XZqMCzpYjOsmXkS5-TC2hTmCOEw,4351
3
+ audio_scribe/config.py,sha256=umD9-QBfi4e5RZG33lCOpdLBBbriG0LFyyDwvgHlSlQ,935
4
+ audio_scribe/models.py,sha256=Z5eJJf7rxq6k60fJMfVW98jwB9MDT7JxKBVvFmXZN-Q,7971
5
+ audio_scribe/transcriber.py,sha256=xMWt50QmNXeLhpTZhJlLtmJSzeOcRSWKtYRMJghjUnI,4026
6
+ audio_scribe/utils.py,sha256=iKt0ZZKF_Jmo7WNKJxldOHlwo__afEWuYWi_ckNd9gU,3278
7
+ audio_scribe-0.1.2.dist-info/METADATA,sha256=sKvgdqqlHZ9fqlwJu-sXCDdpqAFKVBsis8BHq0Q6yyU,10355
8
+ audio_scribe-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
+ audio_scribe-0.1.2.dist-info/entry_points.txt,sha256=Bj7Co8Er22Ux59Vs2_S63ds2bnwDURvhHYNXVviZdPM,63
10
+ audio_scribe-0.1.2.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
11
+ audio_scribe-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ audio-scribe = audio_scribe.transcriber:main
audio_scribe/cli.py DELETED
@@ -1,567 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Audio Scribe
4
- -----------------
5
- A command-line script for transcribing audio files with speaker diarization
6
- using Whisper and Pyannote. The script uses a Hugging Face token for
7
- downloading Pyannote speaker-diarization models and displays a progress bar
8
- with resource usage while transcribing.
9
- """
10
-
11
- print("Initializing environment... Please wait while we load dependencies and models.")
12
- import sys
13
- sys.stdout.flush()
14
-
15
- import os
16
- import glob
17
- import wave
18
- import json
19
- import logging
20
- import warnings
21
- import argparse
22
- import readline # <--- For enabling tab-completion on Unix/Linux
23
- from pathlib import Path
24
- from datetime import datetime
25
- from typing import Optional, Dict
26
- from dataclasses import dataclass
27
- import base64
28
-
29
- from cryptography.fernet import Fernet
30
- from cryptography.hazmat.primitives import hashes
31
- from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
32
-
33
- import torch
34
- import whisper
35
-
36
- import importlib.metadata
37
- from importlib.metadata import PackageNotFoundError
38
-
39
- from pyannote.audio import Pipeline
40
-
41
- # Attempt to import optional packages for progress bar and resource monitoring
42
- try:
43
- from alive_progress import alive_bar
44
- import psutil
45
- import GPUtil
46
- HAVE_PROGRESS_SUPPORT = True
47
- except ImportError:
48
- HAVE_PROGRESS_SUPPORT = False
49
-
50
- # Configure logging
51
- LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
52
- logging.basicConfig(
53
- level=logging.INFO,
54
- format=LOG_FORMAT,
55
- handlers=[
56
- logging.StreamHandler(),
57
- logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
58
- ],
59
- )
60
- logger = logging.getLogger(__name__)
61
-
62
- # ---------- FILE PATH TAB-COMPLETION SNIPPET ----------
63
- def complete_path(text, state):
64
- """
65
- Return the 'state'-th completion for 'text'.
66
- This function will be used by 'readline' to enable file path autocompletion.
67
- """
68
- # If the user typed a glob pattern (with * or ?)
69
- if '*' in text or '?' in text:
70
- matches = glob.glob(text)
71
- else:
72
- # Split off the directory name and partial file/directory name
73
- directory, partial = os.path.split(text)
74
- if not directory:
75
- directory = '.'
76
- try:
77
- # List everything in 'directory' that starts with 'partial'
78
- entries = os.listdir(directory)
79
- except OSError:
80
- # If directory doesn't exist or we lack permission, no matches
81
- entries = []
82
-
83
- matches = []
84
- for entry in entries:
85
- if entry.startswith(partial):
86
- full_path = os.path.join(directory, entry)
87
- # If it's a directory, add a trailing slash to indicate that
88
- if os.path.isdir(full_path) and not full_path.endswith(os.path.sep):
89
- full_path += os.path.sep
90
- matches.append(full_path)
91
-
92
- # Sort matches to have a consistent order
93
- matches.sort()
94
-
95
- # If 'state' is beyond last match, return None
96
- return matches[state] if state < len(matches) else None
97
-
98
-
99
- @dataclass
100
- class TranscriptionConfig:
101
- """
102
- Configuration settings for the transcription pipeline.
103
- """
104
- output_directory: Path
105
- whisper_model: str = "base.en"
106
- diarization_model: str = "pyannote/speaker-diarization-3.1"
107
- temp_directory: Optional[Path] = None
108
- device: Optional[str] = None
109
-
110
- def __post_init__(self):
111
- # Use CUDA if available, else fall back to CPU
112
- self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
113
- # Default temp directory inside the output directory
114
- self.temp_directory = self.temp_directory or (self.output_directory / "temp")
115
- # Ensure directories exist
116
- self.temp_directory.mkdir(parents=True, exist_ok=True)
117
- self.output_directory.mkdir(parents=True, exist_ok=True)
118
-
119
-
120
- class TokenManager:
121
- """
122
- Handles secure storage and retrieval of the Hugging Face authentication token.
123
- """
124
- def __init__(self):
125
- # Store config in ~/.pyannote/config.json
126
- self.config_dir = Path.home() / ".pyannote"
127
- self.config_file = self.config_dir / "config.json"
128
- self._initialize_config()
129
-
130
- def _initialize_config(self) -> None:
131
- """
132
- Initialize configuration directory and file with secure permissions.
133
- """
134
- self.config_dir.mkdir(exist_ok=True)
135
- if not self.config_file.exists():
136
- self._save_config({})
137
-
138
- # Set secure file and directory permissions on POSIX systems
139
- if os.name == "posix":
140
- os.chmod(self.config_dir, 0o700)
141
- os.chmod(self.config_file, 0o600)
142
-
143
- def _get_encryption_key(self) -> bytes:
144
- """
145
- Generate an encryption key from system-specific data.
146
- """
147
- salt = b"pyannote-audio-salt"
148
- kdf = PBKDF2HMAC(
149
- algorithm=hashes.SHA256(),
150
- length=32,
151
- salt=salt,
152
- iterations=100000,
153
- )
154
- key = kdf.derive(str(Path.home()).encode())
155
- return base64.urlsafe_b64encode(key)
156
-
157
- def _save_config(self, config: dict) -> None:
158
- """
159
- Securely save configuration to file.
160
- """
161
- with open(self.config_file, "w", encoding="utf-8") as f:
162
- json.dump(config, f)
163
-
164
- def _load_config(self) -> dict:
165
- """
166
- Load configuration from file.
167
- """
168
- try:
169
- with open(self.config_file, "r", encoding="utf-8") as f:
170
- return json.load(f)
171
- except Exception:
172
- return {}
173
-
174
- def store_token(self, token: str) -> bool:
175
- """
176
- Securely store authentication token.
177
- """
178
- try:
179
- fernet = Fernet(self._get_encryption_key())
180
- encrypted_token = fernet.encrypt(token.encode())
181
-
182
- config = self._load_config()
183
- config["token"] = encrypted_token.decode()
184
-
185
- self._save_config(config)
186
- return True
187
- except Exception as e:
188
- logger.error(f"Failed to store token: {e}")
189
- return False
190
-
191
- def retrieve_token(self) -> Optional[str]:
192
- """
193
- Retrieve stored authentication token.
194
- """
195
- try:
196
- config = self._load_config()
197
- if "token" in config:
198
- fernet = Fernet(self._get_encryption_key())
199
- return fernet.decrypt(config["token"].encode()).decode()
200
- except Exception as e:
201
- logger.error(f"Failed to retrieve token: {e}")
202
- return None
203
-
204
- def delete_token(self) -> bool:
205
- """
206
- Delete stored authentication token.
207
- """
208
- try:
209
- config = self._load_config()
210
- if "token" in config:
211
- del config["token"]
212
- self._save_config(config)
213
- return True
214
- except Exception as e:
215
- logger.error(f"Failed to delete token: {e}")
216
- return False
217
-
218
-
219
- class DependencyManager:
220
- """
221
- Manages and verifies system dependencies using importlib.metadata.
222
- """
223
- REQUIRED_PACKAGES = {
224
- "torch": None,
225
- "pyannote.audio": None,
226
- "openai-whisper": None,
227
- "pytorch-lightning": None,
228
- "keyring": None,
229
- }
230
-
231
- @classmethod
232
- def verify_dependencies(cls) -> bool:
233
- """
234
- Verify all required dependencies are installed with correct versions
235
- (if specified). Returns True if all are installed and correct, False otherwise.
236
- """
237
- missing = []
238
- outdated = []
239
-
240
- for package, required_version in cls.REQUIRED_PACKAGES.items():
241
- try:
242
- installed_version = importlib.metadata.version(package)
243
- if required_version and installed_version != required_version:
244
- outdated.append(
245
- f"{package} (installed: {installed_version}, required: {required_version})"
246
- )
247
- except PackageNotFoundError:
248
- missing.append(package)
249
-
250
- if missing or outdated:
251
- if missing:
252
- logger.error("Missing packages: %s", ", ".join(missing))
253
- if outdated:
254
- logger.error("Outdated packages: %s", ", ".join(outdated))
255
- logger.info(
256
- "Install required packages: pip install %s",
257
- " ".join(
258
- f"{pkg}=={ver}" if ver else pkg
259
- for pkg, ver in cls.REQUIRED_PACKAGES.items()
260
- ),
261
- )
262
- return False
263
- return True
264
-
265
-
266
- class AudioProcessor:
267
- """
268
- Handles audio file processing and segmentation using the `wave` module.
269
- """
270
- def __init__(self, config: TranscriptionConfig):
271
- self.config = config
272
-
273
- def load_audio_segment(
274
- self,
275
- audio_path: Path,
276
- start_time: float,
277
- end_time: float,
278
- output_path: Path,
279
- ) -> bool:
280
- """
281
- Extract and save the audio segment from `start_time` to `end_time`.
282
- """
283
- try:
284
- with wave.open(str(audio_path), "rb") as infile:
285
- params = infile.getparams()
286
- frame_rate = params.framerate
287
- start_frame = int(start_time * frame_rate)
288
- end_frame = min(int(end_time * frame_rate), infile.getnframes())
289
-
290
- infile.setpos(start_frame)
291
- frames = infile.readframes(end_frame - start_frame)
292
-
293
- with wave.open(str(output_path), "wb") as outfile:
294
- outfile.setparams(params)
295
- outfile.writeframes(frames)
296
- return True
297
- except Exception as e:
298
- logger.error(f"Failed to process audio segment: {e}")
299
- return False
300
-
301
-
302
- class TranscriptionPipeline:
303
- """
304
- Main pipeline for audio transcription (Whisper) and speaker diarization (Pyannote).
305
- """
306
- def __init__(self, config: TranscriptionConfig):
307
- self.config = config
308
- self.diarization_pipeline = None
309
- self.whisper_model = None
310
- self.token_manager = TokenManager()
311
- self._running = False # used for resource monitor thread
312
-
313
- def initialize_models(self, auth_token: str) -> bool:
314
- """
315
- Initialize the Pyannote diarization pipeline and the Whisper model.
316
- """
317
- try:
318
- # Load Whisper model (set download root to avoid clutter in home directory)
319
- self.whisper_model = whisper.load_model(
320
- self.config.whisper_model,
321
- device=self.config.device,
322
- download_root=str(self.config.output_directory / "models"),
323
- )
324
-
325
- # Load Pyannote diarization pipeline
326
- self.diarization_pipeline = Pipeline.from_pretrained(
327
- self.config.diarization_model, use_auth_token=auth_token
328
- )
329
- self.diarization_pipeline.to(torch.device(self.config.device))
330
-
331
- if self.config.device == "cpu":
332
- warnings.warn("Running on CPU. GPU is recommended for better performance.")
333
-
334
- return True
335
- except Exception as e:
336
- logger.error(f"Model initialization failed: {e}")
337
- logger.error("Please ensure you have accepted the model conditions at:")
338
- logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
339
- logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
340
- return False
341
-
342
- def _update_resources(self, bar):
343
- """
344
- Continuously update progress bar text with CPU/MEM/GPU usage, until self._running is False.
345
- """
346
- while self._running:
347
- try:
348
- import time
349
- time.sleep(0.5)
350
-
351
- cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
352
- memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
353
-
354
- if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
355
- gpus = GPUtil.getGPUs()
356
- gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
357
- gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
358
- gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
359
- else:
360
- gpu_usage_text = "N/A"
361
-
362
- resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
363
- bar.text(resource_text)
364
- except Exception as e:
365
- logger.error(f"Resource monitoring error: {e}")
366
-
367
- def process_file(self, audio_path: Path) -> bool:
368
- """
369
- Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback.
370
- """
371
- try:
372
- logger.info("Starting audio processing...")
373
- diarization = self.diarization_pipeline(str(audio_path))
374
- segments = list(diarization.itertracks(yield_label=True))
375
- total_segments = len(segments)
376
-
377
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
378
- output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
379
- audio_processor = AudioProcessor(self.config)
380
-
381
- if not HAVE_PROGRESS_SUPPORT:
382
- # No alive_progress, psutil, or GPUtil installed
383
- logger.info("Processing audio without progress bar (missing optional packages).")
384
- with output_file.open("w", encoding="utf-8") as f:
385
- for turn, _, speaker in segments:
386
- segment_path = (
387
- self.config.temp_directory
388
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
389
- )
390
- if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
391
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
392
- segment_path.unlink(missing_ok=True)
393
-
394
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
395
- f.write(line)
396
- logger.info(line.strip())
397
- else:
398
- # Use a progress bar to track segment transcription
399
- from alive_progress import alive_bar
400
- import threading
401
-
402
- self._running = True
403
- with output_file.open("w", encoding="utf-8") as f, alive_bar(
404
- total_segments,
405
- title="Transcribing Audio",
406
- spinner="pulse",
407
- theme="classic",
408
- stats=False,
409
- elapsed=True,
410
- monitor=True,
411
- ) as bar:
412
-
413
- # Start a background thread for resource monitoring
414
- resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
415
- resource_thread.start()
416
-
417
- for turn, _, speaker in segments:
418
- segment_path = (
419
- self.config.temp_directory
420
- / f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
421
- )
422
- if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
423
- transcription = self.whisper_model.transcribe(str(segment_path))["text"]
424
- segment_path.unlink(missing_ok=True)
425
-
426
- line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
427
- f.write(line)
428
- logger.info(line.strip())
429
-
430
- # Update the progress bar
431
- bar()
432
-
433
- # Stop resource monitoring
434
- self._running = False
435
- resource_thread.join()
436
-
437
- logger.info(f"Transcription completed. Output saved to: {output_file}")
438
- return True
439
-
440
- except Exception as e:
441
- logger.error(f"Processing failed: {e}")
442
- return False
443
-
444
-
445
- def get_token(token_manager: TokenManager) -> Optional[str]:
446
- """
447
- Get authentication token from storage or user input.
448
- """
449
- stored_token = token_manager.retrieve_token()
450
- if stored_token:
451
- choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
452
- if choice == "y":
453
- return stored_token
454
-
455
- print("\nA HuggingFace token is required for speaker diarization.")
456
- print("Get your token at: https://huggingface.co/settings/tokens")
457
- print("\nEnsure you have accepted:")
458
- print(" 1. pyannote/segmentation-3.0 conditions")
459
- print(" 2. pyannote/speaker-diarization-3.1 conditions")
460
-
461
- token = input("\nEnter HuggingFace token: ").strip()
462
- if token:
463
- choice = input("Save token for future use? (y/n): ").lower().strip()
464
- if choice == "y":
465
- if token_manager.store_token(token):
466
- print("Token saved successfully.")
467
- else:
468
- print("Failed to save token. It will be used for this session only.")
469
- return token if token else None
470
-
471
-
472
- def main():
473
- parser = argparse.ArgumentParser(
474
- description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
475
- )
476
- parser.add_argument(
477
- "--audio",
478
- type=Path,
479
- help="Path to the audio file to transcribe."
480
- )
481
- parser.add_argument(
482
- "--token",
483
- help="HuggingFace API token. Overrides any saved token."
484
- )
485
- parser.add_argument(
486
- "--output",
487
- type=Path,
488
- help="Path to the output directory for transcripts and temporary files.",
489
- )
490
- parser.add_argument(
491
- "--delete-token",
492
- action="store_true",
493
- help="Delete any stored Hugging Face token and exit.",
494
- )
495
- parser.add_argument(
496
- "--show-warnings",
497
- action="store_true",
498
- help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
499
- )
500
- parser.add_argument(
501
- "--whisper-model",
502
- default="base.en",
503
- help="Specify the Whisper model to use (default: 'base.en').",
504
- )
505
- args = parser.parse_args()
506
-
507
- # Manage user warnings
508
- if not args.show_warnings:
509
- warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
510
- warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
511
- else:
512
- warnings.resetwarnings()
513
-
514
- # Check dependencies
515
- if not DependencyManager.verify_dependencies():
516
- sys.exit(1)
517
-
518
- # Initialize tab-completion for file paths (Unix-like only, or with pyreadline on Windows)
519
- readline.set_completer_delims(' \t\n;')
520
- readline.set_completer(complete_path)
521
- readline.parse_and_bind("tab: complete")
522
-
523
- # Initialize the token manager
524
- token_manager = TokenManager()
525
-
526
- # If user wants to delete the stored token, do so and exit
527
- if args.delete_token:
528
- success = token_manager.delete_token()
529
- sys.exit(0 if success else 1)
530
-
531
- # Prepare configuration
532
- output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
533
- config = TranscriptionConfig(
534
- output_directory=output_dir,
535
- whisper_model=args.whisper_model
536
- )
537
-
538
- # Initialize pipeline
539
- pipeline = TranscriptionPipeline(config)
540
- hf_token = args.token or get_token(token_manager)
541
- if not hf_token:
542
- logger.error("No Hugging Face token provided. Exiting.")
543
- sys.exit(1)
544
-
545
- # Initialize models
546
- if not pipeline.initialize_models(hf_token):
547
- logger.error("Failed to initialize pipeline. Exiting.")
548
- sys.exit(1)
549
-
550
- # Prompt user for audio file path if not passed in
551
- audio_path = args.audio
552
- while not audio_path or not audio_path.exists():
553
- audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
554
- audio_path = Path(audio_path_str)
555
- if not audio_path.exists():
556
- print(f"File '{audio_path}' not found. Please try again.")
557
-
558
- print("Audio file path accepted. Preparing to process the audio...")
559
- sys.stdout.flush()
560
-
561
- # Process the audio file
562
- if not pipeline.process_file(audio_path):
563
- sys.exit(1)
564
-
565
-
566
- if __name__ == "__main__":
567
- main()
@@ -1,7 +0,0 @@
1
- audio_scribe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- audio_scribe/cli.py,sha256=dabh7fe9wAEORwVIBd-V8FAzHBBzbkjnfMSR-wOywO8,20286
3
- audio_scribe-0.1.0.dist-info/METADATA,sha256=BBx81TI9DPCYgsdKyBn2PWEJ9pJsnhqTUb8ZsWoS1Ps,9503
4
- audio_scribe-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- audio_scribe-0.1.0.dist-info/entry_points.txt,sha256=eaO9r_zAFnrWseKyJcBpGUHQq-P7NXBw5er8sZaPfFU,55
6
- audio_scribe-0.1.0.dist-info/top_level.txt,sha256=L1mltKt-5HrbTXPpAXwht8SXQCgcCceoqpCq4OCZRsk,13
7
- audio_scribe-0.1.0.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- audio-scribe = audio_scribe.cli:main