audio-scribe 0.1.1__tar.gz → 0.1.2__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {audio_scribe-0.1.1 → audio_scribe-0.1.2}/PKG-INFO +8 -13
- {audio_scribe-0.1.1 → audio_scribe-0.1.2}/README.md +7 -12
- {audio_scribe-0.1.1 → audio_scribe-0.1.2}/setup.py +5 -6
- audio_scribe-0.1.2/src/audio_scribe/__init__.py +24 -0
- audio_scribe-0.1.2/src/audio_scribe/auth.py +119 -0
- audio_scribe-0.1.2/src/audio_scribe/config.py +24 -0
- audio_scribe-0.1.2/src/audio_scribe/models.py +196 -0
- audio_scribe-0.1.2/src/audio_scribe/transcriber.py +131 -0
- audio_scribe-0.1.2/src/audio_scribe/utils.py +93 -0
- {audio_scribe-0.1.1 → audio_scribe-0.1.2/src}/audio_scribe.egg-info/PKG-INFO +8 -13
- audio_scribe-0.1.2/src/audio_scribe.egg-info/SOURCES.txt +19 -0
- audio_scribe-0.1.2/src/audio_scribe.egg-info/entry_points.txt +2 -0
- {audio_scribe-0.1.1 → audio_scribe-0.1.2/src}/audio_scribe.egg-info/top_level.txt +0 -1
- audio_scribe-0.1.2/tests/test_auth.py +92 -0
- audio_scribe-0.1.2/tests/test_config.py +47 -0
- audio_scribe-0.1.2/tests/test_models.py +350 -0
- audio_scribe-0.1.2/tests/test_transcriber.py +166 -0
- audio_scribe-0.1.2/tests/test_utils.py +83 -0
- audio_scribe-0.1.1/audio_scribe/__init__.py +0 -32
- audio_scribe-0.1.1/audio_scribe/cli.py +0 -567
- audio_scribe-0.1.1/audio_scribe.egg-info/SOURCES.txt +0 -12
- audio_scribe-0.1.1/audio_scribe.egg-info/entry_points.txt +0 -2
- audio_scribe-0.1.1/tests/__init__.py +0 -0
- audio_scribe-0.1.1/tests/test_audio_scribe_main.py +0 -468
- {audio_scribe-0.1.1 → audio_scribe-0.1.2}/setup.cfg +0 -0
- {audio_scribe-0.1.1 → audio_scribe-0.1.2/src}/audio_scribe.egg-info/dependency_links.txt +0 -0
- {audio_scribe-0.1.1 → audio_scribe-0.1.2/src}/audio_scribe.egg-info/requires.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: audio_scribe
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: A command-line tool for audio transcription with Whisper and Pyannote.
|
5
5
|
Home-page: https://gitlab.genomicops.cloud/genomicops/audio-scribe
|
6
6
|
Author: Gurasis Osahan
|
@@ -46,20 +46,16 @@ Dynamic: summary
|
|
46
46
|
|
47
47
|
# Audio Scribe
|
48
48
|
|
49
|
-
**A Command-Line Tool for Audio Transcription
|
49
|
+
**A Command-Line Tool for Audio Transcription and Speaker Diarization Using OpenAI Whisper and Pyannote**
|
50
50
|
|
51
51
|
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
# Current Working Badges
|
52
|
+
![Coverage](https://img.shields.io/badge/coverage-94.3%25-brightgreen)
|
56
53
|
[![Pipeline Status](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/pipeline.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main)
|
57
|
-
[![PyPI
|
58
|
-
[![
|
59
|
-
[![PyPI
|
60
|
-
[![PyPI
|
61
|
-
[![Coverage Report](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/coverage.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main)
|
62
|
-
![Coverage](https://img.shields.io/badge/coverage-{percentage}%25-{color})
|
54
|
+
[![PyPI Version](https://badge.fury.io/py/audio-scribe.svg)](https://badge.fury.io/py/audio-scribe)
|
55
|
+
[![Python Versions](https://img.shields.io/pypi/pyversions/audio-scribe)](https://pypi.org/project/audio-scribe/)
|
56
|
+
[![PyPI Downloads](https://img.shields.io/pypi/dm/audio-scribe)](https://pypi.org/project/audio-scribe/)
|
57
|
+
[![PyPI License](https://img.shields.io/pypi/l/audio-scribe)](https://pypi.org/project/audio-scribe/)
|
58
|
+
<!-- [![Coverage Report](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/coverage.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main) -->
|
63
59
|
|
64
60
|
## Overview
|
65
61
|
|
@@ -78,7 +74,6 @@ This repository is licensed under the [Apache License 2.0](#license).
|
|
78
74
|
## Table of Contents
|
79
75
|
|
80
76
|
- [Audio Scribe](#audio-scribe)
|
81
|
-
- [Current Working Badges](#current-working-badges)
|
82
77
|
- [Overview](#overview)
|
83
78
|
- [Table of Contents](#table-of-contents)
|
84
79
|
- [Features](#features)
|
@@ -1,19 +1,15 @@
|
|
1
1
|
# Audio Scribe
|
2
2
|
|
3
|
-
**A Command-Line Tool for Audio Transcription
|
3
|
+
**A Command-Line Tool for Audio Transcription and Speaker Diarization Using OpenAI Whisper and Pyannote**
|
4
4
|
|
5
5
|
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
# Current Working Badges
|
6
|
+
![Coverage](https://img.shields.io/badge/coverage-94.3%25-brightgreen)
|
10
7
|
[![Pipeline Status](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/pipeline.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main)
|
11
|
-
[![PyPI
|
12
|
-
[![
|
13
|
-
[![PyPI
|
14
|
-
[![PyPI
|
15
|
-
[![Coverage Report](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/coverage.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main)
|
16
|
-
![Coverage](https://img.shields.io/badge/coverage-{percentage}%25-{color})
|
8
|
+
[![PyPI Version](https://badge.fury.io/py/audio-scribe.svg)](https://badge.fury.io/py/audio-scribe)
|
9
|
+
[![Python Versions](https://img.shields.io/pypi/pyversions/audio-scribe)](https://pypi.org/project/audio-scribe/)
|
10
|
+
[![PyPI Downloads](https://img.shields.io/pypi/dm/audio-scribe)](https://pypi.org/project/audio-scribe/)
|
11
|
+
[![PyPI License](https://img.shields.io/pypi/l/audio-scribe)](https://pypi.org/project/audio-scribe/)
|
12
|
+
<!-- [![Coverage Report](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/badges/main/coverage.svg)](https://gitlab.genomicops.cloud/innovation-hub/audio-scribe/-/commits/main) -->
|
17
13
|
|
18
14
|
## Overview
|
19
15
|
|
@@ -32,7 +28,6 @@ This repository is licensed under the [Apache License 2.0](#license).
|
|
32
28
|
## Table of Contents
|
33
29
|
|
34
30
|
- [Audio Scribe](#audio-scribe)
|
35
|
-
- [Current Working Badges](#current-working-badges)
|
36
31
|
- [Overview](#overview)
|
37
32
|
- [Table of Contents](#table-of-contents)
|
38
33
|
- [Features](#features)
|
@@ -5,14 +5,15 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
5
5
|
|
6
6
|
setuptools.setup(
|
7
7
|
name="audio_scribe",
|
8
|
-
version="0.1.
|
8
|
+
version="0.1.2",
|
9
9
|
author="Gurasis Osahan",
|
10
10
|
author_email="contact@genomicops.com",
|
11
11
|
description="A command-line tool for audio transcription with Whisper and Pyannote.",
|
12
12
|
long_description=long_description,
|
13
13
|
long_description_content_type="text/markdown",
|
14
14
|
url="https://gitlab.genomicops.cloud/genomicops/audio-scribe",
|
15
|
-
|
15
|
+
package_dir={"": "src"},
|
16
|
+
packages=setuptools.find_packages(where="src"),
|
16
17
|
python_requires=">=3.8",
|
17
18
|
install_requires=[
|
18
19
|
"torch",
|
@@ -26,9 +27,7 @@ setuptools.setup(
|
|
26
27
|
"GPUtil",
|
27
28
|
],
|
28
29
|
entry_points={
|
29
|
-
"console_scripts": [
|
30
|
-
"audio-scribe=audio_scribe.cli:main",
|
31
|
-
]
|
30
|
+
"console_scripts": ["audio-scribe=audio_scribe.transcriber:main"]
|
32
31
|
},
|
33
32
|
keywords="whisper pyannote transcription audio diarization",
|
34
33
|
license="Apache-2.0",
|
@@ -49,4 +48,4 @@ setuptools.setup(
|
|
49
48
|
"Programming Language :: Python :: 3.10",
|
50
49
|
"Operating System :: OS Independent",
|
51
50
|
],
|
52
|
-
)
|
51
|
+
)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
"""
|
2
|
+
Audio Scribe
|
3
|
+
-----------------
|
4
|
+
A Python package for transcribing audio files with speaker diarization
|
5
|
+
using Whisper and Pyannote.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .transcriber import main
|
9
|
+
from .models import TranscriptionPipeline, AudioProcessor
|
10
|
+
from .config import TranscriptionConfig
|
11
|
+
from .auth import TokenManager
|
12
|
+
from .utils import DependencyManager, complete_path
|
13
|
+
|
14
|
+
__version__ = "0.1.2"
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"main",
|
18
|
+
"TranscriptionPipeline",
|
19
|
+
"TranscriptionConfig",
|
20
|
+
"AudioProcessor",
|
21
|
+
"TokenManager",
|
22
|
+
"DependencyManager",
|
23
|
+
"complete_path",
|
24
|
+
]
|
@@ -0,0 +1,119 @@
|
|
1
|
+
"""Authentication and token management for Audio Scribe."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import json
|
5
|
+
import base64
|
6
|
+
import logging
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Optional
|
9
|
+
from cryptography.fernet import Fernet
|
10
|
+
from cryptography.hazmat.primitives import hashes
|
11
|
+
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
class TokenManager:
|
16
|
+
"""Handles secure storage and retrieval of the Hugging Face authentication token."""
|
17
|
+
def __init__(self):
|
18
|
+
# Store config in ~/.pyannote/config.json
|
19
|
+
self.config_dir = Path.home() / ".pyannote"
|
20
|
+
self.config_file = self.config_dir / "config.json"
|
21
|
+
self._initialize_config()
|
22
|
+
|
23
|
+
def _initialize_config(self) -> None:
|
24
|
+
"""Initialize configuration directory and file with secure permissions."""
|
25
|
+
self.config_dir.mkdir(exist_ok=True)
|
26
|
+
if not self.config_file.exists():
|
27
|
+
self._save_config({})
|
28
|
+
|
29
|
+
# Set secure file and directory permissions on POSIX systems
|
30
|
+
if os.name == "posix":
|
31
|
+
os.chmod(self.config_dir, 0o700)
|
32
|
+
os.chmod(self.config_file, 0o600)
|
33
|
+
|
34
|
+
def _get_encryption_key(self) -> bytes:
|
35
|
+
"""Generate an encryption key from system-specific data."""
|
36
|
+
salt = b"pyannote-audio-salt"
|
37
|
+
kdf = PBKDF2HMAC(
|
38
|
+
algorithm=hashes.SHA256(),
|
39
|
+
length=32,
|
40
|
+
salt=salt,
|
41
|
+
iterations=100000,
|
42
|
+
)
|
43
|
+
key = kdf.derive(str(Path.home()).encode())
|
44
|
+
return base64.urlsafe_b64encode(key)
|
45
|
+
|
46
|
+
def _save_config(self, config: dict) -> None:
|
47
|
+
"""Securely save configuration to file."""
|
48
|
+
with open(self.config_file, "w", encoding="utf-8") as f:
|
49
|
+
json.dump(config, f)
|
50
|
+
|
51
|
+
def _load_config(self) -> dict:
|
52
|
+
"""Load configuration from file."""
|
53
|
+
try:
|
54
|
+
with open(self.config_file, "r", encoding="utf-8") as f:
|
55
|
+
return json.load(f)
|
56
|
+
except Exception:
|
57
|
+
return {}
|
58
|
+
|
59
|
+
def store_token(self, token: str) -> bool:
|
60
|
+
"""Securely store authentication token."""
|
61
|
+
try:
|
62
|
+
fernet = Fernet(self._get_encryption_key())
|
63
|
+
encrypted_token = fernet.encrypt(token.encode())
|
64
|
+
|
65
|
+
config = self._load_config()
|
66
|
+
config["token"] = encrypted_token.decode()
|
67
|
+
|
68
|
+
self._save_config(config)
|
69
|
+
return True
|
70
|
+
except Exception as e:
|
71
|
+
logger.error(f"Failed to store token: {e}")
|
72
|
+
return False
|
73
|
+
|
74
|
+
def retrieve_token(self) -> Optional[str]:
|
75
|
+
"""Retrieve stored authentication token."""
|
76
|
+
try:
|
77
|
+
config = self._load_config()
|
78
|
+
if "token" in config:
|
79
|
+
fernet = Fernet(self._get_encryption_key())
|
80
|
+
return fernet.decrypt(config["token"].encode()).decode()
|
81
|
+
except Exception as e:
|
82
|
+
logger.error(f"Failed to retrieve token: {e}")
|
83
|
+
return None
|
84
|
+
|
85
|
+
def delete_token(self) -> bool:
|
86
|
+
"""Delete stored authentication token."""
|
87
|
+
try:
|
88
|
+
config = self._load_config()
|
89
|
+
if "token" in config:
|
90
|
+
del config["token"]
|
91
|
+
self._save_config(config)
|
92
|
+
return True
|
93
|
+
except Exception as e:
|
94
|
+
logger.error(f"Failed to delete token: {e}")
|
95
|
+
return False
|
96
|
+
|
97
|
+
def get_token(token_manager: TokenManager) -> Optional[str]:
|
98
|
+
"""Get authentication token from storage or user input."""
|
99
|
+
stored_token = token_manager.retrieve_token()
|
100
|
+
if stored_token:
|
101
|
+
choice = input("\nUse the stored Hugging Face token? (y/n): ").lower().strip()
|
102
|
+
if choice == "y":
|
103
|
+
return stored_token
|
104
|
+
|
105
|
+
print("\nA HuggingFace token is required for speaker diarization.")
|
106
|
+
print("Get your token at: https://huggingface.co/settings/tokens")
|
107
|
+
print("\nEnsure you have accepted:")
|
108
|
+
print(" 1. pyannote/segmentation-3.0 conditions")
|
109
|
+
print(" 2. pyannote/speaker-diarization-3.1 conditions")
|
110
|
+
|
111
|
+
token = input("\nEnter HuggingFace token: ").strip()
|
112
|
+
if token:
|
113
|
+
choice = input("Save token for future use? (y/n): ").lower().strip()
|
114
|
+
if choice == "y":
|
115
|
+
if token_manager.store_token(token):
|
116
|
+
print("Token saved successfully.")
|
117
|
+
else:
|
118
|
+
print("Failed to save token. It will be used for this session only.")
|
119
|
+
return token if token else None
|
@@ -0,0 +1,24 @@
|
|
1
|
+
"""Configuration management for Audio Scribe."""
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Optional
|
6
|
+
import torch
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class TranscriptionConfig:
|
10
|
+
"""Configuration settings for the transcription pipeline."""
|
11
|
+
output_directory: Path
|
12
|
+
whisper_model: str = "base.en"
|
13
|
+
diarization_model: str = "pyannote/speaker-diarization-3.1"
|
14
|
+
temp_directory: Optional[Path] = None
|
15
|
+
device: Optional[str] = None
|
16
|
+
|
17
|
+
def __post_init__(self):
|
18
|
+
# Use CUDA if available, else fall back to CPU
|
19
|
+
self.device = self.device or ("cuda" if torch.cuda.is_available() else "cpu")
|
20
|
+
# Default temp directory inside the output directory
|
21
|
+
self.temp_directory = self.temp_directory or (self.output_directory / "temp")
|
22
|
+
# Ensure directories exist
|
23
|
+
self.temp_directory.mkdir(parents=True, exist_ok=True)
|
24
|
+
self.output_directory.mkdir(parents=True, exist_ok=True)
|
@@ -0,0 +1,196 @@
|
|
1
|
+
"""Model handling and audio processing for Audio Scribe."""
|
2
|
+
|
3
|
+
import wave
|
4
|
+
import torch
|
5
|
+
import whisper
|
6
|
+
import logging
|
7
|
+
import warnings
|
8
|
+
import threading
|
9
|
+
from datetime import datetime
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import Optional
|
12
|
+
from pyannote.audio import Pipeline
|
13
|
+
|
14
|
+
from .config import TranscriptionConfig
|
15
|
+
from .auth import TokenManager
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
try:
|
20
|
+
from alive_progress import alive_bar
|
21
|
+
import psutil
|
22
|
+
import GPUtil
|
23
|
+
HAVE_PROGRESS_SUPPORT = True
|
24
|
+
except ImportError:
|
25
|
+
HAVE_PROGRESS_SUPPORT = False
|
26
|
+
|
27
|
+
|
28
|
+
class AudioProcessor:
|
29
|
+
"""Handles audio file processing and segmentation."""
|
30
|
+
|
31
|
+
def __init__(self, config: TranscriptionConfig):
|
32
|
+
self.config = config
|
33
|
+
|
34
|
+
def load_audio_segment(
|
35
|
+
self,
|
36
|
+
audio_path: Path,
|
37
|
+
start_time: float,
|
38
|
+
end_time: float,
|
39
|
+
output_path: Path,
|
40
|
+
) -> bool:
|
41
|
+
"""Extract and save the audio segment from start_time to end_time."""
|
42
|
+
try:
|
43
|
+
with wave.open(str(audio_path), "rb") as infile:
|
44
|
+
params = infile.getparams()
|
45
|
+
frame_rate = params.framerate
|
46
|
+
start_frame = int(start_time * frame_rate)
|
47
|
+
end_frame = min(int(end_time * frame_rate), infile.getnframes())
|
48
|
+
|
49
|
+
infile.setpos(start_frame)
|
50
|
+
frames = infile.readframes(end_frame - start_frame)
|
51
|
+
|
52
|
+
with wave.open(str(output_path), "wb") as outfile:
|
53
|
+
outfile.setparams(params)
|
54
|
+
outfile.writeframes(frames)
|
55
|
+
return True
|
56
|
+
except Exception as e:
|
57
|
+
logger.error(f"Failed to process audio segment: {e}")
|
58
|
+
return False
|
59
|
+
|
60
|
+
|
61
|
+
class TranscriptionPipeline:
|
62
|
+
"""Main pipeline for audio transcription and speaker diarization."""
|
63
|
+
|
64
|
+
def __init__(self, config: TranscriptionConfig):
|
65
|
+
self.config = config
|
66
|
+
self.diarization_pipeline = None
|
67
|
+
self.whisper_model = None
|
68
|
+
self.token_manager = TokenManager()
|
69
|
+
self._running = False # used for resource monitor thread
|
70
|
+
|
71
|
+
def initialize_models(self, auth_token: str) -> bool:
|
72
|
+
"""Initialize the Pyannote diarization pipeline and Whisper model."""
|
73
|
+
try:
|
74
|
+
# Load Whisper model
|
75
|
+
self.whisper_model = whisper.load_model(
|
76
|
+
self.config.whisper_model,
|
77
|
+
device=self.config.device,
|
78
|
+
download_root=str(self.config.output_directory / "models"),
|
79
|
+
)
|
80
|
+
|
81
|
+
# Load Pyannote diarization pipeline
|
82
|
+
self.diarization_pipeline = Pipeline.from_pretrained(
|
83
|
+
self.config.diarization_model,
|
84
|
+
use_auth_token=auth_token
|
85
|
+
)
|
86
|
+
self.diarization_pipeline.to(torch.device(self.config.device))
|
87
|
+
|
88
|
+
if self.config.device == "cpu":
|
89
|
+
warnings.warn("Running on CPU. GPU is recommended for better performance.")
|
90
|
+
|
91
|
+
return True
|
92
|
+
except Exception as e:
|
93
|
+
logger.error(f"Model initialization failed: {e}")
|
94
|
+
logger.error("Please ensure you have accepted the model conditions at:")
|
95
|
+
logger.error(" 1. https://huggingface.co/pyannote/segmentation-3.0")
|
96
|
+
logger.error(" 2. https://huggingface.co/pyannote/speaker-diarization-3.1")
|
97
|
+
return False
|
98
|
+
|
99
|
+
def _update_resources(self, bar):
|
100
|
+
"""Update progress bar with resource usage information."""
|
101
|
+
while self._running:
|
102
|
+
try:
|
103
|
+
import time
|
104
|
+
time.sleep(0.5)
|
105
|
+
|
106
|
+
cpu_usage = psutil.cpu_percent(interval=None) if HAVE_PROGRESS_SUPPORT else 0
|
107
|
+
memory_usage = psutil.virtual_memory().percent if HAVE_PROGRESS_SUPPORT else 0
|
108
|
+
|
109
|
+
if HAVE_PROGRESS_SUPPORT and GPUtil.getGPUs():
|
110
|
+
gpus = GPUtil.getGPUs()
|
111
|
+
gpu_mem_used = f"{gpus[0].memoryUsed:.0f}"
|
112
|
+
gpu_mem_total = f"{gpus[0].memoryTotal:.0f}"
|
113
|
+
gpu_usage_text = f"{gpu_mem_used}/{gpu_mem_total} MB"
|
114
|
+
else:
|
115
|
+
gpu_usage_text = "N/A"
|
116
|
+
|
117
|
+
resource_text = f"CPU: {cpu_usage}%, MEM: {memory_usage}%, GPU Mem: {gpu_usage_text}"
|
118
|
+
bar.text(resource_text)
|
119
|
+
except Exception as e:
|
120
|
+
logger.error(f"Resource monitoring error: {e}")
|
121
|
+
|
122
|
+
def process_file(self, audio_path: Path) -> bool:
|
123
|
+
"""Diarize, segment, and transcribe using Whisper + Pyannote with progress feedback."""
|
124
|
+
try:
|
125
|
+
logger.info("Starting audio processing...")
|
126
|
+
diarization = self.diarization_pipeline(str(audio_path))
|
127
|
+
segments = list(diarization.itertracks(yield_label=True))
|
128
|
+
total_segments = len(segments)
|
129
|
+
|
130
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
131
|
+
output_file = self.config.output_directory / f"transcript_{timestamp}.txt"
|
132
|
+
audio_processor = AudioProcessor(self.config)
|
133
|
+
|
134
|
+
if not HAVE_PROGRESS_SUPPORT:
|
135
|
+
# No alive_progress, psutil, or GPUtil installed
|
136
|
+
logger.info("Processing audio without progress bar (missing optional packages).")
|
137
|
+
with output_file.open("w", encoding="utf-8") as f:
|
138
|
+
for turn, _, speaker in segments:
|
139
|
+
segment_path = (
|
140
|
+
self.config.temp_directory
|
141
|
+
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
142
|
+
)
|
143
|
+
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
144
|
+
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
145
|
+
segment_path.unlink(missing_ok=True)
|
146
|
+
|
147
|
+
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
148
|
+
f.write(line)
|
149
|
+
logger.info(line.strip())
|
150
|
+
return True
|
151
|
+
else:
|
152
|
+
# Use a progress bar to track segment transcription
|
153
|
+
from alive_progress import alive_bar
|
154
|
+
import threading
|
155
|
+
|
156
|
+
self._running = True
|
157
|
+
with output_file.open("w", encoding="utf-8") as f, alive_bar(
|
158
|
+
total_segments,
|
159
|
+
title="Transcribing Audio",
|
160
|
+
spinner="pulse",
|
161
|
+
theme="classic",
|
162
|
+
stats=False,
|
163
|
+
elapsed=True,
|
164
|
+
monitor=True,
|
165
|
+
) as bar:
|
166
|
+
|
167
|
+
# Start a background thread for resource monitoring
|
168
|
+
resource_thread = threading.Thread(target=self._update_resources, args=(bar,))
|
169
|
+
resource_thread.start()
|
170
|
+
|
171
|
+
for turn, _, speaker in segments:
|
172
|
+
segment_path = (
|
173
|
+
self.config.temp_directory
|
174
|
+
/ f"segment_{speaker}_{turn.start:.2f}_{turn.end:.2f}.wav"
|
175
|
+
)
|
176
|
+
if audio_processor.load_audio_segment(audio_path, turn.start, turn.end, segment_path):
|
177
|
+
transcription = self.whisper_model.transcribe(str(segment_path))["text"]
|
178
|
+
segment_path.unlink(missing_ok=True)
|
179
|
+
|
180
|
+
line = f"[{turn.start:.2f}s - {turn.end:.2f}s] Speaker {speaker}: {transcription.strip()}\n"
|
181
|
+
f.write(line)
|
182
|
+
logger.info(line.strip())
|
183
|
+
|
184
|
+
# Update the progress bar
|
185
|
+
bar()
|
186
|
+
|
187
|
+
# Stop resource monitoring
|
188
|
+
self._running = False
|
189
|
+
resource_thread.join()
|
190
|
+
|
191
|
+
logger.info(f"Transcription completed. Output saved to: {output_file}")
|
192
|
+
return True
|
193
|
+
|
194
|
+
except Exception as e:
|
195
|
+
logger.error(f"Processing failed: {e}")
|
196
|
+
return False
|
@@ -0,0 +1,131 @@
|
|
1
|
+
"""
|
2
|
+
Main entry point for Audio Scribe transcription tool.
|
3
|
+
Handles CLI interface and orchestrates the transcription process.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import sys
|
7
|
+
import logging
|
8
|
+
import warnings
|
9
|
+
import argparse
|
10
|
+
import readline
|
11
|
+
from pathlib import Path
|
12
|
+
from datetime import datetime
|
13
|
+
|
14
|
+
from .config import TranscriptionConfig
|
15
|
+
from .models import TranscriptionPipeline
|
16
|
+
from .auth import TokenManager, get_token
|
17
|
+
from .utils import DependencyManager, complete_path
|
18
|
+
|
19
|
+
# Configure logging
|
20
|
+
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
|
21
|
+
logging.basicConfig(
|
22
|
+
level=logging.INFO,
|
23
|
+
format=LOG_FORMAT,
|
24
|
+
handlers=[
|
25
|
+
logging.StreamHandler(),
|
26
|
+
logging.FileHandler("transcription.log", mode="a", encoding="utf-8"),
|
27
|
+
],
|
28
|
+
)
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
def main():
|
33
|
+
"""Main entry point for the Audio Scribe CLI."""
|
34
|
+
print("Initializing environment... Please wait while we load dependencies and models.")
|
35
|
+
sys.stdout.flush()
|
36
|
+
|
37
|
+
parser = argparse.ArgumentParser(
|
38
|
+
description="Audio Transcription Pipeline using Whisper + Pyannote, with optional progress bar."
|
39
|
+
)
|
40
|
+
parser.add_argument(
|
41
|
+
"--audio",
|
42
|
+
type=Path,
|
43
|
+
help="Path to the audio file to transcribe."
|
44
|
+
)
|
45
|
+
parser.add_argument(
|
46
|
+
"--token",
|
47
|
+
help="HuggingFace API token. Overrides any saved token."
|
48
|
+
)
|
49
|
+
parser.add_argument(
|
50
|
+
"--output",
|
51
|
+
type=Path,
|
52
|
+
help="Path to the output directory for transcripts and temporary files.",
|
53
|
+
)
|
54
|
+
parser.add_argument(
|
55
|
+
"--delete-token",
|
56
|
+
action="store_true",
|
57
|
+
help="Delete any stored Hugging Face token and exit.",
|
58
|
+
)
|
59
|
+
parser.add_argument(
|
60
|
+
"--show-warnings",
|
61
|
+
action="store_true",
|
62
|
+
help="Enable user warnings (e.g., from pyannote.audio). Disabled by default.",
|
63
|
+
)
|
64
|
+
parser.add_argument(
|
65
|
+
"--whisper-model",
|
66
|
+
default="base.en",
|
67
|
+
help="Specify the Whisper model to use (default: 'base.en').",
|
68
|
+
)
|
69
|
+
args = parser.parse_args()
|
70
|
+
|
71
|
+
# Manage user warnings
|
72
|
+
if not args.show_warnings:
|
73
|
+
warnings.filterwarnings("ignore", category=UserWarning, module=r"pyannote\.audio")
|
74
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
|
75
|
+
else:
|
76
|
+
warnings.resetwarnings()
|
77
|
+
|
78
|
+
# Check dependencies
|
79
|
+
if not DependencyManager.verify_dependencies():
|
80
|
+
sys.exit(1)
|
81
|
+
|
82
|
+
# Initialize tab-completion for file paths
|
83
|
+
readline.set_completer_delims(' \t\n;')
|
84
|
+
readline.set_completer(complete_path)
|
85
|
+
readline.parse_and_bind("tab: complete")
|
86
|
+
|
87
|
+
# Initialize the token manager
|
88
|
+
token_manager = TokenManager()
|
89
|
+
|
90
|
+
# If user wants to delete the stored token, do so and exit
|
91
|
+
if args.delete_token:
|
92
|
+
success = token_manager.delete_token()
|
93
|
+
sys.exit(0 if success else 1)
|
94
|
+
|
95
|
+
# Prepare configuration
|
96
|
+
output_dir = args.output or (Path("transcripts") / datetime.now().strftime("%Y%m%d"))
|
97
|
+
config = TranscriptionConfig(
|
98
|
+
output_directory=output_dir,
|
99
|
+
whisper_model=args.whisper_model
|
100
|
+
)
|
101
|
+
|
102
|
+
# Initialize pipeline
|
103
|
+
pipeline = TranscriptionPipeline(config)
|
104
|
+
hf_token = args.token or get_token(token_manager)
|
105
|
+
if not hf_token:
|
106
|
+
logger.error("No Hugging Face token provided. Exiting.")
|
107
|
+
sys.exit(1)
|
108
|
+
|
109
|
+
# Initialize models
|
110
|
+
if not pipeline.initialize_models(hf_token):
|
111
|
+
logger.error("Failed to initialize pipeline. Exiting.")
|
112
|
+
sys.exit(1)
|
113
|
+
|
114
|
+
# Prompt user for audio file path if not passed in
|
115
|
+
audio_path = args.audio
|
116
|
+
while not audio_path or not audio_path.exists():
|
117
|
+
audio_path_str = input("\nEnter path to audio file (Tab for autocomplete): ").strip()
|
118
|
+
audio_path = Path(audio_path_str)
|
119
|
+
if not audio_path.exists():
|
120
|
+
print(f"File '{audio_path}' not found. Please try again.")
|
121
|
+
|
122
|
+
print("Audio file path accepted. Preparing to process the audio...")
|
123
|
+
sys.stdout.flush()
|
124
|
+
|
125
|
+
# Process the audio file
|
126
|
+
if not pipeline.process_file(audio_path):
|
127
|
+
sys.exit(1)
|
128
|
+
|
129
|
+
|
130
|
+
if __name__ == "__main__":
|
131
|
+
main()
|