audio2sub 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audio2sub/__init__.py +113 -0
- audio2sub/audio.py +50 -0
- audio2sub/cli.py +141 -0
- audio2sub/transcribers/__init__.py +22 -0
- audio2sub/transcribers/base.py +286 -0
- audio2sub/transcribers/faster_whisper.py +65 -0
- audio2sub/transcribers/gemini.py +55 -0
- audio2sub/transcribers/whisper.py +65 -0
- audio2sub/vad.py +74 -0
- audio2sub-0.1.0.dist-info/METADATA +116 -0
- audio2sub-0.1.0.dist-info/RECORD +15 -0
- audio2sub-0.1.0.dist-info/WHEEL +5 -0
- audio2sub-0.1.0.dist-info/entry_points.txt +2 -0
- audio2sub-0.1.0.dist-info/licenses/LICENSE +21 -0
- audio2sub-0.1.0.dist-info/top_level.txt +1 -0
audio2sub/__init__.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Audio2Sub package: convert media to subtitles."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable, Iterable, List, Optional
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
import pysrt
|
|
11
|
+
|
|
12
|
+
__all__ = ["__version__", "transcribe", "segments_to_srt", "Segment", "Usage"]
|
|
13
|
+
__title__ = "audio2sub"
|
|
14
|
+
__description__ = "Transcribe media files to SRT subtitles."
|
|
15
|
+
__url__ = "https://github.com/Xavier-Lam/audio2sub"
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
__author__ = "Xavier-Lam"
|
|
18
|
+
__author_email__ = "xavierlam7@hotmail.com"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
ReporterCallback = Callable[[str, dict], None]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Segment:
|
|
26
|
+
index: int
|
|
27
|
+
start: float
|
|
28
|
+
end: float
|
|
29
|
+
text: str = ""
|
|
30
|
+
audio: Optional[Path] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
from .audio import convert_media_to_wav, cut_wav_segment # noqa: E402
|
|
34
|
+
from .transcribers.base import Base, Usage # noqa: E402
|
|
35
|
+
from .vad import SileroVAD # noqa: E402
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def transcribe(
|
|
39
|
+
input_media: str | Path,
|
|
40
|
+
transcriber: Base,
|
|
41
|
+
lang: Optional[str] = None,
|
|
42
|
+
reporter: Optional[ReporterCallback] = None,
|
|
43
|
+
stats: Optional[Usage | dict] = None,
|
|
44
|
+
opts: Optional[dict] = None,
|
|
45
|
+
) -> List[Segment]:
|
|
46
|
+
"""Convert media to segments using Silero VAD and batch transcription."""
|
|
47
|
+
|
|
48
|
+
input_media = Path(input_media)
|
|
49
|
+
if not input_media.exists():
|
|
50
|
+
raise FileNotFoundError(f"Input media not found: {input_media}")
|
|
51
|
+
|
|
52
|
+
_output = lambda message: reporter and reporter("status", message=message)
|
|
53
|
+
_progress = lambda name, current, total, **payload: reporter and reporter(
|
|
54
|
+
"progress",
|
|
55
|
+
name=name,
|
|
56
|
+
current=current,
|
|
57
|
+
total=total,
|
|
58
|
+
**payload,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
62
|
+
wav_path = Path(tmpdir) / "audio.wav"
|
|
63
|
+
_output("Converting audio...")
|
|
64
|
+
convert_media_to_wav(input_media, wav_path)
|
|
65
|
+
|
|
66
|
+
vad = SileroVAD(sample_rate=16_000)
|
|
67
|
+
_output("Running voice activity detection (VAD)...")
|
|
68
|
+
segments = vad.detect_segments(wav_path)
|
|
69
|
+
if not segments:
|
|
70
|
+
raise RuntimeError("No speech detected by Silero VAD")
|
|
71
|
+
total_segments = len(segments)
|
|
72
|
+
_output((f"Detected {total_segments} speech segment(s)."))
|
|
73
|
+
_output("Cutting audio into clips...")
|
|
74
|
+
|
|
75
|
+
# Attach indices and extract audio clips for each segment
|
|
76
|
+
for idx, seg in enumerate(segments, start=1):
|
|
77
|
+
seg.index = idx
|
|
78
|
+
seg_path = Path(tmpdir) / f"segment_{idx}.wav"
|
|
79
|
+
cut_wav_segment(wav_path, seg.start, seg.end, seg_path)
|
|
80
|
+
seg.audio = seg_path
|
|
81
|
+
|
|
82
|
+
_output("Starting transcription...")
|
|
83
|
+
_progress("transcription", 0, total_segments, unit="seg")
|
|
84
|
+
|
|
85
|
+
# Batch transcribe for potential backend optimizations (generator)
|
|
86
|
+
transcribed_segments: List[Segment] = []
|
|
87
|
+
completed = 0
|
|
88
|
+
for seg in transcriber.batch_transcribe(
|
|
89
|
+
segments, lang=lang, stats=stats, **(opts or {})
|
|
90
|
+
):
|
|
91
|
+
if seg.text.strip():
|
|
92
|
+
transcribed_segments.append(seg)
|
|
93
|
+
completed += 1
|
|
94
|
+
_progress("transcription", completed, total_segments, unit="seg")
|
|
95
|
+
|
|
96
|
+
if len(transcribed_segments) == 0:
|
|
97
|
+
raise RuntimeError("Transcription produced no subtitle lines.")
|
|
98
|
+
|
|
99
|
+
_output("Transcription completed.")
|
|
100
|
+
return transcribed_segments
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def segments_to_srt(segments: Iterable[Segment]) -> pysrt.SubRipFile:
|
|
104
|
+
srt = pysrt.SubRipFile()
|
|
105
|
+
for seg in segments:
|
|
106
|
+
item = pysrt.SubRipItem(
|
|
107
|
+
index=seg.index,
|
|
108
|
+
start=pysrt.SubRipTime(seconds=seg.start),
|
|
109
|
+
end=pysrt.SubRipTime(seconds=seg.end),
|
|
110
|
+
text=seg.text,
|
|
111
|
+
)
|
|
112
|
+
srt.append(item)
|
|
113
|
+
return srt
|
audio2sub/audio.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import ffmpeg
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def convert_media_to_wav(
|
|
7
|
+
input_path: str | Path,
|
|
8
|
+
output_path: str | Path,
|
|
9
|
+
sample_rate: int = 16_000,
|
|
10
|
+
channels: int = 1,
|
|
11
|
+
overwrite: bool = True,
|
|
12
|
+
):
|
|
13
|
+
"""Convert any media file to a WAV"""
|
|
14
|
+
|
|
15
|
+
input_path = Path(input_path)
|
|
16
|
+
output_path = Path(output_path)
|
|
17
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
|
|
19
|
+
if not input_path.exists():
|
|
20
|
+
raise FileNotFoundError(f"Input file does not exist: {input_path}")
|
|
21
|
+
|
|
22
|
+
stream = ffmpeg.input(str(input_path)).output(
|
|
23
|
+
str(output_path),
|
|
24
|
+
ac=channels,
|
|
25
|
+
ar=sample_rate,
|
|
26
|
+
format="wav",
|
|
27
|
+
)
|
|
28
|
+
if overwrite:
|
|
29
|
+
stream = stream.overwrite_output()
|
|
30
|
+
else:
|
|
31
|
+
stream = stream.global_args("-n")
|
|
32
|
+
stream.run(quiet=True)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def cut_wav_segment(
|
|
36
|
+
input_wav: str | Path,
|
|
37
|
+
start: float,
|
|
38
|
+
end: float,
|
|
39
|
+
output_path: str | Path,
|
|
40
|
+
):
|
|
41
|
+
"""Cut a WAV segment using ffmpeg"""
|
|
42
|
+
|
|
43
|
+
input_wav = Path(input_wav)
|
|
44
|
+
output_path = Path(output_path)
|
|
45
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
stream = ffmpeg.input(str(input_wav), ss=start, to=end).output(
|
|
48
|
+
str(output_path), acodec="copy"
|
|
49
|
+
)
|
|
50
|
+
stream.overwrite_output().run(quiet=True)
|
audio2sub/cli.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import inspect
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import warnings
|
|
7
|
+
from typing import Dict, Type
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
from tqdm.auto import tqdm
|
|
11
|
+
|
|
12
|
+
from . import __version__, segments_to_srt, transcribe, transcribers
|
|
13
|
+
from .transcribers import Base
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _available_transcribers() -> Dict[str, Type[Base]]:
|
|
17
|
+
return {
|
|
18
|
+
obj.name: obj
|
|
19
|
+
for _, obj in inspect.getmembers(transcribers, inspect.isclass)
|
|
20
|
+
if issubclass(obj, Base) and not inspect.isabstract(obj)
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _build_backend_parser(choices: list[str]) -> argparse.ArgumentParser:
|
|
25
|
+
default = "faster_whisper"
|
|
26
|
+
parser = argparse.ArgumentParser(add_help=False)
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"-t",
|
|
29
|
+
"--transcriber",
|
|
30
|
+
choices=choices,
|
|
31
|
+
default=default,
|
|
32
|
+
help=f"Transcription backend to use (default: {default})",
|
|
33
|
+
)
|
|
34
|
+
return parser
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def build_parser(
|
|
38
|
+
available: Dict[str, Type[Base]], args=None
|
|
39
|
+
) -> argparse.ArgumentParser:
|
|
40
|
+
backend_parser = _build_backend_parser(choices=sorted(available.keys()))
|
|
41
|
+
backend_args, _remaining = backend_parser.parse_known_args(args)
|
|
42
|
+
|
|
43
|
+
parser = argparse.ArgumentParser(
|
|
44
|
+
prog="audio2sub",
|
|
45
|
+
description=(
|
|
46
|
+
"Convert media files to SRT subtitles using FFmpeg, Silero VAD, "
|
|
47
|
+
"and transcription backends."
|
|
48
|
+
),
|
|
49
|
+
parents=[backend_parser],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
parser.add_argument("input", help="Path to input media file (audio or video)")
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"-o",
|
|
55
|
+
"--output",
|
|
56
|
+
required=True,
|
|
57
|
+
help="Output SRT file path",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--lang",
|
|
61
|
+
default=None,
|
|
62
|
+
help=(
|
|
63
|
+
"Language code (e.g., en, es, fr). If omitted, backend may default to en. "
|
|
64
|
+
"See https://github.com/openai/whisper/blob/main/whisper/tokenizer.py for "
|
|
65
|
+
"a list of available languages."
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--version",
|
|
70
|
+
action="version",
|
|
71
|
+
version=f"%(prog)s {__version__}",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
available[backend_args.transcriber].contribute_to_cli(parser)
|
|
75
|
+
return parser
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def main() -> int:
|
|
79
|
+
if not torch.cuda.is_available():
|
|
80
|
+
warnings.warn(
|
|
81
|
+
"CUDA is not available; performance may be degraded significantly. "
|
|
82
|
+
"For more information, please refer to the README.md of the project."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
available = _available_transcribers()
|
|
86
|
+
parser = build_parser(available)
|
|
87
|
+
args = parser.parse_args()
|
|
88
|
+
backend = args.transcriber
|
|
89
|
+
|
|
90
|
+
input_media = Path(args.input)
|
|
91
|
+
output_srt = Path(args.output)
|
|
92
|
+
|
|
93
|
+
bars: dict[str, tqdm] = {}
|
|
94
|
+
|
|
95
|
+
def reporter(kind: str, **payload):
|
|
96
|
+
if kind == "status":
|
|
97
|
+
print(payload.get("message", ""))
|
|
98
|
+
if kind == "progress":
|
|
99
|
+
name = payload.pop("name")
|
|
100
|
+
current = payload.pop("current", 0)
|
|
101
|
+
total = payload.pop("total", 0)
|
|
102
|
+
|
|
103
|
+
bar = bars.get(name)
|
|
104
|
+
if bar is None:
|
|
105
|
+
bar = tqdm(
|
|
106
|
+
total=total,
|
|
107
|
+
desc=name.capitalize(),
|
|
108
|
+
leave=True,
|
|
109
|
+
**payload,
|
|
110
|
+
)
|
|
111
|
+
bars[name] = bar
|
|
112
|
+
bar.n = current
|
|
113
|
+
bar.refresh()
|
|
114
|
+
if current >= total:
|
|
115
|
+
bar.close()
|
|
116
|
+
bars.pop(name, None)
|
|
117
|
+
|
|
118
|
+
stats = {}
|
|
119
|
+
transcriber_cls = available[backend]
|
|
120
|
+
transcriber = transcriber_cls.from_cli_args(args)
|
|
121
|
+
batch_opts = transcriber_cls.opts_from_cli(args)
|
|
122
|
+
|
|
123
|
+
segments = transcribe(
|
|
124
|
+
input_media,
|
|
125
|
+
transcriber,
|
|
126
|
+
lang=args.lang,
|
|
127
|
+
reporter=reporter,
|
|
128
|
+
stats=stats,
|
|
129
|
+
opts=batch_opts,
|
|
130
|
+
)
|
|
131
|
+
segments_to_srt(segments).save(str(output_srt))
|
|
132
|
+
|
|
133
|
+
print("Stats:")
|
|
134
|
+
for k, v in stats.items():
|
|
135
|
+
print(f" {k}: {v}")
|
|
136
|
+
print(f"SRT written to {output_srt}")
|
|
137
|
+
return 0
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__": # pragma: no cover
|
|
141
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
AIAPITranscriber,
|
|
3
|
+
Base,
|
|
4
|
+
MissingDependencyException,
|
|
5
|
+
Usage,
|
|
6
|
+
)
|
|
7
|
+
from .whisper import Whisper
|
|
8
|
+
from .faster_whisper import FasterWhisper
|
|
9
|
+
from .gemini import Gemini
|
|
10
|
+
from audio2sub import Segment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"Base",
|
|
15
|
+
"AIAPITranscriber",
|
|
16
|
+
"Whisper",
|
|
17
|
+
"FasterWhisper",
|
|
18
|
+
"Gemini",
|
|
19
|
+
"MissingDependencyException",
|
|
20
|
+
"Segment",
|
|
21
|
+
"Usage",
|
|
22
|
+
]
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
import argparse
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Iterable, List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from audio2sub import Segment
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Usage:
|
|
16
|
+
tokens_in: int = 0
|
|
17
|
+
tokens_out: int = 0
|
|
18
|
+
|
|
19
|
+
def export(self, stats: Optional[dict]) -> None:
|
|
20
|
+
if stats is None:
|
|
21
|
+
return
|
|
22
|
+
stats["tokens_in"] = self.tokens_in
|
|
23
|
+
stats["tokens_out"] = self.tokens_out
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Base(ABC):
|
|
27
|
+
"""Base class for transcription backends."""
|
|
28
|
+
|
|
29
|
+
name: str = "base"
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
|
|
33
|
+
"""Hook for CLI option registration."""
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def from_cli_args(cls, args: argparse.Namespace) -> "Base":
|
|
37
|
+
"""Instantiate transcriber from CLI args."""
|
|
38
|
+
return cls() # pragma: no cover - overridden when needed
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def opts_from_cli(cls, args: argparse.Namespace) -> dict:
|
|
42
|
+
"""Extract transcriber-specific options from CLI args."""
|
|
43
|
+
return {}
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def transcribe(
|
|
47
|
+
self,
|
|
48
|
+
audio_path: str,
|
|
49
|
+
lang: Optional[str] = None,
|
|
50
|
+
stats: Optional[dict] = None,
|
|
51
|
+
) -> str:
|
|
52
|
+
"""Transcribe a single audio segment and return text."""
|
|
53
|
+
raise NotImplementedError
|
|
54
|
+
|
|
55
|
+
def batch_transcribe(
|
|
56
|
+
self,
|
|
57
|
+
segments: List[Segment],
|
|
58
|
+
lang: Optional[str] = None,
|
|
59
|
+
stats: Optional[dict] = None,
|
|
60
|
+
) -> Iterable[Segment]:
|
|
61
|
+
"""Transcribe a list of segments. Yields updated segments."""
|
|
62
|
+
for seg in segments:
|
|
63
|
+
if seg.audio is None:
|
|
64
|
+
raise FileNotFoundError("Segment has no audio path set")
|
|
65
|
+
text = self.transcribe(str(seg.audio), lang=lang, stats=stats)
|
|
66
|
+
seg.text = text
|
|
67
|
+
yield seg
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class AIAPITranscriber(Base, ABC):
|
|
71
|
+
"""Base class for AI API transcribers"""
|
|
72
|
+
|
|
73
|
+
base_prompt: str = (
|
|
74
|
+
"You will receive multiple audio clips. Return a JSON array of objects "
|
|
75
|
+
"with `index` and `text` fields in the same order. Transcribe each "
|
|
76
|
+
"clip verbatim (no paraphrasing). Omit non-speech clips or return "
|
|
77
|
+
"empty text."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
default_model: str = ""
|
|
81
|
+
default_chunk: int = 20
|
|
82
|
+
|
|
83
|
+
api_key_env_var: Optional[str] = None
|
|
84
|
+
|
|
85
|
+
def __init__(self, model="", api_key=None) -> None:
|
|
86
|
+
self.model = model or self.default_model
|
|
87
|
+
self.api_key = api_key
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
|
|
91
|
+
parser.add_argument(
|
|
92
|
+
"--model",
|
|
93
|
+
default=cls.default_model or None,
|
|
94
|
+
help=(
|
|
95
|
+
"Model name to use"
|
|
96
|
+
+ (f" (default: {cls.default_model})" if cls.default_model else "")
|
|
97
|
+
),
|
|
98
|
+
)
|
|
99
|
+
parser.add_argument(
|
|
100
|
+
"--api-key",
|
|
101
|
+
dest="api_key",
|
|
102
|
+
required=False,
|
|
103
|
+
help=(
|
|
104
|
+
f"API key (optional; env {cls.api_key_env_var or 'API key env var'} "
|
|
105
|
+
"is used if not provided)"
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Add batch transcription options
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"--chunk",
|
|
112
|
+
type=int,
|
|
113
|
+
default=cls.default_chunk,
|
|
114
|
+
help=("Number of clips per API request " f"(default: {cls.default_chunk})"),
|
|
115
|
+
)
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"--outline",
|
|
118
|
+
dest="outline",
|
|
119
|
+
required=False,
|
|
120
|
+
help=("Context outline to guide transcription"),
|
|
121
|
+
)
|
|
122
|
+
parser.add_argument(
|
|
123
|
+
"--prompt",
|
|
124
|
+
dest="prompt",
|
|
125
|
+
required=False,
|
|
126
|
+
help=("Additional system prompt/instructions"),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def from_cli_args(cls, args: argparse.Namespace) -> "AIAPITranscriber":
|
|
131
|
+
return cls(model=args.model, api_key=args.api_key)
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def opts_from_cli(cls, args: argparse.Namespace) -> dict:
|
|
135
|
+
return {
|
|
136
|
+
"chunk": args.chunk,
|
|
137
|
+
"outline": args.outline,
|
|
138
|
+
"prompt": args.prompt,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
def transcribe(
|
|
142
|
+
self, audio_path: str, lang: Optional[str] = None, stats: Optional[dict] = None
|
|
143
|
+
) -> str:
|
|
144
|
+
segments_iter = self.batch_transcribe(
|
|
145
|
+
[Segment(index=1, start=0, end=0, audio=Path(audio_path))],
|
|
146
|
+
lang=lang,
|
|
147
|
+
stats=stats,
|
|
148
|
+
)
|
|
149
|
+
segments = list(segments_iter)
|
|
150
|
+
return segments[0].text if segments else ""
|
|
151
|
+
|
|
152
|
+
def batch_transcribe(
|
|
153
|
+
self,
|
|
154
|
+
segments: List[Segment],
|
|
155
|
+
lang: Optional[str] = None,
|
|
156
|
+
stats: Optional[dict] = None,
|
|
157
|
+
chunk: Optional[int] = None,
|
|
158
|
+
outline: Optional[str] = None,
|
|
159
|
+
prompt: Optional[str] = None,
|
|
160
|
+
) -> Iterable[Segment]:
|
|
161
|
+
"""Transcribe segments with shared chunking, prompt, and stats."""
|
|
162
|
+
|
|
163
|
+
chunk_size = chunk if chunk and chunk > 0 else self.default_chunk
|
|
164
|
+
prompt_cfg = self._build_prompt(lang=lang, outline=outline, prompt=prompt)
|
|
165
|
+
client = self._ensure_client()
|
|
166
|
+
usage_tracker = Usage()
|
|
167
|
+
|
|
168
|
+
for batch in self._iter_chunks(segments, chunk_size):
|
|
169
|
+
raw_text, usage = self._request_transcription(client, batch, prompt_cfg)
|
|
170
|
+
self._parse_response_text(raw_text, batch)
|
|
171
|
+
if usage:
|
|
172
|
+
usage_tracker.tokens_in += usage.tokens_in
|
|
173
|
+
usage_tracker.tokens_out += usage.tokens_out
|
|
174
|
+
usage_tracker.export(stats)
|
|
175
|
+
for seg in batch:
|
|
176
|
+
yield seg
|
|
177
|
+
|
|
178
|
+
def _ensure_client(self):
|
|
179
|
+
if getattr(self, "_client", None):
|
|
180
|
+
return self._client
|
|
181
|
+
self._client = self._create_client()
|
|
182
|
+
return self._client
|
|
183
|
+
|
|
184
|
+
@abstractmethod
|
|
185
|
+
def _create_client(self):
|
|
186
|
+
"""Instantiate the API client."""
|
|
187
|
+
|
|
188
|
+
@abstractmethod
|
|
189
|
+
def _request_transcription(
|
|
190
|
+
self,
|
|
191
|
+
client,
|
|
192
|
+
batch: List[Segment],
|
|
193
|
+
prompt: List[str],
|
|
194
|
+
) -> Tuple[str, Optional[Usage]]:
|
|
195
|
+
"""Call the provider and return (raw_text_response, Usage)."""
|
|
196
|
+
|
|
197
|
+
def _iter_chunks(self, items: List[Segment], size: int) -> Iterable[List[Segment]]:
|
|
198
|
+
if size <= 0:
|
|
199
|
+
size = len(items)
|
|
200
|
+
for i in range(0, len(items), size):
|
|
201
|
+
yield items[i : i + size]
|
|
202
|
+
|
|
203
|
+
def _build_prompt(
|
|
204
|
+
self,
|
|
205
|
+
lang: Optional[str],
|
|
206
|
+
outline: Optional[str],
|
|
207
|
+
prompt: Optional[str],
|
|
208
|
+
) -> List[str]:
|
|
209
|
+
prompt_text = self.base_prompt
|
|
210
|
+
if lang:
|
|
211
|
+
prompt_text += (
|
|
212
|
+
" Primary language is "
|
|
213
|
+
f"{lang}, but audio may include other languages."
|
|
214
|
+
)
|
|
215
|
+
prompt_text += (
|
|
216
|
+
" Each object's `text` must be the transcription of that specific "
|
|
217
|
+
"clip, with no labels or formatting. Respond as plain JSON text "
|
|
218
|
+
"only; do not include markdown or code fences such as ``` or "
|
|
219
|
+
"other wrappers."
|
|
220
|
+
)
|
|
221
|
+
prompt_text += (
|
|
222
|
+
'\nRespond with JSON array of objects: [{"index": <clip '
|
|
223
|
+
'index>, "text": <transcription>}, ...].'
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
system_prompts = [prompt_text]
|
|
227
|
+
if outline:
|
|
228
|
+
system_prompts.append(
|
|
229
|
+
"Outline to guide transcription (context only). *Use the outline "
|
|
230
|
+
"only to make minor corrections to what you hear in the audio "
|
|
231
|
+
"(for example: fix homophones, obvious mis-hearings, or minor "
|
|
232
|
+
"punctuation). Do NOT use the outline or any external knowledge "
|
|
233
|
+
"to create or add words that are not present in the audio*:\n" + outline
|
|
234
|
+
)
|
|
235
|
+
if prompt:
|
|
236
|
+
system_prompts.append("Additional instructions:\n" + prompt)
|
|
237
|
+
|
|
238
|
+
# Return raw list of system prompts (was PromptConfig.system_prompts)
|
|
239
|
+
return system_prompts
|
|
240
|
+
|
|
241
|
+
def _parse_response_text(self, raw_text: str, batch: List[Segment]) -> None:
|
|
242
|
+
raw_text = raw_text.strip()
|
|
243
|
+
parsed: List[dict] = json.loads(raw_text)
|
|
244
|
+
|
|
245
|
+
by_index = {
|
|
246
|
+
entry.get("index"): entry
|
|
247
|
+
for entry in parsed
|
|
248
|
+
if isinstance(entry, dict) and "index" in entry
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
for idx, seg in enumerate(batch):
|
|
252
|
+
entry = by_index.get(seg.index)
|
|
253
|
+
if entry:
|
|
254
|
+
seg.text = entry.get("text", "").strip()
|
|
255
|
+
|
|
256
|
+
def _segments_to_audio_bytes(
|
|
257
|
+
self, batch: List[Segment]
|
|
258
|
+
) -> List[Tuple[Segment, bytes]]:
|
|
259
|
+
payloads: List[Tuple[Segment, bytes]] = []
|
|
260
|
+
for seg in batch:
|
|
261
|
+
if seg.audio is None:
|
|
262
|
+
raise FileNotFoundError("Segment has no audio path set")
|
|
263
|
+
audio_path = Path(seg.audio)
|
|
264
|
+
if not audio_path.exists():
|
|
265
|
+
raise FileNotFoundError(f"Audio not found: {audio_path}")
|
|
266
|
+
payloads.append((seg, audio_path.read_bytes()))
|
|
267
|
+
return payloads
|
|
268
|
+
|
|
269
|
+
def _resolve_api_key(self) -> str:
|
|
270
|
+
api_key = self.api_key
|
|
271
|
+
if not api_key and self.api_key_env_var:
|
|
272
|
+
api_key = os.getenv(self.api_key_env_var)
|
|
273
|
+
if not api_key:
|
|
274
|
+
env_hint = self.api_key_env_var or "API key"
|
|
275
|
+
raise RuntimeError(f"{env_hint} is required for {self.name} transcriber.")
|
|
276
|
+
return api_key
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class MissingDependencyException(RuntimeError):
|
|
280
|
+
def __init__(self, transcriber) -> None:
|
|
281
|
+
name = transcriber.name
|
|
282
|
+
msg = (
|
|
283
|
+
f"Transcriber '{name}' is not installed. Install with `pip install "
|
|
284
|
+
f"audio2sub[{name}]`."
|
|
285
|
+
)
|
|
286
|
+
super().__init__(msg)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .base import Base, MissingDependencyException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FasterWhisper(Base):
|
|
11
|
+
"""Transcriber using faster-whisper (ctranslate2 backend)."""
|
|
12
|
+
|
|
13
|
+
name = "faster_whisper"
|
|
14
|
+
|
|
15
|
+
def __init__(self, model_name: str = "turbo") -> None:
|
|
16
|
+
self.model_name = model_name
|
|
17
|
+
self._model = None
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--model",
|
|
23
|
+
default="turbo",
|
|
24
|
+
help="Faster-Whisper model name (default: turbo)",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def from_cli_args(cls, args: argparse.Namespace) -> "FasterWhisper":
|
|
29
|
+
return cls(model_name=args.model)
|
|
30
|
+
|
|
31
|
+
def transcribe(
|
|
32
|
+
self,
|
|
33
|
+
audio_path: str,
|
|
34
|
+
lang: Optional[str] = None,
|
|
35
|
+
stats: Optional[dict] = None,
|
|
36
|
+
) -> str:
|
|
37
|
+
model = self._ensure_model()
|
|
38
|
+
|
|
39
|
+
audio_path = Path(audio_path)
|
|
40
|
+
if not audio_path.exists():
|
|
41
|
+
raise FileNotFoundError(f"Audio not found: {audio_path}")
|
|
42
|
+
|
|
43
|
+
segments, _info = model.transcribe(
|
|
44
|
+
str(audio_path),
|
|
45
|
+
language=lang,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return " ".join(seg.text.strip() for seg in segments).strip()
|
|
49
|
+
|
|
50
|
+
def _ensure_model(self): # pragma: no cover - exercised in integration
|
|
51
|
+
if self._model is not None:
|
|
52
|
+
return self._model
|
|
53
|
+
try:
|
|
54
|
+
import torch
|
|
55
|
+
from faster_whisper import WhisperModel
|
|
56
|
+
except ImportError as exc:
|
|
57
|
+
raise MissingDependencyException(self) from exc
|
|
58
|
+
|
|
59
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
60
|
+
compute_type = "float16" if device == "cuda" else "int8"
|
|
61
|
+
|
|
62
|
+
self._model = WhisperModel(
|
|
63
|
+
self.model_name, device=device, compute_type=compute_type
|
|
64
|
+
)
|
|
65
|
+
return self._model
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from audio2sub import Segment
|
|
6
|
+
from .base import AIAPITranscriber, MissingDependencyException, Usage
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Gemini(AIAPITranscriber):
|
|
10
|
+
"""Transcriber using Gemini API (google-genai)."""
|
|
11
|
+
|
|
12
|
+
name = "gemini"
|
|
13
|
+
default_model = "gemini-2.5-flash"
|
|
14
|
+
api_key_env_var = "GEMINI_API_KEY"
|
|
15
|
+
|
|
16
|
+
def _create_client(self):
|
|
17
|
+
try:
|
|
18
|
+
from google import genai
|
|
19
|
+
except ImportError as exc:
|
|
20
|
+
raise MissingDependencyException(self) from exc
|
|
21
|
+
|
|
22
|
+
api_key = self._resolve_api_key()
|
|
23
|
+
return genai.Client(api_key=api_key)
|
|
24
|
+
|
|
25
|
+
def _request_transcription(
|
|
26
|
+
self,
|
|
27
|
+
client,
|
|
28
|
+
batch: List[Segment],
|
|
29
|
+
prompt: List[str],
|
|
30
|
+
) -> Tuple[str, Optional[Usage]]:
|
|
31
|
+
parts = [{"text": "\n\n".join(prompt)}]
|
|
32
|
+
parts.extend(self._build_parts(batch=batch))
|
|
33
|
+
contents = [{"role": "user", "parts": parts}]
|
|
34
|
+
|
|
35
|
+
response = client.models.generate_content(
|
|
36
|
+
model=self.model,
|
|
37
|
+
contents=contents,
|
|
38
|
+
)
|
|
39
|
+
raw_text = response.text.strip() if hasattr(response, "text") else ""
|
|
40
|
+
|
|
41
|
+
usage = Usage(
|
|
42
|
+
tokens_in=getattr(response.usage_metadata, "prompt_token_count", 0),
|
|
43
|
+
tokens_out=getattr(response.usage_metadata, "candidates_token_count", 0),
|
|
44
|
+
)
|
|
45
|
+
return raw_text, usage
|
|
46
|
+
|
|
47
|
+
def _build_parts(self, batch: List[Segment]) -> List[dict]:
|
|
48
|
+
parts: List[dict] = []
|
|
49
|
+
for seg, audio_bytes in self._segments_to_audio_bytes(batch):
|
|
50
|
+
parts.append({"text": f"Clip {seg.index}"})
|
|
51
|
+
parts.append(
|
|
52
|
+
{"inline_data": {"mime_type": "audio/wav", "data": audio_bytes}}
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return parts
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .base import Base, MissingDependencyException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Whisper(Base):
|
|
11
|
+
"""Whisper-based transcriber (openai/whisper) for single audio segments."""
|
|
12
|
+
|
|
13
|
+
name = "whisper"
|
|
14
|
+
|
|
15
|
+
def __init__(self, model_name: str = "turbo") -> None:
|
|
16
|
+
self.model_name = model_name
|
|
17
|
+
self._model = None
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--model",
|
|
23
|
+
default="turbo",
|
|
24
|
+
help="Whisper model name (default: turbo)",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def from_cli_args(cls, args: argparse.Namespace) -> "Whisper":
|
|
29
|
+
return cls(args.model)
|
|
30
|
+
|
|
31
|
+
def transcribe(
|
|
32
|
+
self,
|
|
33
|
+
audio_path: str,
|
|
34
|
+
lang: Optional[str] = None,
|
|
35
|
+
stats: Optional[dict] = None,
|
|
36
|
+
) -> str:
|
|
37
|
+
model, whisper = self._ensure_model()
|
|
38
|
+
|
|
39
|
+
audio_path = Path(audio_path)
|
|
40
|
+
if not audio_path.exists():
|
|
41
|
+
raise FileNotFoundError(f"Audio not found: {audio_path}")
|
|
42
|
+
|
|
43
|
+
audio = whisper.load_audio(str(audio_path))
|
|
44
|
+
result = model.transcribe(
|
|
45
|
+
audio,
|
|
46
|
+
language=lang or "en",
|
|
47
|
+
task="transcribe",
|
|
48
|
+
fp16=model.device.type == "cuda",
|
|
49
|
+
)
|
|
50
|
+
text = result.get("text", "")
|
|
51
|
+
return str(text).strip()
|
|
52
|
+
|
|
53
|
+
def _ensure_model(self):
|
|
54
|
+
try:
|
|
55
|
+
import torch
|
|
56
|
+
import whisper
|
|
57
|
+
except ImportError as exc:
|
|
58
|
+
raise MissingDependencyException(self) from exc
|
|
59
|
+
|
|
60
|
+
if self._model is not None:
|
|
61
|
+
return self._model, whisper
|
|
62
|
+
|
|
63
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
64
|
+
self._model = whisper.load_model(self.model_name, device=device)
|
|
65
|
+
return self._model, whisper
|
audio2sub/vad.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import ffmpeg
|
|
7
|
+
import numpy as np
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
from audio2sub import Segment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SileroVAD:
|
|
14
|
+
"""Thin wrapper around snakers4/silero-vad for speech timestamp detection."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
threshold: float = 0.5,
|
|
19
|
+
min_silence_duration: float = 0.5,
|
|
20
|
+
window_size_samples: int = 512,
|
|
21
|
+
sample_rate: int = 16_000,
|
|
22
|
+
) -> None:
|
|
23
|
+
self.threshold = threshold
|
|
24
|
+
self.min_silence_duration = min_silence_duration
|
|
25
|
+
self.window_size_samples = window_size_samples
|
|
26
|
+
self.sample_rate = sample_rate
|
|
27
|
+
|
|
28
|
+
def detect_segments(self, wav_path: str | Path) -> List[Segment]:
|
|
29
|
+
try:
|
|
30
|
+
model, utils = torch.hub.load(
|
|
31
|
+
repo_or_dir="snakers4/silero-vad",
|
|
32
|
+
model="silero_vad",
|
|
33
|
+
force_reload=False,
|
|
34
|
+
onnx=False,
|
|
35
|
+
trust_repo=True,
|
|
36
|
+
)
|
|
37
|
+
get_speech_timestamps = utils[0]
|
|
38
|
+
except Exception as exc:
|
|
39
|
+
raise RuntimeError(f"Failed to load silero-vad: {exc}") from exc
|
|
40
|
+
|
|
41
|
+
# Read WAV via ffmpeg pipe (float32 mono at target sample rate)
|
|
42
|
+
process = (
|
|
43
|
+
ffmpeg.input(str(wav_path))
|
|
44
|
+
.output(
|
|
45
|
+
"pipe:",
|
|
46
|
+
format="f32le",
|
|
47
|
+
ac=1,
|
|
48
|
+
ar=self.sample_rate,
|
|
49
|
+
)
|
|
50
|
+
.run(capture_stdout=True, capture_stderr=True)
|
|
51
|
+
)
|
|
52
|
+
audio_bytes, stderr = process
|
|
53
|
+
|
|
54
|
+
wav_np = np.frombuffer(audio_bytes, dtype=np.float32).copy()
|
|
55
|
+
if wav_np.size == 0:
|
|
56
|
+
raise RuntimeError("No audio data decoded from WAV.")
|
|
57
|
+
wav = torch.from_numpy(wav_np)
|
|
58
|
+
|
|
59
|
+
timestamps = get_speech_timestamps(
|
|
60
|
+
wav,
|
|
61
|
+
model,
|
|
62
|
+
sampling_rate=self.sample_rate,
|
|
63
|
+
threshold=self.threshold,
|
|
64
|
+
min_silence_duration_ms=int(self.min_silence_duration * 1000),
|
|
65
|
+
window_size_samples=self.window_size_samples,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
segments: List[Segment] = []
|
|
69
|
+
for idx, ts in enumerate(timestamps, start=1):
|
|
70
|
+
start = ts.get("start", 0) / self.sample_rate
|
|
71
|
+
end = ts.get("end", 0) / self.sample_rate
|
|
72
|
+
if end > start:
|
|
73
|
+
segments.append(Segment(index=idx, start=start, end=end))
|
|
74
|
+
return segments
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: audio2sub
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Transcribe media files to SRT subtitles.
|
|
5
|
+
Home-page: https://github.com/Xavier-Lam/audio2sub
|
|
6
|
+
Author: Xavier-Lam
|
|
7
|
+
Author-email: xavierlam7@hotmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
19
|
+
Classifier: Topic :: Multimedia :: Video
|
|
20
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: torch>=2.1.0
|
|
25
|
+
Requires-Dist: torchaudio>=2.1.0
|
|
26
|
+
Requires-Dist: ffmpeg-python>=0.2.0
|
|
27
|
+
Requires-Dist: pysrt>=1.1.2
|
|
28
|
+
Requires-Dist: tqdm
|
|
29
|
+
Requires-Dist: onnxruntime<2,>=1.14
|
|
30
|
+
Requires-Dist: numpy
|
|
31
|
+
Provides-Extra: faster-whisper
|
|
32
|
+
Requires-Dist: faster-whisper>=1.0.1; extra == "faster-whisper"
|
|
33
|
+
Provides-Extra: whisper
|
|
34
|
+
Requires-Dist: openai-whisper>=20231117; extra == "whisper"
|
|
35
|
+
Provides-Extra: gemini
|
|
36
|
+
Requires-Dist: google-genai>=1.0.0; extra == "gemini"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
39
|
+
Requires-Dist: openai-whisper>=20231117; extra == "dev"
|
|
40
|
+
Requires-Dist: faster-whisper>=1.0.1; extra == "dev"
|
|
41
|
+
Requires-Dist: google-genai>=1.0.0; extra == "dev"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: openai-whisper>=20231117; extra == "all"
|
|
44
|
+
Requires-Dist: faster-whisper>=1.0.1; extra == "all"
|
|
45
|
+
Requires-Dist: google-genai>=1.0.0; extra == "all"
|
|
46
|
+
Dynamic: author
|
|
47
|
+
Dynamic: author-email
|
|
48
|
+
Dynamic: classifier
|
|
49
|
+
Dynamic: description
|
|
50
|
+
Dynamic: description-content-type
|
|
51
|
+
Dynamic: home-page
|
|
52
|
+
Dynamic: license
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
Dynamic: provides-extra
|
|
55
|
+
Dynamic: requires-dist
|
|
56
|
+
Dynamic: requires-python
|
|
57
|
+
Dynamic: summary
|
|
58
|
+
|
|
59
|
+
# Audio2Sub
|
|
60
|
+
|
|
61
|
+
**Audio2Sub** is a command-line tool that automatically transcribes audio from video or audio files and generates subtitles in the `.srt` format. It uses FFmpeg for media handling, [Silero VAD](https://github.com/snakers4/silero-vad) for precise voice activity detection, and supports multiple transcription backends to convert speech to text.
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
Before installing, you must have [FFmpeg](https://ffmpeg.org/download.html) installed and available in your system's PATH.
|
|
66
|
+
|
|
67
|
+
You can install Audio2Sub using `pip`. The default installation includes the `faster_whisper` backend.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install audio2sub[faster_whisper]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
To install with a different backend, see the table in the [Backends](#Backends) section below.
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
### Basic Example
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
audio2sub my_video.mp4 -o my_video.srt --lang en
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This command will transcribe the audio from `my_video.mp4` into English and save the subtitles to `my_video.srt`.
|
|
83
|
+
|
|
84
|
+
**Notes:**
|
|
85
|
+
* **First-Time Use**: The first time you run the program, it will download the necessary transcription models. This may take some time and require significant disk space.
|
|
86
|
+
* **CUDA**: Performance significantly degraded without CUDA when using whisper-based local models. The program will raise a warning if CUDA is not available when it starts. If your system has a compatible GPU, install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) first. If you are sure CUDA has been installed correctly and still get the warning, you may need to [reinstall a compatible PyTorch version manually](https://pytorch.org/get-started/locally/). The reinstallation of PyTorch may break other dependencies if you choose a different version than what you currently have. In this case, you may need to reinstall those according to the warnings shown.
|
|
87
|
+
|
|
88
|
+
### Using a Different Transcriber
|
|
89
|
+
|
|
90
|
+
Use the `-t` or `--transcriber` flag to select a different backend.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
audio2sub my_audio.wav -o my_audio.srt --lang en -t whisper --model medium
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Each transcriber has its own options. To see them, use `--help` with the transcriber specified.
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
audio2sub -t faster_whisper --help
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Backends
|
|
103
|
+
|
|
104
|
+
Audio2Sub supports the following transcription backends.
|
|
105
|
+
|
|
106
|
+
| Backend Name | Description |
|
|
107
|
+
| --- | --- |
|
|
108
|
+
| `faster_whisper` | A faster reimplementation of Whisper using CTranslate2. See [Faster Whisper](https://github.com/guillaumekln/faster-whisper). This is the default backend. |
|
|
109
|
+
| `whisper` | The original speech recognition model by OpenAI. See [OpenAI Whisper](https://github.com/openai/whisper). |
|
|
110
|
+
| `gemini` | Google's Gemini model via their API. Requires a `GEMINI_API_KEY` environment variable or `--gemini-api-key` argument.|
|
|
111
|
+
|
|
112
|
+
You should use `pip install audio2sub[<backend>]` to install the desired backend support and use the corresponding transcriber with the `-t` flag.
|
|
113
|
+
|
|
114
|
+
## Contributing
|
|
115
|
+
|
|
116
|
+
Contributions are welcome! Please open an issue or submit a pull request on the GitHub repository.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
audio2sub/__init__.py,sha256=vtNHWilMeUJh8aMQqeWqIq_hWNr4TJ_kawfdz2XSxrQ,3661
|
|
2
|
+
audio2sub/audio.py,sha256=WsSJNqT62Q-7aq4JwD4bpNRlTMjNKcAMr451sOo82tY,1226
|
|
3
|
+
audio2sub/cli.py,sha256=rh2QxNfqKsXtsuhmZpJIyYxMrOYnoT8bq9eU93tegQQ,3930
|
|
4
|
+
audio2sub/vad.py,sha256=0s710xcdupOvGcLMtadUcB_86oKeK4z9Klh5CR_mZqs,2360
|
|
5
|
+
audio2sub/transcribers/__init__.py,sha256=NnwNvg_RXXXNcSctcHSDKMRFbZ0-uW_ABiWX2YJKyiw,389
|
|
6
|
+
audio2sub/transcribers/base.py,sha256=-uwK7xfJ_i9yEPg9g3bD8XXiPnpGhajSGhWmSobThZE,9508
|
|
7
|
+
audio2sub/transcribers/faster_whisper.py,sha256=Dc5KXsmZbdrS71r3KBQH_ExOYYXY4NqEu8ZKGqigKuQ,1898
|
|
8
|
+
audio2sub/transcribers/gemini.py,sha256=-67zkMnhB4ruxWCLspVY4JVXWZgkcNL72RpVRruELno,1741
|
|
9
|
+
audio2sub/transcribers/whisper.py,sha256=2iQP6kqqL1CzkZeSfcIGYyutT8DyWSurY-IO3NRCVBk,1840
|
|
10
|
+
audio2sub-0.1.0.dist-info/licenses/LICENSE,sha256=NoqtIpP2SxhnVDs1RxKe2oLL5QdcHxyg5Ytzvenh5w0,1076
|
|
11
|
+
audio2sub-0.1.0.dist-info/METADATA,sha256=cQsgoEjP-wmjPS5vcGUlNg9rEoLk2WjJ0qOZvdKD_6c,5013
|
|
12
|
+
audio2sub-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
13
|
+
audio2sub-0.1.0.dist-info/entry_points.txt,sha256=8dJOZbTc4JrkIXJbWauaCqoZeASo76z6jTWC_kFPpvU,49
|
|
14
|
+
audio2sub-0.1.0.dist-info/top_level.txt,sha256=LAGOkV7oCPKbeFHyx_U0tM5_vj7X3BiG_FbrizXM1JI,10
|
|
15
|
+
audio2sub-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Xavier-Lam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
audio2sub
|