audio2sub 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2026 Xavier-Lam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,116 @@
1
+ Metadata-Version: 2.4
2
+ Name: audio2sub
3
+ Version: 0.1.0
4
+ Summary: Transcribe media files to SRT subtitles.
5
+ Home-page: https://github.com/Xavier-Lam/audio2sub
6
+ Author: Xavier-Lam
7
+ Author-email: xavierlam7@hotmail.com
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Topic :: Multimedia :: Sound/Audio
19
+ Classifier: Topic :: Multimedia :: Video
20
+ Classifier: Topic :: Text Processing :: Linguistic
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: torch>=2.1.0
25
+ Requires-Dist: torchaudio>=2.1.0
26
+ Requires-Dist: ffmpeg-python>=0.2.0
27
+ Requires-Dist: pysrt>=1.1.2
28
+ Requires-Dist: tqdm
29
+ Requires-Dist: onnxruntime<2,>=1.14
30
+ Requires-Dist: numpy
31
+ Provides-Extra: faster-whisper
32
+ Requires-Dist: faster-whisper>=1.0.1; extra == "faster-whisper"
33
+ Provides-Extra: whisper
34
+ Requires-Dist: openai-whisper>=20231117; extra == "whisper"
35
+ Provides-Extra: gemini
36
+ Requires-Dist: google-genai>=1.0.0; extra == "gemini"
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
39
+ Requires-Dist: openai-whisper>=20231117; extra == "dev"
40
+ Requires-Dist: faster-whisper>=1.0.1; extra == "dev"
41
+ Requires-Dist: google-genai>=1.0.0; extra == "dev"
42
+ Provides-Extra: all
43
+ Requires-Dist: openai-whisper>=20231117; extra == "all"
44
+ Requires-Dist: faster-whisper>=1.0.1; extra == "all"
45
+ Requires-Dist: google-genai>=1.0.0; extra == "all"
46
+ Dynamic: author
47
+ Dynamic: author-email
48
+ Dynamic: classifier
49
+ Dynamic: description
50
+ Dynamic: description-content-type
51
+ Dynamic: home-page
52
+ Dynamic: license
53
+ Dynamic: license-file
54
+ Dynamic: provides-extra
55
+ Dynamic: requires-dist
56
+ Dynamic: requires-python
57
+ Dynamic: summary
58
+
59
+ # Audio2Sub
60
+
61
+ **Audio2Sub** is a command-line tool that automatically transcribes audio from video or audio files and generates subtitles in the `.srt` format. It uses FFmpeg for media handling, [Silero VAD](https://github.com/snakers4/silero-vad) for precise voice activity detection, and supports multiple transcription backends to convert speech to text.
62
+
63
+ ## Installation
64
+
65
+ Before installing, you must have [FFmpeg](https://ffmpeg.org/download.html) installed and available in your system's PATH.
66
+
67
+ You can install Audio2Sub using `pip`. The default installation includes the `faster_whisper` backend.
68
+
69
+ ```bash
70
+ pip install audio2sub[faster_whisper]
71
+ ```
72
+
73
+ To install with a different backend, see the table in the [Backends](#Backends) section below.
74
+
75
+ ## Usage
76
+ ### Basic Example
77
+
78
+ ```bash
79
+ audio2sub my_video.mp4 -o my_video.srt --lang en
80
+ ```
81
+
82
+ This command will transcribe the audio from `my_video.mp4` into English and save the subtitles to `my_video.srt`.
83
+
84
+ **Notes:**
85
+ * **First-Time Use**: The first time you run the program, it will download the necessary transcription models. This may take some time and require significant disk space.
86
+ * **CUDA**: Performance significantly degraded without CUDA when using whisper-based local models. The program will raise a warning if CUDA is not available when it starts. If your system has a compatible GPU, install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) first. If you are sure CUDA has been installed correctly and still get the warning, you may need to [reinstall a compatible PyTorch version manually](https://pytorch.org/get-started/locally/). The reinstallation of PyTorch may break other dependencies if you choose a different version than what you currently have. In this case, you may need to reinstall those according to the warnings shown.
87
+
88
+ ### Using a Different Transcriber
89
+
90
+ Use the `-t` or `--transcriber` flag to select a different backend.
91
+
92
+ ```bash
93
+ audio2sub my_audio.wav -o my_audio.srt --lang en -t whisper --model medium
94
+ ```
95
+
96
+ Each transcriber has its own options. To see them, use `--help` with the transcriber specified.
97
+
98
+ ```bash
99
+ audio2sub -t faster_whisper --help
100
+ ```
101
+
102
+ ## Backends
103
+
104
+ Audio2Sub supports the following transcription backends.
105
+
106
+ | Backend Name | Description |
107
+ | --- | --- |
108
+ | `faster_whisper` | A faster reimplementation of Whisper using CTranslate2. See [Faster Whisper](https://github.com/guillaumekln/faster-whisper). This is the default backend. |
109
+ | `whisper` | The original speech recognition model by OpenAI. See [OpenAI Whisper](https://github.com/openai/whisper). |
110
+ | `gemini` | Google's Gemini model via their API. Requires a `GEMINI_API_KEY` environment variable or `--gemini-api-key` argument.|
111
+
112
+ You should use `pip install audio2sub[<backend>]` to install the desired backend support and use the corresponding transcriber with the `-t` flag.
113
+
114
+ ## Contributing
115
+
116
+ Contributions are welcome! Please open an issue or submit a pull request on the GitHub repository.
@@ -0,0 +1,58 @@
1
+ # Audio2Sub
2
+
3
+ **Audio2Sub** is a command-line tool that automatically transcribes audio from video or audio files and generates subtitles in the `.srt` format. It uses FFmpeg for media handling, [Silero VAD](https://github.com/snakers4/silero-vad) for precise voice activity detection, and supports multiple transcription backends to convert speech to text.
4
+
5
+ ## Installation
6
+
7
+ Before installing, you must have [FFmpeg](https://ffmpeg.org/download.html) installed and available in your system's PATH.
8
+
9
+ You can install Audio2Sub using `pip`. The default installation includes the `faster_whisper` backend.
10
+
11
+ ```bash
12
+ pip install audio2sub[faster_whisper]
13
+ ```
14
+
15
+ To install with a different backend, see the table in the [Backends](#Backends) section below.
16
+
17
+ ## Usage
18
+ ### Basic Example
19
+
20
+ ```bash
21
+ audio2sub my_video.mp4 -o my_video.srt --lang en
22
+ ```
23
+
24
+ This command will transcribe the audio from `my_video.mp4` into English and save the subtitles to `my_video.srt`.
25
+
26
+ **Notes:**
27
+ * **First-Time Use**: The first time you run the program, it will download the necessary transcription models. This may take some time and require significant disk space.
28
+ * **CUDA**: Performance significantly degraded without CUDA when using whisper-based local models. The program will raise a warning if CUDA is not available when it starts. If your system has a compatible GPU, install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) first. If you are sure CUDA has been installed correctly and still get the warning, you may need to [reinstall a compatible PyTorch version manually](https://pytorch.org/get-started/locally/). The reinstallation of PyTorch may break other dependencies if you choose a different version than what you currently have. In this case, you may need to reinstall those according to the warnings shown.
29
+
30
+ ### Using a Different Transcriber
31
+
32
+ Use the `-t` or `--transcriber` flag to select a different backend.
33
+
34
+ ```bash
35
+ audio2sub my_audio.wav -o my_audio.srt --lang en -t whisper --model medium
36
+ ```
37
+
38
+ Each transcriber has its own options. To see them, use `--help` with the transcriber specified.
39
+
40
+ ```bash
41
+ audio2sub -t faster_whisper --help
42
+ ```
43
+
44
+ ## Backends
45
+
46
+ Audio2Sub supports the following transcription backends.
47
+
48
+ | Backend Name | Description |
49
+ | --- | --- |
50
+ | `faster_whisper` | A faster reimplementation of Whisper using CTranslate2. See [Faster Whisper](https://github.com/guillaumekln/faster-whisper). This is the default backend. |
51
+ | `whisper` | The original speech recognition model by OpenAI. See [OpenAI Whisper](https://github.com/openai/whisper). |
52
+ | `gemini` | Google's Gemini model via their API. Requires a `GEMINI_API_KEY` environment variable or `--gemini-api-key` argument.|
53
+
54
+ You should use `pip install audio2sub[<backend>]` to install the desired backend support and use the corresponding transcriber with the `-t` flag.
55
+
56
+ ## Contributing
57
+
58
+ Contributions are welcome! Please open an issue or submit a pull request on the GitHub repository.
@@ -0,0 +1,113 @@
1
+ """Audio2Sub package: convert media to subtitles."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Callable, Iterable, List, Optional
8
+ from dataclasses import dataclass
9
+
10
+ import pysrt
11
+
12
+ __all__ = ["__version__", "transcribe", "segments_to_srt", "Segment", "Usage"]
13
+ __title__ = "audio2sub"
14
+ __description__ = "Transcribe media files to SRT subtitles."
15
+ __url__ = "https://github.com/Xavier-Lam/audio2sub"
16
+ __version__ = "0.1.0"
17
+ __author__ = "Xavier-Lam"
18
+ __author_email__ = "xavierlam7@hotmail.com"
19
+
20
+
21
+ ReporterCallback = Callable[[str, dict], None]
22
+
23
+
24
+ @dataclass
25
+ class Segment:
26
+ index: int
27
+ start: float
28
+ end: float
29
+ text: str = ""
30
+ audio: Optional[Path] = None
31
+
32
+
33
+ from .audio import convert_media_to_wav, cut_wav_segment # noqa: E402
34
+ from .transcribers.base import Base, Usage # noqa: E402
35
+ from .vad import SileroVAD # noqa: E402
36
+
37
+
38
+ def transcribe(
39
+ input_media: str | Path,
40
+ transcriber: Base,
41
+ lang: Optional[str] = None,
42
+ reporter: Optional[ReporterCallback] = None,
43
+ stats: Optional[Usage | dict] = None,
44
+ opts: Optional[dict] = None,
45
+ ) -> List[Segment]:
46
+ """Convert media to segments using Silero VAD and batch transcription."""
47
+
48
+ input_media = Path(input_media)
49
+ if not input_media.exists():
50
+ raise FileNotFoundError(f"Input media not found: {input_media}")
51
+
52
+ _output = lambda message: reporter and reporter("status", message=message)
53
+ _progress = lambda name, current, total, **payload: reporter and reporter(
54
+ "progress",
55
+ name=name,
56
+ current=current,
57
+ total=total,
58
+ **payload,
59
+ )
60
+
61
+ with tempfile.TemporaryDirectory() as tmpdir:
62
+ wav_path = Path(tmpdir) / "audio.wav"
63
+ _output("Converting audio...")
64
+ convert_media_to_wav(input_media, wav_path)
65
+
66
+ vad = SileroVAD(sample_rate=16_000)
67
+ _output("Running voice activity detection (VAD)...")
68
+ segments = vad.detect_segments(wav_path)
69
+ if not segments:
70
+ raise RuntimeError("No speech detected by Silero VAD")
71
+ total_segments = len(segments)
72
+ _output((f"Detected {total_segments} speech segment(s)."))
73
+ _output("Cutting audio into clips...")
74
+
75
+ # Attach indices and extract audio clips for each segment
76
+ for idx, seg in enumerate(segments, start=1):
77
+ seg.index = idx
78
+ seg_path = Path(tmpdir) / f"segment_{idx}.wav"
79
+ cut_wav_segment(wav_path, seg.start, seg.end, seg_path)
80
+ seg.audio = seg_path
81
+
82
+ _output("Starting transcription...")
83
+ _progress("transcription", 0, total_segments, unit="seg")
84
+
85
+ # Batch transcribe for potential backend optimizations (generator)
86
+ transcribed_segments: List[Segment] = []
87
+ completed = 0
88
+ for seg in transcriber.batch_transcribe(
89
+ segments, lang=lang, stats=stats, **(opts or {})
90
+ ):
91
+ if seg.text.strip():
92
+ transcribed_segments.append(seg)
93
+ completed += 1
94
+ _progress("transcription", completed, total_segments, unit="seg")
95
+
96
+ if len(transcribed_segments) == 0:
97
+ raise RuntimeError("Transcription produced no subtitle lines.")
98
+
99
+ _output("Transcription completed.")
100
+ return transcribed_segments
101
+
102
+
103
+ def segments_to_srt(segments: Iterable[Segment]) -> pysrt.SubRipFile:
104
+ srt = pysrt.SubRipFile()
105
+ for seg in segments:
106
+ item = pysrt.SubRipItem(
107
+ index=seg.index,
108
+ start=pysrt.SubRipTime(seconds=seg.start),
109
+ end=pysrt.SubRipTime(seconds=seg.end),
110
+ text=seg.text,
111
+ )
112
+ srt.append(item)
113
+ return srt
@@ -0,0 +1,50 @@
1
+ from pathlib import Path
2
+
3
+ import ffmpeg
4
+
5
+
6
+ def convert_media_to_wav(
7
+ input_path: str | Path,
8
+ output_path: str | Path,
9
+ sample_rate: int = 16_000,
10
+ channels: int = 1,
11
+ overwrite: bool = True,
12
+ ):
13
+ """Convert any media file to a WAV"""
14
+
15
+ input_path = Path(input_path)
16
+ output_path = Path(output_path)
17
+ output_path.parent.mkdir(parents=True, exist_ok=True)
18
+
19
+ if not input_path.exists():
20
+ raise FileNotFoundError(f"Input file does not exist: {input_path}")
21
+
22
+ stream = ffmpeg.input(str(input_path)).output(
23
+ str(output_path),
24
+ ac=channels,
25
+ ar=sample_rate,
26
+ format="wav",
27
+ )
28
+ if overwrite:
29
+ stream = stream.overwrite_output()
30
+ else:
31
+ stream = stream.global_args("-n")
32
+ stream.run(quiet=True)
33
+
34
+
35
+ def cut_wav_segment(
36
+ input_wav: str | Path,
37
+ start: float,
38
+ end: float,
39
+ output_path: str | Path,
40
+ ):
41
+ """Cut a WAV segment using ffmpeg"""
42
+
43
+ input_wav = Path(input_wav)
44
+ output_path = Path(output_path)
45
+ output_path.parent.mkdir(parents=True, exist_ok=True)
46
+
47
+ stream = ffmpeg.input(str(input_wav), ss=start, to=end).output(
48
+ str(output_path), acodec="copy"
49
+ )
50
+ stream.overwrite_output().run(quiet=True)
@@ -0,0 +1,141 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import inspect
5
+ from pathlib import Path
6
+ import warnings
7
+ from typing import Dict, Type
8
+
9
+ import torch
10
+ from tqdm.auto import tqdm
11
+
12
+ from . import __version__, segments_to_srt, transcribe, transcribers
13
+ from .transcribers import Base
14
+
15
+
16
+ def _available_transcribers() -> Dict[str, Type[Base]]:
17
+ return {
18
+ obj.name: obj
19
+ for _, obj in inspect.getmembers(transcribers, inspect.isclass)
20
+ if issubclass(obj, Base) and not inspect.isabstract(obj)
21
+ }
22
+
23
+
24
+ def _build_backend_parser(choices: list[str]) -> argparse.ArgumentParser:
25
+ default = "faster_whisper"
26
+ parser = argparse.ArgumentParser(add_help=False)
27
+ parser.add_argument(
28
+ "-t",
29
+ "--transcriber",
30
+ choices=choices,
31
+ default=default,
32
+ help=f"Transcription backend to use (default: {default})",
33
+ )
34
+ return parser
35
+
36
+
37
+ def build_parser(
38
+ available: Dict[str, Type[Base]], args=None
39
+ ) -> argparse.ArgumentParser:
40
+ backend_parser = _build_backend_parser(choices=sorted(available.keys()))
41
+ backend_args, _remaining = backend_parser.parse_known_args(args)
42
+
43
+ parser = argparse.ArgumentParser(
44
+ prog="audio2sub",
45
+ description=(
46
+ "Convert media files to SRT subtitles using FFmpeg, Silero VAD, "
47
+ "and transcription backends."
48
+ ),
49
+ parents=[backend_parser],
50
+ )
51
+
52
+ parser.add_argument("input", help="Path to input media file (audio or video)")
53
+ parser.add_argument(
54
+ "-o",
55
+ "--output",
56
+ required=True,
57
+ help="Output SRT file path",
58
+ )
59
+ parser.add_argument(
60
+ "--lang",
61
+ default=None,
62
+ help=(
63
+ "Language code (e.g., en, es, fr). If omitted, backend may default to en. "
64
+ "See https://github.com/openai/whisper/blob/main/whisper/tokenizer.py for "
65
+ "a list of available languages."
66
+ ),
67
+ )
68
+ parser.add_argument(
69
+ "--version",
70
+ action="version",
71
+ version=f"%(prog)s {__version__}",
72
+ )
73
+
74
+ available[backend_args.transcriber].contribute_to_cli(parser)
75
+ return parser
76
+
77
+
78
+ def main() -> int:
79
+ if not torch.cuda.is_available():
80
+ warnings.warn(
81
+ "CUDA is not available; performance may be degraded significantly. "
82
+ "For more information, please refer to the README.md of the project."
83
+ )
84
+
85
+ available = _available_transcribers()
86
+ parser = build_parser(available)
87
+ args = parser.parse_args()
88
+ backend = args.transcriber
89
+
90
+ input_media = Path(args.input)
91
+ output_srt = Path(args.output)
92
+
93
+ bars: dict[str, tqdm] = {}
94
+
95
+ def reporter(kind: str, **payload):
96
+ if kind == "status":
97
+ print(payload.get("message", ""))
98
+ if kind == "progress":
99
+ name = payload.pop("name")
100
+ current = payload.pop("current", 0)
101
+ total = payload.pop("total", 0)
102
+
103
+ bar = bars.get(name)
104
+ if bar is None:
105
+ bar = tqdm(
106
+ total=total,
107
+ desc=name.capitalize(),
108
+ leave=True,
109
+ **payload,
110
+ )
111
+ bars[name] = bar
112
+ bar.n = current
113
+ bar.refresh()
114
+ if current >= total:
115
+ bar.close()
116
+ bars.pop(name, None)
117
+
118
+ stats = {}
119
+ transcriber_cls = available[backend]
120
+ transcriber = transcriber_cls.from_cli_args(args)
121
+ batch_opts = transcriber_cls.opts_from_cli(args)
122
+
123
+ segments = transcribe(
124
+ input_media,
125
+ transcriber,
126
+ lang=args.lang,
127
+ reporter=reporter,
128
+ stats=stats,
129
+ opts=batch_opts,
130
+ )
131
+ segments_to_srt(segments).save(str(output_srt))
132
+
133
+ print("Stats:")
134
+ for k, v in stats.items():
135
+ print(f" {k}: {v}")
136
+ print(f"SRT written to {output_srt}")
137
+ return 0
138
+
139
+
140
+ if __name__ == "__main__": # pragma: no cover
141
+ raise SystemExit(main())
@@ -0,0 +1,22 @@
1
+ from .base import (
2
+ AIAPITranscriber,
3
+ Base,
4
+ MissingDependencyException,
5
+ Usage,
6
+ )
7
+ from .whisper import Whisper
8
+ from .faster_whisper import FasterWhisper
9
+ from .gemini import Gemini
10
+ from audio2sub import Segment
11
+
12
+
13
+ __all__ = [
14
+ "Base",
15
+ "AIAPITranscriber",
16
+ "Whisper",
17
+ "FasterWhisper",
18
+ "Gemini",
19
+ "MissingDependencyException",
20
+ "Segment",
21
+ "Usage",
22
+ ]