audio2sub 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audio2sub-0.1.0/LICENSE +21 -0
- audio2sub-0.1.0/PKG-INFO +116 -0
- audio2sub-0.1.0/README.md +58 -0
- audio2sub-0.1.0/audio2sub/__init__.py +113 -0
- audio2sub-0.1.0/audio2sub/audio.py +50 -0
- audio2sub-0.1.0/audio2sub/cli.py +141 -0
- audio2sub-0.1.0/audio2sub/transcribers/__init__.py +22 -0
- audio2sub-0.1.0/audio2sub/transcribers/base.py +286 -0
- audio2sub-0.1.0/audio2sub/transcribers/faster_whisper.py +65 -0
- audio2sub-0.1.0/audio2sub/transcribers/gemini.py +55 -0
- audio2sub-0.1.0/audio2sub/transcribers/whisper.py +65 -0
- audio2sub-0.1.0/audio2sub/vad.py +74 -0
- audio2sub-0.1.0/audio2sub.egg-info/PKG-INFO +116 -0
- audio2sub-0.1.0/audio2sub.egg-info/SOURCES.txt +19 -0
- audio2sub-0.1.0/audio2sub.egg-info/dependency_links.txt +1 -0
- audio2sub-0.1.0/audio2sub.egg-info/entry_points.txt +2 -0
- audio2sub-0.1.0/audio2sub.egg-info/requires.txt +27 -0
- audio2sub-0.1.0/audio2sub.egg-info/top_level.txt +1 -0
- audio2sub-0.1.0/setup.cfg +4 -0
- audio2sub-0.1.0/setup.py +75 -0
- audio2sub-0.1.0/tests/test_backends.py +63 -0
audio2sub-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Xavier-Lam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
audio2sub-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: audio2sub
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Transcribe media files to SRT subtitles.
|
|
5
|
+
Home-page: https://github.com/Xavier-Lam/audio2sub
|
|
6
|
+
Author: Xavier-Lam
|
|
7
|
+
Author-email: xavierlam7@hotmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
19
|
+
Classifier: Topic :: Multimedia :: Video
|
|
20
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: torch>=2.1.0
|
|
25
|
+
Requires-Dist: torchaudio>=2.1.0
|
|
26
|
+
Requires-Dist: ffmpeg-python>=0.2.0
|
|
27
|
+
Requires-Dist: pysrt>=1.1.2
|
|
28
|
+
Requires-Dist: tqdm
|
|
29
|
+
Requires-Dist: onnxruntime<2,>=1.14
|
|
30
|
+
Requires-Dist: numpy
|
|
31
|
+
Provides-Extra: faster-whisper
|
|
32
|
+
Requires-Dist: faster-whisper>=1.0.1; extra == "faster-whisper"
|
|
33
|
+
Provides-Extra: whisper
|
|
34
|
+
Requires-Dist: openai-whisper>=20231117; extra == "whisper"
|
|
35
|
+
Provides-Extra: gemini
|
|
36
|
+
Requires-Dist: google-genai>=1.0.0; extra == "gemini"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
39
|
+
Requires-Dist: openai-whisper>=20231117; extra == "dev"
|
|
40
|
+
Requires-Dist: faster-whisper>=1.0.1; extra == "dev"
|
|
41
|
+
Requires-Dist: google-genai>=1.0.0; extra == "dev"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: openai-whisper>=20231117; extra == "all"
|
|
44
|
+
Requires-Dist: faster-whisper>=1.0.1; extra == "all"
|
|
45
|
+
Requires-Dist: google-genai>=1.0.0; extra == "all"
|
|
46
|
+
Dynamic: author
|
|
47
|
+
Dynamic: author-email
|
|
48
|
+
Dynamic: classifier
|
|
49
|
+
Dynamic: description
|
|
50
|
+
Dynamic: description-content-type
|
|
51
|
+
Dynamic: home-page
|
|
52
|
+
Dynamic: license
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
Dynamic: provides-extra
|
|
55
|
+
Dynamic: requires-dist
|
|
56
|
+
Dynamic: requires-python
|
|
57
|
+
Dynamic: summary
|
|
58
|
+
|
|
59
|
+
# Audio2Sub
|
|
60
|
+
|
|
61
|
+
**Audio2Sub** is a command-line tool that automatically transcribes audio from video or audio files and generates subtitles in the `.srt` format. It uses FFmpeg for media handling, [Silero VAD](https://github.com/snakers4/silero-vad) for precise voice activity detection, and supports multiple transcription backends to convert speech to text.
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
Before installing, you must have [FFmpeg](https://ffmpeg.org/download.html) installed and available in your system's PATH.
|
|
66
|
+
|
|
67
|
+
You can install Audio2Sub using `pip`. The default installation includes the `faster_whisper` backend.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install audio2sub[faster_whisper]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
To install with a different backend, see the table in the [Backends](#Backends) section below.
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
### Basic Example
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
audio2sub my_video.mp4 -o my_video.srt --lang en
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This command will transcribe the audio from `my_video.mp4` into English and save the subtitles to `my_video.srt`.
|
|
83
|
+
|
|
84
|
+
**Notes:**
|
|
85
|
+
* **First-Time Use**: The first time you run the program, it will download the necessary transcription models. This may take some time and require significant disk space.
|
|
86
|
+
* **CUDA**: Performance significantly degraded without CUDA when using whisper-based local models. The program will raise a warning if CUDA is not available when it starts. If your system has a compatible GPU, install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) first. If you are sure CUDA has been installed correctly and still get the warning, you may need to [reinstall a compatible PyTorch version manually](https://pytorch.org/get-started/locally/). The reinstallation of PyTorch may break other dependencies if you choose a different version than what you currently have. In this case, you may need to reinstall those according to the warnings shown.
|
|
87
|
+
|
|
88
|
+
### Using a Different Transcriber
|
|
89
|
+
|
|
90
|
+
Use the `-t` or `--transcriber` flag to select a different backend.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
audio2sub my_audio.wav -o my_audio.srt --lang en -t whisper --model medium
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Each transcriber has its own options. To see them, use `--help` with the transcriber specified.
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
audio2sub -t faster_whisper --help
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Backends
|
|
103
|
+
|
|
104
|
+
Audio2Sub supports the following transcription backends.
|
|
105
|
+
|
|
106
|
+
| Backend Name | Description |
|
|
107
|
+
| --- | --- |
|
|
108
|
+
| `faster_whisper` | A faster reimplementation of Whisper using CTranslate2. See [Faster Whisper](https://github.com/guillaumekln/faster-whisper). This is the default backend. |
|
|
109
|
+
| `whisper` | The original speech recognition model by OpenAI. See [OpenAI Whisper](https://github.com/openai/whisper). |
|
|
110
|
+
| `gemini` | Google's Gemini model via their API. Requires a `GEMINI_API_KEY` environment variable or `--gemini-api-key` argument.|
|
|
111
|
+
|
|
112
|
+
You should use `pip install audio2sub[<backend>]` to install the desired backend support and use the corresponding transcriber with the `-t` flag.
|
|
113
|
+
|
|
114
|
+
## Contributing
|
|
115
|
+
|
|
116
|
+
Contributions are welcome! Please open an issue or submit a pull request on the GitHub repository.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Audio2Sub
|
|
2
|
+
|
|
3
|
+
**Audio2Sub** is a command-line tool that automatically transcribes audio from video or audio files and generates subtitles in the `.srt` format. It uses FFmpeg for media handling, [Silero VAD](https://github.com/snakers4/silero-vad) for precise voice activity detection, and supports multiple transcription backends to convert speech to text.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Before installing, you must have [FFmpeg](https://ffmpeg.org/download.html) installed and available in your system's PATH.
|
|
8
|
+
|
|
9
|
+
You can install Audio2Sub using `pip`. The default installation includes the `faster_whisper` backend.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install audio2sub[faster_whisper]
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
To install with a different backend, see the table in the [Backends](#Backends) section below.
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
### Basic Example
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
audio2sub my_video.mp4 -o my_video.srt --lang en
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
This command will transcribe the audio from `my_video.mp4` into English and save the subtitles to `my_video.srt`.
|
|
25
|
+
|
|
26
|
+
**Notes:**
|
|
27
|
+
* **First-Time Use**: The first time you run the program, it will download the necessary transcription models. This may take some time and require significant disk space.
|
|
28
|
+
* **CUDA**: Performance significantly degraded without CUDA when using whisper-based local models. The program will raise a warning if CUDA is not available when it starts. If your system has a compatible GPU, install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) first. If you are sure CUDA has been installed correctly and still get the warning, you may need to [reinstall a compatible PyTorch version manually](https://pytorch.org/get-started/locally/). The reinstallation of PyTorch may break other dependencies if you choose a different version than what you currently have. In this case, you may need to reinstall those according to the warnings shown.
|
|
29
|
+
|
|
30
|
+
### Using a Different Transcriber
|
|
31
|
+
|
|
32
|
+
Use the `-t` or `--transcriber` flag to select a different backend.
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
audio2sub my_audio.wav -o my_audio.srt --lang en -t whisper --model medium
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Each transcriber has its own options. To see them, use `--help` with the transcriber specified.
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
audio2sub -t faster_whisper --help
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Backends
|
|
45
|
+
|
|
46
|
+
Audio2Sub supports the following transcription backends.
|
|
47
|
+
|
|
48
|
+
| Backend Name | Description |
|
|
49
|
+
| --- | --- |
|
|
50
|
+
| `faster_whisper` | A faster reimplementation of Whisper using CTranslate2. See [Faster Whisper](https://github.com/guillaumekln/faster-whisper). This is the default backend. |
|
|
51
|
+
| `whisper` | The original speech recognition model by OpenAI. See [OpenAI Whisper](https://github.com/openai/whisper). |
|
|
52
|
+
| `gemini` | Google's Gemini model via their API. Requires a `GEMINI_API_KEY` environment variable or `--gemini-api-key` argument.|
|
|
53
|
+
|
|
54
|
+
You should use `pip install audio2sub[<backend>]` to install the desired backend support and use the corresponding transcriber with the `-t` flag.
|
|
55
|
+
|
|
56
|
+
## Contributing
|
|
57
|
+
|
|
58
|
+
Contributions are welcome! Please open an issue or submit a pull request on the GitHub repository.
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Audio2Sub package: convert media to subtitles."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable, Iterable, List, Optional
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
import pysrt
|
|
11
|
+
|
|
12
|
+
__all__ = ["__version__", "transcribe", "segments_to_srt", "Segment", "Usage"]
|
|
13
|
+
__title__ = "audio2sub"
|
|
14
|
+
__description__ = "Transcribe media files to SRT subtitles."
|
|
15
|
+
__url__ = "https://github.com/Xavier-Lam/audio2sub"
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
__author__ = "Xavier-Lam"
|
|
18
|
+
__author_email__ = "xavierlam7@hotmail.com"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
ReporterCallback = Callable[[str, dict], None]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Segment:
|
|
26
|
+
index: int
|
|
27
|
+
start: float
|
|
28
|
+
end: float
|
|
29
|
+
text: str = ""
|
|
30
|
+
audio: Optional[Path] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
from .audio import convert_media_to_wav, cut_wav_segment # noqa: E402
|
|
34
|
+
from .transcribers.base import Base, Usage # noqa: E402
|
|
35
|
+
from .vad import SileroVAD # noqa: E402
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def transcribe(
|
|
39
|
+
input_media: str | Path,
|
|
40
|
+
transcriber: Base,
|
|
41
|
+
lang: Optional[str] = None,
|
|
42
|
+
reporter: Optional[ReporterCallback] = None,
|
|
43
|
+
stats: Optional[Usage | dict] = None,
|
|
44
|
+
opts: Optional[dict] = None,
|
|
45
|
+
) -> List[Segment]:
|
|
46
|
+
"""Convert media to segments using Silero VAD and batch transcription."""
|
|
47
|
+
|
|
48
|
+
input_media = Path(input_media)
|
|
49
|
+
if not input_media.exists():
|
|
50
|
+
raise FileNotFoundError(f"Input media not found: {input_media}")
|
|
51
|
+
|
|
52
|
+
_output = lambda message: reporter and reporter("status", message=message)
|
|
53
|
+
_progress = lambda name, current, total, **payload: reporter and reporter(
|
|
54
|
+
"progress",
|
|
55
|
+
name=name,
|
|
56
|
+
current=current,
|
|
57
|
+
total=total,
|
|
58
|
+
**payload,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
62
|
+
wav_path = Path(tmpdir) / "audio.wav"
|
|
63
|
+
_output("Converting audio...")
|
|
64
|
+
convert_media_to_wav(input_media, wav_path)
|
|
65
|
+
|
|
66
|
+
vad = SileroVAD(sample_rate=16_000)
|
|
67
|
+
_output("Running voice activity detection (VAD)...")
|
|
68
|
+
segments = vad.detect_segments(wav_path)
|
|
69
|
+
if not segments:
|
|
70
|
+
raise RuntimeError("No speech detected by Silero VAD")
|
|
71
|
+
total_segments = len(segments)
|
|
72
|
+
_output((f"Detected {total_segments} speech segment(s)."))
|
|
73
|
+
_output("Cutting audio into clips...")
|
|
74
|
+
|
|
75
|
+
# Attach indices and extract audio clips for each segment
|
|
76
|
+
for idx, seg in enumerate(segments, start=1):
|
|
77
|
+
seg.index = idx
|
|
78
|
+
seg_path = Path(tmpdir) / f"segment_{idx}.wav"
|
|
79
|
+
cut_wav_segment(wav_path, seg.start, seg.end, seg_path)
|
|
80
|
+
seg.audio = seg_path
|
|
81
|
+
|
|
82
|
+
_output("Starting transcription...")
|
|
83
|
+
_progress("transcription", 0, total_segments, unit="seg")
|
|
84
|
+
|
|
85
|
+
# Batch transcribe for potential backend optimizations (generator)
|
|
86
|
+
transcribed_segments: List[Segment] = []
|
|
87
|
+
completed = 0
|
|
88
|
+
for seg in transcriber.batch_transcribe(
|
|
89
|
+
segments, lang=lang, stats=stats, **(opts or {})
|
|
90
|
+
):
|
|
91
|
+
if seg.text.strip():
|
|
92
|
+
transcribed_segments.append(seg)
|
|
93
|
+
completed += 1
|
|
94
|
+
_progress("transcription", completed, total_segments, unit="seg")
|
|
95
|
+
|
|
96
|
+
if len(transcribed_segments) == 0:
|
|
97
|
+
raise RuntimeError("Transcription produced no subtitle lines.")
|
|
98
|
+
|
|
99
|
+
_output("Transcription completed.")
|
|
100
|
+
return transcribed_segments
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def segments_to_srt(segments: Iterable[Segment]) -> pysrt.SubRipFile:
|
|
104
|
+
srt = pysrt.SubRipFile()
|
|
105
|
+
for seg in segments:
|
|
106
|
+
item = pysrt.SubRipItem(
|
|
107
|
+
index=seg.index,
|
|
108
|
+
start=pysrt.SubRipTime(seconds=seg.start),
|
|
109
|
+
end=pysrt.SubRipTime(seconds=seg.end),
|
|
110
|
+
text=seg.text,
|
|
111
|
+
)
|
|
112
|
+
srt.append(item)
|
|
113
|
+
return srt
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import ffmpeg
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def convert_media_to_wav(
|
|
7
|
+
input_path: str | Path,
|
|
8
|
+
output_path: str | Path,
|
|
9
|
+
sample_rate: int = 16_000,
|
|
10
|
+
channels: int = 1,
|
|
11
|
+
overwrite: bool = True,
|
|
12
|
+
):
|
|
13
|
+
"""Convert any media file to a WAV"""
|
|
14
|
+
|
|
15
|
+
input_path = Path(input_path)
|
|
16
|
+
output_path = Path(output_path)
|
|
17
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
|
|
19
|
+
if not input_path.exists():
|
|
20
|
+
raise FileNotFoundError(f"Input file does not exist: {input_path}")
|
|
21
|
+
|
|
22
|
+
stream = ffmpeg.input(str(input_path)).output(
|
|
23
|
+
str(output_path),
|
|
24
|
+
ac=channels,
|
|
25
|
+
ar=sample_rate,
|
|
26
|
+
format="wav",
|
|
27
|
+
)
|
|
28
|
+
if overwrite:
|
|
29
|
+
stream = stream.overwrite_output()
|
|
30
|
+
else:
|
|
31
|
+
stream = stream.global_args("-n")
|
|
32
|
+
stream.run(quiet=True)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def cut_wav_segment(
|
|
36
|
+
input_wav: str | Path,
|
|
37
|
+
start: float,
|
|
38
|
+
end: float,
|
|
39
|
+
output_path: str | Path,
|
|
40
|
+
):
|
|
41
|
+
"""Cut a WAV segment using ffmpeg"""
|
|
42
|
+
|
|
43
|
+
input_wav = Path(input_wav)
|
|
44
|
+
output_path = Path(output_path)
|
|
45
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
stream = ffmpeg.input(str(input_wav), ss=start, to=end).output(
|
|
48
|
+
str(output_path), acodec="copy"
|
|
49
|
+
)
|
|
50
|
+
stream.overwrite_output().run(quiet=True)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import inspect
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import warnings
|
|
7
|
+
from typing import Dict, Type
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
from tqdm.auto import tqdm
|
|
11
|
+
|
|
12
|
+
from . import __version__, segments_to_srt, transcribe, transcribers
|
|
13
|
+
from .transcribers import Base
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _available_transcribers() -> Dict[str, Type[Base]]:
|
|
17
|
+
return {
|
|
18
|
+
obj.name: obj
|
|
19
|
+
for _, obj in inspect.getmembers(transcribers, inspect.isclass)
|
|
20
|
+
if issubclass(obj, Base) and not inspect.isabstract(obj)
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _build_backend_parser(choices: list[str]) -> argparse.ArgumentParser:
|
|
25
|
+
default = "faster_whisper"
|
|
26
|
+
parser = argparse.ArgumentParser(add_help=False)
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"-t",
|
|
29
|
+
"--transcriber",
|
|
30
|
+
choices=choices,
|
|
31
|
+
default=default,
|
|
32
|
+
help=f"Transcription backend to use (default: {default})",
|
|
33
|
+
)
|
|
34
|
+
return parser
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def build_parser(
|
|
38
|
+
available: Dict[str, Type[Base]], args=None
|
|
39
|
+
) -> argparse.ArgumentParser:
|
|
40
|
+
backend_parser = _build_backend_parser(choices=sorted(available.keys()))
|
|
41
|
+
backend_args, _remaining = backend_parser.parse_known_args(args)
|
|
42
|
+
|
|
43
|
+
parser = argparse.ArgumentParser(
|
|
44
|
+
prog="audio2sub",
|
|
45
|
+
description=(
|
|
46
|
+
"Convert media files to SRT subtitles using FFmpeg, Silero VAD, "
|
|
47
|
+
"and transcription backends."
|
|
48
|
+
),
|
|
49
|
+
parents=[backend_parser],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
parser.add_argument("input", help="Path to input media file (audio or video)")
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"-o",
|
|
55
|
+
"--output",
|
|
56
|
+
required=True,
|
|
57
|
+
help="Output SRT file path",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--lang",
|
|
61
|
+
default=None,
|
|
62
|
+
help=(
|
|
63
|
+
"Language code (e.g., en, es, fr). If omitted, backend may default to en. "
|
|
64
|
+
"See https://github.com/openai/whisper/blob/main/whisper/tokenizer.py for "
|
|
65
|
+
"a list of available languages."
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--version",
|
|
70
|
+
action="version",
|
|
71
|
+
version=f"%(prog)s {__version__}",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
available[backend_args.transcriber].contribute_to_cli(parser)
|
|
75
|
+
return parser
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def main() -> int:
|
|
79
|
+
if not torch.cuda.is_available():
|
|
80
|
+
warnings.warn(
|
|
81
|
+
"CUDA is not available; performance may be degraded significantly. "
|
|
82
|
+
"For more information, please refer to the README.md of the project."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
available = _available_transcribers()
|
|
86
|
+
parser = build_parser(available)
|
|
87
|
+
args = parser.parse_args()
|
|
88
|
+
backend = args.transcriber
|
|
89
|
+
|
|
90
|
+
input_media = Path(args.input)
|
|
91
|
+
output_srt = Path(args.output)
|
|
92
|
+
|
|
93
|
+
bars: dict[str, tqdm] = {}
|
|
94
|
+
|
|
95
|
+
def reporter(kind: str, **payload):
|
|
96
|
+
if kind == "status":
|
|
97
|
+
print(payload.get("message", ""))
|
|
98
|
+
if kind == "progress":
|
|
99
|
+
name = payload.pop("name")
|
|
100
|
+
current = payload.pop("current", 0)
|
|
101
|
+
total = payload.pop("total", 0)
|
|
102
|
+
|
|
103
|
+
bar = bars.get(name)
|
|
104
|
+
if bar is None:
|
|
105
|
+
bar = tqdm(
|
|
106
|
+
total=total,
|
|
107
|
+
desc=name.capitalize(),
|
|
108
|
+
leave=True,
|
|
109
|
+
**payload,
|
|
110
|
+
)
|
|
111
|
+
bars[name] = bar
|
|
112
|
+
bar.n = current
|
|
113
|
+
bar.refresh()
|
|
114
|
+
if current >= total:
|
|
115
|
+
bar.close()
|
|
116
|
+
bars.pop(name, None)
|
|
117
|
+
|
|
118
|
+
stats = {}
|
|
119
|
+
transcriber_cls = available[backend]
|
|
120
|
+
transcriber = transcriber_cls.from_cli_args(args)
|
|
121
|
+
batch_opts = transcriber_cls.opts_from_cli(args)
|
|
122
|
+
|
|
123
|
+
segments = transcribe(
|
|
124
|
+
input_media,
|
|
125
|
+
transcriber,
|
|
126
|
+
lang=args.lang,
|
|
127
|
+
reporter=reporter,
|
|
128
|
+
stats=stats,
|
|
129
|
+
opts=batch_opts,
|
|
130
|
+
)
|
|
131
|
+
segments_to_srt(segments).save(str(output_srt))
|
|
132
|
+
|
|
133
|
+
print("Stats:")
|
|
134
|
+
for k, v in stats.items():
|
|
135
|
+
print(f" {k}: {v}")
|
|
136
|
+
print(f"SRT written to {output_srt}")
|
|
137
|
+
return 0
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__": # pragma: no cover
|
|
141
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
AIAPITranscriber,
|
|
3
|
+
Base,
|
|
4
|
+
MissingDependencyException,
|
|
5
|
+
Usage,
|
|
6
|
+
)
|
|
7
|
+
from .whisper import Whisper
|
|
8
|
+
from .faster_whisper import FasterWhisper
|
|
9
|
+
from .gemini import Gemini
|
|
10
|
+
from audio2sub import Segment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"Base",
|
|
15
|
+
"AIAPITranscriber",
|
|
16
|
+
"Whisper",
|
|
17
|
+
"FasterWhisper",
|
|
18
|
+
"Gemini",
|
|
19
|
+
"MissingDependencyException",
|
|
20
|
+
"Segment",
|
|
21
|
+
"Usage",
|
|
22
|
+
]
|