audio2sub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
audio2sub/__init__.py ADDED
@@ -0,0 +1,113 @@
1
+ """Audio2Sub package: convert media to subtitles."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Callable, Iterable, List, Optional
8
+ from dataclasses import dataclass
9
+
10
+ import pysrt
11
+
12
+ __all__ = ["__version__", "transcribe", "segments_to_srt", "Segment", "Usage"]
13
+ __title__ = "audio2sub"
14
+ __description__ = "Transcribe media files to SRT subtitles."
15
+ __url__ = "https://github.com/Xavier-Lam/audio2sub"
16
+ __version__ = "0.1.0"
17
+ __author__ = "Xavier-Lam"
18
+ __author_email__ = "xavierlam7@hotmail.com"
19
+
20
+
21
+ ReporterCallback = Callable[[str, dict], None]
22
+
23
+
24
+ @dataclass
25
+ class Segment:
26
+ index: int
27
+ start: float
28
+ end: float
29
+ text: str = ""
30
+ audio: Optional[Path] = None
31
+
32
+
33
+ from .audio import convert_media_to_wav, cut_wav_segment # noqa: E402
34
+ from .transcribers.base import Base, Usage # noqa: E402
35
+ from .vad import SileroVAD # noqa: E402
36
+
37
+
38
+ def transcribe(
39
+ input_media: str | Path,
40
+ transcriber: Base,
41
+ lang: Optional[str] = None,
42
+ reporter: Optional[ReporterCallback] = None,
43
+ stats: Optional[Usage | dict] = None,
44
+ opts: Optional[dict] = None,
45
+ ) -> List[Segment]:
46
+ """Convert media to segments using Silero VAD and batch transcription."""
47
+
48
+ input_media = Path(input_media)
49
+ if not input_media.exists():
50
+ raise FileNotFoundError(f"Input media not found: {input_media}")
51
+
52
+ _output = lambda message: reporter and reporter("status", message=message)
53
+ _progress = lambda name, current, total, **payload: reporter and reporter(
54
+ "progress",
55
+ name=name,
56
+ current=current,
57
+ total=total,
58
+ **payload,
59
+ )
60
+
61
+ with tempfile.TemporaryDirectory() as tmpdir:
62
+ wav_path = Path(tmpdir) / "audio.wav"
63
+ _output("Converting audio...")
64
+ convert_media_to_wav(input_media, wav_path)
65
+
66
+ vad = SileroVAD(sample_rate=16_000)
67
+ _output("Running voice activity detection (VAD)...")
68
+ segments = vad.detect_segments(wav_path)
69
+ if not segments:
70
+ raise RuntimeError("No speech detected by Silero VAD")
71
+ total_segments = len(segments)
72
+ _output((f"Detected {total_segments} speech segment(s)."))
73
+ _output("Cutting audio into clips...")
74
+
75
+ # Attach indices and extract audio clips for each segment
76
+ for idx, seg in enumerate(segments, start=1):
77
+ seg.index = idx
78
+ seg_path = Path(tmpdir) / f"segment_{idx}.wav"
79
+ cut_wav_segment(wav_path, seg.start, seg.end, seg_path)
80
+ seg.audio = seg_path
81
+
82
+ _output("Starting transcription...")
83
+ _progress("transcription", 0, total_segments, unit="seg")
84
+
85
+ # Batch transcribe for potential backend optimizations (generator)
86
+ transcribed_segments: List[Segment] = []
87
+ completed = 0
88
+ for seg in transcriber.batch_transcribe(
89
+ segments, lang=lang, stats=stats, **(opts or {})
90
+ ):
91
+ if seg.text.strip():
92
+ transcribed_segments.append(seg)
93
+ completed += 1
94
+ _progress("transcription", completed, total_segments, unit="seg")
95
+
96
+ if len(transcribed_segments) == 0:
97
+ raise RuntimeError("Transcription produced no subtitle lines.")
98
+
99
+ _output("Transcription completed.")
100
+ return transcribed_segments
101
+
102
+
103
+ def segments_to_srt(segments: Iterable[Segment]) -> pysrt.SubRipFile:
104
+ srt = pysrt.SubRipFile()
105
+ for seg in segments:
106
+ item = pysrt.SubRipItem(
107
+ index=seg.index,
108
+ start=pysrt.SubRipTime(seconds=seg.start),
109
+ end=pysrt.SubRipTime(seconds=seg.end),
110
+ text=seg.text,
111
+ )
112
+ srt.append(item)
113
+ return srt
audio2sub/audio.py ADDED
@@ -0,0 +1,50 @@
1
+ from pathlib import Path
2
+
3
+ import ffmpeg
4
+
5
+
6
+ def convert_media_to_wav(
7
+ input_path: str | Path,
8
+ output_path: str | Path,
9
+ sample_rate: int = 16_000,
10
+ channels: int = 1,
11
+ overwrite: bool = True,
12
+ ):
13
+ """Convert any media file to a WAV"""
14
+
15
+ input_path = Path(input_path)
16
+ output_path = Path(output_path)
17
+ output_path.parent.mkdir(parents=True, exist_ok=True)
18
+
19
+ if not input_path.exists():
20
+ raise FileNotFoundError(f"Input file does not exist: {input_path}")
21
+
22
+ stream = ffmpeg.input(str(input_path)).output(
23
+ str(output_path),
24
+ ac=channels,
25
+ ar=sample_rate,
26
+ format="wav",
27
+ )
28
+ if overwrite:
29
+ stream = stream.overwrite_output()
30
+ else:
31
+ stream = stream.global_args("-n")
32
+ stream.run(quiet=True)
33
+
34
+
35
+ def cut_wav_segment(
36
+ input_wav: str | Path,
37
+ start: float,
38
+ end: float,
39
+ output_path: str | Path,
40
+ ):
41
+ """Cut a WAV segment using ffmpeg"""
42
+
43
+ input_wav = Path(input_wav)
44
+ output_path = Path(output_path)
45
+ output_path.parent.mkdir(parents=True, exist_ok=True)
46
+
47
+ stream = ffmpeg.input(str(input_wav), ss=start, to=end).output(
48
+ str(output_path), acodec="copy"
49
+ )
50
+ stream.overwrite_output().run(quiet=True)
audio2sub/cli.py ADDED
@@ -0,0 +1,141 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import inspect
5
+ from pathlib import Path
6
+ import warnings
7
+ from typing import Dict, Type
8
+
9
+ import torch
10
+ from tqdm.auto import tqdm
11
+
12
+ from . import __version__, segments_to_srt, transcribe, transcribers
13
+ from .transcribers import Base
14
+
15
+
16
+ def _available_transcribers() -> Dict[str, Type[Base]]:
17
+ return {
18
+ obj.name: obj
19
+ for _, obj in inspect.getmembers(transcribers, inspect.isclass)
20
+ if issubclass(obj, Base) and not inspect.isabstract(obj)
21
+ }
22
+
23
+
24
+ def _build_backend_parser(choices: list[str]) -> argparse.ArgumentParser:
25
+ default = "faster_whisper"
26
+ parser = argparse.ArgumentParser(add_help=False)
27
+ parser.add_argument(
28
+ "-t",
29
+ "--transcriber",
30
+ choices=choices,
31
+ default=default,
32
+ help=f"Transcription backend to use (default: {default})",
33
+ )
34
+ return parser
35
+
36
+
37
+ def build_parser(
38
+ available: Dict[str, Type[Base]], args=None
39
+ ) -> argparse.ArgumentParser:
40
+ backend_parser = _build_backend_parser(choices=sorted(available.keys()))
41
+ backend_args, _remaining = backend_parser.parse_known_args(args)
42
+
43
+ parser = argparse.ArgumentParser(
44
+ prog="audio2sub",
45
+ description=(
46
+ "Convert media files to SRT subtitles using FFmpeg, Silero VAD, "
47
+ "and transcription backends."
48
+ ),
49
+ parents=[backend_parser],
50
+ )
51
+
52
+ parser.add_argument("input", help="Path to input media file (audio or video)")
53
+ parser.add_argument(
54
+ "-o",
55
+ "--output",
56
+ required=True,
57
+ help="Output SRT file path",
58
+ )
59
+ parser.add_argument(
60
+ "--lang",
61
+ default=None,
62
+ help=(
63
+ "Language code (e.g., en, es, fr). If omitted, backend may default to en. "
64
+ "See https://github.com/openai/whisper/blob/main/whisper/tokenizer.py for "
65
+ "a list of available languages."
66
+ ),
67
+ )
68
+ parser.add_argument(
69
+ "--version",
70
+ action="version",
71
+ version=f"%(prog)s {__version__}",
72
+ )
73
+
74
+ available[backend_args.transcriber].contribute_to_cli(parser)
75
+ return parser
76
+
77
+
78
+ def main() -> int:
79
+ if not torch.cuda.is_available():
80
+ warnings.warn(
81
+ "CUDA is not available; performance may be degraded significantly. "
82
+ "For more information, please refer to the README.md of the project."
83
+ )
84
+
85
+ available = _available_transcribers()
86
+ parser = build_parser(available)
87
+ args = parser.parse_args()
88
+ backend = args.transcriber
89
+
90
+ input_media = Path(args.input)
91
+ output_srt = Path(args.output)
92
+
93
+ bars: dict[str, tqdm] = {}
94
+
95
+ def reporter(kind: str, **payload):
96
+ if kind == "status":
97
+ print(payload.get("message", ""))
98
+ if kind == "progress":
99
+ name = payload.pop("name")
100
+ current = payload.pop("current", 0)
101
+ total = payload.pop("total", 0)
102
+
103
+ bar = bars.get(name)
104
+ if bar is None:
105
+ bar = tqdm(
106
+ total=total,
107
+ desc=name.capitalize(),
108
+ leave=True,
109
+ **payload,
110
+ )
111
+ bars[name] = bar
112
+ bar.n = current
113
+ bar.refresh()
114
+ if current >= total:
115
+ bar.close()
116
+ bars.pop(name, None)
117
+
118
+ stats = {}
119
+ transcriber_cls = available[backend]
120
+ transcriber = transcriber_cls.from_cli_args(args)
121
+ batch_opts = transcriber_cls.opts_from_cli(args)
122
+
123
+ segments = transcribe(
124
+ input_media,
125
+ transcriber,
126
+ lang=args.lang,
127
+ reporter=reporter,
128
+ stats=stats,
129
+ opts=batch_opts,
130
+ )
131
+ segments_to_srt(segments).save(str(output_srt))
132
+
133
+ print("Stats:")
134
+ for k, v in stats.items():
135
+ print(f" {k}: {v}")
136
+ print(f"SRT written to {output_srt}")
137
+ return 0
138
+
139
+
140
+ if __name__ == "__main__": # pragma: no cover
141
+ raise SystemExit(main())
@@ -0,0 +1,22 @@
1
+ from .base import (
2
+ AIAPITranscriber,
3
+ Base,
4
+ MissingDependencyException,
5
+ Usage,
6
+ )
7
+ from .whisper import Whisper
8
+ from .faster_whisper import FasterWhisper
9
+ from .gemini import Gemini
10
+ from audio2sub import Segment
11
+
12
+
13
+ __all__ = [
14
+ "Base",
15
+ "AIAPITranscriber",
16
+ "Whisper",
17
+ "FasterWhisper",
18
+ "Gemini",
19
+ "MissingDependencyException",
20
+ "Segment",
21
+ "Usage",
22
+ ]
@@ -0,0 +1,286 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from abc import ABC, abstractmethod
6
+ import argparse
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Iterable, List, Optional, Tuple
10
+
11
+ from audio2sub import Segment
12
+
13
+
14
+ @dataclass
15
+ class Usage:
16
+ tokens_in: int = 0
17
+ tokens_out: int = 0
18
+
19
+ def export(self, stats: Optional[dict]) -> None:
20
+ if stats is None:
21
+ return
22
+ stats["tokens_in"] = self.tokens_in
23
+ stats["tokens_out"] = self.tokens_out
24
+
25
+
26
+ class Base(ABC):
27
+ """Base class for transcription backends."""
28
+
29
+ name: str = "base"
30
+
31
+ @classmethod
32
+ def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
33
+ """Hook for CLI option registration."""
34
+
35
+ @classmethod
36
+ def from_cli_args(cls, args: argparse.Namespace) -> "Base":
37
+ """Instantiate transcriber from CLI args."""
38
+ return cls() # pragma: no cover - overridden when needed
39
+
40
+ @classmethod
41
+ def opts_from_cli(cls, args: argparse.Namespace) -> dict:
42
+ """Extract transcriber-specific options from CLI args."""
43
+ return {}
44
+
45
+ @abstractmethod
46
+ def transcribe(
47
+ self,
48
+ audio_path: str,
49
+ lang: Optional[str] = None,
50
+ stats: Optional[dict] = None,
51
+ ) -> str:
52
+ """Transcribe a single audio segment and return text."""
53
+ raise NotImplementedError
54
+
55
+ def batch_transcribe(
56
+ self,
57
+ segments: List[Segment],
58
+ lang: Optional[str] = None,
59
+ stats: Optional[dict] = None,
60
+ ) -> Iterable[Segment]:
61
+ """Transcribe a list of segments. Yields updated segments."""
62
+ for seg in segments:
63
+ if seg.audio is None:
64
+ raise FileNotFoundError("Segment has no audio path set")
65
+ text = self.transcribe(str(seg.audio), lang=lang, stats=stats)
66
+ seg.text = text
67
+ yield seg
68
+
69
+
70
+ class AIAPITranscriber(Base, ABC):
71
+ """Base class for AI API transcribers"""
72
+
73
+ base_prompt: str = (
74
+ "You will receive multiple audio clips. Return a JSON array of objects "
75
+ "with `index` and `text` fields in the same order. Transcribe each "
76
+ "clip verbatim (no paraphrasing). Omit non-speech clips or return "
77
+ "empty text."
78
+ )
79
+
80
+ default_model: str = ""
81
+ default_chunk: int = 20
82
+
83
+ api_key_env_var: Optional[str] = None
84
+
85
+ def __init__(self, model="", api_key=None) -> None:
86
+ self.model = model or self.default_model
87
+ self.api_key = api_key
88
+
89
+ @classmethod
90
+ def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
91
+ parser.add_argument(
92
+ "--model",
93
+ default=cls.default_model or None,
94
+ help=(
95
+ "Model name to use"
96
+ + (f" (default: {cls.default_model})" if cls.default_model else "")
97
+ ),
98
+ )
99
+ parser.add_argument(
100
+ "--api-key",
101
+ dest="api_key",
102
+ required=False,
103
+ help=(
104
+ f"API key (optional; env {cls.api_key_env_var or 'API key env var'} "
105
+ "is used if not provided)"
106
+ ),
107
+ )
108
+
109
+ # Add batch transcription options
110
+ parser.add_argument(
111
+ "--chunk",
112
+ type=int,
113
+ default=cls.default_chunk,
114
+ help=("Number of clips per API request " f"(default: {cls.default_chunk})"),
115
+ )
116
+ parser.add_argument(
117
+ "--outline",
118
+ dest="outline",
119
+ required=False,
120
+ help=("Context outline to guide transcription"),
121
+ )
122
+ parser.add_argument(
123
+ "--prompt",
124
+ dest="prompt",
125
+ required=False,
126
+ help=("Additional system prompt/instructions"),
127
+ )
128
+
129
+ @classmethod
130
+ def from_cli_args(cls, args: argparse.Namespace) -> "AIAPITranscriber":
131
+ return cls(model=args.model, api_key=args.api_key)
132
+
133
+ @classmethod
134
+ def opts_from_cli(cls, args: argparse.Namespace) -> dict:
135
+ return {
136
+ "chunk": args.chunk,
137
+ "outline": args.outline,
138
+ "prompt": args.prompt,
139
+ }
140
+
141
+ def transcribe(
142
+ self, audio_path: str, lang: Optional[str] = None, stats: Optional[dict] = None
143
+ ) -> str:
144
+ segments_iter = self.batch_transcribe(
145
+ [Segment(index=1, start=0, end=0, audio=Path(audio_path))],
146
+ lang=lang,
147
+ stats=stats,
148
+ )
149
+ segments = list(segments_iter)
150
+ return segments[0].text if segments else ""
151
+
152
+ def batch_transcribe(
153
+ self,
154
+ segments: List[Segment],
155
+ lang: Optional[str] = None,
156
+ stats: Optional[dict] = None,
157
+ chunk: Optional[int] = None,
158
+ outline: Optional[str] = None,
159
+ prompt: Optional[str] = None,
160
+ ) -> Iterable[Segment]:
161
+ """Transcribe segments with shared chunking, prompt, and stats."""
162
+
163
+ chunk_size = chunk if chunk and chunk > 0 else self.default_chunk
164
+ prompt_cfg = self._build_prompt(lang=lang, outline=outline, prompt=prompt)
165
+ client = self._ensure_client()
166
+ usage_tracker = Usage()
167
+
168
+ for batch in self._iter_chunks(segments, chunk_size):
169
+ raw_text, usage = self._request_transcription(client, batch, prompt_cfg)
170
+ self._parse_response_text(raw_text, batch)
171
+ if usage:
172
+ usage_tracker.tokens_in += usage.tokens_in
173
+ usage_tracker.tokens_out += usage.tokens_out
174
+ usage_tracker.export(stats)
175
+ for seg in batch:
176
+ yield seg
177
+
178
+ def _ensure_client(self):
179
+ if getattr(self, "_client", None):
180
+ return self._client
181
+ self._client = self._create_client()
182
+ return self._client
183
+
184
+ @abstractmethod
185
+ def _create_client(self):
186
+ """Instantiate the API client."""
187
+
188
+ @abstractmethod
189
+ def _request_transcription(
190
+ self,
191
+ client,
192
+ batch: List[Segment],
193
+ prompt: List[str],
194
+ ) -> Tuple[str, Optional[Usage]]:
195
+ """Call the provider and return (raw_text_response, Usage)."""
196
+
197
+ def _iter_chunks(self, items: List[Segment], size: int) -> Iterable[List[Segment]]:
198
+ if size <= 0:
199
+ size = len(items)
200
+ for i in range(0, len(items), size):
201
+ yield items[i : i + size]
202
+
203
+ def _build_prompt(
204
+ self,
205
+ lang: Optional[str],
206
+ outline: Optional[str],
207
+ prompt: Optional[str],
208
+ ) -> List[str]:
209
+ prompt_text = self.base_prompt
210
+ if lang:
211
+ prompt_text += (
212
+ " Primary language is "
213
+ f"{lang}, but audio may include other languages."
214
+ )
215
+ prompt_text += (
216
+ " Each object's `text` must be the transcription of that specific "
217
+ "clip, with no labels or formatting. Respond as plain JSON text "
218
+ "only; do not include markdown or code fences such as ``` or "
219
+ "other wrappers."
220
+ )
221
+ prompt_text += (
222
+ '\nRespond with JSON array of objects: [{"index": <clip '
223
+ 'index>, "text": <transcription>}, ...].'
224
+ )
225
+
226
+ system_prompts = [prompt_text]
227
+ if outline:
228
+ system_prompts.append(
229
+ "Outline to guide transcription (context only). *Use the outline "
230
+ "only to make minor corrections to what you hear in the audio "
231
+ "(for example: fix homophones, obvious mis-hearings, or minor "
232
+ "punctuation). Do NOT use the outline or any external knowledge "
233
+ "to create or add words that are not present in the audio*:\n" + outline
234
+ )
235
+ if prompt:
236
+ system_prompts.append("Additional instructions:\n" + prompt)
237
+
238
+ # Return raw list of system prompts (was PromptConfig.system_prompts)
239
+ return system_prompts
240
+
241
+ def _parse_response_text(self, raw_text: str, batch: List[Segment]) -> None:
242
+ raw_text = raw_text.strip()
243
+ parsed: List[dict] = json.loads(raw_text)
244
+
245
+ by_index = {
246
+ entry.get("index"): entry
247
+ for entry in parsed
248
+ if isinstance(entry, dict) and "index" in entry
249
+ }
250
+
251
+ for idx, seg in enumerate(batch):
252
+ entry = by_index.get(seg.index)
253
+ if entry:
254
+ seg.text = entry.get("text", "").strip()
255
+
256
+ def _segments_to_audio_bytes(
257
+ self, batch: List[Segment]
258
+ ) -> List[Tuple[Segment, bytes]]:
259
+ payloads: List[Tuple[Segment, bytes]] = []
260
+ for seg in batch:
261
+ if seg.audio is None:
262
+ raise FileNotFoundError("Segment has no audio path set")
263
+ audio_path = Path(seg.audio)
264
+ if not audio_path.exists():
265
+ raise FileNotFoundError(f"Audio not found: {audio_path}")
266
+ payloads.append((seg, audio_path.read_bytes()))
267
+ return payloads
268
+
269
+ def _resolve_api_key(self) -> str:
270
+ api_key = self.api_key
271
+ if not api_key and self.api_key_env_var:
272
+ api_key = os.getenv(self.api_key_env_var)
273
+ if not api_key:
274
+ env_hint = self.api_key_env_var or "API key"
275
+ raise RuntimeError(f"{env_hint} is required for {self.name} transcriber.")
276
+ return api_key
277
+
278
+
279
+ class MissingDependencyException(RuntimeError):
280
+ def __init__(self, transcriber) -> None:
281
+ name = transcriber.name
282
+ msg = (
283
+ f"Transcriber '{name}' is not installed. Install with `pip install "
284
+ f"audio2sub[{name}]`."
285
+ )
286
+ super().__init__(msg)
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .base import Base, MissingDependencyException
8
+
9
+
10
+ class FasterWhisper(Base):
11
+ """Transcriber using faster-whisper (ctranslate2 backend)."""
12
+
13
+ name = "faster_whisper"
14
+
15
+ def __init__(self, model_name: str = "turbo") -> None:
16
+ self.model_name = model_name
17
+ self._model = None
18
+
19
+ @classmethod
20
+ def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
21
+ parser.add_argument(
22
+ "--model",
23
+ default="turbo",
24
+ help="Faster-Whisper model name (default: turbo)",
25
+ )
26
+
27
+ @classmethod
28
+ def from_cli_args(cls, args: argparse.Namespace) -> "FasterWhisper":
29
+ return cls(model_name=args.model)
30
+
31
+ def transcribe(
32
+ self,
33
+ audio_path: str,
34
+ lang: Optional[str] = None,
35
+ stats: Optional[dict] = None,
36
+ ) -> str:
37
+ model = self._ensure_model()
38
+
39
+ audio_path = Path(audio_path)
40
+ if not audio_path.exists():
41
+ raise FileNotFoundError(f"Audio not found: {audio_path}")
42
+
43
+ segments, _info = model.transcribe(
44
+ str(audio_path),
45
+ language=lang,
46
+ )
47
+
48
+ return " ".join(seg.text.strip() for seg in segments).strip()
49
+
50
+ def _ensure_model(self): # pragma: no cover - exercised in integration
51
+ if self._model is not None:
52
+ return self._model
53
+ try:
54
+ import torch
55
+ from faster_whisper import WhisperModel
56
+ except ImportError as exc:
57
+ raise MissingDependencyException(self) from exc
58
+
59
+ device = "cuda" if torch.cuda.is_available() else "cpu"
60
+ compute_type = "float16" if device == "cuda" else "int8"
61
+
62
+ self._model = WhisperModel(
63
+ self.model_name, device=device, compute_type=compute_type
64
+ )
65
+ return self._model
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional, Tuple
4
+
5
+ from audio2sub import Segment
6
+ from .base import AIAPITranscriber, MissingDependencyException, Usage
7
+
8
+
9
+ class Gemini(AIAPITranscriber):
10
+ """Transcriber using Gemini API (google-genai)."""
11
+
12
+ name = "gemini"
13
+ default_model = "gemini-2.5-flash"
14
+ api_key_env_var = "GEMINI_API_KEY"
15
+
16
+ def _create_client(self):
17
+ try:
18
+ from google import genai
19
+ except ImportError as exc:
20
+ raise MissingDependencyException(self) from exc
21
+
22
+ api_key = self._resolve_api_key()
23
+ return genai.Client(api_key=api_key)
24
+
25
+ def _request_transcription(
26
+ self,
27
+ client,
28
+ batch: List[Segment],
29
+ prompt: List[str],
30
+ ) -> Tuple[str, Optional[Usage]]:
31
+ parts = [{"text": "\n\n".join(prompt)}]
32
+ parts.extend(self._build_parts(batch=batch))
33
+ contents = [{"role": "user", "parts": parts}]
34
+
35
+ response = client.models.generate_content(
36
+ model=self.model,
37
+ contents=contents,
38
+ )
39
+ raw_text = response.text.strip() if hasattr(response, "text") else ""
40
+
41
+ usage = Usage(
42
+ tokens_in=getattr(response.usage_metadata, "prompt_token_count", 0),
43
+ tokens_out=getattr(response.usage_metadata, "candidates_token_count", 0),
44
+ )
45
+ return raw_text, usage
46
+
47
+ def _build_parts(self, batch: List[Segment]) -> List[dict]:
48
+ parts: List[dict] = []
49
+ for seg, audio_bytes in self._segments_to_audio_bytes(batch):
50
+ parts.append({"text": f"Clip {seg.index}"})
51
+ parts.append(
52
+ {"inline_data": {"mime_type": "audio/wav", "data": audio_bytes}}
53
+ )
54
+
55
+ return parts
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from .base import Base, MissingDependencyException
8
+
9
+
10
+ class Whisper(Base):
11
+ """Whisper-based transcriber (openai/whisper) for single audio segments."""
12
+
13
+ name = "whisper"
14
+
15
+ def __init__(self, model_name: str = "turbo") -> None:
16
+ self.model_name = model_name
17
+ self._model = None
18
+
19
+ @classmethod
20
+ def contribute_to_cli(cls, parser: argparse.ArgumentParser) -> None:
21
+ parser.add_argument(
22
+ "--model",
23
+ default="turbo",
24
+ help="Whisper model name (default: turbo)",
25
+ )
26
+
27
+ @classmethod
28
+ def from_cli_args(cls, args: argparse.Namespace) -> "Whisper":
29
+ return cls(args.model)
30
+
31
+ def transcribe(
32
+ self,
33
+ audio_path: str,
34
+ lang: Optional[str] = None,
35
+ stats: Optional[dict] = None,
36
+ ) -> str:
37
+ model, whisper = self._ensure_model()
38
+
39
+ audio_path = Path(audio_path)
40
+ if not audio_path.exists():
41
+ raise FileNotFoundError(f"Audio not found: {audio_path}")
42
+
43
+ audio = whisper.load_audio(str(audio_path))
44
+ result = model.transcribe(
45
+ audio,
46
+ language=lang or "en",
47
+ task="transcribe",
48
+ fp16=model.device.type == "cuda",
49
+ )
50
+ text = result.get("text", "")
51
+ return str(text).strip()
52
+
53
+ def _ensure_model(self):
54
+ try:
55
+ import torch
56
+ import whisper
57
+ except ImportError as exc:
58
+ raise MissingDependencyException(self) from exc
59
+
60
+ if self._model is not None:
61
+ return self._model, whisper
62
+
63
+ device = "cuda" if torch.cuda.is_available() else "cpu"
64
+ self._model = whisper.load_model(self.model_name, device=device)
65
+ return self._model, whisper
audio2sub/vad.py ADDED
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ import ffmpeg
7
+ import numpy as np
8
+ import torch
9
+
10
+ from audio2sub import Segment
11
+
12
+
13
+ class SileroVAD:
14
+ """Thin wrapper around snakers4/silero-vad for speech timestamp detection."""
15
+
16
+ def __init__(
17
+ self,
18
+ threshold: float = 0.5,
19
+ min_silence_duration: float = 0.5,
20
+ window_size_samples: int = 512,
21
+ sample_rate: int = 16_000,
22
+ ) -> None:
23
+ self.threshold = threshold
24
+ self.min_silence_duration = min_silence_duration
25
+ self.window_size_samples = window_size_samples
26
+ self.sample_rate = sample_rate
27
+
28
+ def detect_segments(self, wav_path: str | Path) -> List[Segment]:
29
+ try:
30
+ model, utils = torch.hub.load(
31
+ repo_or_dir="snakers4/silero-vad",
32
+ model="silero_vad",
33
+ force_reload=False,
34
+ onnx=False,
35
+ trust_repo=True,
36
+ )
37
+ get_speech_timestamps = utils[0]
38
+ except Exception as exc:
39
+ raise RuntimeError(f"Failed to load silero-vad: {exc}") from exc
40
+
41
+ # Read WAV via ffmpeg pipe (float32 mono at target sample rate)
42
+ process = (
43
+ ffmpeg.input(str(wav_path))
44
+ .output(
45
+ "pipe:",
46
+ format="f32le",
47
+ ac=1,
48
+ ar=self.sample_rate,
49
+ )
50
+ .run(capture_stdout=True, capture_stderr=True)
51
+ )
52
+ audio_bytes, stderr = process
53
+
54
+ wav_np = np.frombuffer(audio_bytes, dtype=np.float32).copy()
55
+ if wav_np.size == 0:
56
+ raise RuntimeError("No audio data decoded from WAV.")
57
+ wav = torch.from_numpy(wav_np)
58
+
59
+ timestamps = get_speech_timestamps(
60
+ wav,
61
+ model,
62
+ sampling_rate=self.sample_rate,
63
+ threshold=self.threshold,
64
+ min_silence_duration_ms=int(self.min_silence_duration * 1000),
65
+ window_size_samples=self.window_size_samples,
66
+ )
67
+
68
+ segments: List[Segment] = []
69
+ for idx, ts in enumerate(timestamps, start=1):
70
+ start = ts.get("start", 0) / self.sample_rate
71
+ end = ts.get("end", 0) / self.sample_rate
72
+ if end > start:
73
+ segments.append(Segment(index=idx, start=start, end=end))
74
+ return segments
@@ -0,0 +1,116 @@
1
+ Metadata-Version: 2.4
2
+ Name: audio2sub
3
+ Version: 0.1.0
4
+ Summary: Transcribe media files to SRT subtitles.
5
+ Home-page: https://github.com/Xavier-Lam/audio2sub
6
+ Author: Xavier-Lam
7
+ Author-email: xavierlam7@hotmail.com
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Topic :: Multimedia :: Sound/Audio
19
+ Classifier: Topic :: Multimedia :: Video
20
+ Classifier: Topic :: Text Processing :: Linguistic
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: torch>=2.1.0
25
+ Requires-Dist: torchaudio>=2.1.0
26
+ Requires-Dist: ffmpeg-python>=0.2.0
27
+ Requires-Dist: pysrt>=1.1.2
28
+ Requires-Dist: tqdm
29
+ Requires-Dist: onnxruntime<2,>=1.14
30
+ Requires-Dist: numpy
31
+ Provides-Extra: faster-whisper
32
+ Requires-Dist: faster-whisper>=1.0.1; extra == "faster-whisper"
33
+ Provides-Extra: whisper
34
+ Requires-Dist: openai-whisper>=20231117; extra == "whisper"
35
+ Provides-Extra: gemini
36
+ Requires-Dist: google-genai>=1.0.0; extra == "gemini"
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
39
+ Requires-Dist: openai-whisper>=20231117; extra == "dev"
40
+ Requires-Dist: faster-whisper>=1.0.1; extra == "dev"
41
+ Requires-Dist: google-genai>=1.0.0; extra == "dev"
42
+ Provides-Extra: all
43
+ Requires-Dist: openai-whisper>=20231117; extra == "all"
44
+ Requires-Dist: faster-whisper>=1.0.1; extra == "all"
45
+ Requires-Dist: google-genai>=1.0.0; extra == "all"
46
+ Dynamic: author
47
+ Dynamic: author-email
48
+ Dynamic: classifier
49
+ Dynamic: description
50
+ Dynamic: description-content-type
51
+ Dynamic: home-page
52
+ Dynamic: license
53
+ Dynamic: license-file
54
+ Dynamic: provides-extra
55
+ Dynamic: requires-dist
56
+ Dynamic: requires-python
57
+ Dynamic: summary
58
+
59
+ # Audio2Sub
60
+
61
+ **Audio2Sub** is a command-line tool that automatically transcribes audio from video or audio files and generates subtitles in the `.srt` format. It uses FFmpeg for media handling, [Silero VAD](https://github.com/snakers4/silero-vad) for precise voice activity detection, and supports multiple transcription backends to convert speech to text.
62
+
63
+ ## Installation
64
+
65
+ Before installing, you must have [FFmpeg](https://ffmpeg.org/download.html) installed and available in your system's PATH.
66
+
67
+ You can install Audio2Sub using `pip`. The default installation includes the `faster_whisper` backend.
68
+
69
+ ```bash
70
+ pip install audio2sub[faster_whisper]
71
+ ```
72
+
73
+ To install with a different backend, see the table in the [Backends](#Backends) section below.
74
+
75
+ ## Usage
76
+ ### Basic Example
77
+
78
+ ```bash
79
+ audio2sub my_video.mp4 -o my_video.srt --lang en
80
+ ```
81
+
82
+ This command will transcribe the audio from `my_video.mp4` into English and save the subtitles to `my_video.srt`.
83
+
84
+ **Notes:**
85
+ * **First-Time Use**: The first time you run the program, it will download the necessary transcription models. This may take some time and require significant disk space.
86
+ * **CUDA**: Performance significantly degraded without CUDA when using whisper-based local models. The program will raise a warning if CUDA is not available when it starts. If your system has a compatible GPU, install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) first. If you are sure CUDA has been installed correctly and still get the warning, you may need to [reinstall a compatible PyTorch version manually](https://pytorch.org/get-started/locally/). The reinstallation of PyTorch may break other dependencies if you choose a different version than what you currently have. In this case, you may need to reinstall those according to the warnings shown.
87
+
88
+ ### Using a Different Transcriber
89
+
90
+ Use the `-t` or `--transcriber` flag to select a different backend.
91
+
92
+ ```bash
93
+ audio2sub my_audio.wav -o my_audio.srt --lang en -t whisper --model medium
94
+ ```
95
+
96
+ Each transcriber has its own options. To see them, use `--help` with the transcriber specified.
97
+
98
+ ```bash
99
+ audio2sub -t faster_whisper --help
100
+ ```
101
+
102
+ ## Backends
103
+
104
+ Audio2Sub supports the following transcription backends.
105
+
106
+ | Backend Name | Description |
107
+ | --- | --- |
108
+ | `faster_whisper` | A faster reimplementation of Whisper using CTranslate2. See [Faster Whisper](https://github.com/guillaumekln/faster-whisper). This is the default backend. |
109
+ | `whisper` | The original speech recognition model by OpenAI. See [OpenAI Whisper](https://github.com/openai/whisper). |
110
+ | `gemini` | Google's Gemini model via their API. Requires a `GEMINI_API_KEY` environment variable or `--gemini-api-key` argument.|
111
+
112
+ You should use `pip install audio2sub[<backend>]` to install the desired backend support and use the corresponding transcriber with the `-t` flag.
113
+
114
+ ## Contributing
115
+
116
+ Contributions are welcome! Please open an issue or submit a pull request on the GitHub repository.
@@ -0,0 +1,15 @@
1
+ audio2sub/__init__.py,sha256=vtNHWilMeUJh8aMQqeWqIq_hWNr4TJ_kawfdz2XSxrQ,3661
2
+ audio2sub/audio.py,sha256=WsSJNqT62Q-7aq4JwD4bpNRlTMjNKcAMr451sOo82tY,1226
3
+ audio2sub/cli.py,sha256=rh2QxNfqKsXtsuhmZpJIyYxMrOYnoT8bq9eU93tegQQ,3930
4
+ audio2sub/vad.py,sha256=0s710xcdupOvGcLMtadUcB_86oKeK4z9Klh5CR_mZqs,2360
5
+ audio2sub/transcribers/__init__.py,sha256=NnwNvg_RXXXNcSctcHSDKMRFbZ0-uW_ABiWX2YJKyiw,389
6
+ audio2sub/transcribers/base.py,sha256=-uwK7xfJ_i9yEPg9g3bD8XXiPnpGhajSGhWmSobThZE,9508
7
+ audio2sub/transcribers/faster_whisper.py,sha256=Dc5KXsmZbdrS71r3KBQH_ExOYYXY4NqEu8ZKGqigKuQ,1898
8
+ audio2sub/transcribers/gemini.py,sha256=-67zkMnhB4ruxWCLspVY4JVXWZgkcNL72RpVRruELno,1741
9
+ audio2sub/transcribers/whisper.py,sha256=2iQP6kqqL1CzkZeSfcIGYyutT8DyWSurY-IO3NRCVBk,1840
10
+ audio2sub-0.1.0.dist-info/licenses/LICENSE,sha256=NoqtIpP2SxhnVDs1RxKe2oLL5QdcHxyg5Ytzvenh5w0,1076
11
+ audio2sub-0.1.0.dist-info/METADATA,sha256=cQsgoEjP-wmjPS5vcGUlNg9rEoLk2WjJ0qOZvdKD_6c,5013
12
+ audio2sub-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
+ audio2sub-0.1.0.dist-info/entry_points.txt,sha256=8dJOZbTc4JrkIXJbWauaCqoZeASo76z6jTWC_kFPpvU,49
14
+ audio2sub-0.1.0.dist-info/top_level.txt,sha256=LAGOkV7oCPKbeFHyx_U0tM5_vj7X3BiG_FbrizXM1JI,10
15
+ audio2sub-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ audio2sub = audio2sub.cli:main
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2026 Xavier-Lam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ audio2sub