subs-down-n-sync 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- subs_down_n_sync/__init__.py +5 -0
- subs_down_n_sync/__main__.py +3 -0
- subs_down_n_sync/cli.py +324 -0
- subs_down_n_sync/core.py +538 -0
- subs_down_n_sync/exceptions.py +31 -0
- subs_down_n_sync-1.0.0.dist-info/METADATA +195 -0
- subs_down_n_sync-1.0.0.dist-info/RECORD +10 -0
- subs_down_n_sync-1.0.0.dist-info/WHEEL +5 -0
- subs_down_n_sync-1.0.0.dist-info/entry_points.txt +2 -0
- subs_down_n_sync-1.0.0.dist-info/top_level.txt +1 -0
subs_down_n_sync/cli.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.panel import Panel
|
|
10
|
+
from rich.progress import (
|
|
11
|
+
BarColumn,
|
|
12
|
+
MofNCompleteColumn,
|
|
13
|
+
Progress,
|
|
14
|
+
SpinnerColumn,
|
|
15
|
+
TaskID,
|
|
16
|
+
TextColumn,
|
|
17
|
+
TimeElapsedColumn,
|
|
18
|
+
)
|
|
19
|
+
from rich.table import Table
|
|
20
|
+
|
|
21
|
+
from subs_down_n_sync.core import DEFAULT_LANG, VIDEO_EXTENSIONS, RunSummary, run
|
|
22
|
+
from subs_down_n_sync.exceptions import SubsDownError
|
|
23
|
+
|
|
24
|
+
MAX_PARALLEL_WORKERS = 2
|
|
25
|
+
|
|
26
|
+
console = Console()
|
|
27
|
+
err_console = Console(stderr=True)
|
|
28
|
+
|
|
29
|
+
_STEP_LABELS: dict[str, str] = {
|
|
30
|
+
"validando": "Validando vídeo...",
|
|
31
|
+
"buscando": "Buscando legenda no OpenSubtitles...",
|
|
32
|
+
"baixado": "Legenda encontrada e baixada",
|
|
33
|
+
"referencia": "Baixando legenda EN de referência...",
|
|
34
|
+
"sem_referencia": "Referência EN não encontrada — sincronização ignorada",
|
|
35
|
+
"sincronizando": "Alinhando com embeddings semânticos...",
|
|
36
|
+
"sincronizado": "Sincronização concluída",
|
|
37
|
+
"sem_sync": "Legenda já sincronizada",
|
|
38
|
+
"erro_sync": "Erro na sincronização",
|
|
39
|
+
"concluido": "Finalizado",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
_STEPS_WITH_SYNC = [
|
|
43
|
+
"validando",
|
|
44
|
+
"buscando",
|
|
45
|
+
"baixado",
|
|
46
|
+
"referencia",
|
|
47
|
+
"sincronizando",
|
|
48
|
+
"concluido",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
_STEPS_NO_SYNC = [
|
|
52
|
+
"validando",
|
|
53
|
+
"buscando",
|
|
54
|
+
"baixado",
|
|
55
|
+
"concluido",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _make_progress() -> Progress:
|
|
60
|
+
return Progress(
|
|
61
|
+
SpinnerColumn(),
|
|
62
|
+
TextColumn("[bold blue]{task.description}"),
|
|
63
|
+
TextColumn("[dim]{task.fields[detail]}"),
|
|
64
|
+
TimeElapsedColumn(),
|
|
65
|
+
console=console,
|
|
66
|
+
transient=False,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _print_summary(summary: RunSummary) -> None:
|
|
71
|
+
if summary.sync_error:
|
|
72
|
+
status_line = (
|
|
73
|
+
f"[yellow]Aviso:[/yellow] sincronização falhou — legenda original mantida.\n"
|
|
74
|
+
f"Detalhe: {summary.sync_error}"
|
|
75
|
+
)
|
|
76
|
+
border = "yellow"
|
|
77
|
+
elif summary.synced:
|
|
78
|
+
status_line = (
|
|
79
|
+
f"[green]Sincronizada[/green] "
|
|
80
|
+
f"(ajuste médio: {summary.offset_seconds:.2f}s, modo: {summary.sync_mode})"
|
|
81
|
+
)
|
|
82
|
+
border = "green"
|
|
83
|
+
else:
|
|
84
|
+
status_line = (
|
|
85
|
+
f"[cyan]Já sincronizada[/cyan] (offset médio: {summary.offset_seconds:.2f}s < 0.10s)"
|
|
86
|
+
)
|
|
87
|
+
border = "cyan"
|
|
88
|
+
|
|
89
|
+
body = (
|
|
90
|
+
f"Idioma: [bold]{summary.lang_tag}[/bold] | "
|
|
91
|
+
f"Provider: {summary.provider} | "
|
|
92
|
+
f"Match: {summary.match_type}\n"
|
|
93
|
+
f"{status_line}\n"
|
|
94
|
+
f"Tempo total: {summary.elapsed_seconds:.2f}s"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
console.print(Panel(body, title="subs-down-n-sync", border_style=border))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _make_batch_progress() -> Progress:
|
|
101
|
+
return Progress(
|
|
102
|
+
SpinnerColumn(),
|
|
103
|
+
TextColumn("[bold blue]{task.description}"),
|
|
104
|
+
BarColumn(),
|
|
105
|
+
MofNCompleteColumn(),
|
|
106
|
+
TextColumn("[dim]{task.fields[detail]}"),
|
|
107
|
+
TimeElapsedColumn(),
|
|
108
|
+
console=console,
|
|
109
|
+
transient=False,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _process_video(
|
|
114
|
+
video: Path,
|
|
115
|
+
lang_tag: str,
|
|
116
|
+
progress: Progress,
|
|
117
|
+
) -> RunSummary:
|
|
118
|
+
task_id = progress.add_task(video.name, detail="aguardando...", total=None)
|
|
119
|
+
|
|
120
|
+
def on_progress(step: str, detail: str) -> None:
|
|
121
|
+
label = _STEP_LABELS.get(step, step)
|
|
122
|
+
progress.update(task_id, description=f"{video.name} — {label}", detail=detail)
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
return run(str(video), lang_tag=lang_tag, on_progress=on_progress)
|
|
126
|
+
finally:
|
|
127
|
+
progress.remove_task(task_id)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _run_directory(
|
|
131
|
+
dir_path: Path,
|
|
132
|
+
lang_tag: str,
|
|
133
|
+
overwrite: bool,
|
|
134
|
+
parallel: bool = False,
|
|
135
|
+
) -> tuple[list[RunSummary], list[Path], list[tuple[Path, str]]]:
|
|
136
|
+
videos = sorted(p for p in dir_path.rglob("*") if p.suffix.lower() in VIDEO_EXTENSIONS)
|
|
137
|
+
|
|
138
|
+
results: list[RunSummary] = []
|
|
139
|
+
skipped: list[Path] = []
|
|
140
|
+
errors: list[tuple[Path, str]] = []
|
|
141
|
+
|
|
142
|
+
to_process: list[Path] = []
|
|
143
|
+
for video in videos:
|
|
144
|
+
srt_path = video.with_suffix("").with_suffix(f".{lang_tag}.srt")
|
|
145
|
+
if srt_path.exists() and not overwrite:
|
|
146
|
+
skipped.append(video)
|
|
147
|
+
continue
|
|
148
|
+
to_process.append(video)
|
|
149
|
+
|
|
150
|
+
if not to_process:
|
|
151
|
+
return results, skipped, errors
|
|
152
|
+
|
|
153
|
+
progress = _make_batch_progress()
|
|
154
|
+
overall = progress.add_task(
|
|
155
|
+
f"Lote ({len(to_process)} vídeo(s))",
|
|
156
|
+
detail="",
|
|
157
|
+
total=len(to_process),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
with progress:
|
|
161
|
+
if parallel:
|
|
162
|
+
with ThreadPoolExecutor(max_workers=MAX_PARALLEL_WORKERS) as pool:
|
|
163
|
+
futures = {
|
|
164
|
+
pool.submit(_process_video, v, lang_tag, progress): v for v in to_process
|
|
165
|
+
}
|
|
166
|
+
for fut in as_completed(futures):
|
|
167
|
+
video = futures[fut]
|
|
168
|
+
try:
|
|
169
|
+
results.append(fut.result())
|
|
170
|
+
except SubsDownError as e:
|
|
171
|
+
errors.append((video, str(e)))
|
|
172
|
+
progress.advance(overall)
|
|
173
|
+
else:
|
|
174
|
+
for video in to_process:
|
|
175
|
+
try:
|
|
176
|
+
results.append(_process_video(video, lang_tag, progress))
|
|
177
|
+
except SubsDownError as e:
|
|
178
|
+
errors.append((video, str(e)))
|
|
179
|
+
progress.advance(overall)
|
|
180
|
+
|
|
181
|
+
return results, skipped, errors
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _print_batch_summary(
|
|
185
|
+
results: list[RunSummary],
|
|
186
|
+
skipped: list[Path],
|
|
187
|
+
errors: list[tuple[Path, str]],
|
|
188
|
+
) -> None:
|
|
189
|
+
table = Table(title="subs-down-n-sync — lote", show_lines=False)
|
|
190
|
+
table.add_column("Arquivo", style="bold")
|
|
191
|
+
table.add_column("Status")
|
|
192
|
+
table.add_column("Idioma")
|
|
193
|
+
table.add_column("Provider")
|
|
194
|
+
table.add_column("Offset")
|
|
195
|
+
|
|
196
|
+
for s in results:
|
|
197
|
+
if s.sync_error:
|
|
198
|
+
status = "[yellow]aviso[/yellow]"
|
|
199
|
+
elif s.synced:
|
|
200
|
+
status = "[green]sincronizado[/green]"
|
|
201
|
+
else:
|
|
202
|
+
status = "[cyan]ok[/cyan]"
|
|
203
|
+
table.add_row(
|
|
204
|
+
s.output_path.name,
|
|
205
|
+
status,
|
|
206
|
+
s.lang_tag,
|
|
207
|
+
s.provider,
|
|
208
|
+
f"{s.offset_seconds:.2f}s",
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
for path in skipped:
|
|
212
|
+
table.add_row(path.name, "[dim]pulado[/dim]", "-", "-", "-")
|
|
213
|
+
|
|
214
|
+
for path, _msg in errors:
|
|
215
|
+
table.add_row(path.name, "[red]erro[/red]", "-", "-", "-")
|
|
216
|
+
|
|
217
|
+
console.print(table)
|
|
218
|
+
|
|
219
|
+
parts = []
|
|
220
|
+
if results:
|
|
221
|
+
parts.append(f"[green]{len(results)} processado(s)[/green]")
|
|
222
|
+
if skipped:
|
|
223
|
+
parts.append(f"[dim]{len(skipped)} pulado(s)[/dim]")
|
|
224
|
+
if errors:
|
|
225
|
+
parts.append(f"[red]{len(errors)} erro(s)[/red]")
|
|
226
|
+
|
|
227
|
+
console.print(" ".join(parts) if parts else "[dim]Nenhum vídeo encontrado.[/dim]")
|
|
228
|
+
|
|
229
|
+
if errors:
|
|
230
|
+
console.print()
|
|
231
|
+
for path, msg in errors:
|
|
232
|
+
err_console.print(f"[bold red]Erro[/bold red] {path.name}: {msg}")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
236
|
+
parser = argparse.ArgumentParser(
|
|
237
|
+
prog="subs-down-n-sync",
|
|
238
|
+
description="Busca e sincroniza legenda para arquivo(s) de vídeo.",
|
|
239
|
+
)
|
|
240
|
+
parser.add_argument("path", help="Caminho para arquivo de vídeo ou diretório.")
|
|
241
|
+
parser.add_argument(
|
|
242
|
+
"-l",
|
|
243
|
+
"--lang",
|
|
244
|
+
default=DEFAULT_LANG,
|
|
245
|
+
help=f"Código de idioma BCP 47 (ex: pt-BR, en, es). Default: {DEFAULT_LANG}.",
|
|
246
|
+
)
|
|
247
|
+
parser.add_argument(
|
|
248
|
+
"-o",
|
|
249
|
+
"--overwrite",
|
|
250
|
+
action="store_true",
|
|
251
|
+
default=False,
|
|
252
|
+
help=(
|
|
253
|
+
"Sobrescrever legendas existentes. Por padrão, vídeos com legenda "
|
|
254
|
+
"já existente são pulados."
|
|
255
|
+
),
|
|
256
|
+
)
|
|
257
|
+
parser.add_argument(
|
|
258
|
+
"-p",
|
|
259
|
+
"--parallel",
|
|
260
|
+
action="store_true",
|
|
261
|
+
default=False,
|
|
262
|
+
help=(
|
|
263
|
+
f"Processar vídeos em paralelo (até {MAX_PARALLEL_WORKERS} simultâneos) "
|
|
264
|
+
"quando o caminho for um diretório."
|
|
265
|
+
),
|
|
266
|
+
)
|
|
267
|
+
return parser
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def main(argv: list[str] | None = None) -> int:
|
|
271
|
+
parser = build_parser()
|
|
272
|
+
args = parser.parse_args(argv)
|
|
273
|
+
|
|
274
|
+
p = Path(args.path).expanduser()
|
|
275
|
+
|
|
276
|
+
if p.is_dir():
|
|
277
|
+
results, skipped, errors = _run_directory(
|
|
278
|
+
p,
|
|
279
|
+
lang_tag=args.lang,
|
|
280
|
+
overwrite=args.overwrite,
|
|
281
|
+
parallel=args.parallel,
|
|
282
|
+
)
|
|
283
|
+
_print_batch_summary(results, skipped, errors)
|
|
284
|
+
return 1 if errors else 0
|
|
285
|
+
|
|
286
|
+
if not p.exists():
|
|
287
|
+
err_console.print(f"[bold red]Erro:[/bold red] Caminho não existe: {p}")
|
|
288
|
+
return 1
|
|
289
|
+
|
|
290
|
+
progress = _make_progress()
|
|
291
|
+
task_id: TaskID | None = None
|
|
292
|
+
|
|
293
|
+
completed_steps: list[str] = []
|
|
294
|
+
|
|
295
|
+
def on_progress(step: str, detail: str) -> None:
|
|
296
|
+
nonlocal task_id
|
|
297
|
+
|
|
298
|
+
label = _STEP_LABELS.get(step, step)
|
|
299
|
+
|
|
300
|
+
if task_id is None:
|
|
301
|
+
task_id = progress.add_task(label, detail=detail, total=None)
|
|
302
|
+
else:
|
|
303
|
+
progress.update(task_id, description=label, detail=detail)
|
|
304
|
+
|
|
305
|
+
completed_steps.append(step)
|
|
306
|
+
|
|
307
|
+
if step in ("baixado", "sincronizado", "sem_sync", "sem_referencia", "erro_sync"):
|
|
308
|
+
console.log(f"[dim]{label}[/dim] {detail}")
|
|
309
|
+
|
|
310
|
+
with progress:
|
|
311
|
+
try:
|
|
312
|
+
summary = run(args.path, lang_tag=args.lang, on_progress=on_progress)
|
|
313
|
+
except SubsDownError as e:
|
|
314
|
+
progress.stop()
|
|
315
|
+
err_console.print(f"[bold red]Erro:[/bold red] {e}")
|
|
316
|
+
return 1
|
|
317
|
+
|
|
318
|
+
_print_summary(summary)
|
|
319
|
+
|
|
320
|
+
return 0
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
if __name__ == "__main__":
|
|
324
|
+
sys.exit(main())
|
subs_down_n_sync/core.py
ADDED
|
@@ -0,0 +1,538 @@
|
|
|
1
|
+
"""subs_down_n_sync: busca e sincroniza legendas (pt-BR por padrão, qualquer BCP 47)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
import time
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import subliminal
|
|
16
|
+
from babelfish import Language
|
|
17
|
+
from scipy.spatial.distance import cdist
|
|
18
|
+
from sentence_transformers import SentenceTransformer
|
|
19
|
+
from subliminal.refiners.hash import refine as hash_refine
|
|
20
|
+
from subliminal.score import compute_score
|
|
21
|
+
|
|
22
|
+
from subs_down_n_sync.exceptions import (
|
|
23
|
+
InvalidLanguageError,
|
|
24
|
+
InvalidVideoError,
|
|
25
|
+
MissingCredentialsError,
|
|
26
|
+
MissingDependencyError,
|
|
27
|
+
SubtitleNotFoundError,
|
|
28
|
+
SubtitleSyncError,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
VIDEO_EXTENSIONS = {".mkv", ".mp4", ".avi", ".mov", ".m4v", ".wmv", ".flv", ".webm"}
|
|
32
|
+
|
|
33
|
+
DEFAULT_LANG = "pt-BR"
|
|
34
|
+
|
|
35
|
+
SCORE_THRESHOLD = 0.9 # score/max_score >= 90% → sem sync
|
|
36
|
+
|
|
37
|
+
SYNC_THRESHOLD_SECONDS = 0.1
|
|
38
|
+
|
|
39
|
+
_TS_RE = re.compile(
|
|
40
|
+
r"^(\d{2}):(\d{2}):(\d{2}),(\d{3})\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}",
|
|
41
|
+
re.MULTILINE,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
_SRT_BLOCK_RE = re.compile(
|
|
45
|
+
r"(\d+)\n"
|
|
46
|
+
r"(\d{2}):(\d{2}):(\d{2}),(\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2}),(\d{3})\n"
|
|
47
|
+
r"((?:.+\n?)+)",
|
|
48
|
+
re.MULTILINE,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class SubtitleInfo:
|
|
54
|
+
provider: str
|
|
55
|
+
match_type: str # "hash" | "release" | "fallback"
|
|
56
|
+
needs_sync: bool
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(frozen=True)
|
|
60
|
+
class SyncResult:
|
|
61
|
+
synced: bool
|
|
62
|
+
offset_seconds: float
|
|
63
|
+
sync_mode: str = "none" # "video" | "none"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class RunSummary:
|
|
68
|
+
output_path: Path
|
|
69
|
+
provider: str
|
|
70
|
+
match_type: str
|
|
71
|
+
synced: bool
|
|
72
|
+
offset_seconds: float
|
|
73
|
+
sync_mode: str
|
|
74
|
+
sync_error: str | None
|
|
75
|
+
elapsed_seconds: float
|
|
76
|
+
lang_tag: str
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def check_ffmpeg() -> None:
|
|
80
|
+
if shutil.which("ffmpeg") is None:
|
|
81
|
+
raise MissingDependencyError(
|
|
82
|
+
"ffmpeg não encontrado no PATH. Instale via gerenciador de pacotes "
|
|
83
|
+
"(ex.: sudo apt install ffmpeg, brew install ffmpeg)."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def load_credentials() -> tuple[str, str]:
|
|
88
|
+
user = os.environ.get("OPENSUBTITLES_USERNAME")
|
|
89
|
+
pwd = os.environ.get("OPENSUBTITLES_PASSWORD")
|
|
90
|
+
|
|
91
|
+
missing = [
|
|
92
|
+
name
|
|
93
|
+
for name, val in (
|
|
94
|
+
("OPENSUBTITLES_USERNAME", user),
|
|
95
|
+
("OPENSUBTITLES_PASSWORD", pwd),
|
|
96
|
+
)
|
|
97
|
+
if not val
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
if missing:
|
|
101
|
+
raise MissingCredentialsError(
|
|
102
|
+
"Variáveis de ambiente obrigatórias faltando: " + ", ".join(missing)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return user, pwd # type: ignore[return-value]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def validate_video_path(raw: str) -> Path:
|
|
109
|
+
p = Path(raw).expanduser()
|
|
110
|
+
|
|
111
|
+
if not p.exists():
|
|
112
|
+
raise InvalidVideoError(f"Arquivo de vídeo não existe: {p}")
|
|
113
|
+
if not p.is_file():
|
|
114
|
+
raise InvalidVideoError(f"Caminho não é um arquivo: {p}")
|
|
115
|
+
if p.suffix.lower() not in VIDEO_EXTENSIONS:
|
|
116
|
+
raise InvalidVideoError(
|
|
117
|
+
f"Extensão não suportada ({p.suffix}). "
|
|
118
|
+
f"Esperado um destes: {', '.join(sorted(VIDEO_EXTENSIONS))}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return p
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def parse_language(raw: str) -> Language:
|
|
125
|
+
try:
|
|
126
|
+
return Language.fromietf(raw)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
raise InvalidLanguageError(
|
|
129
|
+
f"Código de idioma inválido: {raw!r}. Use tags BCP 47 como 'pt-BR', 'en', 'es', 'ja'."
|
|
130
|
+
) from e
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _classify_match(matches: set[str]) -> str:
|
|
134
|
+
if "hash" in matches:
|
|
135
|
+
return "hash"
|
|
136
|
+
if "release_group" in matches:
|
|
137
|
+
return "release"
|
|
138
|
+
|
|
139
|
+
return "fallback"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _filename_similarity(sub_filename: str, video_name: str) -> float:
|
|
143
|
+
"""Fração de tokens do stem do vídeo presentes no nome da legenda."""
|
|
144
|
+
norm = re.compile(r"[\W_]+")
|
|
145
|
+
video_stem = Path(video_name).stem
|
|
146
|
+
sub_tokens = set(norm.sub(" ", sub_filename.lower()).split())
|
|
147
|
+
video_tokens = set(norm.sub(" ", video_stem.lower()).split())
|
|
148
|
+
if not video_tokens:
|
|
149
|
+
return 0.0
|
|
150
|
+
return len(sub_tokens & video_tokens) / len(video_tokens)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _pick_subtitle(
|
|
154
|
+
candidates: list,
|
|
155
|
+
video: object,
|
|
156
|
+
) -> tuple[object, str, bool]:
|
|
157
|
+
"""Escolhe melhor legenda e decide se precisa de sync.
|
|
158
|
+
|
|
159
|
+
Retorna (subtitle, match_type, needs_sync).
|
|
160
|
+
Ordem de preferência:
|
|
161
|
+
1. hash match → sem sync
|
|
162
|
+
2. release_group match → melhor filename similarity → com sync
|
|
163
|
+
3. fallback → melhor filename similarity → com sync
|
|
164
|
+
"""
|
|
165
|
+
video_name = getattr(video, "name", "") or ""
|
|
166
|
+
|
|
167
|
+
scored = [(sub, compute_score(sub, video)) for sub in candidates]
|
|
168
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
169
|
+
|
|
170
|
+
# 1. hash match
|
|
171
|
+
for sub, _ in scored:
|
|
172
|
+
if "hash" in set(sub.get_matches(video)):
|
|
173
|
+
return sub, "hash", False
|
|
174
|
+
|
|
175
|
+
# 2. release_group match
|
|
176
|
+
release_candidates = [
|
|
177
|
+
(sub, score) for sub, score in scored if "release_group" in set(sub.get_matches(video))
|
|
178
|
+
]
|
|
179
|
+
pool = release_candidates if release_candidates else scored
|
|
180
|
+
match_type = "release" if release_candidates else "fallback"
|
|
181
|
+
|
|
182
|
+
best_sub = max(
|
|
183
|
+
pool,
|
|
184
|
+
key=lambda x: _filename_similarity(getattr(x[0], "filename", "") or "", video_name),
|
|
185
|
+
)[0]
|
|
186
|
+
return best_sub, match_type, True
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _download_sub(
|
|
190
|
+
video: object,
|
|
191
|
+
language: Language,
|
|
192
|
+
provider_configs: dict,
|
|
193
|
+
) -> object | None:
|
|
194
|
+
"""Busca e baixa melhor legenda disponível para o idioma. Retorna subtitle ou None."""
|
|
195
|
+
results = subliminal.list_subtitles(
|
|
196
|
+
{video},
|
|
197
|
+
{language},
|
|
198
|
+
providers=["opensubtitles"],
|
|
199
|
+
provider_configs=provider_configs,
|
|
200
|
+
)
|
|
201
|
+
candidates = results.get(video, [])
|
|
202
|
+
candidates = [s for s in candidates if getattr(s, "language", None) == language]
|
|
203
|
+
|
|
204
|
+
if not candidates:
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
scored = [(sub, compute_score(sub, video)) for sub in candidates]
|
|
208
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
209
|
+
best = scored[0][0]
|
|
210
|
+
|
|
211
|
+
subliminal.download_subtitles([best], provider_configs=provider_configs)
|
|
212
|
+
return best if best.text else None
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def find_and_download_subtitle(
|
|
216
|
+
video_path: Path,
|
|
217
|
+
language: Language,
|
|
218
|
+
credentials: tuple[str, str],
|
|
219
|
+
) -> tuple[Path, SubtitleInfo]:
|
|
220
|
+
user, pwd = credentials
|
|
221
|
+
video = subliminal.scan_video(str(video_path))
|
|
222
|
+
hash_refine(video)
|
|
223
|
+
|
|
224
|
+
provider_configs = {"opensubtitles": {"username": user, "password": pwd}}
|
|
225
|
+
|
|
226
|
+
results = subliminal.list_subtitles(
|
|
227
|
+
{video},
|
|
228
|
+
{language},
|
|
229
|
+
providers=["opensubtitles"],
|
|
230
|
+
provider_configs=provider_configs,
|
|
231
|
+
)
|
|
232
|
+
candidates = results.get(video, [])
|
|
233
|
+
target_candidates = [s for s in candidates if getattr(s, "language", None) == language]
|
|
234
|
+
|
|
235
|
+
if not target_candidates:
|
|
236
|
+
raise SubtitleNotFoundError(
|
|
237
|
+
f"Nenhuma legenda em {language.alpha3} encontrada para: {video_path.name}"
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
subtitle, match_type, needs_sync = _pick_subtitle(target_candidates, video)
|
|
241
|
+
|
|
242
|
+
subliminal.download_subtitles([subtitle], provider_configs=provider_configs)
|
|
243
|
+
|
|
244
|
+
# Não usamos subliminal.save_subtitles porque ele grava os bytes crus no
|
|
245
|
+
# encoding detectado (ex.: cp1252), produzindo mojibake. Pegamos o texto
|
|
246
|
+
# já decodificado e escrevemos em UTF-8.
|
|
247
|
+
if not subtitle.text:
|
|
248
|
+
raise SubtitleNotFoundError(f"Legenda veio vazia do provider para: {video_path.name}")
|
|
249
|
+
|
|
250
|
+
srt_path = video_path.parent / Path(subtitle.get_path(video)).name
|
|
251
|
+
srt_path.write_text(subtitle.text, encoding="utf-8")
|
|
252
|
+
|
|
253
|
+
info = SubtitleInfo(
|
|
254
|
+
provider=subtitle.provider_name,
|
|
255
|
+
match_type=match_type,
|
|
256
|
+
needs_sync=needs_sync,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return srt_path, info
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def find_reference_subtitle(
|
|
263
|
+
video_path: Path,
|
|
264
|
+
credentials: tuple[str, str],
|
|
265
|
+
) -> Path | None:
|
|
266
|
+
"""Baixa legenda EN como referência de alinhamento. Retorna path ou None."""
|
|
267
|
+
user, pwd = credentials
|
|
268
|
+
video = subliminal.scan_video(str(video_path))
|
|
269
|
+
hash_refine(video)
|
|
270
|
+
|
|
271
|
+
en = Language("eng")
|
|
272
|
+
provider_configs = {"opensubtitles": {"username": user, "password": pwd}}
|
|
273
|
+
|
|
274
|
+
subtitle = _download_sub(video, en, provider_configs)
|
|
275
|
+
if subtitle is None:
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
tmp_dir = Path(tempfile.mkdtemp(prefix="subs_ref_"))
|
|
279
|
+
ref_path = tmp_dir / Path(subtitle.get_path(video)).name
|
|
280
|
+
ref_path.write_text(subtitle.text, encoding="utf-8")
|
|
281
|
+
|
|
282
|
+
return ref_path
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _ts(h: str, m: str, s: str, ms: str) -> float:
|
|
286
|
+
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _seconds_to_ts(total: float) -> str:
|
|
290
|
+
total = max(0.0, total)
|
|
291
|
+
h = int(total // 3600)
|
|
292
|
+
m = int((total % 3600) // 60)
|
|
293
|
+
s = int(total % 60)
|
|
294
|
+
ms = round((total - int(total)) * 1000)
|
|
295
|
+
if ms == 1000:
|
|
296
|
+
ms = 0
|
|
297
|
+
s += 1
|
|
298
|
+
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _srt_to_segments(srt_text: str) -> list[dict]:
|
|
302
|
+
segments = []
|
|
303
|
+
for m in _SRT_BLOCK_RE.finditer(srt_text):
|
|
304
|
+
segments.append(
|
|
305
|
+
{
|
|
306
|
+
"start": _ts(m.group(2), m.group(3), m.group(4), m.group(5)),
|
|
307
|
+
"end": _ts(m.group(6), m.group(7), m.group(8), m.group(9)),
|
|
308
|
+
"text": m.group(10).strip(),
|
|
309
|
+
}
|
|
310
|
+
)
|
|
311
|
+
return segments
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _parse_srt_timestamps(srt_text: str) -> list[float]:
|
|
315
|
+
out: list[float] = []
|
|
316
|
+
|
|
317
|
+
for h, m, s, ms in _TS_RE.findall(srt_text):
|
|
318
|
+
out.append(int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000)
|
|
319
|
+
|
|
320
|
+
return out
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _mean_offset_seconds(orig: list[float], synced: list[float]) -> float:
|
|
324
|
+
n = min(len(orig), len(synced))
|
|
325
|
+
|
|
326
|
+
if n == 0:
|
|
327
|
+
return 0.0
|
|
328
|
+
|
|
329
|
+
total = sum(abs(synced[i] - orig[i]) for i in range(n))
|
|
330
|
+
|
|
331
|
+
return total / n
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
_SEMANTIC_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _align_cues_by_semantics(
|
|
338
|
+
target_cues: list[dict],
|
|
339
|
+
ref_cues: list[dict],
|
|
340
|
+
) -> list[dict]:
|
|
341
|
+
"""Alinha cues de legenda alvo aos timestamps da referência via embeddings semânticos + DTW.
|
|
342
|
+
|
|
343
|
+
Para cada cue alvo, encontra o(s) cue(s) ref mais similar(es) semanticamente
|
|
344
|
+
e copia os timestamps. Garante ordem monotônica crescente no resultado.
|
|
345
|
+
"""
|
|
346
|
+
model = SentenceTransformer(_SEMANTIC_MODEL)
|
|
347
|
+
|
|
348
|
+
target_texts = [c["text"] for c in target_cues]
|
|
349
|
+
ref_texts = [c["text"] for c in ref_cues]
|
|
350
|
+
|
|
351
|
+
target_emb = model.encode(target_texts, convert_to_numpy=True)
|
|
352
|
+
ref_emb = model.encode(ref_texts, convert_to_numpy=True)
|
|
353
|
+
|
|
354
|
+
# matriz de distância coseno: shape (len_target, len_ref)
|
|
355
|
+
dist_matrix = cdist(target_emb, ref_emb, metric="cosine")
|
|
356
|
+
|
|
357
|
+
# DTW simples: encontra caminho de alinhamento ótimo
|
|
358
|
+
n, m = dist_matrix.shape
|
|
359
|
+
cost = np.full((n + 1, m + 1), np.inf)
|
|
360
|
+
cost[0, 0] = 0.0
|
|
361
|
+
|
|
362
|
+
for i in range(1, n + 1):
|
|
363
|
+
for j in range(1, m + 1):
|
|
364
|
+
cost[i, j] = dist_matrix[i - 1, j - 1] + min(
|
|
365
|
+
cost[i - 1, j], # inserção
|
|
366
|
+
cost[i, j - 1], # deleção
|
|
367
|
+
cost[i - 1, j - 1], # match
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# traceback
|
|
371
|
+
path: list[tuple[int, int]] = []
|
|
372
|
+
i, j = n, m
|
|
373
|
+
while i > 0 and j > 0:
|
|
374
|
+
path.append((i - 1, j - 1))
|
|
375
|
+
options = [
|
|
376
|
+
(cost[i - 1, j - 1], i - 1, j - 1),
|
|
377
|
+
(cost[i - 1, j], i - 1, j),
|
|
378
|
+
(cost[i, j - 1], i, j - 1),
|
|
379
|
+
]
|
|
380
|
+
_, i, j = min(options, key=lambda x: x[0])
|
|
381
|
+
path.reverse()
|
|
382
|
+
|
|
383
|
+
# para cada cue alvo, coleta os ref_cues mapeados e usa média dos timestamps
|
|
384
|
+
from collections import defaultdict
|
|
385
|
+
|
|
386
|
+
target_to_refs: dict[int, list[int]] = defaultdict(list)
|
|
387
|
+
for ti, ri in path:
|
|
388
|
+
target_to_refs[ti].append(ri)
|
|
389
|
+
|
|
390
|
+
result = []
|
|
391
|
+
for ti, cue in enumerate(target_cues):
|
|
392
|
+
mapped = target_to_refs.get(ti, [])
|
|
393
|
+
orig_duration = max(cue["end"] - cue["start"], 0.0)
|
|
394
|
+
|
|
395
|
+
if mapped:
|
|
396
|
+
# usa start do primeiro ref mapeado (não média, evita puxar p/ meio)
|
|
397
|
+
start = float(ref_cues[mapped[0]]["start"])
|
|
398
|
+
# preserva duração original do target (frase pt pode ser mais longa que en)
|
|
399
|
+
end = start + orig_duration
|
|
400
|
+
else:
|
|
401
|
+
start, end = cue["start"], cue["end"]
|
|
402
|
+
|
|
403
|
+
# duração mínima de leitura: ~60ms/char, mínimo 1s, teto 7s
|
|
404
|
+
min_duration = max(1.0, min(len(cue["text"]) * 0.06, 7.0))
|
|
405
|
+
if end - start < min_duration:
|
|
406
|
+
end = start + min_duration
|
|
407
|
+
|
|
408
|
+
result.append({"start": start, "end": end, "text": cue["text"]})
|
|
409
|
+
|
|
410
|
+
# garante ordem monotônica e clamp contra próximo cue
|
|
411
|
+
for i in range(1, len(result)):
|
|
412
|
+
if result[i]["start"] <= result[i - 1]["start"]:
|
|
413
|
+
gap = max(result[i - 1]["end"] - result[i - 1]["start"], 0.1)
|
|
414
|
+
result[i]["start"] = result[i - 1]["start"] + gap
|
|
415
|
+
duration = result[i]["end"] - result[i]["start"]
|
|
416
|
+
result[i]["end"] = result[i]["start"] + max(duration, 0.1)
|
|
417
|
+
|
|
418
|
+
# clamp: end do cue i não invade start do cue i+1 (deixa 50ms de gap)
|
|
419
|
+
for i in range(len(result) - 1):
|
|
420
|
+
max_end = result[i + 1]["start"] - 0.05
|
|
421
|
+
if result[i]["end"] > max_end:
|
|
422
|
+
result[i]["end"] = max(max_end, result[i]["start"] + 0.1)
|
|
423
|
+
|
|
424
|
+
return result
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _cues_to_srt(cues: list[dict]) -> str:
|
|
428
|
+
lines = []
|
|
429
|
+
for i, cue in enumerate(cues, start=1):
|
|
430
|
+
lines.append(
|
|
431
|
+
f"{i}\n{_seconds_to_ts(cue['start'])} --> {_seconds_to_ts(cue['end'])}\n{cue['text']}\n"
|
|
432
|
+
)
|
|
433
|
+
return "\n".join(lines)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def sync_subtitle(
|
|
437
|
+
srt_path: Path,
|
|
438
|
+
ref_path: Path,
|
|
439
|
+
) -> SyncResult:
|
|
440
|
+
"""Alinha legenda alvo usando legenda EN de referência via embeddings semânticos."""
|
|
441
|
+
target_text = srt_path.read_text(encoding="utf-8", errors="replace")
|
|
442
|
+
target_cues = _srt_to_segments(target_text)
|
|
443
|
+
target_ts_orig = [c["start"] for c in target_cues]
|
|
444
|
+
|
|
445
|
+
ref_text = ref_path.read_text(encoding="utf-8", errors="replace")
|
|
446
|
+
ref_cues = _srt_to_segments(ref_text)
|
|
447
|
+
|
|
448
|
+
try:
|
|
449
|
+
aligned_cues = _align_cues_by_semantics(target_cues, ref_cues)
|
|
450
|
+
except Exception as e:
|
|
451
|
+
raise SubtitleSyncError(f"alinhamento semântico falhou: {e}") from e
|
|
452
|
+
|
|
453
|
+
aligned_ts = [c["start"] for c in aligned_cues]
|
|
454
|
+
offset = _mean_offset_seconds(target_ts_orig, aligned_ts)
|
|
455
|
+
|
|
456
|
+
if offset < SYNC_THRESHOLD_SECONDS:
|
|
457
|
+
return SyncResult(synced=False, offset_seconds=offset, sync_mode="none")
|
|
458
|
+
|
|
459
|
+
srt_path.write_text(_cues_to_srt(aligned_cues), encoding="utf-8")
|
|
460
|
+
return SyncResult(synced=True, offset_seconds=offset, sync_mode="ref")
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def finalize_output_path(video_path: Path, srt_path: Path, lang_tag: str) -> Path:
|
|
464
|
+
target = video_path.with_suffix(f".{lang_tag}.srt")
|
|
465
|
+
|
|
466
|
+
if srt_path == target:
|
|
467
|
+
return target
|
|
468
|
+
|
|
469
|
+
srt_path.replace(target)
|
|
470
|
+
|
|
471
|
+
return target
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
ProgressCallback = Callable[[str, str], None]
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def run(
|
|
478
|
+
video_arg: str,
|
|
479
|
+
lang_tag: str = DEFAULT_LANG,
|
|
480
|
+
on_progress: ProgressCallback | None = None,
|
|
481
|
+
) -> RunSummary:
|
|
482
|
+
def _notify(step: str, detail: str = "") -> None:
|
|
483
|
+
if on_progress:
|
|
484
|
+
on_progress(step, detail)
|
|
485
|
+
|
|
486
|
+
start = time.monotonic()
|
|
487
|
+
|
|
488
|
+
_notify("validando", video_arg)
|
|
489
|
+
video_path = validate_video_path(video_arg)
|
|
490
|
+
check_ffmpeg()
|
|
491
|
+
language = parse_language(lang_tag)
|
|
492
|
+
credentials = load_credentials()
|
|
493
|
+
|
|
494
|
+
_notify("buscando", f"idioma={lang_tag}")
|
|
495
|
+
srt_path, info = find_and_download_subtitle(
|
|
496
|
+
video_path, language=language, credentials=credentials
|
|
497
|
+
)
|
|
498
|
+
_notify("baixado", f"provider={info.provider} match={info.match_type}")
|
|
499
|
+
|
|
500
|
+
sync_error: str | None = None
|
|
501
|
+
|
|
502
|
+
if info.needs_sync:
|
|
503
|
+
_notify("referencia", "buscando EN")
|
|
504
|
+
ref_path = find_reference_subtitle(video_path, credentials=credentials)
|
|
505
|
+
|
|
506
|
+
if ref_path is None:
|
|
507
|
+
sync_error = "legenda EN de referência não encontrada — sincronização ignorada"
|
|
508
|
+
sync_result = SyncResult(synced=False, offset_seconds=0.0, sync_mode="none")
|
|
509
|
+
_notify("sem_referencia", "")
|
|
510
|
+
else:
|
|
511
|
+
_notify("sincronizando", "embeddings semânticos")
|
|
512
|
+
try:
|
|
513
|
+
sync_result = sync_subtitle(srt_path, ref_path=ref_path)
|
|
514
|
+
_notify("sincronizado", f"offset={sync_result.offset_seconds:.2f}s")
|
|
515
|
+
except SubtitleSyncError as e:
|
|
516
|
+
sync_error = str(e)
|
|
517
|
+
sync_result = SyncResult(synced=False, offset_seconds=0.0, sync_mode="none")
|
|
518
|
+
_notify("erro_sync", str(e))
|
|
519
|
+
else:
|
|
520
|
+
sync_result = SyncResult(synced=False, offset_seconds=0.0, sync_mode="none")
|
|
521
|
+
_notify("sem_sync", f"offset={sync_result.offset_seconds:.2f}s")
|
|
522
|
+
|
|
523
|
+
final_path = finalize_output_path(video_path, srt_path, lang_tag=lang_tag)
|
|
524
|
+
elapsed = time.monotonic() - start
|
|
525
|
+
|
|
526
|
+
_notify("concluido", str(final_path))
|
|
527
|
+
|
|
528
|
+
return RunSummary(
|
|
529
|
+
output_path=final_path,
|
|
530
|
+
provider=info.provider,
|
|
531
|
+
match_type=info.match_type,
|
|
532
|
+
synced=sync_result.synced,
|
|
533
|
+
offset_seconds=sync_result.offset_seconds,
|
|
534
|
+
sync_mode=sync_result.sync_mode,
|
|
535
|
+
sync_error=sync_error,
|
|
536
|
+
elapsed_seconds=elapsed,
|
|
537
|
+
lang_tag=lang_tag,
|
|
538
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Exceptions raised by subs_down_n_sync. Message is user-facing (in Portuguese)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SubsDownError(Exception):
|
|
7
|
+
"""Erro base do script — mensagem é o que vai para o usuário."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class InvalidVideoError(SubsDownError):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MissingDependencyError(SubsDownError):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MissingCredentialsError(SubsDownError):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class InvalidLanguageError(SubsDownError):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SubtitleNotFoundError(SubsDownError):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SubtitleSyncError(SubsDownError):
|
|
31
|
+
pass
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: subs-down-n-sync
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: CLI to download and sync subtitles for video files using semantic embeddings + DTW
|
|
5
|
+
Author-email: Airton Soares <airtonnsoares@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://pypi.org/project/subs-down-n-sync/
|
|
8
|
+
Project-URL: Repository, https://github.com/airton-soares/subs_down_n_sync
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/airton-soares/subs_down_n_sync/issues
|
|
10
|
+
Keywords: subtitles,subtitle-sync,opensubtitles,nlp,cli,video,sentence-transformers,dtw
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Topic :: Multimedia :: Video
|
|
15
|
+
Classifier: Topic :: Utilities
|
|
16
|
+
Classifier: Natural Language :: Portuguese (Brazilian)
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: subliminal>=2.2
|
|
22
|
+
Requires-Dist: sentence-transformers>=3.0
|
|
23
|
+
Requires-Dist: scipy>=1.13
|
|
24
|
+
Requires-Dist: setuptools<81
|
|
25
|
+
Requires-Dist: rich>=13
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-mock>=3.12; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=6.0; extra == "dev"
|
|
30
|
+
Requires-Dist: ruff>=0.9; extra == "dev"
|
|
31
|
+
|
|
32
|
+
# subs_down_n_sync
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+
[](https://pypi.org/project/subs-down-n-sync/)
|
|
36
|
+
[](https://pypi.org/project/subs-down-n-sync/)
|
|
37
|
+
[](LICENSE)
|
|
38
|
+
|
|
39
|
+
CLI Python para baixar e sincronizar legendas para arquivos de vídeo. Idioma padrão: **pt-BR**, configurável via flag `--lang` (qualquer tag BCP 47).
|
|
40
|
+
|
|
41
|
+
A sincronização usa embeddings semânticos multilíngues ([sentence-transformers](https://www.sbert.net/), modelo `paraphrase-multilingual-MiniLM-L12-v2`) combinados com DTW: baixa uma legenda EN de referência e alinha os cues da legenda alvo aos timestamps da referência por similaridade semântica. Legendas com match exato (hash ou release group) são usadas sem sincronização.
|
|
42
|
+
|
|
43
|
+
## Instalação
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install subs-down-n-sync
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Instale também o `ffmpeg`:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
sudo apt install ffmpeg # Debian/Ubuntu
|
|
53
|
+
brew install ffmpeg # macOS
|
|
54
|
+
winget install Gyan.FFmpeg # Windows
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Configure as credenciais do OpenSubtitles:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
export OPENSUBTITLES_USERNAME="seu_usuario"
|
|
61
|
+
export OPENSUBTITLES_PASSWORD="sua_senha"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
> Para desenvolvimento, veja [Setup](#setup).
|
|
65
|
+
|
|
66
|
+
## Setup
|
|
67
|
+
|
|
68
|
+
Linux/macOS:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python -m venv .venv
|
|
72
|
+
source .venv/bin/activate
|
|
73
|
+
pip install -e ".[dev]"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Windows (PowerShell):
|
|
77
|
+
|
|
78
|
+
```powershell
|
|
79
|
+
python -m venv .venv
|
|
80
|
+
.\.venv\Scripts\Activate.ps1
|
|
81
|
+
pip install -e ".[dev]"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Windows (cmd.exe):
|
|
85
|
+
|
|
86
|
+
```cmd
|
|
87
|
+
python -m venv .venv
|
|
88
|
+
.venv\Scripts\activate.bat
|
|
89
|
+
pip install -e ".[dev]"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Instale também o `ffmpeg` no sistema:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
sudo apt install ffmpeg # Debian/Ubuntu
|
|
96
|
+
brew install ffmpeg # macOS
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
```powershell
|
|
100
|
+
winget install Gyan.FFmpeg # Windows (winget)
|
|
101
|
+
choco install ffmpeg # Windows (Chocolatey)
|
|
102
|
+
scoop install ffmpeg # Windows (Scoop)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Confirme que `ffmpeg` está no `PATH` rodando `ffmpeg -version` em novo terminal.
|
|
106
|
+
|
|
107
|
+
## Configuração (uma única vez)
|
|
108
|
+
|
|
109
|
+
Linux/macOS:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
export OPENSUBTITLES_USERNAME="seu_usuario"
|
|
113
|
+
export OPENSUBTITLES_PASSWORD="sua_senha"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Windows (PowerShell, sessão atual):
|
|
117
|
+
|
|
118
|
+
```powershell
|
|
119
|
+
$env:OPENSUBTITLES_USERNAME = "seu_usuario"
|
|
120
|
+
$env:OPENSUBTITLES_PASSWORD = "sua_senha"
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Windows (persistente, próximas sessões):
|
|
124
|
+
|
|
125
|
+
```powershell
|
|
126
|
+
setx OPENSUBTITLES_USERNAME "seu_usuario"
|
|
127
|
+
setx OPENSUBTITLES_PASSWORD "sua_senha"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Uso
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Default: pt-BR
|
|
134
|
+
subs-down-n-sync /caminho/para/filme.mkv
|
|
135
|
+
|
|
136
|
+
# Outro idioma (BCP 47: 'en', 'pt-BR', 'en-US', 'es', 'ja', ...)
|
|
137
|
+
subs-down-n-sync /caminho/para/filme.mkv --lang en
|
|
138
|
+
subs-down-n-sync /caminho/para/filme.mkv -l es
|
|
139
|
+
|
|
140
|
+
# Processar diretório inteiro (busca vídeos recursivamente)
|
|
141
|
+
subs-down-n-sync /caminho/para/pasta/
|
|
142
|
+
subs-down-n-sync /caminho/para/pasta/ --lang en
|
|
143
|
+
subs-down-n-sync /caminho/para/pasta/ --overwrite # sobrescreve legendas existentes
|
|
144
|
+
subs-down-n-sync /caminho/para/pasta/ --parallel # processa até 2 vídeos simultâneos
|
|
145
|
+
|
|
146
|
+
# Ou via módulo Python
|
|
147
|
+
python -m subs_down_n_sync /caminho/para/filme.mkv
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Ao passar um diretório, vídeos que já têm legenda (`<video>.<lang>.srt`) são pulados por padrão. Use `--overwrite` / `-o` para reprocessar. Use `--parallel` / `-p` para processar até 2 vídeos em paralelo.
|
|
151
|
+
|
|
152
|
+
Saída: `/caminho/para/filme.<lang>.srt` (ex.: `filme.pt-BR.srt`, `filme.en.srt`). Isso permite manter legendas do mesmo vídeo em idiomas diferentes sem sobrescrever.
|
|
153
|
+
|
|
154
|
+
## Desenvolvimento
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
pip install -e ".[dev]"
|
|
158
|
+
pytest
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Os testes unitários rodam com gate de cobertura de 90% (configurado em `pyproject.toml`). O CI falha se a cobertura cair abaixo disso.
|
|
162
|
+
|
|
163
|
+
Para rodar sem o gate (útil ao explorar com `-k` ou `--collect-only`):
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
pytest --no-cov
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Lint e formatação
|
|
170
|
+
|
|
171
|
+
O projeto usa [Ruff](https://docs.astral.sh/ruff/) para formatação e lint.
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
ruff format . # aplica formatação
|
|
175
|
+
ruff format --check . # verifica sem escrever (usado no CI)
|
|
176
|
+
ruff check . # roda lint
|
|
177
|
+
ruff check --fix . # aplica fixes automáticos
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
O CI falha se `ruff format --check` ou `ruff check` encontrarem problemas.
|
|
181
|
+
|
|
182
|
+
## Testes de integração
|
|
183
|
+
|
|
184
|
+
O projeto tem duas camadas de testes:
|
|
185
|
+
|
|
186
|
+
- **Testes unitários** (padrão, `pytest`) — rápidos, mockam `subliminal` e `sentence_transformers`. Não precisam de rede nem de binários externos além do Python.
|
|
187
|
+
- **Testes de integração** (`pytest -m integration`) — exercitam o pipeline real de alinhamento semântico (download do modelo `sentence-transformers` + DTW) sobre legendas reais. Requer acesso à internet no primeiro run para baixar o modelo (~120 MB), cacheado pelo Hugging Face em `~/.cache/huggingface/`.
|
|
188
|
+
|
|
189
|
+
Como rodar cada camada:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
pytest # só unit (rápido)
|
|
193
|
+
pytest -m integration # só integração (baixa modelo de embeddings, roda DTW real)
|
|
194
|
+
pytest -m "" # tudo (unit + integração)
|
|
195
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
subs_down_n_sync/__init__.py,sha256=jlLhNwF8RXLhwXUYKN0v8AeiCwKbb9pejoJecLFCGxk,95
|
|
2
|
+
subs_down_n_sync/__main__.py,sha256=4UUonBkrqVeT-ZqrBs96xCz1wsuAltma9tqYBAS-X-w,46
|
|
3
|
+
subs_down_n_sync/cli.py,sha256=8j-T1DXmxnVQtdORQZYmTScB19YWPA6tCicukddXsnU,9298
|
|
4
|
+
subs_down_n_sync/core.py,sha256=ScH0q6mvm2NSynO4Ylqbm7n_937y7KoO73vH6tnjua0,16714
|
|
5
|
+
subs_down_n_sync/exceptions.py,sha256=HeOfy398O1dwhXGB8QuJ2PU8RvU5bFcBfrlEDzj8ti0,552
|
|
6
|
+
subs_down_n_sync-1.0.0.dist-info/METADATA,sha256=uPvBQ-9BsbAQRPRzfzHK8Rz4uZc0tRoekVg4hh8s24g,6351
|
|
7
|
+
subs_down_n_sync-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
subs_down_n_sync-1.0.0.dist-info/entry_points.txt,sha256=ml-sdvqTipASHrCRNYVhlKa5gSHOXEgXik80pcx-Db4,63
|
|
9
|
+
subs_down_n_sync-1.0.0.dist-info/top_level.txt,sha256=uMLMnRnFOJuCkpl_5DIM-E2tl8Rg11rg8k0bwSMFze8,17
|
|
10
|
+
subs_down_n_sync-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
subs_down_n_sync
|