subs-down-n-sync 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ __version__ = "1.0.0"
2
+
3
+ from subs_down_n_sync.core import run
4
+
5
+ __all__ = ["run", "__version__"]
@@ -0,0 +1,3 @@
1
+ from subs_down_n_sync.cli import main
2
+
3
+ main()
@@ -0,0 +1,324 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from pathlib import Path
7
+
8
+ from rich.console import Console
9
+ from rich.panel import Panel
10
+ from rich.progress import (
11
+ BarColumn,
12
+ MofNCompleteColumn,
13
+ Progress,
14
+ SpinnerColumn,
15
+ TaskID,
16
+ TextColumn,
17
+ TimeElapsedColumn,
18
+ )
19
+ from rich.table import Table
20
+
21
+ from subs_down_n_sync.core import DEFAULT_LANG, VIDEO_EXTENSIONS, RunSummary, run
22
+ from subs_down_n_sync.exceptions import SubsDownError
23
+
24
+ MAX_PARALLEL_WORKERS = 2
25
+
26
+ console = Console()
27
+ err_console = Console(stderr=True)
28
+
29
+ _STEP_LABELS: dict[str, str] = {
30
+ "validando": "Validando vídeo...",
31
+ "buscando": "Buscando legenda no OpenSubtitles...",
32
+ "baixado": "Legenda encontrada e baixada",
33
+ "referencia": "Baixando legenda EN de referência...",
34
+ "sem_referencia": "Referência EN não encontrada — sincronização ignorada",
35
+ "sincronizando": "Alinhando com embeddings semânticos...",
36
+ "sincronizado": "Sincronização concluída",
37
+ "sem_sync": "Legenda já sincronizada",
38
+ "erro_sync": "Erro na sincronização",
39
+ "concluido": "Finalizado",
40
+ }
41
+
42
+ _STEPS_WITH_SYNC = [
43
+ "validando",
44
+ "buscando",
45
+ "baixado",
46
+ "referencia",
47
+ "sincronizando",
48
+ "concluido",
49
+ ]
50
+
51
+ _STEPS_NO_SYNC = [
52
+ "validando",
53
+ "buscando",
54
+ "baixado",
55
+ "concluido",
56
+ ]
57
+
58
+
59
+ def _make_progress() -> Progress:
60
+ return Progress(
61
+ SpinnerColumn(),
62
+ TextColumn("[bold blue]{task.description}"),
63
+ TextColumn("[dim]{task.fields[detail]}"),
64
+ TimeElapsedColumn(),
65
+ console=console,
66
+ transient=False,
67
+ )
68
+
69
+
70
+ def _print_summary(summary: RunSummary) -> None:
71
+ if summary.sync_error:
72
+ status_line = (
73
+ f"[yellow]Aviso:[/yellow] sincronização falhou — legenda original mantida.\n"
74
+ f"Detalhe: {summary.sync_error}"
75
+ )
76
+ border = "yellow"
77
+ elif summary.synced:
78
+ status_line = (
79
+ f"[green]Sincronizada[/green] "
80
+ f"(ajuste médio: {summary.offset_seconds:.2f}s, modo: {summary.sync_mode})"
81
+ )
82
+ border = "green"
83
+ else:
84
+ status_line = (
85
+ f"[cyan]Já sincronizada[/cyan] (offset médio: {summary.offset_seconds:.2f}s < 0.10s)"
86
+ )
87
+ border = "cyan"
88
+
89
+ body = (
90
+ f"Idioma: [bold]{summary.lang_tag}[/bold] | "
91
+ f"Provider: {summary.provider} | "
92
+ f"Match: {summary.match_type}\n"
93
+ f"{status_line}\n"
94
+ f"Tempo total: {summary.elapsed_seconds:.2f}s"
95
+ )
96
+
97
+ console.print(Panel(body, title="subs-down-n-sync", border_style=border))
98
+
99
+
100
+ def _make_batch_progress() -> Progress:
101
+ return Progress(
102
+ SpinnerColumn(),
103
+ TextColumn("[bold blue]{task.description}"),
104
+ BarColumn(),
105
+ MofNCompleteColumn(),
106
+ TextColumn("[dim]{task.fields[detail]}"),
107
+ TimeElapsedColumn(),
108
+ console=console,
109
+ transient=False,
110
+ )
111
+
112
+
113
+ def _process_video(
114
+ video: Path,
115
+ lang_tag: str,
116
+ progress: Progress,
117
+ ) -> RunSummary:
118
+ task_id = progress.add_task(video.name, detail="aguardando...", total=None)
119
+
120
+ def on_progress(step: str, detail: str) -> None:
121
+ label = _STEP_LABELS.get(step, step)
122
+ progress.update(task_id, description=f"{video.name} — {label}", detail=detail)
123
+
124
+ try:
125
+ return run(str(video), lang_tag=lang_tag, on_progress=on_progress)
126
+ finally:
127
+ progress.remove_task(task_id)
128
+
129
+
130
+ def _run_directory(
131
+ dir_path: Path,
132
+ lang_tag: str,
133
+ overwrite: bool,
134
+ parallel: bool = False,
135
+ ) -> tuple[list[RunSummary], list[Path], list[tuple[Path, str]]]:
136
+ videos = sorted(p for p in dir_path.rglob("*") if p.suffix.lower() in VIDEO_EXTENSIONS)
137
+
138
+ results: list[RunSummary] = []
139
+ skipped: list[Path] = []
140
+ errors: list[tuple[Path, str]] = []
141
+
142
+ to_process: list[Path] = []
143
+ for video in videos:
144
+ srt_path = video.with_suffix("").with_suffix(f".{lang_tag}.srt")
145
+ if srt_path.exists() and not overwrite:
146
+ skipped.append(video)
147
+ continue
148
+ to_process.append(video)
149
+
150
+ if not to_process:
151
+ return results, skipped, errors
152
+
153
+ progress = _make_batch_progress()
154
+ overall = progress.add_task(
155
+ f"Lote ({len(to_process)} vídeo(s))",
156
+ detail="",
157
+ total=len(to_process),
158
+ )
159
+
160
+ with progress:
161
+ if parallel:
162
+ with ThreadPoolExecutor(max_workers=MAX_PARALLEL_WORKERS) as pool:
163
+ futures = {
164
+ pool.submit(_process_video, v, lang_tag, progress): v for v in to_process
165
+ }
166
+ for fut in as_completed(futures):
167
+ video = futures[fut]
168
+ try:
169
+ results.append(fut.result())
170
+ except SubsDownError as e:
171
+ errors.append((video, str(e)))
172
+ progress.advance(overall)
173
+ else:
174
+ for video in to_process:
175
+ try:
176
+ results.append(_process_video(video, lang_tag, progress))
177
+ except SubsDownError as e:
178
+ errors.append((video, str(e)))
179
+ progress.advance(overall)
180
+
181
+ return results, skipped, errors
182
+
183
+
184
+ def _print_batch_summary(
185
+ results: list[RunSummary],
186
+ skipped: list[Path],
187
+ errors: list[tuple[Path, str]],
188
+ ) -> None:
189
+ table = Table(title="subs-down-n-sync — lote", show_lines=False)
190
+ table.add_column("Arquivo", style="bold")
191
+ table.add_column("Status")
192
+ table.add_column("Idioma")
193
+ table.add_column("Provider")
194
+ table.add_column("Offset")
195
+
196
+ for s in results:
197
+ if s.sync_error:
198
+ status = "[yellow]aviso[/yellow]"
199
+ elif s.synced:
200
+ status = "[green]sincronizado[/green]"
201
+ else:
202
+ status = "[cyan]ok[/cyan]"
203
+ table.add_row(
204
+ s.output_path.name,
205
+ status,
206
+ s.lang_tag,
207
+ s.provider,
208
+ f"{s.offset_seconds:.2f}s",
209
+ )
210
+
211
+ for path in skipped:
212
+ table.add_row(path.name, "[dim]pulado[/dim]", "-", "-", "-")
213
+
214
+ for path, _msg in errors:
215
+ table.add_row(path.name, "[red]erro[/red]", "-", "-", "-")
216
+
217
+ console.print(table)
218
+
219
+ parts = []
220
+ if results:
221
+ parts.append(f"[green]{len(results)} processado(s)[/green]")
222
+ if skipped:
223
+ parts.append(f"[dim]{len(skipped)} pulado(s)[/dim]")
224
+ if errors:
225
+ parts.append(f"[red]{len(errors)} erro(s)[/red]")
226
+
227
+ console.print(" ".join(parts) if parts else "[dim]Nenhum vídeo encontrado.[/dim]")
228
+
229
+ if errors:
230
+ console.print()
231
+ for path, msg in errors:
232
+ err_console.print(f"[bold red]Erro[/bold red] {path.name}: {msg}")
233
+
234
+
235
+ def build_parser() -> argparse.ArgumentParser:
236
+ parser = argparse.ArgumentParser(
237
+ prog="subs-down-n-sync",
238
+ description="Busca e sincroniza legenda para arquivo(s) de vídeo.",
239
+ )
240
+ parser.add_argument("path", help="Caminho para arquivo de vídeo ou diretório.")
241
+ parser.add_argument(
242
+ "-l",
243
+ "--lang",
244
+ default=DEFAULT_LANG,
245
+ help=f"Código de idioma BCP 47 (ex: pt-BR, en, es). Default: {DEFAULT_LANG}.",
246
+ )
247
+ parser.add_argument(
248
+ "-o",
249
+ "--overwrite",
250
+ action="store_true",
251
+ default=False,
252
+ help=(
253
+ "Sobrescrever legendas existentes. Por padrão, vídeos com legenda "
254
+ "já existente são pulados."
255
+ ),
256
+ )
257
+ parser.add_argument(
258
+ "-p",
259
+ "--parallel",
260
+ action="store_true",
261
+ default=False,
262
+ help=(
263
+ f"Processar vídeos em paralelo (até {MAX_PARALLEL_WORKERS} simultâneos) "
264
+ "quando o caminho for um diretório."
265
+ ),
266
+ )
267
+ return parser
268
+
269
+
270
+ def main(argv: list[str] | None = None) -> int:
271
+ parser = build_parser()
272
+ args = parser.parse_args(argv)
273
+
274
+ p = Path(args.path).expanduser()
275
+
276
+ if p.is_dir():
277
+ results, skipped, errors = _run_directory(
278
+ p,
279
+ lang_tag=args.lang,
280
+ overwrite=args.overwrite,
281
+ parallel=args.parallel,
282
+ )
283
+ _print_batch_summary(results, skipped, errors)
284
+ return 1 if errors else 0
285
+
286
+ if not p.exists():
287
+ err_console.print(f"[bold red]Erro:[/bold red] Caminho não existe: {p}")
288
+ return 1
289
+
290
+ progress = _make_progress()
291
+ task_id: TaskID | None = None
292
+
293
+ completed_steps: list[str] = []
294
+
295
+ def on_progress(step: str, detail: str) -> None:
296
+ nonlocal task_id
297
+
298
+ label = _STEP_LABELS.get(step, step)
299
+
300
+ if task_id is None:
301
+ task_id = progress.add_task(label, detail=detail, total=None)
302
+ else:
303
+ progress.update(task_id, description=label, detail=detail)
304
+
305
+ completed_steps.append(step)
306
+
307
+ if step in ("baixado", "sincronizado", "sem_sync", "sem_referencia", "erro_sync"):
308
+ console.log(f"[dim]{label}[/dim] {detail}")
309
+
310
+ with progress:
311
+ try:
312
+ summary = run(args.path, lang_tag=args.lang, on_progress=on_progress)
313
+ except SubsDownError as e:
314
+ progress.stop()
315
+ err_console.print(f"[bold red]Erro:[/bold red] {e}")
316
+ return 1
317
+
318
+ _print_summary(summary)
319
+
320
+ return 0
321
+
322
+
323
+ if __name__ == "__main__":
324
+ sys.exit(main())
@@ -0,0 +1,538 @@
1
+ """subs_down_n_sync: busca e sincroniza legendas (pt-BR por padrão, qualquer BCP 47)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ import tempfile
9
+ import time
10
+ from collections.abc import Callable
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+ import subliminal
16
+ from babelfish import Language
17
+ from scipy.spatial.distance import cdist
18
+ from sentence_transformers import SentenceTransformer
19
+ from subliminal.refiners.hash import refine as hash_refine
20
+ from subliminal.score import compute_score
21
+
22
+ from subs_down_n_sync.exceptions import (
23
+ InvalidLanguageError,
24
+ InvalidVideoError,
25
+ MissingCredentialsError,
26
+ MissingDependencyError,
27
+ SubtitleNotFoundError,
28
+ SubtitleSyncError,
29
+ )
30
+
31
+ VIDEO_EXTENSIONS = {".mkv", ".mp4", ".avi", ".mov", ".m4v", ".wmv", ".flv", ".webm"}
32
+
33
+ DEFAULT_LANG = "pt-BR"
34
+
35
+ SCORE_THRESHOLD = 0.9 # score/max_score >= 90% → sem sync
36
+
37
+ SYNC_THRESHOLD_SECONDS = 0.1
38
+
39
+ _TS_RE = re.compile(
40
+ r"^(\d{2}):(\d{2}):(\d{2}),(\d{3})\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}",
41
+ re.MULTILINE,
42
+ )
43
+
44
+ _SRT_BLOCK_RE = re.compile(
45
+ r"(\d+)\n"
46
+ r"(\d{2}):(\d{2}):(\d{2}),(\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2}),(\d{3})\n"
47
+ r"((?:.+\n?)+)",
48
+ re.MULTILINE,
49
+ )
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class SubtitleInfo:
54
+ provider: str
55
+ match_type: str # "hash" | "release" | "fallback"
56
+ needs_sync: bool
57
+
58
+
59
+ @dataclass(frozen=True)
60
+ class SyncResult:
61
+ synced: bool
62
+ offset_seconds: float
63
+ sync_mode: str = "none" # "video" | "none"
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class RunSummary:
68
+ output_path: Path
69
+ provider: str
70
+ match_type: str
71
+ synced: bool
72
+ offset_seconds: float
73
+ sync_mode: str
74
+ sync_error: str | None
75
+ elapsed_seconds: float
76
+ lang_tag: str
77
+
78
+
79
+ def check_ffmpeg() -> None:
80
+ if shutil.which("ffmpeg") is None:
81
+ raise MissingDependencyError(
82
+ "ffmpeg não encontrado no PATH. Instale via gerenciador de pacotes "
83
+ "(ex.: sudo apt install ffmpeg, brew install ffmpeg)."
84
+ )
85
+
86
+
87
+ def load_credentials() -> tuple[str, str]:
88
+ user = os.environ.get("OPENSUBTITLES_USERNAME")
89
+ pwd = os.environ.get("OPENSUBTITLES_PASSWORD")
90
+
91
+ missing = [
92
+ name
93
+ for name, val in (
94
+ ("OPENSUBTITLES_USERNAME", user),
95
+ ("OPENSUBTITLES_PASSWORD", pwd),
96
+ )
97
+ if not val
98
+ ]
99
+
100
+ if missing:
101
+ raise MissingCredentialsError(
102
+ "Variáveis de ambiente obrigatórias faltando: " + ", ".join(missing)
103
+ )
104
+
105
+ return user, pwd # type: ignore[return-value]
106
+
107
+
108
+ def validate_video_path(raw: str) -> Path:
109
+ p = Path(raw).expanduser()
110
+
111
+ if not p.exists():
112
+ raise InvalidVideoError(f"Arquivo de vídeo não existe: {p}")
113
+ if not p.is_file():
114
+ raise InvalidVideoError(f"Caminho não é um arquivo: {p}")
115
+ if p.suffix.lower() not in VIDEO_EXTENSIONS:
116
+ raise InvalidVideoError(
117
+ f"Extensão não suportada ({p.suffix}). "
118
+ f"Esperado um destes: {', '.join(sorted(VIDEO_EXTENSIONS))}"
119
+ )
120
+
121
+ return p
122
+
123
+
124
+ def parse_language(raw: str) -> Language:
125
+ try:
126
+ return Language.fromietf(raw)
127
+ except Exception as e:
128
+ raise InvalidLanguageError(
129
+ f"Código de idioma inválido: {raw!r}. Use tags BCP 47 como 'pt-BR', 'en', 'es', 'ja'."
130
+ ) from e
131
+
132
+
133
+ def _classify_match(matches: set[str]) -> str:
134
+ if "hash" in matches:
135
+ return "hash"
136
+ if "release_group" in matches:
137
+ return "release"
138
+
139
+ return "fallback"
140
+
141
+
142
+ def _filename_similarity(sub_filename: str, video_name: str) -> float:
143
+ """Fração de tokens do stem do vídeo presentes no nome da legenda."""
144
+ norm = re.compile(r"[\W_]+")
145
+ video_stem = Path(video_name).stem
146
+ sub_tokens = set(norm.sub(" ", sub_filename.lower()).split())
147
+ video_tokens = set(norm.sub(" ", video_stem.lower()).split())
148
+ if not video_tokens:
149
+ return 0.0
150
+ return len(sub_tokens & video_tokens) / len(video_tokens)
151
+
152
+
153
+ def _pick_subtitle(
154
+ candidates: list,
155
+ video: object,
156
+ ) -> tuple[object, str, bool]:
157
+ """Escolhe melhor legenda e decide se precisa de sync.
158
+
159
+ Retorna (subtitle, match_type, needs_sync).
160
+ Ordem de preferência:
161
+ 1. hash match → sem sync
162
+ 2. release_group match → melhor filename similarity → com sync
163
+ 3. fallback → melhor filename similarity → com sync
164
+ """
165
+ video_name = getattr(video, "name", "") or ""
166
+
167
+ scored = [(sub, compute_score(sub, video)) for sub in candidates]
168
+ scored.sort(key=lambda x: x[1], reverse=True)
169
+
170
+ # 1. hash match
171
+ for sub, _ in scored:
172
+ if "hash" in set(sub.get_matches(video)):
173
+ return sub, "hash", False
174
+
175
+ # 2. release_group match
176
+ release_candidates = [
177
+ (sub, score) for sub, score in scored if "release_group" in set(sub.get_matches(video))
178
+ ]
179
+ pool = release_candidates if release_candidates else scored
180
+ match_type = "release" if release_candidates else "fallback"
181
+
182
+ best_sub = max(
183
+ pool,
184
+ key=lambda x: _filename_similarity(getattr(x[0], "filename", "") or "", video_name),
185
+ )[0]
186
+ return best_sub, match_type, True
187
+
188
+
189
+ def _download_sub(
190
+ video: object,
191
+ language: Language,
192
+ provider_configs: dict,
193
+ ) -> object | None:
194
+ """Busca e baixa melhor legenda disponível para o idioma. Retorna subtitle ou None."""
195
+ results = subliminal.list_subtitles(
196
+ {video},
197
+ {language},
198
+ providers=["opensubtitles"],
199
+ provider_configs=provider_configs,
200
+ )
201
+ candidates = results.get(video, [])
202
+ candidates = [s for s in candidates if getattr(s, "language", None) == language]
203
+
204
+ if not candidates:
205
+ return None
206
+
207
+ scored = [(sub, compute_score(sub, video)) for sub in candidates]
208
+ scored.sort(key=lambda x: x[1], reverse=True)
209
+ best = scored[0][0]
210
+
211
+ subliminal.download_subtitles([best], provider_configs=provider_configs)
212
+ return best if best.text else None
213
+
214
+
215
+ def find_and_download_subtitle(
216
+ video_path: Path,
217
+ language: Language,
218
+ credentials: tuple[str, str],
219
+ ) -> tuple[Path, SubtitleInfo]:
220
+ user, pwd = credentials
221
+ video = subliminal.scan_video(str(video_path))
222
+ hash_refine(video)
223
+
224
+ provider_configs = {"opensubtitles": {"username": user, "password": pwd}}
225
+
226
+ results = subliminal.list_subtitles(
227
+ {video},
228
+ {language},
229
+ providers=["opensubtitles"],
230
+ provider_configs=provider_configs,
231
+ )
232
+ candidates = results.get(video, [])
233
+ target_candidates = [s for s in candidates if getattr(s, "language", None) == language]
234
+
235
+ if not target_candidates:
236
+ raise SubtitleNotFoundError(
237
+ f"Nenhuma legenda em {language.alpha3} encontrada para: {video_path.name}"
238
+ )
239
+
240
+ subtitle, match_type, needs_sync = _pick_subtitle(target_candidates, video)
241
+
242
+ subliminal.download_subtitles([subtitle], provider_configs=provider_configs)
243
+
244
+ # Não usamos subliminal.save_subtitles porque ele grava os bytes crus no
245
+ # encoding detectado (ex.: cp1252), produzindo mojibake. Pegamos o texto
246
+ # já decodificado e escrevemos em UTF-8.
247
+ if not subtitle.text:
248
+ raise SubtitleNotFoundError(f"Legenda veio vazia do provider para: {video_path.name}")
249
+
250
+ srt_path = video_path.parent / Path(subtitle.get_path(video)).name
251
+ srt_path.write_text(subtitle.text, encoding="utf-8")
252
+
253
+ info = SubtitleInfo(
254
+ provider=subtitle.provider_name,
255
+ match_type=match_type,
256
+ needs_sync=needs_sync,
257
+ )
258
+
259
+ return srt_path, info
260
+
261
+
262
+ def find_reference_subtitle(
263
+ video_path: Path,
264
+ credentials: tuple[str, str],
265
+ ) -> Path | None:
266
+ """Baixa legenda EN como referência de alinhamento. Retorna path ou None."""
267
+ user, pwd = credentials
268
+ video = subliminal.scan_video(str(video_path))
269
+ hash_refine(video)
270
+
271
+ en = Language("eng")
272
+ provider_configs = {"opensubtitles": {"username": user, "password": pwd}}
273
+
274
+ subtitle = _download_sub(video, en, provider_configs)
275
+ if subtitle is None:
276
+ return None
277
+
278
+ tmp_dir = Path(tempfile.mkdtemp(prefix="subs_ref_"))
279
+ ref_path = tmp_dir / Path(subtitle.get_path(video)).name
280
+ ref_path.write_text(subtitle.text, encoding="utf-8")
281
+
282
+ return ref_path
283
+
284
+
285
+ def _ts(h: str, m: str, s: str, ms: str) -> float:
286
+ return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
287
+
288
+
289
+ def _seconds_to_ts(total: float) -> str:
290
+ total = max(0.0, total)
291
+ h = int(total // 3600)
292
+ m = int((total % 3600) // 60)
293
+ s = int(total % 60)
294
+ ms = round((total - int(total)) * 1000)
295
+ if ms == 1000:
296
+ ms = 0
297
+ s += 1
298
+ return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
299
+
300
+
301
+ def _srt_to_segments(srt_text: str) -> list[dict]:
302
+ segments = []
303
+ for m in _SRT_BLOCK_RE.finditer(srt_text):
304
+ segments.append(
305
+ {
306
+ "start": _ts(m.group(2), m.group(3), m.group(4), m.group(5)),
307
+ "end": _ts(m.group(6), m.group(7), m.group(8), m.group(9)),
308
+ "text": m.group(10).strip(),
309
+ }
310
+ )
311
+ return segments
312
+
313
+
314
+ def _parse_srt_timestamps(srt_text: str) -> list[float]:
315
+ out: list[float] = []
316
+
317
+ for h, m, s, ms in _TS_RE.findall(srt_text):
318
+ out.append(int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000)
319
+
320
+ return out
321
+
322
+
323
+ def _mean_offset_seconds(orig: list[float], synced: list[float]) -> float:
324
+ n = min(len(orig), len(synced))
325
+
326
+ if n == 0:
327
+ return 0.0
328
+
329
+ total = sum(abs(synced[i] - orig[i]) for i in range(n))
330
+
331
+ return total / n
332
+
333
+
334
+ _SEMANTIC_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
335
+
336
+
337
+ def _align_cues_by_semantics(
338
+ target_cues: list[dict],
339
+ ref_cues: list[dict],
340
+ ) -> list[dict]:
341
+ """Alinha cues de legenda alvo aos timestamps da referência via embeddings semânticos + DTW.
342
+
343
+ Para cada cue alvo, encontra o(s) cue(s) ref mais similar(es) semanticamente
344
+ e copia os timestamps. Garante ordem monotônica crescente no resultado.
345
+ """
346
+ model = SentenceTransformer(_SEMANTIC_MODEL)
347
+
348
+ target_texts = [c["text"] for c in target_cues]
349
+ ref_texts = [c["text"] for c in ref_cues]
350
+
351
+ target_emb = model.encode(target_texts, convert_to_numpy=True)
352
+ ref_emb = model.encode(ref_texts, convert_to_numpy=True)
353
+
354
+ # matriz de distância coseno: shape (len_target, len_ref)
355
+ dist_matrix = cdist(target_emb, ref_emb, metric="cosine")
356
+
357
+ # DTW simples: encontra caminho de alinhamento ótimo
358
+ n, m = dist_matrix.shape
359
+ cost = np.full((n + 1, m + 1), np.inf)
360
+ cost[0, 0] = 0.0
361
+
362
+ for i in range(1, n + 1):
363
+ for j in range(1, m + 1):
364
+ cost[i, j] = dist_matrix[i - 1, j - 1] + min(
365
+ cost[i - 1, j], # inserção
366
+ cost[i, j - 1], # deleção
367
+ cost[i - 1, j - 1], # match
368
+ )
369
+
370
+ # traceback
371
+ path: list[tuple[int, int]] = []
372
+ i, j = n, m
373
+ while i > 0 and j > 0:
374
+ path.append((i - 1, j - 1))
375
+ options = [
376
+ (cost[i - 1, j - 1], i - 1, j - 1),
377
+ (cost[i - 1, j], i - 1, j),
378
+ (cost[i, j - 1], i, j - 1),
379
+ ]
380
+ _, i, j = min(options, key=lambda x: x[0])
381
+ path.reverse()
382
+
383
+ # para cada cue alvo, coleta os ref_cues mapeados e usa média dos timestamps
384
+ from collections import defaultdict
385
+
386
+ target_to_refs: dict[int, list[int]] = defaultdict(list)
387
+ for ti, ri in path:
388
+ target_to_refs[ti].append(ri)
389
+
390
+ result = []
391
+ for ti, cue in enumerate(target_cues):
392
+ mapped = target_to_refs.get(ti, [])
393
+ orig_duration = max(cue["end"] - cue["start"], 0.0)
394
+
395
+ if mapped:
396
+ # usa start do primeiro ref mapeado (não média, evita puxar p/ meio)
397
+ start = float(ref_cues[mapped[0]]["start"])
398
+ # preserva duração original do target (frase pt pode ser mais longa que en)
399
+ end = start + orig_duration
400
+ else:
401
+ start, end = cue["start"], cue["end"]
402
+
403
+ # duração mínima de leitura: ~60ms/char, mínimo 1s, teto 7s
404
+ min_duration = max(1.0, min(len(cue["text"]) * 0.06, 7.0))
405
+ if end - start < min_duration:
406
+ end = start + min_duration
407
+
408
+ result.append({"start": start, "end": end, "text": cue["text"]})
409
+
410
+ # garante ordem monotônica e clamp contra próximo cue
411
+ for i in range(1, len(result)):
412
+ if result[i]["start"] <= result[i - 1]["start"]:
413
+ gap = max(result[i - 1]["end"] - result[i - 1]["start"], 0.1)
414
+ result[i]["start"] = result[i - 1]["start"] + gap
415
+ duration = result[i]["end"] - result[i]["start"]
416
+ result[i]["end"] = result[i]["start"] + max(duration, 0.1)
417
+
418
+ # clamp: end do cue i não invade start do cue i+1 (deixa 50ms de gap)
419
+ for i in range(len(result) - 1):
420
+ max_end = result[i + 1]["start"] - 0.05
421
+ if result[i]["end"] > max_end:
422
+ result[i]["end"] = max(max_end, result[i]["start"] + 0.1)
423
+
424
+ return result
425
+
426
+
427
+ def _cues_to_srt(cues: list[dict]) -> str:
428
+ lines = []
429
+ for i, cue in enumerate(cues, start=1):
430
+ lines.append(
431
+ f"{i}\n{_seconds_to_ts(cue['start'])} --> {_seconds_to_ts(cue['end'])}\n{cue['text']}\n"
432
+ )
433
+ return "\n".join(lines)
434
+
435
+
436
+ def sync_subtitle(
437
+ srt_path: Path,
438
+ ref_path: Path,
439
+ ) -> SyncResult:
440
+ """Alinha legenda alvo usando legenda EN de referência via embeddings semânticos."""
441
+ target_text = srt_path.read_text(encoding="utf-8", errors="replace")
442
+ target_cues = _srt_to_segments(target_text)
443
+ target_ts_orig = [c["start"] for c in target_cues]
444
+
445
+ ref_text = ref_path.read_text(encoding="utf-8", errors="replace")
446
+ ref_cues = _srt_to_segments(ref_text)
447
+
448
+ try:
449
+ aligned_cues = _align_cues_by_semantics(target_cues, ref_cues)
450
+ except Exception as e:
451
+ raise SubtitleSyncError(f"alinhamento semântico falhou: {e}") from e
452
+
453
+ aligned_ts = [c["start"] for c in aligned_cues]
454
+ offset = _mean_offset_seconds(target_ts_orig, aligned_ts)
455
+
456
+ if offset < SYNC_THRESHOLD_SECONDS:
457
+ return SyncResult(synced=False, offset_seconds=offset, sync_mode="none")
458
+
459
+ srt_path.write_text(_cues_to_srt(aligned_cues), encoding="utf-8")
460
+ return SyncResult(synced=True, offset_seconds=offset, sync_mode="ref")
461
+
462
+
463
+ def finalize_output_path(video_path: Path, srt_path: Path, lang_tag: str) -> Path:
464
+ target = video_path.with_suffix(f".{lang_tag}.srt")
465
+
466
+ if srt_path == target:
467
+ return target
468
+
469
+ srt_path.replace(target)
470
+
471
+ return target
472
+
473
+
474
+ ProgressCallback = Callable[[str, str], None]
475
+
476
+
477
+ def run(
478
+ video_arg: str,
479
+ lang_tag: str = DEFAULT_LANG,
480
+ on_progress: ProgressCallback | None = None,
481
+ ) -> RunSummary:
482
+ def _notify(step: str, detail: str = "") -> None:
483
+ if on_progress:
484
+ on_progress(step, detail)
485
+
486
+ start = time.monotonic()
487
+
488
+ _notify("validando", video_arg)
489
+ video_path = validate_video_path(video_arg)
490
+ check_ffmpeg()
491
+ language = parse_language(lang_tag)
492
+ credentials = load_credentials()
493
+
494
+ _notify("buscando", f"idioma={lang_tag}")
495
+ srt_path, info = find_and_download_subtitle(
496
+ video_path, language=language, credentials=credentials
497
+ )
498
+ _notify("baixado", f"provider={info.provider} match={info.match_type}")
499
+
500
+ sync_error: str | None = None
501
+
502
+ if info.needs_sync:
503
+ _notify("referencia", "buscando EN")
504
+ ref_path = find_reference_subtitle(video_path, credentials=credentials)
505
+
506
+ if ref_path is None:
507
+ sync_error = "legenda EN de referência não encontrada — sincronização ignorada"
508
+ sync_result = SyncResult(synced=False, offset_seconds=0.0, sync_mode="none")
509
+ _notify("sem_referencia", "")
510
+ else:
511
+ _notify("sincronizando", "embeddings semânticos")
512
+ try:
513
+ sync_result = sync_subtitle(srt_path, ref_path=ref_path)
514
+ _notify("sincronizado", f"offset={sync_result.offset_seconds:.2f}s")
515
+ except SubtitleSyncError as e:
516
+ sync_error = str(e)
517
+ sync_result = SyncResult(synced=False, offset_seconds=0.0, sync_mode="none")
518
+ _notify("erro_sync", str(e))
519
+ else:
520
+ sync_result = SyncResult(synced=False, offset_seconds=0.0, sync_mode="none")
521
+ _notify("sem_sync", f"offset={sync_result.offset_seconds:.2f}s")
522
+
523
+ final_path = finalize_output_path(video_path, srt_path, lang_tag=lang_tag)
524
+ elapsed = time.monotonic() - start
525
+
526
+ _notify("concluido", str(final_path))
527
+
528
+ return RunSummary(
529
+ output_path=final_path,
530
+ provider=info.provider,
531
+ match_type=info.match_type,
532
+ synced=sync_result.synced,
533
+ offset_seconds=sync_result.offset_seconds,
534
+ sync_mode=sync_result.sync_mode,
535
+ sync_error=sync_error,
536
+ elapsed_seconds=elapsed,
537
+ lang_tag=lang_tag,
538
+ )
@@ -0,0 +1,31 @@
1
+ """Exceptions raised by subs_down_n_sync. Message is user-facing (in Portuguese)."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class SubsDownError(Exception):
7
+ """Erro base do script — mensagem é o que vai para o usuário."""
8
+
9
+
10
+ class InvalidVideoError(SubsDownError):
11
+ pass
12
+
13
+
14
+ class MissingDependencyError(SubsDownError):
15
+ pass
16
+
17
+
18
+ class MissingCredentialsError(SubsDownError):
19
+ pass
20
+
21
+
22
+ class InvalidLanguageError(SubsDownError):
23
+ pass
24
+
25
+
26
+ class SubtitleNotFoundError(SubsDownError):
27
+ pass
28
+
29
+
30
+ class SubtitleSyncError(SubsDownError):
31
+ pass
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: subs-down-n-sync
3
+ Version: 1.0.0
4
+ Summary: CLI to download and sync subtitles for video files using semantic embeddings + DTW
5
+ Author-email: Airton Soares <airtonnsoares@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://pypi.org/project/subs-down-n-sync/
8
+ Project-URL: Repository, https://github.com/airton-soares/subs_down_n_sync
9
+ Project-URL: Bug Tracker, https://github.com/airton-soares/subs_down_n_sync/issues
10
+ Keywords: subtitles,subtitle-sync,opensubtitles,nlp,cli,video,sentence-transformers,dtw
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Environment :: Console
14
+ Classifier: Topic :: Multimedia :: Video
15
+ Classifier: Topic :: Utilities
16
+ Classifier: Natural Language :: Portuguese (Brazilian)
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Requires-Python: >=3.12
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: subliminal>=2.2
22
+ Requires-Dist: sentence-transformers>=3.0
23
+ Requires-Dist: scipy>=1.13
24
+ Requires-Dist: setuptools<81
25
+ Requires-Dist: rich>=13
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.0; extra == "dev"
28
+ Requires-Dist: pytest-mock>=3.12; extra == "dev"
29
+ Requires-Dist: pytest-cov>=6.0; extra == "dev"
30
+ Requires-Dist: ruff>=0.9; extra == "dev"
31
+
32
+ # subs_down_n_sync
33
+
34
+ ![CI](https://github.com/airton-soares/subs_down_n_sync/actions/workflows/ci.yml/badge.svg)
35
+ [![PyPI version](https://img.shields.io/pypi/v/subs-down-n-sync)](https://pypi.org/project/subs-down-n-sync/)
36
+ [![Python versions](https://img.shields.io/pypi/pyversions/subs-down-n-sync)](https://pypi.org/project/subs-down-n-sync/)
37
+ [![License](https://img.shields.io/github/license/airton-soares/subs_down_n_sync)](LICENSE)
38
+
39
+ CLI Python para baixar e sincronizar legendas para arquivos de vídeo. Idioma padrão: **pt-BR**, configurável via flag `--lang` (qualquer tag BCP 47).
40
+
41
+ A sincronização usa embeddings semânticos multilíngues ([sentence-transformers](https://www.sbert.net/), modelo `paraphrase-multilingual-MiniLM-L12-v2`) combinados com DTW: baixa uma legenda EN de referência e alinha os cues da legenda alvo aos timestamps da referência por similaridade semântica. Legendas com match exato (hash ou release group) são usadas sem sincronização.
42
+
43
+ ## Instalação
44
+
45
+ ```bash
46
+ pip install subs-down-n-sync
47
+ ```
48
+
49
+ Instale também o `ffmpeg`:
50
+
51
+ ```bash
52
+ sudo apt install ffmpeg # Debian/Ubuntu
53
+ brew install ffmpeg # macOS
54
+ winget install Gyan.FFmpeg # Windows
55
+ ```
56
+
57
+ Configure as credenciais do OpenSubtitles:
58
+
59
+ ```bash
60
+ export OPENSUBTITLES_USERNAME="seu_usuario"
61
+ export OPENSUBTITLES_PASSWORD="sua_senha"
62
+ ```
63
+
64
+ > Para desenvolvimento, veja [Setup](#setup).
65
+
66
+ ## Setup
67
+
68
+ Linux/macOS:
69
+
70
+ ```bash
71
+ python -m venv .venv
72
+ source .venv/bin/activate
73
+ pip install -e ".[dev]"
74
+ ```
75
+
76
+ Windows (PowerShell):
77
+
78
+ ```powershell
79
+ python -m venv .venv
80
+ .\.venv\Scripts\Activate.ps1
81
+ pip install -e ".[dev]"
82
+ ```
83
+
84
+ Windows (cmd.exe):
85
+
86
+ ```cmd
87
+ python -m venv .venv
88
+ .venv\Scripts\activate.bat
89
+ pip install -e ".[dev]"
90
+ ```
91
+
92
+ Instale também o `ffmpeg` no sistema:
93
+
94
+ ```bash
95
+ sudo apt install ffmpeg # Debian/Ubuntu
96
+ brew install ffmpeg # macOS
97
+ ```
98
+
99
+ ```powershell
100
+ winget install Gyan.FFmpeg # Windows (winget)
101
+ choco install ffmpeg # Windows (Chocolatey)
102
+ scoop install ffmpeg # Windows (Scoop)
103
+ ```
104
+
105
+ Confirme que `ffmpeg` está no `PATH` rodando `ffmpeg -version` em novo terminal.
106
+
107
+ ## Configuração (uma única vez)
108
+
109
+ Linux/macOS:
110
+
111
+ ```bash
112
+ export OPENSUBTITLES_USERNAME="seu_usuario"
113
+ export OPENSUBTITLES_PASSWORD="sua_senha"
114
+ ```
115
+
116
+ Windows (PowerShell, sessão atual):
117
+
118
+ ```powershell
119
+ $env:OPENSUBTITLES_USERNAME = "seu_usuario"
120
+ $env:OPENSUBTITLES_PASSWORD = "sua_senha"
121
+ ```
122
+
123
+ Windows (persistente, próximas sessões):
124
+
125
+ ```powershell
126
+ setx OPENSUBTITLES_USERNAME "seu_usuario"
127
+ setx OPENSUBTITLES_PASSWORD "sua_senha"
128
+ ```
129
+
130
+ ## Uso
131
+
132
+ ```bash
133
+ # Default: pt-BR
134
+ subs-down-n-sync /caminho/para/filme.mkv
135
+
136
+ # Outro idioma (BCP 47: 'en', 'pt-BR', 'en-US', 'es', 'ja', ...)
137
+ subs-down-n-sync /caminho/para/filme.mkv --lang en
138
+ subs-down-n-sync /caminho/para/filme.mkv -l es
139
+
140
+ # Processar diretório inteiro (busca vídeos recursivamente)
141
+ subs-down-n-sync /caminho/para/pasta/
142
+ subs-down-n-sync /caminho/para/pasta/ --lang en
143
+ subs-down-n-sync /caminho/para/pasta/ --overwrite # sobrescreve legendas existentes
144
+ subs-down-n-sync /caminho/para/pasta/ --parallel # processa até 2 vídeos simultâneos
145
+
146
+ # Ou via módulo Python
147
+ python -m subs_down_n_sync /caminho/para/filme.mkv
148
+ ```
149
+
150
+ Ao passar um diretório, vídeos que já têm legenda (`<video>.<lang>.srt`) são pulados por padrão. Use `--overwrite` / `-o` para reprocessar. Use `--parallel` / `-p` para processar até 2 vídeos em paralelo.
151
+
152
+ Saída: `/caminho/para/filme.<lang>.srt` (ex.: `filme.pt-BR.srt`, `filme.en.srt`). Isso permite manter legendas do mesmo vídeo em idiomas diferentes sem sobrescrever.
153
+
154
+ ## Desenvolvimento
155
+
156
+ ```bash
157
+ pip install -e ".[dev]"
158
+ pytest
159
+ ```
160
+
161
+ Os testes unitários rodam com gate de cobertura de 90% (configurado em `pyproject.toml`). O CI falha se a cobertura cair abaixo disso.
162
+
163
+ Para rodar sem o gate (útil ao explorar com `-k` ou `--collect-only`):
164
+
165
+ ```bash
166
+ pytest --no-cov
167
+ ```
168
+
169
+ ## Lint e formatação
170
+
171
+ O projeto usa [Ruff](https://docs.astral.sh/ruff/) para formatação e lint.
172
+
173
+ ```bash
174
+ ruff format . # aplica formatação
175
+ ruff format --check . # verifica sem escrever (usado no CI)
176
+ ruff check . # roda lint
177
+ ruff check --fix . # aplica fixes automáticos
178
+ ```
179
+
180
+ O CI falha se `ruff format --check` ou `ruff check` encontrarem problemas.
181
+
182
+ ## Testes de integração
183
+
184
+ O projeto tem duas camadas de testes:
185
+
186
+ - **Testes unitários** (padrão, `pytest`) — rápidos, mockam `subliminal` e `sentence_transformers`. Não precisam de rede nem de binários externos além do Python.
187
+ - **Testes de integração** (`pytest -m integration`) — exercitam o pipeline real de alinhamento semântico (download do modelo `sentence-transformers` + DTW) sobre legendas reais. Requer acesso à internet no primeiro run para baixar o modelo (~120 MB), cacheado pelo Hugging Face em `~/.cache/huggingface/`.
188
+
189
+ Como rodar cada camada:
190
+
191
+ ```bash
192
+ pytest # só unit (rápido)
193
+ pytest -m integration # só integração (baixa modelo de embeddings, roda DTW real)
194
+ pytest -m "" # tudo (unit + integração)
195
+ ```
@@ -0,0 +1,10 @@
1
+ subs_down_n_sync/__init__.py,sha256=jlLhNwF8RXLhwXUYKN0v8AeiCwKbb9pejoJecLFCGxk,95
2
+ subs_down_n_sync/__main__.py,sha256=4UUonBkrqVeT-ZqrBs96xCz1wsuAltma9tqYBAS-X-w,46
3
+ subs_down_n_sync/cli.py,sha256=8j-T1DXmxnVQtdORQZYmTScB19YWPA6tCicukddXsnU,9298
4
+ subs_down_n_sync/core.py,sha256=ScH0q6mvm2NSynO4Ylqbm7n_937y7KoO73vH6tnjua0,16714
5
+ subs_down_n_sync/exceptions.py,sha256=HeOfy398O1dwhXGB8QuJ2PU8RvU5bFcBfrlEDzj8ti0,552
6
+ subs_down_n_sync-1.0.0.dist-info/METADATA,sha256=uPvBQ-9BsbAQRPRzfzHK8Rz4uZc0tRoekVg4hh8s24g,6351
7
+ subs_down_n_sync-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ subs_down_n_sync-1.0.0.dist-info/entry_points.txt,sha256=ml-sdvqTipASHrCRNYVhlKa5gSHOXEgXik80pcx-Db4,63
9
+ subs_down_n_sync-1.0.0.dist-info/top_level.txt,sha256=uMLMnRnFOJuCkpl_5DIM-E2tl8Rg11rg8k0bwSMFze8,17
10
+ subs_down_n_sync-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ subs-down-n-sync = subs_down_n_sync.cli:main
@@ -0,0 +1 @@
1
+ subs_down_n_sync