aisrt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aisrt/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Ultimate SRT Generator."""
2
+
3
+ __version__ = "0.1.0"
aisrt/assembly.py ADDED
@@ -0,0 +1,192 @@
1
+ """Broadcast-quality SubRip (SRT) formatting and Atomic File I/O."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from loguru import logger
8
+
9
+
10
+ def _format_timestamp(seconds: float) -> str:
11
+ """Format a timestamp (in float seconds) to SRT standard: HH:MM:SS,mmm."""
12
+ hours = int(seconds // 3600)
13
+ minutes = int((seconds % 3600) // 60)
14
+ secs = int(seconds % 60)
15
+ millis = int(round((seconds - int(seconds)) * 1000))
16
+
17
+ if millis == 1000:
18
+ secs += 1
19
+ millis = 0
20
+ if secs == 60:
21
+ secs = 0
22
+ minutes += 1
23
+ if minutes == 60:
24
+ minutes = 0
25
+ hours += 1
26
+
27
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
28
+
29
+
30
+ class SRTFormatter:
31
+ """Chunks Whisper words into broadcast-standard SRT format."""
32
+
33
+ def __init__(self, max_chars_per_line: int = 42, max_lines: int = 2) -> None:
34
+ """Initialize the SRT chunker.
35
+
36
+ Args:
37
+ max_chars_per_line: Maximum characters before wrapping a line.
38
+ max_lines: Maximum lines per subtitle block.
39
+ """
40
+ self.max_chars_per_line = max_chars_per_line
41
+ self.max_lines = max_lines
42
+ self.terminal_punctuation = {".", "?", "!", "。", "?", "!"}
43
+
44
+ def format_segments(self, segments: Any) -> str:
45
+ """Iterate over faster-whisper Segment/Word objects and yield SRT blocks.
46
+
47
+ Requires word_timestamps=True in the Whisper model transcribe() call.
48
+
49
+ Args:
50
+ segments: A generator of faster-whisper Segment objects.
51
+
52
+ Returns:
53
+ The complete SRT file content as a string.
54
+ """
55
+ self._srt_blocks: list[str] = []
56
+ self._block_idx = 1
57
+
58
+ for segment in segments:
59
+ if not getattr(segment, "words", None):
60
+ self._format_raw_segment(segment)
61
+ else:
62
+ self._format_word_segment(segment)
63
+
64
+ return "\n".join(self._srt_blocks)
65
+
66
+ def _format_raw_segment(self, segment: Any) -> None:
67
+ """Fallback formatter for segments without word timestamps."""
68
+ text = segment.text.strip()
69
+ if text:
70
+ start = _format_timestamp(segment.start)
71
+ end = _format_timestamp(segment.end)
72
+ self._srt_blocks.append(f"{self._block_idx}\n{start} --> {end}\n{text}\n")
73
+ self._block_idx += 1
74
+
75
+ def _format_word_segment(self, segment: Any) -> None:
76
+ """Advanced formatter that chunks based on character count and punctuation."""
77
+ current_words: list[str] = []
78
+ current_start: float | None = None
79
+ current_end: float = 0.0
80
+ char_count = 0
81
+ line_count = 1
82
+
83
+ for word_obj in segment.words:
84
+ word = word_obj.word.strip()
85
+ if not word:
86
+ continue
87
+
88
+ # Temporal gap check: flush if silence > 1.5s
89
+ if current_end > 0.0 and (word_obj.start - current_end) > 1.5:
90
+ if current_words and current_start is not None:
91
+ self._flush_words(current_words, current_start, current_end)
92
+ current_words = []
93
+ current_start = None
94
+ char_count = 0
95
+ line_count = 1
96
+
97
+ if current_start is None:
98
+ current_start = word_obj.start
99
+
100
+ current_words.append(word_obj.word)
101
+ current_end = word_obj.end
102
+ char_count += len(word)
103
+
104
+ is_terminal = any(word.endswith(p) for p in self.terminal_punctuation)
105
+
106
+ # If appending this word exceeds the line length, wrap BEFORE adding it
107
+ if char_count > self.max_chars_per_line and line_count < self.max_lines:
108
+ # Insert newline before the current word
109
+ current_words.pop() # Remove the word we just added
110
+ current_words.append("\n")
111
+ current_words.append(word_obj.word.lstrip())
112
+ char_count = len(word)
113
+ line_count += 1
114
+
115
+ is_too_long = char_count >= self.max_chars_per_line
116
+
117
+ if is_terminal or (is_too_long and line_count >= self.max_lines):
118
+ self._flush_words(current_words, current_start, current_end)
119
+ current_words = []
120
+ current_start = None
121
+ char_count = 0
122
+ line_count = 1
123
+
124
+ if current_words and current_start is not None:
125
+ self._flush_words(current_words, current_start, current_end)
126
+
127
+ def _flush_words(self, words: list[str], start_time: float, end_time: float) -> None:
128
+ """Write the aggregated words to the block list."""
129
+ text = "".join(words).strip()
130
+ if text:
131
+ start_str = _format_timestamp(start_time)
132
+ end_str = _format_timestamp(end_time)
133
+ self._srt_blocks.append(f"{self._block_idx}\n{start_str} --> {end_str}\n{text}\n")
134
+ self._block_idx += 1
135
+
136
+
137
+ class AtomicWriter:
138
+ """Handles cross-device POSIX atomic file writing and metadata inheritance."""
139
+
140
+ @staticmethod
141
+ def write_srt(source_video: Path, srt_content: str, language_code: str = "en") -> Path:
142
+ """Write the SRT securely, inheriting the permissions of the source video.
143
+
144
+ Args:
145
+ source_video: The original MKV/MP4 file.
146
+ srt_content: The fully formatted SRT text block.
147
+ language_code: The locale suffix for the subtitle (e.g., 'en', 'eng').
148
+
149
+ Returns:
150
+ The Path to the finalized, atomically committed SRT file.
151
+ """
152
+ final_srt_path = source_video.with_suffix(f".{language_code}.srt")
153
+ temp_srt_path = source_video.with_name(f".{source_video.stem}.srt.tmp")
154
+
155
+ logger.debug(f"Assembling atomic SRT chunks in {temp_srt_path}")
156
+
157
+ try:
158
+ # 1. Write to hidden temp file in the same directory
159
+ temp_srt_path.write_text(srt_content, encoding="utf-8")
160
+
161
+ # 2. Inherit metadata from the source video
162
+ stat = source_video.stat()
163
+
164
+ try:
165
+ os.chown(temp_srt_path, stat.st_uid, stat.st_gid)
166
+ except PermissionError:
167
+ # Running as non-root over SMB/NFS might restrict chown
168
+ logger.debug(
169
+ f"Insufficient permissions to chown {temp_srt_path} to "
170
+ f"UID:{stat.st_uid}/GID:{stat.st_gid}. Proceeding anyway."
171
+ )
172
+
173
+ try:
174
+ os.chmod(temp_srt_path, stat.st_mode)
175
+ except PermissionError:
176
+ logger.debug(f"Insufficient permissions to chmod {temp_srt_path}")
177
+
178
+ # 3. Cross-device safe Atomic Rename
179
+ # os.replace is atomic on POSIX if both files are on the same filesystem.
180
+ # We write the temp file in the same folder to guarantee this and prevent EXDEV errors.
181
+ os.replace(temp_srt_path, final_srt_path)
182
+ logger.info(f"Successfully generated and committed {final_srt_path.name}")
183
+ return final_srt_path
184
+
185
+ except Exception as e:
186
+ # Clean up the temp file if the atomic commit fails
187
+ if temp_srt_path.exists():
188
+ try:
189
+ temp_srt_path.unlink()
190
+ except OSError:
191
+ pass
192
+ raise RuntimeError(f"Atomic subtitle write failed for {source_video.name}: {e}") from e
aisrt/cli.py ADDED
@@ -0,0 +1,177 @@
1
+ """CLI commands for the SRT Generator."""
2
+
3
+ import asyncio
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ import typer
9
+ from loguru import logger
10
+ from rich.console import Console
11
+ from rich.table import Table
12
+
13
+ from aisrt.config import AppConfig, FilterConfig, HardwareConfig
14
+ from aisrt.discovery import DiscoveryEngine
15
+ from aisrt.hardware import HardwareProfiler, ModelRouter, setup_thread_safety
16
+ from aisrt.state import StateTracker
17
+
18
+ app = typer.Typer(help="Ultimate SRT Generator", add_completion=False)
19
+ console = Console()
20
+
21
+
22
+ def configure_logging(verbose: bool) -> None:
23
+ """Configure Loguru to output cleanly via Rich."""
24
+ logger.remove()
25
+ log_level = "DEBUG" if verbose else "INFO"
26
+ logger.add(sys.stderr, level=log_level, colorize=True)
27
+
28
+
29
+ @app.command()
30
+ def scan(
31
+ media_dir: Annotated[Path, typer.Argument(help="Root directory containing media files")],
32
+ min_age_mins: Annotated[int, typer.Option(help="Minimum file age in minutes")] = 15,
33
+ force_device: Annotated[str | None, typer.Option(help="Force specific device")] = None,
34
+ force_model: Annotated[str | None, typer.Option(help="Force specific model")] = None,
35
+ verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable debug logging")] = False,
36
+ ) -> None:
37
+ """Perform a dry-run scan of the media directory and profile hardware."""
38
+ configure_logging(verbose)
39
+
40
+ # Compile the configuration
41
+ hw_config = HardwareConfig(force_device=force_device, force_model=force_model)
42
+ flt_config = FilterConfig(min_age_mins=min_age_mins)
43
+ config = AppConfig(
44
+ media_dir=media_dir,
45
+ dry_run=True,
46
+ hardware=hw_config,
47
+ filters=flt_config,
48
+ )
49
+
50
+ # 1. Profile Hardware
51
+ setup_thread_safety()
52
+ console.print("\n[bold cyan]1. Profiling Hardware...[/bold cyan]")
53
+ profile = HardwareProfiler.profile()
54
+ _ = ModelRouter.get_config(profile, config.hardware)
55
+
56
+ # 2. Run the Async Discovery Engine
57
+ console.print(f"\n[bold cyan]2. Scanning Directory: {config.media_dir}[/bold cyan]")
58
+ asyncio.run(_run_scan(config))
59
+
60
+
61
+ async def _run_scan(config: AppConfig) -> None:
62
+ """Execute the asynchronous scanning process."""
63
+ table = Table(title="Media File Discovery Report", show_lines=True)
64
+ table.add_column("File Path", style="dim", no_wrap=False)
65
+ table.add_column("Size (MB)", justify="right", style="green")
66
+ table.add_column("Action", style="magenta")
67
+ table.add_column("Reason", style="yellow")
68
+
69
+ async with StateTracker(config.db_path) as tracker:
70
+ engine = DiscoveryEngine(config.media_dir, config.filters, tracker)
71
+
72
+ process_count = 0
73
+ skip_count = 0
74
+
75
+ async for media_file, action_str in engine.scan():
76
+ size_mb = media_file.size / (1024 * 1024)
77
+ path_str = str(media_file.path.relative_to(config.media_dir))
78
+
79
+ if action_str == "PROCESS":
80
+ table.add_row(
81
+ path_str,
82
+ f"{size_mb:.1f}",
83
+ "[bold green]PROCESS[/bold green]",
84
+ "Needs Subtitle",
85
+ )
86
+ process_count += 1
87
+ else:
88
+ reason = action_str.replace("SKIP: ", "")
89
+ table.add_row(path_str, f"{size_mb:.1f}", "[dim]SKIP[/dim]", reason)
90
+ skip_count += 1
91
+
92
+ console.print(table)
93
+ console.print(
94
+ f"\n[bold]Summary:[/bold] {process_count} files to process, {skip_count} files skipped."
95
+ )
96
+
97
+
98
+ @app.command()
99
+ def run(
100
+ media_dir: Annotated[Path, typer.Argument(help="Root directory containing media files")],
101
+ min_age_mins: Annotated[int, typer.Option(help="Minimum file age in minutes")] = 15,
102
+ translate: Annotated[
103
+ bool, typer.Option("--translate", help="Enable AI translation to English")
104
+ ] = False,
105
+ watch: Annotated[bool, typer.Option("--watch", help="Run continuously in daemon mode")] = False,
106
+ watch_interval: Annotated[
107
+ int, typer.Option("--watch-interval", help="Minutes between scans in watch mode")
108
+ ] = 60,
109
+ force_device: Annotated[str | None, typer.Option(help="Force specific device")] = None,
110
+ force_model: Annotated[str | None, typer.Option(help="Force specific model")] = None,
111
+ verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable debug logging")] = False,
112
+ ) -> None:
113
+ """Run the live SRT generation pipeline."""
114
+ configure_logging(verbose)
115
+
116
+ # Compile the configuration
117
+ hw_config = HardwareConfig(force_device=force_device, force_model=force_model)
118
+ flt_config = FilterConfig(min_age_mins=min_age_mins)
119
+ config = AppConfig(
120
+ media_dir=media_dir,
121
+ dry_run=False,
122
+ translate=translate,
123
+ watch=watch,
124
+ watch_interval_mins=watch_interval,
125
+ hardware=hw_config,
126
+ filters=flt_config,
127
+ )
128
+
129
+ setup_thread_safety()
130
+
131
+ console.print("\n[bold cyan]1. Profiling Hardware & Initializing Models...[/bold cyan]")
132
+ profile = HardwareProfiler.profile()
133
+ model_cfg = ModelRouter.get_config(profile, config.hardware)
134
+
135
+ # Initialize the STT singleton before starting async loop
136
+ from aisrt.stt import STTWorker
137
+
138
+ stt_worker = STTWorker()
139
+ stt_worker.initialize(model_cfg)
140
+
141
+ console.print(f"\n[bold cyan]2. Starting Async Pipeline on {config.media_dir}[/bold cyan]")
142
+
143
+ # We define a wrapper to inject the db context manager and the pipeline
144
+ async def _execute_pipeline() -> None:
145
+ from aisrt.pipeline import Pipeline
146
+
147
+ async with StateTracker(config.db_path) as tracker:
148
+ while True:
149
+ engine = DiscoveryEngine(config.media_dir, config.filters, tracker)
150
+ pipeline = Pipeline(
151
+ engine, cpu_cores=profile.physical_cores, translate=config.translate
152
+ )
153
+ await pipeline.run()
154
+
155
+ if not config.watch:
156
+ break
157
+
158
+ console.print(
159
+ f"\n[bold yellow]Sleeping for {config.watch_interval_mins} "
160
+ f"minutes...[/bold yellow]"
161
+ )
162
+ await asyncio.sleep(config.watch_interval_mins * 60)
163
+ console.print(
164
+ f"\n[bold cyan]Waking up and scanning Directory: {config.media_dir}[/bold cyan]"
165
+ )
166
+
167
+ try:
168
+ asyncio.run(_execute_pipeline())
169
+ console.print("\n[bold green]Pipeline finished successfully.[/bold green]")
170
+ except KeyboardInterrupt:
171
+ console.print("\n[bold red]Pipeline interrupted by user.[/bold red]")
172
+ finally:
173
+ stt_worker.close()
174
+
175
+
176
+ if __name__ == "__main__":
177
+ app()
aisrt/config.py ADDED
@@ -0,0 +1,80 @@
1
+ """Configuration schemas for the SRT Generator."""
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field
6
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
+
8
+
9
+ class HardwareConfig(BaseModel):
10
+ """Configuration for hardware acceleration and inference limits."""
11
+
12
+ force_device: str | None = Field(
13
+ default=None,
14
+ description="Force a specific compute device (e.g., 'cuda'). Auto-detect if None.",
15
+ )
16
+ force_compute_type: str | None = Field(
17
+ default=None,
18
+ description="Force compute type (e.g., 'float16', 'int8'). Auto-detect if None.",
19
+ )
20
+ force_model: str | None = Field(
21
+ default=None,
22
+ description="Force a Whisper model (e.g., 'large-v3-turbo'). Auto-detect if None.",
23
+ )
24
+
25
+
26
+ class FilterConfig(BaseModel):
27
+ """Configuration for filtering media files during discovery."""
28
+
29
+ min_age_mins: int = Field(
30
+ default=15,
31
+ description="Minimum file age in minutes to avoid processing active downloads.",
32
+ )
33
+ extensions: list[str] = Field(
34
+ default_factory=lambda: [".mkv", ".mp4", ".avi", ".webm"],
35
+ description="List of valid media file extensions to process.",
36
+ )
37
+ exclude_patterns: list[str] = Field(
38
+ default_factory=lambda: ["*sample*", "*extras*", "*featurettes*"],
39
+ description="Glob patterns for directories or files to ignore.",
40
+ )
41
+ target_languages: list[str] = Field(
42
+ default_factory=lambda: ["eng", "en"],
43
+ description="Target subtitle languages to generate/check for.",
44
+ )
45
+
46
+
47
+ class AppConfig(BaseSettings):
48
+ """Main application configuration."""
49
+
50
+ media_dir: Path = Field(
51
+ description="The root directory containing media to scan.",
52
+ )
53
+ db_path: Path = Field(
54
+ default_factory=lambda: Path.home() / ".config" / "aisrt" / "state.db",
55
+ description="Path to the local SQLite state database.",
56
+ )
57
+ dry_run: bool = Field(
58
+ default=False,
59
+ description="If True, only scan and report what would be done (no execution).",
60
+ )
61
+ translate: bool = Field(
62
+ default=False,
63
+ description="If True, translates foreign audio to English using Whisper's translate task.",
64
+ )
65
+ watch: bool = Field(
66
+ default=False,
67
+ description="If True, runs the pipeline continuously in daemon mode.",
68
+ )
69
+ watch_interval_mins: int = Field(
70
+ default=60,
71
+ description="Interval in minutes between scans when running in watch mode.",
72
+ )
73
+ hardware: HardwareConfig = Field(default_factory=HardwareConfig)
74
+ filters: FilterConfig = Field(default_factory=FilterConfig)
75
+
76
+ model_config = SettingsConfigDict(
77
+ env_prefix="AISRT_",
78
+ env_nested_delimiter="__",
79
+ extra="ignore",
80
+ )
aisrt/discovery.py ADDED
@@ -0,0 +1,195 @@
1
+ """NAS-Safe File Discovery Engine."""
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import time
7
+ from collections.abc import AsyncGenerator
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+
11
+ from loguru import logger
12
+
13
+ from aisrt.config import FilterConfig
14
+ from aisrt.state import StateTracker
15
+
16
+
17
+ @dataclass
18
+ class MediaFile:
19
+ """Represents a discovered media file pending processing."""
20
+
21
+ path: Path
22
+ size: int
23
+ mtime: float
24
+ inode: int
25
+
26
+
27
+ class DiscoveryEngine:
28
+ """Safely crawls a media directory and filters files based on state and config."""
29
+
30
+ def __init__(self, media_dir: Path, config: FilterConfig, state_tracker: StateTracker) -> None:
31
+ """Initialize the discovery engine.
32
+
33
+ Args:
34
+ media_dir: The root directory to scan.
35
+ config: The filtering configuration rules.
36
+ state_tracker: The active SQLite state tracker.
37
+ """
38
+ self.media_dir = media_dir
39
+ self.config = config
40
+ self.state_tracker = state_tracker
41
+
42
+ async def scan(self) -> AsyncGenerator[tuple[MediaFile, str], None]:
43
+ """Scan the media directory and yield files with their action status.
44
+
45
+ Yields:
46
+ A tuple of (MediaFile, action_string).
47
+ action_string is 'PROCESS' if the file needs STT, or a 'SKIP: <reason>' string.
48
+ """
49
+ loop = asyncio.get_running_loop()
50
+
51
+ def _walk(directory: Path) -> list[Path]:
52
+ paths = []
53
+ try:
54
+ for entry in os.scandir(directory):
55
+ path = Path(entry.path)
56
+
57
+ if any(path.match(p) for p in self.config.exclude_patterns):
58
+ continue
59
+
60
+ if entry.is_dir(follow_symlinks=False):
61
+ paths.extend(_walk(path))
62
+ elif entry.is_file(follow_symlinks=False):
63
+ if path.suffix.lower() in self.config.extensions:
64
+ paths.append(path)
65
+ except PermissionError:
66
+ logger.warning(f"Permission denied: {directory}")
67
+ return paths
68
+
69
+ logger.info(f"Starting directory scan at {self.media_dir}...")
70
+ all_files = await loop.run_in_executor(None, _walk, self.media_dir)
71
+ logger.info(f"Found {len(all_files)} potential media files. Analyzing...")
72
+
73
+ current_time = time.time()
74
+
75
+ for file_path in all_files:
76
+ try:
77
+ stat = file_path.stat()
78
+ media_file = MediaFile(
79
+ path=file_path,
80
+ size=stat.st_size,
81
+ mtime=stat.st_mtime,
82
+ inode=stat.st_ino,
83
+ )
84
+ except OSError as e:
85
+ logger.warning(f"Could not stat {file_path}: {e}")
86
+ continue
87
+
88
+ action_str = await self._analyze_file(media_file, current_time)
89
+ yield media_file, action_str
90
+
91
+ async def _analyze_file(self, media_file: MediaFile, current_time: float) -> str:
92
+ """Determine if a single file should be processed or skipped."""
93
+ min_age_seconds = self.config.min_age_mins * 60
94
+
95
+ if (current_time - media_file.mtime) < min_age_seconds:
96
+ return f"SKIP: Modified recently (< {self.config.min_age_mins}m)"
97
+
98
+ if self._has_sibling_subtitle(media_file.path):
99
+ return "SKIP: External sibling subtitle exists"
100
+
101
+ db_state = await self.state_tracker.get_state(str(media_file.path))
102
+ if db_state and db_state.status == "COMPLETED" and db_state.size == media_file.size:
103
+ return "SKIP: Already processed (Database)"
104
+
105
+ is_hardlink = await self.state_tracker.check_hardlink_processed(
106
+ media_file.inode, media_file.size
107
+ )
108
+ if is_hardlink:
109
+ return "SKIP: Hardlink to already processed file"
110
+
111
+ if db_state and db_state.status == "EMBEDDED_EXISTS":
112
+ return "SKIP: Embedded English subtitle exists (Database)"
113
+
114
+ has_embedded = await self._check_embedded_subtitles(media_file.path)
115
+ if has_embedded:
116
+ await self.state_tracker.update_state(
117
+ file_path=str(media_file.path),
118
+ inode=media_file.inode,
119
+ mtime=media_file.mtime,
120
+ size=media_file.size,
121
+ status="EMBEDDED_EXISTS",
122
+ )
123
+ return "SKIP: Embedded English subtitle detected"
124
+
125
+ return "PROCESS"
126
+
127
+ def _has_sibling_subtitle(self, video_path: Path) -> bool:
128
+ """Check if an external SRT file exists next to the video."""
129
+ base_name = video_path.stem
130
+ dir_name = video_path.parent
131
+
132
+ check_suffixes = [".srt"]
133
+ for lang in self.config.target_languages:
134
+ check_suffixes.append(f".{lang}.srt")
135
+
136
+ for suffix in check_suffixes:
137
+ if (dir_name / f"{base_name}{suffix}").exists():
138
+ return True
139
+
140
+ return False
141
+
142
+ async def _check_embedded_subtitles(self, video_path: Path) -> bool:
143
+ """Run ffprobe to check if an embedded target-language subtitle exists."""
144
+ cmd = [
145
+ "ffprobe",
146
+ "-v",
147
+ "error",
148
+ "-select_streams",
149
+ "s",
150
+ "-show_entries",
151
+ "stream=index,codec_name:stream_tags=language",
152
+ "-of",
153
+ "json",
154
+ str(video_path),
155
+ ]
156
+
157
+ try:
158
+ process = await asyncio.create_subprocess_exec(
159
+ *cmd,
160
+ stdout=asyncio.subprocess.PIPE,
161
+ stderr=asyncio.subprocess.PIPE,
162
+ )
163
+ stdout, _ = await process.communicate()
164
+
165
+ if process.returncode != 0:
166
+ logger.warning(f"ffprobe failed on {video_path}")
167
+ return False
168
+
169
+ data = json.loads(stdout.decode("utf-8"))
170
+ streams = data.get("streams", [])
171
+
172
+ for stream in streams:
173
+ codec = stream.get("codec_name", "").lower()
174
+ tags = stream.get("tags", {})
175
+ lang = tags.get("language", "").lower()
176
+
177
+ # Only skip if we find a text-based subtitle track in the target language.
178
+ # Image-based subs (hdmv_pgs_subtitle) force transcodes on many players.
179
+ if lang in self.config.target_languages:
180
+ if codec in ["subrip", "ass", "mov_text", "webvtt"]:
181
+ return True
182
+ else:
183
+ logger.debug(
184
+ f"Ignoring embedded {codec} subtitle in {video_path} (forces transcode)"
185
+ )
186
+
187
+ except FileNotFoundError:
188
+ logger.error("ffprobe not found. Please ensure FFmpeg is installed and in PATH.")
189
+ raise
190
+ except json.JSONDecodeError:
191
+ logger.warning(f"Failed to parse ffprobe JSON for {video_path}")
192
+ except Exception as e:
193
+ logger.warning(f"Error checking embedded streams for {video_path}: {e}")
194
+
195
+ return False