mkv-episode-matcher 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mkv_episode_matcher/__init__.py +8 -0
  2. mkv_episode_matcher/__main__.py +2 -177
  3. mkv_episode_matcher/asr_models.py +506 -0
  4. mkv_episode_matcher/cli.py +558 -0
  5. mkv_episode_matcher/core/config_manager.py +100 -0
  6. mkv_episode_matcher/core/engine.py +577 -0
  7. mkv_episode_matcher/core/matcher.py +214 -0
  8. mkv_episode_matcher/core/models.py +91 -0
  9. mkv_episode_matcher/core/providers/asr.py +85 -0
  10. mkv_episode_matcher/core/providers/subtitles.py +341 -0
  11. mkv_episode_matcher/core/utils.py +148 -0
  12. mkv_episode_matcher/episode_identification.py +550 -118
  13. mkv_episode_matcher/subtitle_utils.py +82 -0
  14. mkv_episode_matcher/tmdb_client.py +56 -14
  15. mkv_episode_matcher/ui/flet_app.py +708 -0
  16. mkv_episode_matcher/utils.py +262 -139
  17. mkv_episode_matcher-1.0.0.dist-info/METADATA +242 -0
  18. mkv_episode_matcher-1.0.0.dist-info/RECORD +23 -0
  19. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/WHEEL +1 -1
  20. mkv_episode_matcher-1.0.0.dist-info/licenses/LICENSE +21 -0
  21. mkv_episode_matcher/config.py +0 -82
  22. mkv_episode_matcher/episode_matcher.py +0 -100
  23. mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
  24. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
  25. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
  26. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
  27. mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
  28. mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  29. mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
  30. mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
  31. mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
  32. mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
  33. mkv_episode_matcher/mkv_to_srt.py +0 -302
  34. mkv_episode_matcher/speech_to_text.py +0 -90
  35. mkv_episode_matcher-0.3.3.dist-info/METADATA +0 -125
  36. mkv_episode_matcher-0.3.3.dist-info/RECORD +0 -25
  37. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/entry_points.txt +0 -0
  38. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,150 +1,582 @@
1
- import json
2
- import os
1
+ import re
3
2
  import subprocess
4
3
  import tempfile
4
+ from functools import lru_cache
5
5
  from pathlib import Path
6
+
7
+ import chardet
8
+ import numpy as np
6
9
  import torch
7
- from rapidfuzz import fuzz
8
10
  from loguru import logger
9
- import whisper
10
- import numpy as np
11
- import re
11
+ from rich import print
12
+ from rich.console import Console
13
+
14
+ from mkv_episode_matcher.asr_models import get_cached_model
15
+ from mkv_episode_matcher.utils import extract_season_episode
16
+
17
+ console = Console()
18
+
19
+
20
+ class SubtitleCache:
21
+ """Cache for storing parsed subtitle data to avoid repeated loading and parsing."""
22
+
23
+ def __init__(self):
24
+ self.subtitles = {} # {file_path: parsed_content}
25
+ self.chunk_cache = {} # {(file_path, chunk_idx): text}
26
+
27
+ def get_subtitle_content(self, srt_file):
28
+ """Get the full content of a subtitle file, loading it only once."""
29
+ srt_file = str(srt_file)
30
+ if srt_file not in self.subtitles:
31
+ reader = SubtitleReader()
32
+ self.subtitles[srt_file] = reader.read_srt_file(srt_file)
33
+ return self.subtitles[srt_file]
34
+
35
+ def get_chunk(self, srt_file, chunk_idx, chunk_start, chunk_end):
36
+ """Get a specific time chunk from a subtitle file, with caching."""
37
+ srt_file = str(srt_file)
38
+ cache_key = (srt_file, chunk_idx)
39
+
40
+ if cache_key not in self.chunk_cache:
41
+ content = self.get_subtitle_content(srt_file)
42
+ reader = SubtitleReader()
43
+ text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
44
+ self.chunk_cache[cache_key] = " ".join(text_lines)
45
+
46
+ return self.chunk_cache[cache_key]
47
+
48
+
12
49
  class EpisodeMatcher:
13
- def __init__(self, cache_dir, show_name, min_confidence=0.6):
50
+ def __init__(self, cache_dir, show_name, min_confidence=0.6, device=None):
14
51
  self.cache_dir = Path(cache_dir)
15
52
  self.min_confidence = min_confidence
16
53
  self.show_name = show_name
17
- self.chunk_duration = 300 # 5 minutes
18
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
54
+ self.chunk_duration = 30
55
+ self.skip_initial_duration = 300
56
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
19
57
  self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
20
58
  self.temp_dir.mkdir(exist_ok=True)
21
-
59
+ # Initialize subtitle cache
60
+ self.subtitle_cache = SubtitleCache()
61
+ # Cache for extracted audio chunks
62
+ self.audio_chunks = {}
63
+ # Store reference files to avoid repeated glob operations
64
+ self.reference_files_cache = {}
65
+
22
66
  def clean_text(self, text):
23
67
  text = text.lower().strip()
24
- text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
25
- text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
26
- return ' '.join(text.split())
27
-
28
- def chunk_score(self, whisper_chunk, ref_chunk):
29
- whisper_clean = self.clean_text(whisper_chunk)
30
- ref_clean = self.clean_text(ref_chunk)
31
- return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
32
- fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
68
+ text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
69
+ text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
70
+ return " ".join(text.split())
33
71
 
34
72
  def extract_audio_chunk(self, mkv_file, start_time):
35
- """Extract a chunk of audio from MKV file."""
73
+ """Extract a chunk of audio from MKV file with caching."""
74
+ cache_key = (str(mkv_file), start_time)
75
+
76
+ if cache_key in self.audio_chunks:
77
+ return self.audio_chunks[cache_key]
78
+
36
79
  chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
37
80
  if not chunk_path.exists():
38
81
  cmd = [
39
- 'ffmpeg',
40
- '-ss', str(start_time),
41
- '-t', str(self.chunk_duration),
42
- '-i', mkv_file,
43
- '-vn',
44
- '-acodec', 'pcm_s16le',
45
- '-ar', '16000',
46
- '-ac', '1',
47
- str(chunk_path)
82
+ "ffmpeg",
83
+ "-ss",
84
+ str(start_time),
85
+ "-t",
86
+ str(self.chunk_duration),
87
+ "-i",
88
+ str(mkv_file),
89
+ "-vn", # Disable video
90
+ "-sn", # Disable subtitles
91
+ "-dn", # Disable data streams
92
+ "-acodec",
93
+ "pcm_s16le",
94
+ "-ar",
95
+ "16000",
96
+ "-ac",
97
+ "1",
98
+ "-y", # Overwrite output files without asking
99
+ str(chunk_path),
48
100
  ]
49
- subprocess.run(cmd, capture_output=True)
50
- return str(chunk_path)
101
+
102
+ try:
103
+ logger.debug(
104
+ f"Extracting audio chunk from {mkv_file} at {start_time}s using FFmpeg"
105
+ )
106
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
107
+
108
+ if result.returncode != 0:
109
+ error_msg = f"FFmpeg failed with return code {result.returncode}"
110
+ if result.stderr:
111
+ error_msg += f". Error: {result.stderr.strip()}"
112
+ logger.error(error_msg)
113
+ logger.debug(f"FFmpeg command: {' '.join(cmd)}")
114
+ raise RuntimeError(error_msg)
115
+
116
+ # Check if the output file was actually created and has content
117
+ if not chunk_path.exists():
118
+ error_msg = f"FFmpeg completed but output file was not created: {chunk_path}"
119
+ logger.error(error_msg)
120
+ raise RuntimeError(error_msg)
121
+
122
+ # Check if the file has meaningful content (at least 1KB)
123
+ if chunk_path.stat().st_size < 1024:
124
+ error_msg = f"Generated audio chunk is too small ({chunk_path.stat().st_size} bytes), likely corrupted"
125
+ logger.warning(error_msg)
126
+ # Don't raise an error for small files, but log the warning
127
+
128
+ logger.debug(
129
+ f"Successfully extracted {chunk_path.stat().st_size} byte audio chunk"
130
+ )
131
+
132
+ except subprocess.TimeoutExpired as e:
133
+ error_msg = f"FFmpeg timed out after 30 seconds while extracting audio from {mkv_file}"
134
+ logger.error(error_msg)
135
+ raise RuntimeError(error_msg) from e
136
+
137
+ except Exception as e:
138
+ error_msg = f"Failed to extract audio chunk from {mkv_file} at {start_time}s: {str(e)}"
139
+ logger.error(error_msg)
140
+ # Clean up partial file if it exists
141
+ if chunk_path.exists():
142
+ try:
143
+ chunk_path.unlink()
144
+ except Exception as cleanup_error:
145
+ logger.warning(
146
+ f"Failed to clean up partial file {chunk_path}: {cleanup_error}"
147
+ )
148
+ raise RuntimeError(error_msg) from e
149
+
150
+ chunk_path_str = str(chunk_path)
151
+ self.audio_chunks[cache_key] = chunk_path_str
152
+ return chunk_path_str
51
153
 
52
154
  def load_reference_chunk(self, srt_file, chunk_idx):
53
- """Load reference subtitles for a specific time chunk."""
54
- chunk_start = chunk_idx * self.chunk_duration
55
- chunk_end = chunk_start + self.chunk_duration
56
- text_lines = []
57
-
58
- with open(srt_file, 'r', encoding='utf-8') as f:
59
- content = f.read().strip()
60
-
61
- for block in content.split('\n\n'):
62
- lines = block.split('\n')
63
- if len(lines) < 3 or '-->' not in lines[1]: # Skip malformed blocks
64
- continue
65
-
155
+ """
156
+ Load reference subtitles for a specific time chunk with caching.
157
+
158
+ Args:
159
+ srt_file (str or Path): Path to the SRT file
160
+ chunk_idx (int): Index of the chunk to load
161
+
162
+ Returns:
163
+ str: Combined text from the subtitle chunk
164
+ """
165
+ try:
166
+ # Apply the same offset as in _try_match_with_model
167
+ chunk_start = self.skip_initial_duration + (chunk_idx * self.chunk_duration)
168
+ chunk_end = chunk_start + self.chunk_duration
169
+
170
+ return self.subtitle_cache.get_chunk(
171
+ srt_file, chunk_idx, chunk_start, chunk_end
172
+ )
173
+
174
+ except Exception as e:
175
+ logger.error(f"Error loading reference chunk from {srt_file}: {e}")
176
+ return ""
177
+
178
+ def get_reference_files(self, season_number):
179
+ """Get reference subtitle files with caching."""
180
+ cache_key = (self.show_name, season_number)
181
+ logger.debug(f"Reference cache key: {cache_key}")
182
+
183
+ if cache_key in self.reference_files_cache:
184
+ logger.debug("Returning cached reference files")
185
+ return self.reference_files_cache[cache_key]
186
+
187
+ reference_dir = self.cache_dir / "data" / self.show_name
188
+ patterns = [
189
+ f"S{season_number:02d}E",
190
+ f"S{season_number}E",
191
+ f"{season_number:02d}x",
192
+ f"{season_number}x",
193
+ ]
194
+
195
+ reference_files = []
196
+ for pattern in patterns:
197
+ # Use case-insensitive file extension matching by checking both .srt and .SRT
198
+ srt_files = list(reference_dir.glob("*.srt")) + list(
199
+ reference_dir.glob("*.SRT")
200
+ )
201
+ files = [
202
+ f
203
+ for f in srt_files
204
+ if re.search(f"{pattern}\\d+", f.name, re.IGNORECASE)
205
+ ]
206
+ reference_files.extend(files)
207
+
208
+ # Remove duplicates while preserving order
209
+ reference_files = list(dict.fromkeys(reference_files))
210
+ logger.debug(
211
+ f"Found {len(reference_files)} reference files for season {season_number}"
212
+ )
213
+ self.reference_files_cache[cache_key] = reference_files
214
+ return reference_files
215
+
216
+ def _try_match_with_model(
217
+ self, video_file, model_config, max_duration, reference_files
218
+ ):
219
+ """
220
+ Attempt to match using specified model, checking multiple chunks starting from skip_initial_duration
221
+ and continuing up to max_duration.
222
+
223
+ Args:
224
+ video_file: Path to the video file
225
+ model_config: Dictionary with ASR model configuration or string for backward compatibility
226
+ max_duration: Maximum duration in seconds to check
227
+ reference_files: List of reference subtitle files
228
+ """
229
+ # Handle backward compatibility for string model names
230
+ if isinstance(model_config, str):
231
+ # Convert old Whisper model names to new format
232
+ model_config = {
233
+ "type": "whisper",
234
+ "name": model_config,
235
+ "device": self.device,
236
+ }
237
+ elif isinstance(model_config, dict):
238
+ # Ensure device is set if not specified
239
+ if "device" not in model_config:
240
+ model_config = model_config.copy()
241
+ model_config["device"] = self.device
242
+
243
+ # Use cached model
244
+ model = get_cached_model(model_config)
245
+
246
+ # Calculate number of chunks to check
247
+ num_chunks = min(
248
+ max_duration // self.chunk_duration, 10
249
+ ) # Limit to 10 chunks for initial check
250
+
251
+ # Pre-load all reference chunks for the chunks we'll check
252
+ for chunk_idx in range(num_chunks):
253
+ for ref_file in reference_files:
254
+ self.load_reference_chunk(ref_file, chunk_idx)
255
+
256
+ for chunk_idx in range(num_chunks):
257
+ # Start at self.skip_initial_duration and check subsequent chunks
258
+ start_time = self.skip_initial_duration + (chunk_idx * self.chunk_duration)
259
+ model_name = (
260
+ model_config.get("name", "unknown")
261
+ if isinstance(model_config, dict)
262
+ else model_config
263
+ )
264
+ logger.debug(f"Trying {model_name} model at {start_time} seconds")
265
+
66
266
  try:
67
- timestamp = lines[1]
68
- text = ' '.join(lines[2:])
69
-
70
- end_time = timestamp.split(' --> ')[1].strip()
71
- hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
72
- total_seconds = hours * 3600 + minutes * 60 + seconds
73
-
74
- if chunk_start <= total_seconds <= chunk_end:
75
- text_lines.append(text)
76
-
77
- except (IndexError, ValueError):
267
+ audio_path = self.extract_audio_chunk(video_file, start_time)
268
+ logger.debug(f"Extracted audio chunk: {audio_path}")
269
+ except RuntimeError as e:
270
+ logger.warning(f"Failed to extract audio chunk at {start_time}s: {e}")
271
+ continue # Skip this chunk and try the next one
272
+ except Exception as e:
273
+ logger.error(
274
+ f"Unexpected error extracting audio chunk at {start_time}s: {e}"
275
+ )
276
+ continue # Skip this chunk and try the next one
277
+
278
+ try:
279
+ result = model.transcribe(audio_path)
280
+ except Exception as e:
281
+ logger.error(
282
+ f"ASR transcription failed for chunk at {start_time}s: {e}"
283
+ )
284
+ continue # Skip this chunk and try the next one
285
+
286
+ chunk_text = result["text"]
287
+ logger.debug(
288
+ f"Transcription result: {chunk_text} ({len(chunk_text)} characters)"
289
+ )
290
+ if len(chunk_text) < 10:
291
+ logger.debug(
292
+ f"Transcription result too short: {chunk_text} ({len(chunk_text)} characters)"
293
+ )
78
294
  continue
79
-
80
- return ' '.join(text_lines)
295
+ best_confidence = 0
296
+ best_match = None
297
+
298
+ # Compare with reference chunks
299
+ # Compare with reference chunks
300
+ for ref_file in reference_files:
301
+ ref_text = self.load_reference_chunk(ref_file, chunk_idx)
302
+
303
+ # Use model's internal scoring logic
304
+ confidence = model.calculate_match_score(chunk_text, ref_text)
305
+
306
+ if confidence > best_confidence:
307
+ logger.debug(f"New best confidence: {confidence} for {ref_file}")
308
+ best_confidence = confidence
309
+ best_match = Path(ref_file)
310
+
311
+ if confidence > self.min_confidence:
312
+ print(
313
+ f"Matched with {best_match} (confidence: {best_confidence:.2f})"
314
+ )
315
+ try:
316
+ season, episode = extract_season_episode(best_match.stem)
317
+ except Exception as e:
318
+ print(f"Error extracting season/episode: {e}")
319
+ continue
320
+ print(
321
+ f"Season: {season}, Episode: {episode} (confidence: {best_confidence:.2f})"
322
+ )
323
+ if season and episode:
324
+ return {
325
+ "season": season,
326
+ "episode": episode,
327
+ "confidence": best_confidence,
328
+ "reference_file": str(best_match),
329
+ "matched_at": start_time,
330
+ }
331
+
332
+ logger.info(
333
+ f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
334
+ )
335
+ return None
81
336
 
82
337
  def identify_episode(self, video_file, temp_dir, season_number):
338
+ """Progressive episode identification with faster initial attempt."""
83
339
  try:
84
- # Get video duration
85
- duration = float(subprocess.check_output([
86
- 'ffprobe', '-v', 'error',
87
- '-show_entries', 'format=duration',
88
- '-of', 'default=noprint_wrappers=1:nokey=1',
89
- video_file
90
- ]).decode())
91
-
92
- total_chunks = int(np.ceil(duration / self.chunk_duration))
93
-
94
- # Load Whisper model
95
- model = whisper.load_model("base", device=self.device)
96
-
97
- # Get season-specific reference files
98
- reference_dir = self.cache_dir / "data" / self.show_name
99
- season_pattern = f"S{season_number:02d}E"
100
- reference_files = [
101
- f for f in reference_dir.glob("*.srt")
102
- if season_pattern in f.name
103
- ]
104
-
340
+ # Get reference files first with caching
341
+ reference_files = self.get_reference_files(season_number)
342
+
105
343
  if not reference_files:
106
344
  logger.error(f"No reference files found for season {season_number}")
107
345
  return None
108
-
109
- # Process chunks until match found
110
- for chunk_idx in range(min(3, total_chunks)): # Only try first 3 chunks
111
- start_time = chunk_idx * self.chunk_duration
112
- audio_path = self.extract_audio_chunk(video_file, start_time)
113
-
114
- # Transcribe chunk
115
- result = model.transcribe(
116
- audio_path,
117
- task="transcribe",
118
- language="en"
346
+
347
+ # Cache video duration
348
+ try:
349
+ duration = get_video_duration(video_file)
350
+ except Exception as e:
351
+ logger.error(f"Failed to get video duration for {video_file}: {e}")
352
+ return None
353
+
354
+ # Try with Parakeet CTC model
355
+ logger.info("Attempting match with Parakeet CTC model...")
356
+ try:
357
+ match = self._try_match_with_model(
358
+ video_file,
359
+ {
360
+ "type": "parakeet",
361
+ "name": "nvidia/parakeet-ctc-0.6b",
362
+ "device": self.device,
363
+ },
364
+ min(duration, 600), # Allow up to 10 minutes
365
+ reference_files,
119
366
  )
120
-
121
- chunk_text = result["text"]
122
- best_confidence = 0
123
- best_match = None
124
-
125
- # Compare with reference chunks
126
- for ref_file in reference_files:
127
- ref_text = self.load_reference_chunk(ref_file, chunk_idx)
128
- confidence = self.chunk_score(chunk_text, ref_text)
129
-
130
- if confidence > best_confidence:
131
- best_confidence = confidence
132
- best_match = ref_file
133
-
134
- if confidence > self.min_confidence:
135
- season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
136
- if season_ep:
137
- season, episode = map(int, season_ep.groups())
138
- return {
139
- 'season': season,
140
- 'episode': episode,
141
- 'confidence': best_confidence,
142
- 'reference_file': str(best_match),
143
- }
144
-
367
+ if match:
368
+ logger.info(
369
+ f"Successfully matched with Parakeet CTC model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
370
+ )
371
+ return match
372
+ except Exception as e:
373
+ logger.warning(f"Parakeet CTC model failed: {e}")
374
+
375
+ logger.info(
376
+ "Speech recognition match failed - no models were able to process this file"
377
+ )
378
+ return None
379
+
380
+ except Exception as e:
381
+ logger.error(
382
+ f"Unexpected error during episode identification for {video_file}: {e}"
383
+ )
145
384
  return None
146
-
385
+
147
386
  finally:
148
- # Cleanup temp files
149
- for file in self.temp_dir.glob("chunk_*.wav"):
150
- file.unlink()
387
+ # Cleanup temp files - keep this limited to only files we know we created
388
+ for chunk_info in self.audio_chunks.values():
389
+ try:
390
+ Path(chunk_info).unlink(missing_ok=True)
391
+ except Exception as e:
392
+ logger.warning(f"Failed to delete temp file {chunk_info}: {e}")
393
+
394
+
395
+ @lru_cache(maxsize=100)
396
+ def get_video_duration(video_file):
397
+ """Get video duration with caching and error handling."""
398
+ try:
399
+ logger.debug(f"Getting duration for video file: {video_file}")
400
+ result = subprocess.run(
401
+ [
402
+ "ffprobe",
403
+ "-v",
404
+ "error",
405
+ "-show_entries",
406
+ "format=duration",
407
+ "-of",
408
+ "default=noprint_wrappers=1:nokey=1",
409
+ str(video_file),
410
+ ],
411
+ capture_output=True,
412
+ text=True,
413
+ timeout=10,
414
+ )
415
+
416
+ if result.returncode != 0:
417
+ error_msg = f"ffprobe failed with return code {result.returncode}"
418
+ if result.stderr:
419
+ error_msg += f". Error: {result.stderr.strip()}"
420
+ logger.error(error_msg)
421
+ raise RuntimeError(error_msg)
422
+
423
+ duration_str = result.stdout.strip()
424
+ if not duration_str:
425
+ raise RuntimeError("ffprobe returned empty duration")
426
+
427
+ duration = float(duration_str)
428
+ if duration <= 0:
429
+ raise RuntimeError(f"Invalid duration: {duration}")
430
+
431
+ result_duration = int(np.ceil(duration))
432
+ logger.debug(f"Video duration: {result_duration} seconds")
433
+ return result_duration
434
+
435
+ except subprocess.TimeoutExpired as e:
436
+ error_msg = f"ffprobe timed out while getting duration for {video_file}"
437
+ logger.error(error_msg)
438
+ raise RuntimeError(error_msg) from e
439
+ except ValueError as e:
440
+ error_msg = (
441
+ f"Failed to parse duration from ffprobe output for {video_file}: {e}"
442
+ )
443
+ logger.error(error_msg)
444
+ raise RuntimeError(error_msg) from e
445
+ except Exception as e:
446
+ error_msg = f"Unexpected error getting video duration for {video_file}: {e}"
447
+ logger.error(error_msg)
448
+ raise RuntimeError(error_msg) from e
449
+
450
+
451
+ def detect_file_encoding(file_path):
452
+ """
453
+ Detect the encoding of a file using chardet.
454
+
455
+ Args:
456
+ file_path (str or Path): Path to the file
457
+
458
+ Returns:
459
+ str: Detected encoding, defaults to 'utf-8' if detection fails
460
+ """
461
+ try:
462
+ with open(file_path, "rb") as f:
463
+ raw_data = f.read(
464
+ min(1024 * 1024, Path(file_path).stat().st_size)
465
+ ) # Read up to 1MB
466
+ result = chardet.detect(raw_data)
467
+ encoding = result["encoding"]
468
+ confidence = result["confidence"]
469
+
470
+ logger.debug(
471
+ f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
472
+ )
473
+ return encoding if encoding else "utf-8"
474
+ except Exception as e:
475
+ logger.warning(f"Error detecting encoding for {file_path}: {e}")
476
+ return "utf-8"
477
+
478
+
479
+ @lru_cache(maxsize=100)
480
+ def read_file_with_fallback(file_path, encodings=None):
481
+ """
482
+ Read a file trying multiple encodings in order of preference.
483
+
484
+ Args:
485
+ file_path (str or Path): Path to the file
486
+ encodings (list): List of encodings to try, defaults to common subtitle encodings
487
+
488
+ Returns:
489
+ str: File contents
490
+
491
+ Raises:
492
+ ValueError: If file cannot be read with any encoding
493
+ """
494
+ if encodings is None:
495
+ # First try detected encoding, then fallback to common subtitle encodings
496
+ detected = detect_file_encoding(file_path)
497
+ encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
498
+
499
+ file_path = Path(file_path)
500
+ errors = []
501
+
502
+ for encoding in encodings:
503
+ try:
504
+ with open(file_path, encoding=encoding) as f:
505
+ content = f.read()
506
+ logger.debug(f"Successfully read {file_path} using {encoding} encoding")
507
+ return content
508
+ except UnicodeDecodeError as e:
509
+ errors.append(f"{encoding}: {str(e)}")
510
+ continue
511
+
512
+ error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
513
+ errors
514
+ )
515
+ logger.error(error_msg)
516
+ raise ValueError(error_msg)
517
+
518
+
519
+ class SubtitleReader:
520
+ """Helper class for reading and parsing subtitle files."""
521
+
522
+ @staticmethod
523
+ def parse_timestamp(timestamp):
524
+ """Parse SRT timestamp into seconds."""
525
+ hours, minutes, seconds = timestamp.replace(",", ".").split(":")
526
+ return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
527
+
528
+ @staticmethod
529
+ def read_srt_file(file_path):
530
+ """
531
+ Read an SRT file and return its contents with robust encoding handling.
532
+
533
+ Args:
534
+ file_path (str or Path): Path to the SRT file
535
+
536
+ Returns:
537
+ str: Contents of the SRT file
538
+ """
539
+ return read_file_with_fallback(file_path)
540
+
541
+ @staticmethod
542
+ def extract_subtitle_chunk(content, start_time, end_time):
543
+ """
544
+ Extract subtitle text for a specific time window.
545
+
546
+ Args:
547
+ content (str): Full SRT file content
548
+ start_time (float): Chunk start time in seconds
549
+ end_time (float): Chunk end time in seconds
550
+
551
+ Returns:
552
+ list: List of subtitle texts within the time window
553
+ """
554
+ text_lines = []
555
+
556
+ for block in content.strip().split("\n\n"):
557
+ lines = block.split("\n")
558
+ if len(lines) < 3 or "-->" not in lines[1]:
559
+ continue
560
+
561
+ try:
562
+ timestamp = lines[1]
563
+ time_parts = timestamp.split(" --> ")
564
+ start_stamp = time_parts[0].strip()
565
+ end_stamp = time_parts[1].strip()
566
+
567
+ subtitle_start = SubtitleReader.parse_timestamp(start_stamp)
568
+ subtitle_end = SubtitleReader.parse_timestamp(end_stamp)
569
+
570
+ # Check if this subtitle overlaps with our chunk
571
+ if subtitle_end >= start_time and subtitle_start <= end_time:
572
+ text = " ".join(lines[2:])
573
+ text_lines.append(text)
574
+
575
+ except (IndexError, ValueError) as e:
576
+ logger.warning(f"Error parsing subtitle block: {e}")
577
+ continue
578
+
579
+ return text_lines
580
+
581
+
582
+ # Note: Model caching is now handled by the ASR abstraction layer in asr_models.py