mkv-episode-matcher 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mkv_episode_matcher/__init__.py +8 -0
- mkv_episode_matcher/__main__.py +2 -177
- mkv_episode_matcher/asr_models.py +506 -0
- mkv_episode_matcher/cli.py +558 -0
- mkv_episode_matcher/core/config_manager.py +100 -0
- mkv_episode_matcher/core/engine.py +577 -0
- mkv_episode_matcher/core/matcher.py +214 -0
- mkv_episode_matcher/core/models.py +91 -0
- mkv_episode_matcher/core/providers/asr.py +85 -0
- mkv_episode_matcher/core/providers/subtitles.py +341 -0
- mkv_episode_matcher/core/utils.py +148 -0
- mkv_episode_matcher/episode_identification.py +550 -118
- mkv_episode_matcher/subtitle_utils.py +82 -0
- mkv_episode_matcher/tmdb_client.py +56 -14
- mkv_episode_matcher/ui/flet_app.py +708 -0
- mkv_episode_matcher/utils.py +262 -139
- mkv_episode_matcher-1.0.0.dist-info/METADATA +242 -0
- mkv_episode_matcher-1.0.0.dist-info/RECORD +23 -0
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/WHEEL +1 -1
- mkv_episode_matcher-1.0.0.dist-info/licenses/LICENSE +21 -0
- mkv_episode_matcher/config.py +0 -82
- mkv_episode_matcher/episode_matcher.py +0 -100
- mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
- mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
- mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
- mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
- mkv_episode_matcher/mkv_to_srt.py +0 -302
- mkv_episode_matcher/speech_to_text.py +0 -90
- mkv_episode_matcher-0.3.3.dist-info/METADATA +0 -125
- mkv_episode_matcher-0.3.3.dist-info/RECORD +0 -25
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,150 +1,582 @@
|
|
|
1
|
-
import
|
|
2
|
-
import os
|
|
1
|
+
import re
|
|
3
2
|
import subprocess
|
|
4
3
|
import tempfile
|
|
4
|
+
from functools import lru_cache
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import chardet
|
|
8
|
+
import numpy as np
|
|
6
9
|
import torch
|
|
7
|
-
from rapidfuzz import fuzz
|
|
8
10
|
from loguru import logger
|
|
9
|
-
import
|
|
10
|
-
|
|
11
|
-
|
|
11
|
+
from rich import print
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
|
|
14
|
+
from mkv_episode_matcher.asr_models import get_cached_model
|
|
15
|
+
from mkv_episode_matcher.utils import extract_season_episode
|
|
16
|
+
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SubtitleCache:
|
|
21
|
+
"""Cache for storing parsed subtitle data to avoid repeated loading and parsing."""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.subtitles = {} # {file_path: parsed_content}
|
|
25
|
+
self.chunk_cache = {} # {(file_path, chunk_idx): text}
|
|
26
|
+
|
|
27
|
+
def get_subtitle_content(self, srt_file):
|
|
28
|
+
"""Get the full content of a subtitle file, loading it only once."""
|
|
29
|
+
srt_file = str(srt_file)
|
|
30
|
+
if srt_file not in self.subtitles:
|
|
31
|
+
reader = SubtitleReader()
|
|
32
|
+
self.subtitles[srt_file] = reader.read_srt_file(srt_file)
|
|
33
|
+
return self.subtitles[srt_file]
|
|
34
|
+
|
|
35
|
+
def get_chunk(self, srt_file, chunk_idx, chunk_start, chunk_end):
|
|
36
|
+
"""Get a specific time chunk from a subtitle file, with caching."""
|
|
37
|
+
srt_file = str(srt_file)
|
|
38
|
+
cache_key = (srt_file, chunk_idx)
|
|
39
|
+
|
|
40
|
+
if cache_key not in self.chunk_cache:
|
|
41
|
+
content = self.get_subtitle_content(srt_file)
|
|
42
|
+
reader = SubtitleReader()
|
|
43
|
+
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
44
|
+
self.chunk_cache[cache_key] = " ".join(text_lines)
|
|
45
|
+
|
|
46
|
+
return self.chunk_cache[cache_key]
|
|
47
|
+
|
|
48
|
+
|
|
12
49
|
class EpisodeMatcher:
|
|
13
|
-
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
50
|
+
def __init__(self, cache_dir, show_name, min_confidence=0.6, device=None):
|
|
14
51
|
self.cache_dir = Path(cache_dir)
|
|
15
52
|
self.min_confidence = min_confidence
|
|
16
53
|
self.show_name = show_name
|
|
17
|
-
self.chunk_duration =
|
|
18
|
-
self.
|
|
54
|
+
self.chunk_duration = 30
|
|
55
|
+
self.skip_initial_duration = 300
|
|
56
|
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
19
57
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
20
58
|
self.temp_dir.mkdir(exist_ok=True)
|
|
21
|
-
|
|
59
|
+
# Initialize subtitle cache
|
|
60
|
+
self.subtitle_cache = SubtitleCache()
|
|
61
|
+
# Cache for extracted audio chunks
|
|
62
|
+
self.audio_chunks = {}
|
|
63
|
+
# Store reference files to avoid repeated glob operations
|
|
64
|
+
self.reference_files_cache = {}
|
|
65
|
+
|
|
22
66
|
def clean_text(self, text):
|
|
23
67
|
text = text.lower().strip()
|
|
24
|
-
text = re.sub(r
|
|
25
|
-
text = re.sub(r
|
|
26
|
-
return
|
|
27
|
-
|
|
28
|
-
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
29
|
-
whisper_clean = self.clean_text(whisper_chunk)
|
|
30
|
-
ref_clean = self.clean_text(ref_chunk)
|
|
31
|
-
return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
|
|
32
|
-
fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
|
|
68
|
+
text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
|
|
69
|
+
text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
|
|
70
|
+
return " ".join(text.split())
|
|
33
71
|
|
|
34
72
|
def extract_audio_chunk(self, mkv_file, start_time):
|
|
35
|
-
"""Extract a chunk of audio from MKV file."""
|
|
73
|
+
"""Extract a chunk of audio from MKV file with caching."""
|
|
74
|
+
cache_key = (str(mkv_file), start_time)
|
|
75
|
+
|
|
76
|
+
if cache_key in self.audio_chunks:
|
|
77
|
+
return self.audio_chunks[cache_key]
|
|
78
|
+
|
|
36
79
|
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
37
80
|
if not chunk_path.exists():
|
|
38
81
|
cmd = [
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
82
|
+
"ffmpeg",
|
|
83
|
+
"-ss",
|
|
84
|
+
str(start_time),
|
|
85
|
+
"-t",
|
|
86
|
+
str(self.chunk_duration),
|
|
87
|
+
"-i",
|
|
88
|
+
str(mkv_file),
|
|
89
|
+
"-vn", # Disable video
|
|
90
|
+
"-sn", # Disable subtitles
|
|
91
|
+
"-dn", # Disable data streams
|
|
92
|
+
"-acodec",
|
|
93
|
+
"pcm_s16le",
|
|
94
|
+
"-ar",
|
|
95
|
+
"16000",
|
|
96
|
+
"-ac",
|
|
97
|
+
"1",
|
|
98
|
+
"-y", # Overwrite output files without asking
|
|
99
|
+
str(chunk_path),
|
|
48
100
|
]
|
|
49
|
-
|
|
50
|
-
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
logger.debug(
|
|
104
|
+
f"Extracting audio chunk from {mkv_file} at {start_time}s using FFmpeg"
|
|
105
|
+
)
|
|
106
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
107
|
+
|
|
108
|
+
if result.returncode != 0:
|
|
109
|
+
error_msg = f"FFmpeg failed with return code {result.returncode}"
|
|
110
|
+
if result.stderr:
|
|
111
|
+
error_msg += f". Error: {result.stderr.strip()}"
|
|
112
|
+
logger.error(error_msg)
|
|
113
|
+
logger.debug(f"FFmpeg command: {' '.join(cmd)}")
|
|
114
|
+
raise RuntimeError(error_msg)
|
|
115
|
+
|
|
116
|
+
# Check if the output file was actually created and has content
|
|
117
|
+
if not chunk_path.exists():
|
|
118
|
+
error_msg = f"FFmpeg completed but output file was not created: {chunk_path}"
|
|
119
|
+
logger.error(error_msg)
|
|
120
|
+
raise RuntimeError(error_msg)
|
|
121
|
+
|
|
122
|
+
# Check if the file has meaningful content (at least 1KB)
|
|
123
|
+
if chunk_path.stat().st_size < 1024:
|
|
124
|
+
error_msg = f"Generated audio chunk is too small ({chunk_path.stat().st_size} bytes), likely corrupted"
|
|
125
|
+
logger.warning(error_msg)
|
|
126
|
+
# Don't raise an error for small files, but log the warning
|
|
127
|
+
|
|
128
|
+
logger.debug(
|
|
129
|
+
f"Successfully extracted {chunk_path.stat().st_size} byte audio chunk"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
except subprocess.TimeoutExpired as e:
|
|
133
|
+
error_msg = f"FFmpeg timed out after 30 seconds while extracting audio from {mkv_file}"
|
|
134
|
+
logger.error(error_msg)
|
|
135
|
+
raise RuntimeError(error_msg) from e
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
error_msg = f"Failed to extract audio chunk from {mkv_file} at {start_time}s: {str(e)}"
|
|
139
|
+
logger.error(error_msg)
|
|
140
|
+
# Clean up partial file if it exists
|
|
141
|
+
if chunk_path.exists():
|
|
142
|
+
try:
|
|
143
|
+
chunk_path.unlink()
|
|
144
|
+
except Exception as cleanup_error:
|
|
145
|
+
logger.warning(
|
|
146
|
+
f"Failed to clean up partial file {chunk_path}: {cleanup_error}"
|
|
147
|
+
)
|
|
148
|
+
raise RuntimeError(error_msg) from e
|
|
149
|
+
|
|
150
|
+
chunk_path_str = str(chunk_path)
|
|
151
|
+
self.audio_chunks[cache_key] = chunk_path_str
|
|
152
|
+
return chunk_path_str
|
|
51
153
|
|
|
52
154
|
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
155
|
+
"""
|
|
156
|
+
Load reference subtitles for a specific time chunk with caching.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
srt_file (str or Path): Path to the SRT file
|
|
160
|
+
chunk_idx (int): Index of the chunk to load
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
str: Combined text from the subtitle chunk
|
|
164
|
+
"""
|
|
165
|
+
try:
|
|
166
|
+
# Apply the same offset as in _try_match_with_model
|
|
167
|
+
chunk_start = self.skip_initial_duration + (chunk_idx * self.chunk_duration)
|
|
168
|
+
chunk_end = chunk_start + self.chunk_duration
|
|
169
|
+
|
|
170
|
+
return self.subtitle_cache.get_chunk(
|
|
171
|
+
srt_file, chunk_idx, chunk_start, chunk_end
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
176
|
+
return ""
|
|
177
|
+
|
|
178
|
+
def get_reference_files(self, season_number):
|
|
179
|
+
"""Get reference subtitle files with caching."""
|
|
180
|
+
cache_key = (self.show_name, season_number)
|
|
181
|
+
logger.debug(f"Reference cache key: {cache_key}")
|
|
182
|
+
|
|
183
|
+
if cache_key in self.reference_files_cache:
|
|
184
|
+
logger.debug("Returning cached reference files")
|
|
185
|
+
return self.reference_files_cache[cache_key]
|
|
186
|
+
|
|
187
|
+
reference_dir = self.cache_dir / "data" / self.show_name
|
|
188
|
+
patterns = [
|
|
189
|
+
f"S{season_number:02d}E",
|
|
190
|
+
f"S{season_number}E",
|
|
191
|
+
f"{season_number:02d}x",
|
|
192
|
+
f"{season_number}x",
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
reference_files = []
|
|
196
|
+
for pattern in patterns:
|
|
197
|
+
# Use case-insensitive file extension matching by checking both .srt and .SRT
|
|
198
|
+
srt_files = list(reference_dir.glob("*.srt")) + list(
|
|
199
|
+
reference_dir.glob("*.SRT")
|
|
200
|
+
)
|
|
201
|
+
files = [
|
|
202
|
+
f
|
|
203
|
+
for f in srt_files
|
|
204
|
+
if re.search(f"{pattern}\\d+", f.name, re.IGNORECASE)
|
|
205
|
+
]
|
|
206
|
+
reference_files.extend(files)
|
|
207
|
+
|
|
208
|
+
# Remove duplicates while preserving order
|
|
209
|
+
reference_files = list(dict.fromkeys(reference_files))
|
|
210
|
+
logger.debug(
|
|
211
|
+
f"Found {len(reference_files)} reference files for season {season_number}"
|
|
212
|
+
)
|
|
213
|
+
self.reference_files_cache[cache_key] = reference_files
|
|
214
|
+
return reference_files
|
|
215
|
+
|
|
216
|
+
def _try_match_with_model(
|
|
217
|
+
self, video_file, model_config, max_duration, reference_files
|
|
218
|
+
):
|
|
219
|
+
"""
|
|
220
|
+
Attempt to match using specified model, checking multiple chunks starting from skip_initial_duration
|
|
221
|
+
and continuing up to max_duration.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
video_file: Path to the video file
|
|
225
|
+
model_config: Dictionary with ASR model configuration or string for backward compatibility
|
|
226
|
+
max_duration: Maximum duration in seconds to check
|
|
227
|
+
reference_files: List of reference subtitle files
|
|
228
|
+
"""
|
|
229
|
+
# Handle backward compatibility for string model names
|
|
230
|
+
if isinstance(model_config, str):
|
|
231
|
+
# Convert old Whisper model names to new format
|
|
232
|
+
model_config = {
|
|
233
|
+
"type": "whisper",
|
|
234
|
+
"name": model_config,
|
|
235
|
+
"device": self.device,
|
|
236
|
+
}
|
|
237
|
+
elif isinstance(model_config, dict):
|
|
238
|
+
# Ensure device is set if not specified
|
|
239
|
+
if "device" not in model_config:
|
|
240
|
+
model_config = model_config.copy()
|
|
241
|
+
model_config["device"] = self.device
|
|
242
|
+
|
|
243
|
+
# Use cached model
|
|
244
|
+
model = get_cached_model(model_config)
|
|
245
|
+
|
|
246
|
+
# Calculate number of chunks to check
|
|
247
|
+
num_chunks = min(
|
|
248
|
+
max_duration // self.chunk_duration, 10
|
|
249
|
+
) # Limit to 10 chunks for initial check
|
|
250
|
+
|
|
251
|
+
# Pre-load all reference chunks for the chunks we'll check
|
|
252
|
+
for chunk_idx in range(num_chunks):
|
|
253
|
+
for ref_file in reference_files:
|
|
254
|
+
self.load_reference_chunk(ref_file, chunk_idx)
|
|
255
|
+
|
|
256
|
+
for chunk_idx in range(num_chunks):
|
|
257
|
+
# Start at self.skip_initial_duration and check subsequent chunks
|
|
258
|
+
start_time = self.skip_initial_duration + (chunk_idx * self.chunk_duration)
|
|
259
|
+
model_name = (
|
|
260
|
+
model_config.get("name", "unknown")
|
|
261
|
+
if isinstance(model_config, dict)
|
|
262
|
+
else model_config
|
|
263
|
+
)
|
|
264
|
+
logger.debug(f"Trying {model_name} model at {start_time} seconds")
|
|
265
|
+
|
|
66
266
|
try:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
267
|
+
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
268
|
+
logger.debug(f"Extracted audio chunk: {audio_path}")
|
|
269
|
+
except RuntimeError as e:
|
|
270
|
+
logger.warning(f"Failed to extract audio chunk at {start_time}s: {e}")
|
|
271
|
+
continue # Skip this chunk and try the next one
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.error(
|
|
274
|
+
f"Unexpected error extracting audio chunk at {start_time}s: {e}"
|
|
275
|
+
)
|
|
276
|
+
continue # Skip this chunk and try the next one
|
|
277
|
+
|
|
278
|
+
try:
|
|
279
|
+
result = model.transcribe(audio_path)
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error(
|
|
282
|
+
f"ASR transcription failed for chunk at {start_time}s: {e}"
|
|
283
|
+
)
|
|
284
|
+
continue # Skip this chunk and try the next one
|
|
285
|
+
|
|
286
|
+
chunk_text = result["text"]
|
|
287
|
+
logger.debug(
|
|
288
|
+
f"Transcription result: {chunk_text} ({len(chunk_text)} characters)"
|
|
289
|
+
)
|
|
290
|
+
if len(chunk_text) < 10:
|
|
291
|
+
logger.debug(
|
|
292
|
+
f"Transcription result too short: {chunk_text} ({len(chunk_text)} characters)"
|
|
293
|
+
)
|
|
78
294
|
continue
|
|
79
|
-
|
|
80
|
-
|
|
295
|
+
best_confidence = 0
|
|
296
|
+
best_match = None
|
|
297
|
+
|
|
298
|
+
# Compare with reference chunks
|
|
299
|
+
# Compare with reference chunks
|
|
300
|
+
for ref_file in reference_files:
|
|
301
|
+
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
302
|
+
|
|
303
|
+
# Use model's internal scoring logic
|
|
304
|
+
confidence = model.calculate_match_score(chunk_text, ref_text)
|
|
305
|
+
|
|
306
|
+
if confidence > best_confidence:
|
|
307
|
+
logger.debug(f"New best confidence: {confidence} for {ref_file}")
|
|
308
|
+
best_confidence = confidence
|
|
309
|
+
best_match = Path(ref_file)
|
|
310
|
+
|
|
311
|
+
if confidence > self.min_confidence:
|
|
312
|
+
print(
|
|
313
|
+
f"Matched with {best_match} (confidence: {best_confidence:.2f})"
|
|
314
|
+
)
|
|
315
|
+
try:
|
|
316
|
+
season, episode = extract_season_episode(best_match.stem)
|
|
317
|
+
except Exception as e:
|
|
318
|
+
print(f"Error extracting season/episode: {e}")
|
|
319
|
+
continue
|
|
320
|
+
print(
|
|
321
|
+
f"Season: {season}, Episode: {episode} (confidence: {best_confidence:.2f})"
|
|
322
|
+
)
|
|
323
|
+
if season and episode:
|
|
324
|
+
return {
|
|
325
|
+
"season": season,
|
|
326
|
+
"episode": episode,
|
|
327
|
+
"confidence": best_confidence,
|
|
328
|
+
"reference_file": str(best_match),
|
|
329
|
+
"matched_at": start_time,
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
logger.info(
|
|
333
|
+
f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
|
|
334
|
+
)
|
|
335
|
+
return None
|
|
81
336
|
|
|
82
337
|
def identify_episode(self, video_file, temp_dir, season_number):
|
|
338
|
+
"""Progressive episode identification with faster initial attempt."""
|
|
83
339
|
try:
|
|
84
|
-
# Get
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
'-show_entries', 'format=duration',
|
|
88
|
-
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
89
|
-
video_file
|
|
90
|
-
]).decode())
|
|
91
|
-
|
|
92
|
-
total_chunks = int(np.ceil(duration / self.chunk_duration))
|
|
93
|
-
|
|
94
|
-
# Load Whisper model
|
|
95
|
-
model = whisper.load_model("base", device=self.device)
|
|
96
|
-
|
|
97
|
-
# Get season-specific reference files
|
|
98
|
-
reference_dir = self.cache_dir / "data" / self.show_name
|
|
99
|
-
season_pattern = f"S{season_number:02d}E"
|
|
100
|
-
reference_files = [
|
|
101
|
-
f for f in reference_dir.glob("*.srt")
|
|
102
|
-
if season_pattern in f.name
|
|
103
|
-
]
|
|
104
|
-
|
|
340
|
+
# Get reference files first with caching
|
|
341
|
+
reference_files = self.get_reference_files(season_number)
|
|
342
|
+
|
|
105
343
|
if not reference_files:
|
|
106
344
|
logger.error(f"No reference files found for season {season_number}")
|
|
107
345
|
return None
|
|
108
|
-
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
346
|
+
|
|
347
|
+
# Cache video duration
|
|
348
|
+
try:
|
|
349
|
+
duration = get_video_duration(video_file)
|
|
350
|
+
except Exception as e:
|
|
351
|
+
logger.error(f"Failed to get video duration for {video_file}: {e}")
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
# Try with Parakeet CTC model
|
|
355
|
+
logger.info("Attempting match with Parakeet CTC model...")
|
|
356
|
+
try:
|
|
357
|
+
match = self._try_match_with_model(
|
|
358
|
+
video_file,
|
|
359
|
+
{
|
|
360
|
+
"type": "parakeet",
|
|
361
|
+
"name": "nvidia/parakeet-ctc-0.6b",
|
|
362
|
+
"device": self.device,
|
|
363
|
+
},
|
|
364
|
+
min(duration, 600), # Allow up to 10 minutes
|
|
365
|
+
reference_files,
|
|
119
366
|
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
season, episode = map(int, season_ep.groups())
|
|
138
|
-
return {
|
|
139
|
-
'season': season,
|
|
140
|
-
'episode': episode,
|
|
141
|
-
'confidence': best_confidence,
|
|
142
|
-
'reference_file': str(best_match),
|
|
143
|
-
}
|
|
144
|
-
|
|
367
|
+
if match:
|
|
368
|
+
logger.info(
|
|
369
|
+
f"Successfully matched with Parakeet CTC model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
370
|
+
)
|
|
371
|
+
return match
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.warning(f"Parakeet CTC model failed: {e}")
|
|
374
|
+
|
|
375
|
+
logger.info(
|
|
376
|
+
"Speech recognition match failed - no models were able to process this file"
|
|
377
|
+
)
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
except Exception as e:
|
|
381
|
+
logger.error(
|
|
382
|
+
f"Unexpected error during episode identification for {video_file}: {e}"
|
|
383
|
+
)
|
|
145
384
|
return None
|
|
146
|
-
|
|
385
|
+
|
|
147
386
|
finally:
|
|
148
|
-
# Cleanup temp files
|
|
149
|
-
for
|
|
150
|
-
|
|
387
|
+
# Cleanup temp files - keep this limited to only files we know we created
|
|
388
|
+
for chunk_info in self.audio_chunks.values():
|
|
389
|
+
try:
|
|
390
|
+
Path(chunk_info).unlink(missing_ok=True)
|
|
391
|
+
except Exception as e:
|
|
392
|
+
logger.warning(f"Failed to delete temp file {chunk_info}: {e}")
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
@lru_cache(maxsize=100)
|
|
396
|
+
def get_video_duration(video_file):
|
|
397
|
+
"""Get video duration with caching and error handling."""
|
|
398
|
+
try:
|
|
399
|
+
logger.debug(f"Getting duration for video file: {video_file}")
|
|
400
|
+
result = subprocess.run(
|
|
401
|
+
[
|
|
402
|
+
"ffprobe",
|
|
403
|
+
"-v",
|
|
404
|
+
"error",
|
|
405
|
+
"-show_entries",
|
|
406
|
+
"format=duration",
|
|
407
|
+
"-of",
|
|
408
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
409
|
+
str(video_file),
|
|
410
|
+
],
|
|
411
|
+
capture_output=True,
|
|
412
|
+
text=True,
|
|
413
|
+
timeout=10,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
if result.returncode != 0:
|
|
417
|
+
error_msg = f"ffprobe failed with return code {result.returncode}"
|
|
418
|
+
if result.stderr:
|
|
419
|
+
error_msg += f". Error: {result.stderr.strip()}"
|
|
420
|
+
logger.error(error_msg)
|
|
421
|
+
raise RuntimeError(error_msg)
|
|
422
|
+
|
|
423
|
+
duration_str = result.stdout.strip()
|
|
424
|
+
if not duration_str:
|
|
425
|
+
raise RuntimeError("ffprobe returned empty duration")
|
|
426
|
+
|
|
427
|
+
duration = float(duration_str)
|
|
428
|
+
if duration <= 0:
|
|
429
|
+
raise RuntimeError(f"Invalid duration: {duration}")
|
|
430
|
+
|
|
431
|
+
result_duration = int(np.ceil(duration))
|
|
432
|
+
logger.debug(f"Video duration: {result_duration} seconds")
|
|
433
|
+
return result_duration
|
|
434
|
+
|
|
435
|
+
except subprocess.TimeoutExpired as e:
|
|
436
|
+
error_msg = f"ffprobe timed out while getting duration for {video_file}"
|
|
437
|
+
logger.error(error_msg)
|
|
438
|
+
raise RuntimeError(error_msg) from e
|
|
439
|
+
except ValueError as e:
|
|
440
|
+
error_msg = (
|
|
441
|
+
f"Failed to parse duration from ffprobe output for {video_file}: {e}"
|
|
442
|
+
)
|
|
443
|
+
logger.error(error_msg)
|
|
444
|
+
raise RuntimeError(error_msg) from e
|
|
445
|
+
except Exception as e:
|
|
446
|
+
error_msg = f"Unexpected error getting video duration for {video_file}: {e}"
|
|
447
|
+
logger.error(error_msg)
|
|
448
|
+
raise RuntimeError(error_msg) from e
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def detect_file_encoding(file_path):
|
|
452
|
+
"""
|
|
453
|
+
Detect the encoding of a file using chardet.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
file_path (str or Path): Path to the file
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
str: Detected encoding, defaults to 'utf-8' if detection fails
|
|
460
|
+
"""
|
|
461
|
+
try:
|
|
462
|
+
with open(file_path, "rb") as f:
|
|
463
|
+
raw_data = f.read(
|
|
464
|
+
min(1024 * 1024, Path(file_path).stat().st_size)
|
|
465
|
+
) # Read up to 1MB
|
|
466
|
+
result = chardet.detect(raw_data)
|
|
467
|
+
encoding = result["encoding"]
|
|
468
|
+
confidence = result["confidence"]
|
|
469
|
+
|
|
470
|
+
logger.debug(
|
|
471
|
+
f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
|
|
472
|
+
)
|
|
473
|
+
return encoding if encoding else "utf-8"
|
|
474
|
+
except Exception as e:
|
|
475
|
+
logger.warning(f"Error detecting encoding for {file_path}: {e}")
|
|
476
|
+
return "utf-8"
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
@lru_cache(maxsize=100)
|
|
480
|
+
def read_file_with_fallback(file_path, encodings=None):
|
|
481
|
+
"""
|
|
482
|
+
Read a file trying multiple encodings in order of preference.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
file_path (str or Path): Path to the file
|
|
486
|
+
encodings (list): List of encodings to try, defaults to common subtitle encodings
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
str: File contents
|
|
490
|
+
|
|
491
|
+
Raises:
|
|
492
|
+
ValueError: If file cannot be read with any encoding
|
|
493
|
+
"""
|
|
494
|
+
if encodings is None:
|
|
495
|
+
# First try detected encoding, then fallback to common subtitle encodings
|
|
496
|
+
detected = detect_file_encoding(file_path)
|
|
497
|
+
encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
|
|
498
|
+
|
|
499
|
+
file_path = Path(file_path)
|
|
500
|
+
errors = []
|
|
501
|
+
|
|
502
|
+
for encoding in encodings:
|
|
503
|
+
try:
|
|
504
|
+
with open(file_path, encoding=encoding) as f:
|
|
505
|
+
content = f.read()
|
|
506
|
+
logger.debug(f"Successfully read {file_path} using {encoding} encoding")
|
|
507
|
+
return content
|
|
508
|
+
except UnicodeDecodeError as e:
|
|
509
|
+
errors.append(f"{encoding}: {str(e)}")
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
|
|
513
|
+
errors
|
|
514
|
+
)
|
|
515
|
+
logger.error(error_msg)
|
|
516
|
+
raise ValueError(error_msg)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
class SubtitleReader:
|
|
520
|
+
"""Helper class for reading and parsing subtitle files."""
|
|
521
|
+
|
|
522
|
+
@staticmethod
|
|
523
|
+
def parse_timestamp(timestamp):
|
|
524
|
+
"""Parse SRT timestamp into seconds."""
|
|
525
|
+
hours, minutes, seconds = timestamp.replace(",", ".").split(":")
|
|
526
|
+
return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
|
|
527
|
+
|
|
528
|
+
@staticmethod
|
|
529
|
+
def read_srt_file(file_path):
|
|
530
|
+
"""
|
|
531
|
+
Read an SRT file and return its contents with robust encoding handling.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
file_path (str or Path): Path to the SRT file
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
str: Contents of the SRT file
|
|
538
|
+
"""
|
|
539
|
+
return read_file_with_fallback(file_path)
|
|
540
|
+
|
|
541
|
+
@staticmethod
|
|
542
|
+
def extract_subtitle_chunk(content, start_time, end_time):
|
|
543
|
+
"""
|
|
544
|
+
Extract subtitle text for a specific time window.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
content (str): Full SRT file content
|
|
548
|
+
start_time (float): Chunk start time in seconds
|
|
549
|
+
end_time (float): Chunk end time in seconds
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
list: List of subtitle texts within the time window
|
|
553
|
+
"""
|
|
554
|
+
text_lines = []
|
|
555
|
+
|
|
556
|
+
for block in content.strip().split("\n\n"):
|
|
557
|
+
lines = block.split("\n")
|
|
558
|
+
if len(lines) < 3 or "-->" not in lines[1]:
|
|
559
|
+
continue
|
|
560
|
+
|
|
561
|
+
try:
|
|
562
|
+
timestamp = lines[1]
|
|
563
|
+
time_parts = timestamp.split(" --> ")
|
|
564
|
+
start_stamp = time_parts[0].strip()
|
|
565
|
+
end_stamp = time_parts[1].strip()
|
|
566
|
+
|
|
567
|
+
subtitle_start = SubtitleReader.parse_timestamp(start_stamp)
|
|
568
|
+
subtitle_end = SubtitleReader.parse_timestamp(end_stamp)
|
|
569
|
+
|
|
570
|
+
# Check if this subtitle overlaps with our chunk
|
|
571
|
+
if subtitle_end >= start_time and subtitle_start <= end_time:
|
|
572
|
+
text = " ".join(lines[2:])
|
|
573
|
+
text_lines.append(text)
|
|
574
|
+
|
|
575
|
+
except (IndexError, ValueError) as e:
|
|
576
|
+
logger.warning(f"Error parsing subtitle block: {e}")
|
|
577
|
+
continue
|
|
578
|
+
|
|
579
|
+
return text_lines
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
# Note: Model caching is now handled by the ASR abstraction layer in asr_models.py
|