mkv-episode-matcher 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/episode_identification.py +147 -74
- {mkv_episode_matcher-0.7.1.dist-info → mkv_episode_matcher-0.8.0.dist-info}/METADATA +1 -1
- {mkv_episode_matcher-0.7.1.dist-info → mkv_episode_matcher-0.8.0.dist-info}/RECORD +6 -6
- {mkv_episode_matcher-0.7.1.dist-info → mkv_episode_matcher-0.8.0.dist-info}/WHEEL +1 -1
- {mkv_episode_matcher-0.7.1.dist-info → mkv_episode_matcher-0.8.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.7.1.dist-info → mkv_episode_matcher-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -10,9 +10,40 @@ import torch
|
|
|
10
10
|
import whisper
|
|
11
11
|
from loguru import logger
|
|
12
12
|
from rapidfuzz import fuzz
|
|
13
|
+
from mkv_episode_matcher.utils import extract_season_episode
|
|
14
|
+
from functools import lru_cache
|
|
13
15
|
|
|
14
16
|
console = Console()
|
|
15
17
|
|
|
18
|
+
class SubtitleCache:
|
|
19
|
+
"""Cache for storing parsed subtitle data to avoid repeated loading and parsing."""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.subtitles = {} # {file_path: parsed_content}
|
|
23
|
+
self.chunk_cache = {} # {(file_path, chunk_idx): text}
|
|
24
|
+
|
|
25
|
+
def get_subtitle_content(self, srt_file):
|
|
26
|
+
"""Get the full content of a subtitle file, loading it only once."""
|
|
27
|
+
srt_file = str(srt_file)
|
|
28
|
+
if srt_file not in self.subtitles:
|
|
29
|
+
reader = SubtitleReader()
|
|
30
|
+
self.subtitles[srt_file] = reader.read_srt_file(srt_file)
|
|
31
|
+
return self.subtitles[srt_file]
|
|
32
|
+
|
|
33
|
+
def get_chunk(self, srt_file, chunk_idx, chunk_start, chunk_end):
|
|
34
|
+
"""Get a specific time chunk from a subtitle file, with caching."""
|
|
35
|
+
srt_file = str(srt_file)
|
|
36
|
+
cache_key = (srt_file, chunk_idx)
|
|
37
|
+
|
|
38
|
+
if cache_key not in self.chunk_cache:
|
|
39
|
+
content = self.get_subtitle_content(srt_file)
|
|
40
|
+
reader = SubtitleReader()
|
|
41
|
+
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
42
|
+
self.chunk_cache[cache_key] = " ".join(text_lines)
|
|
43
|
+
|
|
44
|
+
return self.chunk_cache[cache_key]
|
|
45
|
+
|
|
46
|
+
|
|
16
47
|
class EpisodeMatcher:
|
|
17
48
|
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
18
49
|
self.cache_dir = Path(cache_dir)
|
|
@@ -22,6 +53,12 @@ class EpisodeMatcher:
|
|
|
22
53
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
23
54
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
24
55
|
self.temp_dir.mkdir(exist_ok=True)
|
|
56
|
+
# Initialize subtitle cache
|
|
57
|
+
self.subtitle_cache = SubtitleCache()
|
|
58
|
+
# Cache for extracted audio chunks
|
|
59
|
+
self.audio_chunks = {}
|
|
60
|
+
# Store reference files to avoid repeated glob operations
|
|
61
|
+
self.reference_files_cache = {}
|
|
25
62
|
|
|
26
63
|
def clean_text(self, text):
|
|
27
64
|
text = text.lower().strip()
|
|
@@ -38,7 +75,12 @@ class EpisodeMatcher:
|
|
|
38
75
|
) / 100.0
|
|
39
76
|
|
|
40
77
|
def extract_audio_chunk(self, mkv_file, start_time):
|
|
41
|
-
"""Extract a chunk of audio from MKV file."""
|
|
78
|
+
"""Extract a chunk of audio from MKV file with caching."""
|
|
79
|
+
cache_key = (str(mkv_file), start_time)
|
|
80
|
+
|
|
81
|
+
if cache_key in self.audio_chunks:
|
|
82
|
+
return self.audio_chunks[cache_key]
|
|
83
|
+
|
|
42
84
|
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
43
85
|
if not chunk_path.exists():
|
|
44
86
|
cmd = [
|
|
@@ -58,14 +100,18 @@ class EpisodeMatcher:
|
|
|
58
100
|
"16000",
|
|
59
101
|
"-ac",
|
|
60
102
|
"1",
|
|
103
|
+
"-y", # Overwrite output files without asking
|
|
61
104
|
str(chunk_path),
|
|
62
105
|
]
|
|
63
106
|
subprocess.run(cmd, capture_output=True)
|
|
64
|
-
|
|
107
|
+
|
|
108
|
+
chunk_path_str = str(chunk_path)
|
|
109
|
+
self.audio_chunks[cache_key] = chunk_path_str
|
|
110
|
+
return chunk_path_str
|
|
65
111
|
|
|
66
112
|
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
67
113
|
"""
|
|
68
|
-
Load reference subtitles for a specific time chunk with
|
|
114
|
+
Load reference subtitles for a specific time chunk with caching.
|
|
69
115
|
|
|
70
116
|
Args:
|
|
71
117
|
srt_file (str or Path): Path to the SRT file
|
|
@@ -74,23 +120,48 @@ class EpisodeMatcher:
|
|
|
74
120
|
Returns:
|
|
75
121
|
str: Combined text from the subtitle chunk
|
|
76
122
|
"""
|
|
77
|
-
chunk_start = chunk_idx * self.chunk_duration
|
|
78
|
-
chunk_end = chunk_start + self.chunk_duration
|
|
79
|
-
|
|
80
123
|
try:
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
87
|
-
|
|
88
|
-
return " ".join(text_lines)
|
|
89
|
-
|
|
124
|
+
chunk_start = chunk_idx * self.chunk_duration
|
|
125
|
+
chunk_end = chunk_start + self.chunk_duration
|
|
126
|
+
|
|
127
|
+
return self.subtitle_cache.get_chunk(srt_file, chunk_idx, chunk_start, chunk_end)
|
|
128
|
+
|
|
90
129
|
except Exception as e:
|
|
91
130
|
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
92
131
|
return ""
|
|
93
132
|
|
|
133
|
+
def get_reference_files(self, season_number):
|
|
134
|
+
"""Get reference subtitle files with caching."""
|
|
135
|
+
cache_key = (self.show_name, season_number)
|
|
136
|
+
|
|
137
|
+
if cache_key in self.reference_files_cache:
|
|
138
|
+
return self.reference_files_cache[cache_key]
|
|
139
|
+
|
|
140
|
+
reference_dir = self.cache_dir / "data" / self.show_name
|
|
141
|
+
patterns = [
|
|
142
|
+
f"S{season_number:02d}E",
|
|
143
|
+
f"S{season_number}E",
|
|
144
|
+
f"{season_number:02d}x",
|
|
145
|
+
f"{season_number}x",
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
reference_files = []
|
|
149
|
+
for _pattern in patterns:
|
|
150
|
+
files = [
|
|
151
|
+
f
|
|
152
|
+
for f in reference_dir.glob("*.srt")
|
|
153
|
+
if any(
|
|
154
|
+
re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
|
|
155
|
+
)
|
|
156
|
+
]
|
|
157
|
+
reference_files.extend(files)
|
|
158
|
+
|
|
159
|
+
# Remove duplicates while preserving order
|
|
160
|
+
reference_files = list(dict.fromkeys(reference_files))
|
|
161
|
+
|
|
162
|
+
self.reference_files_cache[cache_key] = reference_files
|
|
163
|
+
return reference_files
|
|
164
|
+
|
|
94
165
|
def _try_match_with_model(
|
|
95
166
|
self, video_file, model_name, max_duration, reference_files
|
|
96
167
|
):
|
|
@@ -107,7 +178,12 @@ class EpisodeMatcher:
|
|
|
107
178
|
model = get_whisper_model(model_name, self.device)
|
|
108
179
|
|
|
109
180
|
# Calculate number of chunks to check (30 seconds each)
|
|
110
|
-
num_chunks = max_duration // self.chunk_duration
|
|
181
|
+
num_chunks = min(max_duration // self.chunk_duration, 10) # Limit to 10 chunks for initial check
|
|
182
|
+
|
|
183
|
+
# Pre-load all reference chunks for the chunks we'll check
|
|
184
|
+
for chunk_idx in range(num_chunks):
|
|
185
|
+
for ref_file in reference_files:
|
|
186
|
+
self.load_reference_chunk(ref_file, chunk_idx)
|
|
111
187
|
|
|
112
188
|
for chunk_idx in range(num_chunks):
|
|
113
189
|
start_time = chunk_idx * self.chunk_duration
|
|
@@ -127,13 +203,19 @@ class EpisodeMatcher:
|
|
|
127
203
|
confidence = self.chunk_score(chunk_text, ref_text)
|
|
128
204
|
|
|
129
205
|
if confidence > best_confidence:
|
|
206
|
+
logger.debug(f"New best confidence: {confidence} for {ref_file}")
|
|
130
207
|
best_confidence = confidence
|
|
131
|
-
best_match = ref_file
|
|
208
|
+
best_match = Path(ref_file)
|
|
132
209
|
|
|
133
210
|
if confidence > self.min_confidence:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
season, episode =
|
|
211
|
+
print(f"Matched with {best_match} (confidence: {best_confidence:.2f})")
|
|
212
|
+
try:
|
|
213
|
+
season, episode = extract_season_episode(best_match.stem)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
print(f"Error extracting season/episode: {e}")
|
|
216
|
+
continue
|
|
217
|
+
print(f"Season: {season}, Episode: {episode} (confidence: {best_confidence:.2f})")
|
|
218
|
+
if season and episode:
|
|
137
219
|
return {
|
|
138
220
|
"season": season,
|
|
139
221
|
"episode": episode,
|
|
@@ -150,54 +232,22 @@ class EpisodeMatcher:
|
|
|
150
232
|
def identify_episode(self, video_file, temp_dir, season_number):
|
|
151
233
|
"""Progressive episode identification with faster initial attempt."""
|
|
152
234
|
try:
|
|
153
|
-
# Get reference files first
|
|
154
|
-
|
|
155
|
-
patterns = [
|
|
156
|
-
f"S{season_number:02d}E",
|
|
157
|
-
f"S{season_number}E",
|
|
158
|
-
f"{season_number:02d}x",
|
|
159
|
-
f"{season_number}x",
|
|
160
|
-
]
|
|
161
|
-
|
|
162
|
-
reference_files = []
|
|
163
|
-
# TODO Figure our why patterns is not being used
|
|
164
|
-
for _pattern in patterns:
|
|
165
|
-
files = [
|
|
166
|
-
f
|
|
167
|
-
for f in reference_dir.glob("*.srt")
|
|
168
|
-
if any(
|
|
169
|
-
re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
|
|
170
|
-
)
|
|
171
|
-
]
|
|
172
|
-
reference_files.extend(files)
|
|
173
|
-
|
|
174
|
-
reference_files = list(dict.fromkeys(reference_files))
|
|
235
|
+
# Get reference files first with caching
|
|
236
|
+
reference_files = self.get_reference_files(season_number)
|
|
175
237
|
|
|
176
238
|
if not reference_files:
|
|
177
239
|
logger.error(f"No reference files found for season {season_number}")
|
|
178
240
|
return None
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
"-v",
|
|
183
|
-
"error",
|
|
184
|
-
"-show_entries",
|
|
185
|
-
"format=duration",
|
|
186
|
-
"-of",
|
|
187
|
-
"default=noprint_wrappers=1:nokey=1",
|
|
188
|
-
video_file,
|
|
189
|
-
]).decode()
|
|
190
|
-
)
|
|
241
|
+
|
|
242
|
+
# Cache video duration
|
|
243
|
+
duration = get_video_duration(video_file)
|
|
191
244
|
|
|
192
|
-
duration = int(np.ceil(duration))
|
|
193
245
|
# Try with tiny model first (fastest)
|
|
194
246
|
logger.info("Attempting match with tiny model...")
|
|
195
247
|
match = self._try_match_with_model(
|
|
196
|
-
video_file, "tiny", duration, reference_files
|
|
248
|
+
video_file, "tiny", min(duration, 300), reference_files # Limit to first 5 minutes
|
|
197
249
|
)
|
|
198
|
-
if
|
|
199
|
-
match and match["confidence"] > 0.65
|
|
200
|
-
): # Slightly lower threshold for tiny
|
|
250
|
+
if match and match["confidence"] > 0.65: # Slightly lower threshold for tiny
|
|
201
251
|
logger.info(
|
|
202
252
|
f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
203
253
|
)
|
|
@@ -205,10 +255,10 @@ class EpisodeMatcher:
|
|
|
205
255
|
|
|
206
256
|
# If no match, try base model
|
|
207
257
|
logger.info(
|
|
208
|
-
"No match
|
|
258
|
+
"No match with tiny model, extending base model search to 10 minutes..."
|
|
209
259
|
)
|
|
210
260
|
match = self._try_match_with_model(
|
|
211
|
-
video_file, "base", duration, reference_files
|
|
261
|
+
video_file, "base", min(duration, 600), reference_files # Limit to first 10 minutes
|
|
212
262
|
)
|
|
213
263
|
if match:
|
|
214
264
|
logger.info(
|
|
@@ -220,12 +270,30 @@ class EpisodeMatcher:
|
|
|
220
270
|
return None
|
|
221
271
|
|
|
222
272
|
finally:
|
|
223
|
-
# Cleanup temp files
|
|
224
|
-
for
|
|
273
|
+
# Cleanup temp files - keep this limited to only files we know we created
|
|
274
|
+
for chunk_info in self.audio_chunks.values():
|
|
225
275
|
try:
|
|
226
|
-
|
|
276
|
+
Path(chunk_info).unlink(missing_ok=True)
|
|
227
277
|
except Exception as e:
|
|
228
|
-
logger.warning(f"Failed to delete temp file {
|
|
278
|
+
logger.warning(f"Failed to delete temp file {chunk_info}: {e}")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@lru_cache(maxsize=100)
|
|
282
|
+
def get_video_duration(video_file):
|
|
283
|
+
"""Get video duration with caching."""
|
|
284
|
+
duration = float(
|
|
285
|
+
subprocess.check_output([
|
|
286
|
+
"ffprobe",
|
|
287
|
+
"-v",
|
|
288
|
+
"error",
|
|
289
|
+
"-show_entries",
|
|
290
|
+
"format=duration",
|
|
291
|
+
"-of",
|
|
292
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
293
|
+
video_file,
|
|
294
|
+
]).decode()
|
|
295
|
+
)
|
|
296
|
+
return int(np.ceil(duration))
|
|
229
297
|
|
|
230
298
|
|
|
231
299
|
def detect_file_encoding(file_path):
|
|
@@ -240,7 +308,7 @@ def detect_file_encoding(file_path):
|
|
|
240
308
|
"""
|
|
241
309
|
try:
|
|
242
310
|
with open(file_path, "rb") as f:
|
|
243
|
-
raw_data = f.read()
|
|
311
|
+
raw_data = f.read(min(1024 * 1024, Path(file_path).stat().st_size)) # Read up to 1MB
|
|
244
312
|
result = chardet.detect(raw_data)
|
|
245
313
|
encoding = result["encoding"]
|
|
246
314
|
confidence = result["confidence"]
|
|
@@ -254,6 +322,7 @@ def detect_file_encoding(file_path):
|
|
|
254
322
|
return "utf-8"
|
|
255
323
|
|
|
256
324
|
|
|
325
|
+
@lru_cache(maxsize=100)
|
|
257
326
|
def read_file_with_fallback(file_path, encodings=None):
|
|
258
327
|
"""
|
|
259
328
|
Read a file trying multiple encodings in order of preference.
|
|
@@ -337,12 +406,16 @@ class SubtitleReader:
|
|
|
337
406
|
|
|
338
407
|
try:
|
|
339
408
|
timestamp = lines[1]
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
end_stamp =
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
409
|
+
time_parts = timestamp.split(" --> ")
|
|
410
|
+
start_stamp = time_parts[0].strip()
|
|
411
|
+
end_stamp = time_parts[1].strip()
|
|
412
|
+
|
|
413
|
+
subtitle_start = SubtitleReader.parse_timestamp(start_stamp)
|
|
414
|
+
subtitle_end = SubtitleReader.parse_timestamp(end_stamp)
|
|
415
|
+
|
|
416
|
+
# Check if this subtitle overlaps with our chunk
|
|
417
|
+
if subtitle_end >= start_time and subtitle_start <= end_time:
|
|
418
|
+
text = " ".join(lines[2:])
|
|
346
419
|
text_lines.append(text)
|
|
347
420
|
|
|
348
421
|
except (IndexError, ValueError) as e:
|
|
@@ -352,9 +425,9 @@ class SubtitleReader:
|
|
|
352
425
|
return text_lines
|
|
353
426
|
|
|
354
427
|
|
|
428
|
+
# Global whisper model cache with better cache key
|
|
355
429
|
_whisper_models = {}
|
|
356
430
|
|
|
357
|
-
|
|
358
431
|
def get_whisper_model(model_name="tiny", device=None):
|
|
359
432
|
"""Cache whisper models to avoid reloading."""
|
|
360
433
|
global _whisper_models
|
|
@@ -366,4 +439,4 @@ def get_whisper_model(model_name="tiny", device=None):
|
|
|
366
439
|
_whisper_models[key] = whisper.load_model(model_name, device=device)
|
|
367
440
|
logger.info(f"Loaded {model_name} model on {device}")
|
|
368
441
|
|
|
369
|
-
return _whisper_models[key]
|
|
442
|
+
return _whisper_models[key]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
|
@@ -2,13 +2,13 @@ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb
|
|
|
2
2
|
mkv_episode_matcher/__init__.py,sha256=u3yZcpuK0ICeUjxYKePvW-zS61E5ss5q2AvqnSHuz9E,240
|
|
3
3
|
mkv_episode_matcher/__main__.py,sha256=O3GQk5R9BFuA-QNlqfBgDSS7G_W8IGSxiV8CFUbcaLc,10059
|
|
4
4
|
mkv_episode_matcher/config.py,sha256=EcJJjkekQ7oWtarUkufCYON_QWbQvq55-zMqCTOqSa4,2265
|
|
5
|
-
mkv_episode_matcher/episode_identification.py,sha256=
|
|
5
|
+
mkv_episode_matcher/episode_identification.py,sha256=IMB1m3-oY4Z31XIWCFjpdXDENwmKMgzjctl3CilthJ4,15926
|
|
6
6
|
mkv_episode_matcher/episode_matcher.py,sha256=SxAbnXuTJITD1o0WohE9heE3Fm9zW_w0Nq3GzqtcIpQ,6329
|
|
7
7
|
mkv_episode_matcher/subtitle_utils.py,sha256=Hz9b4CKPV07YKTY4dcN3WbvdbvH-S3J4zcb9CiyvPlE,2551
|
|
8
8
|
mkv_episode_matcher/tmdb_client.py,sha256=LbMCgjmp7sCbrQo_CDlpcnryKPz5S7inE24YY9Pyjk4,4172
|
|
9
9
|
mkv_episode_matcher/utils.py,sha256=modXMLmt2fpny8liXwqe4ylxnwwfg_98OLOacv5izps,14501
|
|
10
|
-
mkv_episode_matcher-0.
|
|
11
|
-
mkv_episode_matcher-0.
|
|
12
|
-
mkv_episode_matcher-0.
|
|
13
|
-
mkv_episode_matcher-0.
|
|
14
|
-
mkv_episode_matcher-0.
|
|
10
|
+
mkv_episode_matcher-0.8.0.dist-info/METADATA,sha256=TcH5g5UfyJop2ZV_tWShEm4O28EkVGLlcpOXbG74mjI,5384
|
|
11
|
+
mkv_episode_matcher-0.8.0.dist-info/WHEEL,sha256=7ciDxtlje1X8OhobNuGgi1t-ACdFSelPnSmDPrtlobY,91
|
|
12
|
+
mkv_episode_matcher-0.8.0.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
|
|
13
|
+
mkv_episode_matcher-0.8.0.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
|
|
14
|
+
mkv_episode_matcher-0.8.0.dist-info/RECORD,,
|
{mkv_episode_matcher-0.7.1.dist-info → mkv_episode_matcher-0.8.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|