mkv-episode-matcher 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/episode_identification.py +139 -73
- {mkv_episode_matcher-0.7.2.dist-info → mkv_episode_matcher-0.8.0.dist-info}/METADATA +1 -1
- {mkv_episode_matcher-0.7.2.dist-info → mkv_episode_matcher-0.8.0.dist-info}/RECORD +6 -6
- {mkv_episode_matcher-0.7.2.dist-info → mkv_episode_matcher-0.8.0.dist-info}/WHEEL +1 -1
- {mkv_episode_matcher-0.7.2.dist-info → mkv_episode_matcher-0.8.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.7.2.dist-info → mkv_episode_matcher-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -10,10 +10,40 @@ import torch
|
|
|
10
10
|
import whisper
|
|
11
11
|
from loguru import logger
|
|
12
12
|
from rapidfuzz import fuzz
|
|
13
|
-
from utils import extract_season_episode
|
|
13
|
+
from mkv_episode_matcher.utils import extract_season_episode
|
|
14
|
+
from functools import lru_cache
|
|
14
15
|
|
|
15
16
|
console = Console()
|
|
16
17
|
|
|
18
|
+
class SubtitleCache:
|
|
19
|
+
"""Cache for storing parsed subtitle data to avoid repeated loading and parsing."""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.subtitles = {} # {file_path: parsed_content}
|
|
23
|
+
self.chunk_cache = {} # {(file_path, chunk_idx): text}
|
|
24
|
+
|
|
25
|
+
def get_subtitle_content(self, srt_file):
|
|
26
|
+
"""Get the full content of a subtitle file, loading it only once."""
|
|
27
|
+
srt_file = str(srt_file)
|
|
28
|
+
if srt_file not in self.subtitles:
|
|
29
|
+
reader = SubtitleReader()
|
|
30
|
+
self.subtitles[srt_file] = reader.read_srt_file(srt_file)
|
|
31
|
+
return self.subtitles[srt_file]
|
|
32
|
+
|
|
33
|
+
def get_chunk(self, srt_file, chunk_idx, chunk_start, chunk_end):
|
|
34
|
+
"""Get a specific time chunk from a subtitle file, with caching."""
|
|
35
|
+
srt_file = str(srt_file)
|
|
36
|
+
cache_key = (srt_file, chunk_idx)
|
|
37
|
+
|
|
38
|
+
if cache_key not in self.chunk_cache:
|
|
39
|
+
content = self.get_subtitle_content(srt_file)
|
|
40
|
+
reader = SubtitleReader()
|
|
41
|
+
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
42
|
+
self.chunk_cache[cache_key] = " ".join(text_lines)
|
|
43
|
+
|
|
44
|
+
return self.chunk_cache[cache_key]
|
|
45
|
+
|
|
46
|
+
|
|
17
47
|
class EpisodeMatcher:
|
|
18
48
|
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
19
49
|
self.cache_dir = Path(cache_dir)
|
|
@@ -23,6 +53,12 @@ class EpisodeMatcher:
|
|
|
23
53
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
24
54
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
25
55
|
self.temp_dir.mkdir(exist_ok=True)
|
|
56
|
+
# Initialize subtitle cache
|
|
57
|
+
self.subtitle_cache = SubtitleCache()
|
|
58
|
+
# Cache for extracted audio chunks
|
|
59
|
+
self.audio_chunks = {}
|
|
60
|
+
# Store reference files to avoid repeated glob operations
|
|
61
|
+
self.reference_files_cache = {}
|
|
26
62
|
|
|
27
63
|
def clean_text(self, text):
|
|
28
64
|
text = text.lower().strip()
|
|
@@ -39,7 +75,12 @@ class EpisodeMatcher:
|
|
|
39
75
|
) / 100.0
|
|
40
76
|
|
|
41
77
|
def extract_audio_chunk(self, mkv_file, start_time):
|
|
42
|
-
"""Extract a chunk of audio from MKV file."""
|
|
78
|
+
"""Extract a chunk of audio from MKV file with caching."""
|
|
79
|
+
cache_key = (str(mkv_file), start_time)
|
|
80
|
+
|
|
81
|
+
if cache_key in self.audio_chunks:
|
|
82
|
+
return self.audio_chunks[cache_key]
|
|
83
|
+
|
|
43
84
|
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
44
85
|
if not chunk_path.exists():
|
|
45
86
|
cmd = [
|
|
@@ -59,14 +100,18 @@ class EpisodeMatcher:
|
|
|
59
100
|
"16000",
|
|
60
101
|
"-ac",
|
|
61
102
|
"1",
|
|
103
|
+
"-y", # Overwrite output files without asking
|
|
62
104
|
str(chunk_path),
|
|
63
105
|
]
|
|
64
106
|
subprocess.run(cmd, capture_output=True)
|
|
65
|
-
|
|
107
|
+
|
|
108
|
+
chunk_path_str = str(chunk_path)
|
|
109
|
+
self.audio_chunks[cache_key] = chunk_path_str
|
|
110
|
+
return chunk_path_str
|
|
66
111
|
|
|
67
112
|
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
68
113
|
"""
|
|
69
|
-
Load reference subtitles for a specific time chunk with
|
|
114
|
+
Load reference subtitles for a specific time chunk with caching.
|
|
70
115
|
|
|
71
116
|
Args:
|
|
72
117
|
srt_file (str or Path): Path to the SRT file
|
|
@@ -75,23 +120,48 @@ class EpisodeMatcher:
|
|
|
75
120
|
Returns:
|
|
76
121
|
str: Combined text from the subtitle chunk
|
|
77
122
|
"""
|
|
78
|
-
chunk_start = chunk_idx * self.chunk_duration
|
|
79
|
-
chunk_end = chunk_start + self.chunk_duration
|
|
80
|
-
|
|
81
123
|
try:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
88
|
-
|
|
89
|
-
return " ".join(text_lines)
|
|
90
|
-
|
|
124
|
+
chunk_start = chunk_idx * self.chunk_duration
|
|
125
|
+
chunk_end = chunk_start + self.chunk_duration
|
|
126
|
+
|
|
127
|
+
return self.subtitle_cache.get_chunk(srt_file, chunk_idx, chunk_start, chunk_end)
|
|
128
|
+
|
|
91
129
|
except Exception as e:
|
|
92
130
|
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
93
131
|
return ""
|
|
94
132
|
|
|
133
|
+
def get_reference_files(self, season_number):
|
|
134
|
+
"""Get reference subtitle files with caching."""
|
|
135
|
+
cache_key = (self.show_name, season_number)
|
|
136
|
+
|
|
137
|
+
if cache_key in self.reference_files_cache:
|
|
138
|
+
return self.reference_files_cache[cache_key]
|
|
139
|
+
|
|
140
|
+
reference_dir = self.cache_dir / "data" / self.show_name
|
|
141
|
+
patterns = [
|
|
142
|
+
f"S{season_number:02d}E",
|
|
143
|
+
f"S{season_number}E",
|
|
144
|
+
f"{season_number:02d}x",
|
|
145
|
+
f"{season_number}x",
|
|
146
|
+
]
|
|
147
|
+
|
|
148
|
+
reference_files = []
|
|
149
|
+
for _pattern in patterns:
|
|
150
|
+
files = [
|
|
151
|
+
f
|
|
152
|
+
for f in reference_dir.glob("*.srt")
|
|
153
|
+
if any(
|
|
154
|
+
re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
|
|
155
|
+
)
|
|
156
|
+
]
|
|
157
|
+
reference_files.extend(files)
|
|
158
|
+
|
|
159
|
+
# Remove duplicates while preserving order
|
|
160
|
+
reference_files = list(dict.fromkeys(reference_files))
|
|
161
|
+
|
|
162
|
+
self.reference_files_cache[cache_key] = reference_files
|
|
163
|
+
return reference_files
|
|
164
|
+
|
|
95
165
|
def _try_match_with_model(
|
|
96
166
|
self, video_file, model_name, max_duration, reference_files
|
|
97
167
|
):
|
|
@@ -108,7 +178,12 @@ class EpisodeMatcher:
|
|
|
108
178
|
model = get_whisper_model(model_name, self.device)
|
|
109
179
|
|
|
110
180
|
# Calculate number of chunks to check (30 seconds each)
|
|
111
|
-
num_chunks = max_duration // self.chunk_duration
|
|
181
|
+
num_chunks = min(max_duration // self.chunk_duration, 10) # Limit to 10 chunks for initial check
|
|
182
|
+
|
|
183
|
+
# Pre-load all reference chunks for the chunks we'll check
|
|
184
|
+
for chunk_idx in range(num_chunks):
|
|
185
|
+
for ref_file in reference_files:
|
|
186
|
+
self.load_reference_chunk(ref_file, chunk_idx)
|
|
112
187
|
|
|
113
188
|
for chunk_idx in range(num_chunks):
|
|
114
189
|
start_time = chunk_idx * self.chunk_duration
|
|
@@ -128,14 +203,14 @@ class EpisodeMatcher:
|
|
|
128
203
|
confidence = self.chunk_score(chunk_text, ref_text)
|
|
129
204
|
|
|
130
205
|
if confidence > best_confidence:
|
|
131
|
-
|
|
206
|
+
logger.debug(f"New best confidence: {confidence} for {ref_file}")
|
|
132
207
|
best_confidence = confidence
|
|
133
208
|
best_match = Path(ref_file)
|
|
134
209
|
|
|
135
210
|
if confidence > self.min_confidence:
|
|
136
211
|
print(f"Matched with {best_match} (confidence: {best_confidence:.2f})")
|
|
137
212
|
try:
|
|
138
|
-
season,episode = extract_season_episode(best_match.stem)
|
|
213
|
+
season, episode = extract_season_episode(best_match.stem)
|
|
139
214
|
except Exception as e:
|
|
140
215
|
print(f"Error extracting season/episode: {e}")
|
|
141
216
|
continue
|
|
@@ -157,54 +232,22 @@ class EpisodeMatcher:
|
|
|
157
232
|
def identify_episode(self, video_file, temp_dir, season_number):
|
|
158
233
|
"""Progressive episode identification with faster initial attempt."""
|
|
159
234
|
try:
|
|
160
|
-
# Get reference files first
|
|
161
|
-
|
|
162
|
-
patterns = [
|
|
163
|
-
f"S{season_number:02d}E",
|
|
164
|
-
f"S{season_number}E",
|
|
165
|
-
f"{season_number:02d}x",
|
|
166
|
-
f"{season_number}x",
|
|
167
|
-
]
|
|
168
|
-
|
|
169
|
-
reference_files = []
|
|
170
|
-
# TODO Figure our why patterns is not being used
|
|
171
|
-
for _pattern in patterns:
|
|
172
|
-
files = [
|
|
173
|
-
f
|
|
174
|
-
for f in reference_dir.glob("*.srt")
|
|
175
|
-
if any(
|
|
176
|
-
re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
|
|
177
|
-
)
|
|
178
|
-
]
|
|
179
|
-
reference_files.extend(files)
|
|
180
|
-
|
|
181
|
-
reference_files = list(dict.fromkeys(reference_files))
|
|
235
|
+
# Get reference files first with caching
|
|
236
|
+
reference_files = self.get_reference_files(season_number)
|
|
182
237
|
|
|
183
238
|
if not reference_files:
|
|
184
239
|
logger.error(f"No reference files found for season {season_number}")
|
|
185
240
|
return None
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
"-v",
|
|
190
|
-
"error",
|
|
191
|
-
"-show_entries",
|
|
192
|
-
"format=duration",
|
|
193
|
-
"-of",
|
|
194
|
-
"default=noprint_wrappers=1:nokey=1",
|
|
195
|
-
video_file,
|
|
196
|
-
]).decode()
|
|
197
|
-
)
|
|
241
|
+
|
|
242
|
+
# Cache video duration
|
|
243
|
+
duration = get_video_duration(video_file)
|
|
198
244
|
|
|
199
|
-
duration = int(np.ceil(duration))
|
|
200
245
|
# Try with tiny model first (fastest)
|
|
201
246
|
logger.info("Attempting match with tiny model...")
|
|
202
247
|
match = self._try_match_with_model(
|
|
203
|
-
video_file, "tiny", duration, reference_files
|
|
248
|
+
video_file, "tiny", min(duration, 300), reference_files # Limit to first 5 minutes
|
|
204
249
|
)
|
|
205
|
-
if
|
|
206
|
-
match and match["confidence"] > 0.65
|
|
207
|
-
): # Slightly lower threshold for tiny
|
|
250
|
+
if match and match["confidence"] > 0.65: # Slightly lower threshold for tiny
|
|
208
251
|
logger.info(
|
|
209
252
|
f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
210
253
|
)
|
|
@@ -212,10 +255,10 @@ class EpisodeMatcher:
|
|
|
212
255
|
|
|
213
256
|
# If no match, try base model
|
|
214
257
|
logger.info(
|
|
215
|
-
"No match
|
|
258
|
+
"No match with tiny model, extending base model search to 10 minutes..."
|
|
216
259
|
)
|
|
217
260
|
match = self._try_match_with_model(
|
|
218
|
-
video_file, "base", duration, reference_files
|
|
261
|
+
video_file, "base", min(duration, 600), reference_files # Limit to first 10 minutes
|
|
219
262
|
)
|
|
220
263
|
if match:
|
|
221
264
|
logger.info(
|
|
@@ -227,12 +270,30 @@ class EpisodeMatcher:
|
|
|
227
270
|
return None
|
|
228
271
|
|
|
229
272
|
finally:
|
|
230
|
-
# Cleanup temp files
|
|
231
|
-
for
|
|
273
|
+
# Cleanup temp files - keep this limited to only files we know we created
|
|
274
|
+
for chunk_info in self.audio_chunks.values():
|
|
232
275
|
try:
|
|
233
|
-
|
|
276
|
+
Path(chunk_info).unlink(missing_ok=True)
|
|
234
277
|
except Exception as e:
|
|
235
|
-
logger.warning(f"Failed to delete temp file {
|
|
278
|
+
logger.warning(f"Failed to delete temp file {chunk_info}: {e}")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@lru_cache(maxsize=100)
|
|
282
|
+
def get_video_duration(video_file):
|
|
283
|
+
"""Get video duration with caching."""
|
|
284
|
+
duration = float(
|
|
285
|
+
subprocess.check_output([
|
|
286
|
+
"ffprobe",
|
|
287
|
+
"-v",
|
|
288
|
+
"error",
|
|
289
|
+
"-show_entries",
|
|
290
|
+
"format=duration",
|
|
291
|
+
"-of",
|
|
292
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
293
|
+
video_file,
|
|
294
|
+
]).decode()
|
|
295
|
+
)
|
|
296
|
+
return int(np.ceil(duration))
|
|
236
297
|
|
|
237
298
|
|
|
238
299
|
def detect_file_encoding(file_path):
|
|
@@ -247,7 +308,7 @@ def detect_file_encoding(file_path):
|
|
|
247
308
|
"""
|
|
248
309
|
try:
|
|
249
310
|
with open(file_path, "rb") as f:
|
|
250
|
-
raw_data = f.read()
|
|
311
|
+
raw_data = f.read(min(1024 * 1024, Path(file_path).stat().st_size)) # Read up to 1MB
|
|
251
312
|
result = chardet.detect(raw_data)
|
|
252
313
|
encoding = result["encoding"]
|
|
253
314
|
confidence = result["confidence"]
|
|
@@ -261,6 +322,7 @@ def detect_file_encoding(file_path):
|
|
|
261
322
|
return "utf-8"
|
|
262
323
|
|
|
263
324
|
|
|
325
|
+
@lru_cache(maxsize=100)
|
|
264
326
|
def read_file_with_fallback(file_path, encodings=None):
|
|
265
327
|
"""
|
|
266
328
|
Read a file trying multiple encodings in order of preference.
|
|
@@ -344,12 +406,16 @@ class SubtitleReader:
|
|
|
344
406
|
|
|
345
407
|
try:
|
|
346
408
|
timestamp = lines[1]
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
end_stamp =
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
409
|
+
time_parts = timestamp.split(" --> ")
|
|
410
|
+
start_stamp = time_parts[0].strip()
|
|
411
|
+
end_stamp = time_parts[1].strip()
|
|
412
|
+
|
|
413
|
+
subtitle_start = SubtitleReader.parse_timestamp(start_stamp)
|
|
414
|
+
subtitle_end = SubtitleReader.parse_timestamp(end_stamp)
|
|
415
|
+
|
|
416
|
+
# Check if this subtitle overlaps with our chunk
|
|
417
|
+
if subtitle_end >= start_time and subtitle_start <= end_time:
|
|
418
|
+
text = " ".join(lines[2:])
|
|
353
419
|
text_lines.append(text)
|
|
354
420
|
|
|
355
421
|
except (IndexError, ValueError) as e:
|
|
@@ -359,9 +425,9 @@ class SubtitleReader:
|
|
|
359
425
|
return text_lines
|
|
360
426
|
|
|
361
427
|
|
|
428
|
+
# Global whisper model cache with better cache key
|
|
362
429
|
_whisper_models = {}
|
|
363
430
|
|
|
364
|
-
|
|
365
431
|
def get_whisper_model(model_name="tiny", device=None):
|
|
366
432
|
"""Cache whisper models to avoid reloading."""
|
|
367
433
|
global _whisper_models
|
|
@@ -373,4 +439,4 @@ def get_whisper_model(model_name="tiny", device=None):
|
|
|
373
439
|
_whisper_models[key] = whisper.load_model(model_name, device=device)
|
|
374
440
|
logger.info(f"Loaded {model_name} model on {device}")
|
|
375
441
|
|
|
376
|
-
return _whisper_models[key]
|
|
442
|
+
return _whisper_models[key]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
|
@@ -2,13 +2,13 @@ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb
|
|
|
2
2
|
mkv_episode_matcher/__init__.py,sha256=u3yZcpuK0ICeUjxYKePvW-zS61E5ss5q2AvqnSHuz9E,240
|
|
3
3
|
mkv_episode_matcher/__main__.py,sha256=O3GQk5R9BFuA-QNlqfBgDSS7G_W8IGSxiV8CFUbcaLc,10059
|
|
4
4
|
mkv_episode_matcher/config.py,sha256=EcJJjkekQ7oWtarUkufCYON_QWbQvq55-zMqCTOqSa4,2265
|
|
5
|
-
mkv_episode_matcher/episode_identification.py,sha256=
|
|
5
|
+
mkv_episode_matcher/episode_identification.py,sha256=IMB1m3-oY4Z31XIWCFjpdXDENwmKMgzjctl3CilthJ4,15926
|
|
6
6
|
mkv_episode_matcher/episode_matcher.py,sha256=SxAbnXuTJITD1o0WohE9heE3Fm9zW_w0Nq3GzqtcIpQ,6329
|
|
7
7
|
mkv_episode_matcher/subtitle_utils.py,sha256=Hz9b4CKPV07YKTY4dcN3WbvdbvH-S3J4zcb9CiyvPlE,2551
|
|
8
8
|
mkv_episode_matcher/tmdb_client.py,sha256=LbMCgjmp7sCbrQo_CDlpcnryKPz5S7inE24YY9Pyjk4,4172
|
|
9
9
|
mkv_episode_matcher/utils.py,sha256=modXMLmt2fpny8liXwqe4ylxnwwfg_98OLOacv5izps,14501
|
|
10
|
-
mkv_episode_matcher-0.
|
|
11
|
-
mkv_episode_matcher-0.
|
|
12
|
-
mkv_episode_matcher-0.
|
|
13
|
-
mkv_episode_matcher-0.
|
|
14
|
-
mkv_episode_matcher-0.
|
|
10
|
+
mkv_episode_matcher-0.8.0.dist-info/METADATA,sha256=TcH5g5UfyJop2ZV_tWShEm4O28EkVGLlcpOXbG74mjI,5384
|
|
11
|
+
mkv_episode_matcher-0.8.0.dist-info/WHEEL,sha256=7ciDxtlje1X8OhobNuGgi1t-ACdFSelPnSmDPrtlobY,91
|
|
12
|
+
mkv_episode_matcher-0.8.0.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
|
|
13
|
+
mkv_episode_matcher-0.8.0.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
|
|
14
|
+
mkv_episode_matcher-0.8.0.dist-info/RECORD,,
|
{mkv_episode_matcher-0.7.2.dist-info → mkv_episode_matcher-0.8.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|