mkv-episode-matcher 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -10,9 +10,40 @@ import torch
10
10
  import whisper
11
11
  from loguru import logger
12
12
  from rapidfuzz import fuzz
13
+ from mkv_episode_matcher.utils import extract_season_episode
14
+ from functools import lru_cache
13
15
 
14
16
  console = Console()
15
17
 
18
+ class SubtitleCache:
19
+ """Cache for storing parsed subtitle data to avoid repeated loading and parsing."""
20
+
21
+ def __init__(self):
22
+ self.subtitles = {} # {file_path: parsed_content}
23
+ self.chunk_cache = {} # {(file_path, chunk_idx): text}
24
+
25
+ def get_subtitle_content(self, srt_file):
26
+ """Get the full content of a subtitle file, loading it only once."""
27
+ srt_file = str(srt_file)
28
+ if srt_file not in self.subtitles:
29
+ reader = SubtitleReader()
30
+ self.subtitles[srt_file] = reader.read_srt_file(srt_file)
31
+ return self.subtitles[srt_file]
32
+
33
+ def get_chunk(self, srt_file, chunk_idx, chunk_start, chunk_end):
34
+ """Get a specific time chunk from a subtitle file, with caching."""
35
+ srt_file = str(srt_file)
36
+ cache_key = (srt_file, chunk_idx)
37
+
38
+ if cache_key not in self.chunk_cache:
39
+ content = self.get_subtitle_content(srt_file)
40
+ reader = SubtitleReader()
41
+ text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
42
+ self.chunk_cache[cache_key] = " ".join(text_lines)
43
+
44
+ return self.chunk_cache[cache_key]
45
+
46
+
16
47
  class EpisodeMatcher:
17
48
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
18
49
  self.cache_dir = Path(cache_dir)
@@ -22,6 +53,12 @@ class EpisodeMatcher:
22
53
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
54
  self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
24
55
  self.temp_dir.mkdir(exist_ok=True)
56
+ # Initialize subtitle cache
57
+ self.subtitle_cache = SubtitleCache()
58
+ # Cache for extracted audio chunks
59
+ self.audio_chunks = {}
60
+ # Store reference files to avoid repeated glob operations
61
+ self.reference_files_cache = {}
25
62
 
26
63
  def clean_text(self, text):
27
64
  text = text.lower().strip()
@@ -38,7 +75,12 @@ class EpisodeMatcher:
38
75
  ) / 100.0
39
76
 
40
77
  def extract_audio_chunk(self, mkv_file, start_time):
41
- """Extract a chunk of audio from MKV file."""
78
+ """Extract a chunk of audio from MKV file with caching."""
79
+ cache_key = (str(mkv_file), start_time)
80
+
81
+ if cache_key in self.audio_chunks:
82
+ return self.audio_chunks[cache_key]
83
+
42
84
  chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
43
85
  if not chunk_path.exists():
44
86
  cmd = [
@@ -58,14 +100,18 @@ class EpisodeMatcher:
58
100
  "16000",
59
101
  "-ac",
60
102
  "1",
103
+ "-y", # Overwrite output files without asking
61
104
  str(chunk_path),
62
105
  ]
63
106
  subprocess.run(cmd, capture_output=True)
64
- return str(chunk_path)
107
+
108
+ chunk_path_str = str(chunk_path)
109
+ self.audio_chunks[cache_key] = chunk_path_str
110
+ return chunk_path_str
65
111
 
66
112
  def load_reference_chunk(self, srt_file, chunk_idx):
67
113
  """
68
- Load reference subtitles for a specific time chunk with robust encoding handling.
114
+ Load reference subtitles for a specific time chunk with caching.
69
115
 
70
116
  Args:
71
117
  srt_file (str or Path): Path to the SRT file
@@ -74,23 +120,48 @@ class EpisodeMatcher:
74
120
  Returns:
75
121
  str: Combined text from the subtitle chunk
76
122
  """
77
- chunk_start = chunk_idx * self.chunk_duration
78
- chunk_end = chunk_start + self.chunk_duration
79
-
80
123
  try:
81
- # Read the file content using our robust reader
82
- reader = SubtitleReader()
83
- content = reader.read_srt_file(srt_file)
84
-
85
- # Extract subtitles for the time chunk
86
- text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
87
-
88
- return " ".join(text_lines)
89
-
124
+ chunk_start = chunk_idx * self.chunk_duration
125
+ chunk_end = chunk_start + self.chunk_duration
126
+
127
+ return self.subtitle_cache.get_chunk(srt_file, chunk_idx, chunk_start, chunk_end)
128
+
90
129
  except Exception as e:
91
130
  logger.error(f"Error loading reference chunk from {srt_file}: {e}")
92
131
  return ""
93
132
 
133
+ def get_reference_files(self, season_number):
134
+ """Get reference subtitle files with caching."""
135
+ cache_key = (self.show_name, season_number)
136
+
137
+ if cache_key in self.reference_files_cache:
138
+ return self.reference_files_cache[cache_key]
139
+
140
+ reference_dir = self.cache_dir / "data" / self.show_name
141
+ patterns = [
142
+ f"S{season_number:02d}E",
143
+ f"S{season_number}E",
144
+ f"{season_number:02d}x",
145
+ f"{season_number}x",
146
+ ]
147
+
148
+ reference_files = []
149
+ for _pattern in patterns:
150
+ files = [
151
+ f
152
+ for f in reference_dir.glob("*.srt")
153
+ if any(
154
+ re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
155
+ )
156
+ ]
157
+ reference_files.extend(files)
158
+
159
+ # Remove duplicates while preserving order
160
+ reference_files = list(dict.fromkeys(reference_files))
161
+
162
+ self.reference_files_cache[cache_key] = reference_files
163
+ return reference_files
164
+
94
165
  def _try_match_with_model(
95
166
  self, video_file, model_name, max_duration, reference_files
96
167
  ):
@@ -107,7 +178,12 @@ class EpisodeMatcher:
107
178
  model = get_whisper_model(model_name, self.device)
108
179
 
109
180
  # Calculate number of chunks to check (30 seconds each)
110
- num_chunks = max_duration // self.chunk_duration
181
+ num_chunks = min(max_duration // self.chunk_duration, 10) # Limit to 10 chunks for initial check
182
+
183
+ # Pre-load all reference chunks for the chunks we'll check
184
+ for chunk_idx in range(num_chunks):
185
+ for ref_file in reference_files:
186
+ self.load_reference_chunk(ref_file, chunk_idx)
111
187
 
112
188
  for chunk_idx in range(num_chunks):
113
189
  start_time = chunk_idx * self.chunk_duration
@@ -127,13 +203,19 @@ class EpisodeMatcher:
127
203
  confidence = self.chunk_score(chunk_text, ref_text)
128
204
 
129
205
  if confidence > best_confidence:
206
+ logger.debug(f"New best confidence: {confidence} for {ref_file}")
130
207
  best_confidence = confidence
131
- best_match = ref_file
208
+ best_match = Path(ref_file)
132
209
 
133
210
  if confidence > self.min_confidence:
134
- season_ep = re.search(r"S(\d+)E(\d+)", best_match.stem)
135
- if season_ep:
136
- season, episode = map(int, season_ep.groups())
211
+ print(f"Matched with {best_match} (confidence: {best_confidence:.2f})")
212
+ try:
213
+ season, episode = extract_season_episode(best_match.stem)
214
+ except Exception as e:
215
+ print(f"Error extracting season/episode: {e}")
216
+ continue
217
+ print(f"Season: {season}, Episode: {episode} (confidence: {best_confidence:.2f})")
218
+ if season and episode:
137
219
  return {
138
220
  "season": season,
139
221
  "episode": episode,
@@ -150,54 +232,22 @@ class EpisodeMatcher:
150
232
  def identify_episode(self, video_file, temp_dir, season_number):
151
233
  """Progressive episode identification with faster initial attempt."""
152
234
  try:
153
- # Get reference files first
154
- reference_dir = self.cache_dir / "data" / self.show_name
155
- patterns = [
156
- f"S{season_number:02d}E",
157
- f"S{season_number}E",
158
- f"{season_number:02d}x",
159
- f"{season_number}x",
160
- ]
161
-
162
- reference_files = []
163
- # TODO Figure our why patterns is not being used
164
- for _pattern in patterns:
165
- files = [
166
- f
167
- for f in reference_dir.glob("*.srt")
168
- if any(
169
- re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
170
- )
171
- ]
172
- reference_files.extend(files)
173
-
174
- reference_files = list(dict.fromkeys(reference_files))
235
+ # Get reference files first with caching
236
+ reference_files = self.get_reference_files(season_number)
175
237
 
176
238
  if not reference_files:
177
239
  logger.error(f"No reference files found for season {season_number}")
178
240
  return None
179
- duration = float(
180
- subprocess.check_output([
181
- "ffprobe",
182
- "-v",
183
- "error",
184
- "-show_entries",
185
- "format=duration",
186
- "-of",
187
- "default=noprint_wrappers=1:nokey=1",
188
- video_file,
189
- ]).decode()
190
- )
241
+
242
+ # Cache video duration
243
+ duration = get_video_duration(video_file)
191
244
 
192
- duration = int(np.ceil(duration))
193
245
  # Try with tiny model first (fastest)
194
246
  logger.info("Attempting match with tiny model...")
195
247
  match = self._try_match_with_model(
196
- video_file, "tiny", duration, reference_files
248
+ video_file, "tiny", min(duration, 300), reference_files # Limit to first 5 minutes
197
249
  )
198
- if (
199
- match and match["confidence"] > 0.65
200
- ): # Slightly lower threshold for tiny
250
+ if match and match["confidence"] > 0.65: # Slightly lower threshold for tiny
201
251
  logger.info(
202
252
  f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
203
253
  )
@@ -205,10 +255,10 @@ class EpisodeMatcher:
205
255
 
206
256
  # If no match, try base model
207
257
  logger.info(
208
- "No match in first 3 minutes, extending base model search to 10 minutes..."
258
+ "No match with tiny model, extending base model search to 10 minutes..."
209
259
  )
210
260
  match = self._try_match_with_model(
211
- video_file, "base", duration, reference_files
261
+ video_file, "base", min(duration, 600), reference_files # Limit to first 10 minutes
212
262
  )
213
263
  if match:
214
264
  logger.info(
@@ -220,12 +270,30 @@ class EpisodeMatcher:
220
270
  return None
221
271
 
222
272
  finally:
223
- # Cleanup temp files
224
- for file in self.temp_dir.glob("chunk_*.wav"):
273
+ # Cleanup temp files - keep this limited to only files we know we created
274
+ for chunk_info in self.audio_chunks.values():
225
275
  try:
226
- file.unlink()
276
+ Path(chunk_info).unlink(missing_ok=True)
227
277
  except Exception as e:
228
- logger.warning(f"Failed to delete temp file {file}: {e}")
278
+ logger.warning(f"Failed to delete temp file {chunk_info}: {e}")
279
+
280
+
281
+ @lru_cache(maxsize=100)
282
+ def get_video_duration(video_file):
283
+ """Get video duration with caching."""
284
+ duration = float(
285
+ subprocess.check_output([
286
+ "ffprobe",
287
+ "-v",
288
+ "error",
289
+ "-show_entries",
290
+ "format=duration",
291
+ "-of",
292
+ "default=noprint_wrappers=1:nokey=1",
293
+ video_file,
294
+ ]).decode()
295
+ )
296
+ return int(np.ceil(duration))
229
297
 
230
298
 
231
299
  def detect_file_encoding(file_path):
@@ -240,7 +308,7 @@ def detect_file_encoding(file_path):
240
308
  """
241
309
  try:
242
310
  with open(file_path, "rb") as f:
243
- raw_data = f.read()
311
+ raw_data = f.read(min(1024 * 1024, Path(file_path).stat().st_size)) # Read up to 1MB
244
312
  result = chardet.detect(raw_data)
245
313
  encoding = result["encoding"]
246
314
  confidence = result["confidence"]
@@ -254,6 +322,7 @@ def detect_file_encoding(file_path):
254
322
  return "utf-8"
255
323
 
256
324
 
325
+ @lru_cache(maxsize=100)
257
326
  def read_file_with_fallback(file_path, encodings=None):
258
327
  """
259
328
  Read a file trying multiple encodings in order of preference.
@@ -337,12 +406,16 @@ class SubtitleReader:
337
406
 
338
407
  try:
339
408
  timestamp = lines[1]
340
- text = " ".join(lines[2:])
341
-
342
- end_stamp = timestamp.split(" --> ")[1].strip()
343
- total_seconds = SubtitleReader.parse_timestamp(end_stamp)
344
-
345
- if start_time <= total_seconds <= end_time:
409
+ time_parts = timestamp.split(" --> ")
410
+ start_stamp = time_parts[0].strip()
411
+ end_stamp = time_parts[1].strip()
412
+
413
+ subtitle_start = SubtitleReader.parse_timestamp(start_stamp)
414
+ subtitle_end = SubtitleReader.parse_timestamp(end_stamp)
415
+
416
+ # Check if this subtitle overlaps with our chunk
417
+ if subtitle_end >= start_time and subtitle_start <= end_time:
418
+ text = " ".join(lines[2:])
346
419
  text_lines.append(text)
347
420
 
348
421
  except (IndexError, ValueError) as e:
@@ -352,9 +425,9 @@ class SubtitleReader:
352
425
  return text_lines
353
426
 
354
427
 
428
+ # Global whisper model cache with better cache key
355
429
  _whisper_models = {}
356
430
 
357
-
358
431
  def get_whisper_model(model_name="tiny", device=None):
359
432
  """Cache whisper models to avoid reloading."""
360
433
  global _whisper_models
@@ -366,4 +439,4 @@ def get_whisper_model(model_name="tiny", device=None):
366
439
  _whisper_models[key] = whisper.load_model(model_name, device=device)
367
440
  logger.info(f"Loaded {model_name} model on {device}")
368
441
 
369
- return _whisper_models[key]
442
+ return _whisper_models[key]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mkv-episode-matcher
3
- Version: 0.7.1
3
+ Version: 0.8.0
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -2,13 +2,13 @@ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb
2
2
  mkv_episode_matcher/__init__.py,sha256=u3yZcpuK0ICeUjxYKePvW-zS61E5ss5q2AvqnSHuz9E,240
3
3
  mkv_episode_matcher/__main__.py,sha256=O3GQk5R9BFuA-QNlqfBgDSS7G_W8IGSxiV8CFUbcaLc,10059
4
4
  mkv_episode_matcher/config.py,sha256=EcJJjkekQ7oWtarUkufCYON_QWbQvq55-zMqCTOqSa4,2265
5
- mkv_episode_matcher/episode_identification.py,sha256=r75AGVSQPdJwOZC1PkyPh89OCyjqDhpMbMuh0J3KWDY,12531
5
+ mkv_episode_matcher/episode_identification.py,sha256=IMB1m3-oY4Z31XIWCFjpdXDENwmKMgzjctl3CilthJ4,15926
6
6
  mkv_episode_matcher/episode_matcher.py,sha256=SxAbnXuTJITD1o0WohE9heE3Fm9zW_w0Nq3GzqtcIpQ,6329
7
7
  mkv_episode_matcher/subtitle_utils.py,sha256=Hz9b4CKPV07YKTY4dcN3WbvdbvH-S3J4zcb9CiyvPlE,2551
8
8
  mkv_episode_matcher/tmdb_client.py,sha256=LbMCgjmp7sCbrQo_CDlpcnryKPz5S7inE24YY9Pyjk4,4172
9
9
  mkv_episode_matcher/utils.py,sha256=modXMLmt2fpny8liXwqe4ylxnwwfg_98OLOacv5izps,14501
10
- mkv_episode_matcher-0.7.1.dist-info/METADATA,sha256=aldFGWAmo9NdKCgkrxI5bZB-5IwvJt78OlWXVB9xzBs,5384
11
- mkv_episode_matcher-0.7.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
12
- mkv_episode_matcher-0.7.1.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
13
- mkv_episode_matcher-0.7.1.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
14
- mkv_episode_matcher-0.7.1.dist-info/RECORD,,
10
+ mkv_episode_matcher-0.8.0.dist-info/METADATA,sha256=TcH5g5UfyJop2ZV_tWShEm4O28EkVGLlcpOXbG74mjI,5384
11
+ mkv_episode_matcher-0.8.0.dist-info/WHEEL,sha256=7ciDxtlje1X8OhobNuGgi1t-ACdFSelPnSmDPrtlobY,91
12
+ mkv_episode_matcher-0.8.0.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
13
+ mkv_episode_matcher-0.8.0.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
14
+ mkv_episode_matcher-0.8.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5