mkv-episode-matcher 0.7.2__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (42) hide show
  1. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.coverage +0 -0
  2. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/PKG-INFO +1 -1
  3. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/episode_identification.py +139 -73
  4. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher.egg-info/PKG-INFO +1 -1
  5. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/setup.cfg +1 -1
  6. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.gitattributes +0 -0
  7. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.github/funding.yml +0 -0
  8. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.github/workflows/documentation.yml +0 -0
  9. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.github/workflows/python-publish.yml +0 -0
  10. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.github/workflows/tests.yml +0 -0
  11. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.gitignore +0 -0
  12. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.gitmodules +0 -0
  13. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.python-version +0 -0
  14. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/.vscode/settings.json +0 -0
  15. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/CHANGELOG.md +0 -0
  16. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/README.md +0 -0
  17. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/docs/api/index.md +0 -0
  18. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/docs/changelog.md +0 -0
  19. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/docs/cli.md +0 -0
  20. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/docs/configuration.md +0 -0
  21. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/docs/installation.md +0 -0
  22. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/docs/quickstart.md +0 -0
  23. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/docs/tips.md +0 -0
  24. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkdocs.yml +0 -0
  25. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/.gitattributes +0 -0
  26. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/__init__.py +0 -0
  27. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/__main__.py +0 -0
  28. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/config.py +0 -0
  29. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/episode_matcher.py +0 -0
  30. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/subtitle_utils.py +0 -0
  31. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/tmdb_client.py +0 -0
  32. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher/utils.py +0 -0
  33. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher.egg-info/SOURCES.txt +0 -0
  34. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
  35. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
  36. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher.egg-info/requires.txt +0 -0
  37. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
  38. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/pyproject.toml +0 -0
  39. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/setup.py +0 -0
  40. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/tests/__init__.py +0 -0
  41. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/tests/test_main.py +0 -0
  42. {mkv_episode_matcher-0.7.2 → mkv_episode_matcher-0.8.0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mkv-episode-matcher
3
- Version: 0.7.2
3
+ Version: 0.8.0
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -10,10 +10,40 @@ import torch
10
10
  import whisper
11
11
  from loguru import logger
12
12
  from rapidfuzz import fuzz
13
- from utils import extract_season_episode
13
+ from mkv_episode_matcher.utils import extract_season_episode
14
+ from functools import lru_cache
14
15
 
15
16
  console = Console()
16
17
 
18
+ class SubtitleCache:
19
+ """Cache for storing parsed subtitle data to avoid repeated loading and parsing."""
20
+
21
+ def __init__(self):
22
+ self.subtitles = {} # {file_path: parsed_content}
23
+ self.chunk_cache = {} # {(file_path, chunk_idx): text}
24
+
25
+ def get_subtitle_content(self, srt_file):
26
+ """Get the full content of a subtitle file, loading it only once."""
27
+ srt_file = str(srt_file)
28
+ if srt_file not in self.subtitles:
29
+ reader = SubtitleReader()
30
+ self.subtitles[srt_file] = reader.read_srt_file(srt_file)
31
+ return self.subtitles[srt_file]
32
+
33
+ def get_chunk(self, srt_file, chunk_idx, chunk_start, chunk_end):
34
+ """Get a specific time chunk from a subtitle file, with caching."""
35
+ srt_file = str(srt_file)
36
+ cache_key = (srt_file, chunk_idx)
37
+
38
+ if cache_key not in self.chunk_cache:
39
+ content = self.get_subtitle_content(srt_file)
40
+ reader = SubtitleReader()
41
+ text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
42
+ self.chunk_cache[cache_key] = " ".join(text_lines)
43
+
44
+ return self.chunk_cache[cache_key]
45
+
46
+
17
47
  class EpisodeMatcher:
18
48
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
19
49
  self.cache_dir = Path(cache_dir)
@@ -23,6 +53,12 @@ class EpisodeMatcher:
23
53
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
24
54
  self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
25
55
  self.temp_dir.mkdir(exist_ok=True)
56
+ # Initialize subtitle cache
57
+ self.subtitle_cache = SubtitleCache()
58
+ # Cache for extracted audio chunks
59
+ self.audio_chunks = {}
60
+ # Store reference files to avoid repeated glob operations
61
+ self.reference_files_cache = {}
26
62
 
27
63
  def clean_text(self, text):
28
64
  text = text.lower().strip()
@@ -39,7 +75,12 @@ class EpisodeMatcher:
39
75
  ) / 100.0
40
76
 
41
77
  def extract_audio_chunk(self, mkv_file, start_time):
42
- """Extract a chunk of audio from MKV file."""
78
+ """Extract a chunk of audio from MKV file with caching."""
79
+ cache_key = (str(mkv_file), start_time)
80
+
81
+ if cache_key in self.audio_chunks:
82
+ return self.audio_chunks[cache_key]
83
+
43
84
  chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
44
85
  if not chunk_path.exists():
45
86
  cmd = [
@@ -59,14 +100,18 @@ class EpisodeMatcher:
59
100
  "16000",
60
101
  "-ac",
61
102
  "1",
103
+ "-y", # Overwrite output files without asking
62
104
  str(chunk_path),
63
105
  ]
64
106
  subprocess.run(cmd, capture_output=True)
65
- return str(chunk_path)
107
+
108
+ chunk_path_str = str(chunk_path)
109
+ self.audio_chunks[cache_key] = chunk_path_str
110
+ return chunk_path_str
66
111
 
67
112
  def load_reference_chunk(self, srt_file, chunk_idx):
68
113
  """
69
- Load reference subtitles for a specific time chunk with robust encoding handling.
114
+ Load reference subtitles for a specific time chunk with caching.
70
115
 
71
116
  Args:
72
117
  srt_file (str or Path): Path to the SRT file
@@ -75,23 +120,48 @@ class EpisodeMatcher:
75
120
  Returns:
76
121
  str: Combined text from the subtitle chunk
77
122
  """
78
- chunk_start = chunk_idx * self.chunk_duration
79
- chunk_end = chunk_start + self.chunk_duration
80
-
81
123
  try:
82
- # Read the file content using our robust reader
83
- reader = SubtitleReader()
84
- content = reader.read_srt_file(srt_file)
85
-
86
- # Extract subtitles for the time chunk
87
- text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
88
-
89
- return " ".join(text_lines)
90
-
124
+ chunk_start = chunk_idx * self.chunk_duration
125
+ chunk_end = chunk_start + self.chunk_duration
126
+
127
+ return self.subtitle_cache.get_chunk(srt_file, chunk_idx, chunk_start, chunk_end)
128
+
91
129
  except Exception as e:
92
130
  logger.error(f"Error loading reference chunk from {srt_file}: {e}")
93
131
  return ""
94
132
 
133
+ def get_reference_files(self, season_number):
134
+ """Get reference subtitle files with caching."""
135
+ cache_key = (self.show_name, season_number)
136
+
137
+ if cache_key in self.reference_files_cache:
138
+ return self.reference_files_cache[cache_key]
139
+
140
+ reference_dir = self.cache_dir / "data" / self.show_name
141
+ patterns = [
142
+ f"S{season_number:02d}E",
143
+ f"S{season_number}E",
144
+ f"{season_number:02d}x",
145
+ f"{season_number}x",
146
+ ]
147
+
148
+ reference_files = []
149
+ for _pattern in patterns:
150
+ files = [
151
+ f
152
+ for f in reference_dir.glob("*.srt")
153
+ if any(
154
+ re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
155
+ )
156
+ ]
157
+ reference_files.extend(files)
158
+
159
+ # Remove duplicates while preserving order
160
+ reference_files = list(dict.fromkeys(reference_files))
161
+
162
+ self.reference_files_cache[cache_key] = reference_files
163
+ return reference_files
164
+
95
165
  def _try_match_with_model(
96
166
  self, video_file, model_name, max_duration, reference_files
97
167
  ):
@@ -108,7 +178,12 @@ class EpisodeMatcher:
108
178
  model = get_whisper_model(model_name, self.device)
109
179
 
110
180
  # Calculate number of chunks to check (30 seconds each)
111
- num_chunks = max_duration // self.chunk_duration
181
+ num_chunks = min(max_duration // self.chunk_duration, 10) # Limit to 10 chunks for initial check
182
+
183
+ # Pre-load all reference chunks for the chunks we'll check
184
+ for chunk_idx in range(num_chunks):
185
+ for ref_file in reference_files:
186
+ self.load_reference_chunk(ref_file, chunk_idx)
112
187
 
113
188
  for chunk_idx in range(num_chunks):
114
189
  start_time = chunk_idx * self.chunk_duration
@@ -128,14 +203,14 @@ class EpisodeMatcher:
128
203
  confidence = self.chunk_score(chunk_text, ref_text)
129
204
 
130
205
  if confidence > best_confidence:
131
- print(f"New best confidence: {confidence} for {ref_file}")
206
+ logger.debug(f"New best confidence: {confidence} for {ref_file}")
132
207
  best_confidence = confidence
133
208
  best_match = Path(ref_file)
134
209
 
135
210
  if confidence > self.min_confidence:
136
211
  print(f"Matched with {best_match} (confidence: {best_confidence:.2f})")
137
212
  try:
138
- season,episode = extract_season_episode(best_match.stem)
213
+ season, episode = extract_season_episode(best_match.stem)
139
214
  except Exception as e:
140
215
  print(f"Error extracting season/episode: {e}")
141
216
  continue
@@ -157,54 +232,22 @@ class EpisodeMatcher:
157
232
  def identify_episode(self, video_file, temp_dir, season_number):
158
233
  """Progressive episode identification with faster initial attempt."""
159
234
  try:
160
- # Get reference files first
161
- reference_dir = self.cache_dir / "data" / self.show_name
162
- patterns = [
163
- f"S{season_number:02d}E",
164
- f"S{season_number}E",
165
- f"{season_number:02d}x",
166
- f"{season_number}x",
167
- ]
168
-
169
- reference_files = []
170
- # TODO Figure our why patterns is not being used
171
- for _pattern in patterns:
172
- files = [
173
- f
174
- for f in reference_dir.glob("*.srt")
175
- if any(
176
- re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
177
- )
178
- ]
179
- reference_files.extend(files)
180
-
181
- reference_files = list(dict.fromkeys(reference_files))
235
+ # Get reference files first with caching
236
+ reference_files = self.get_reference_files(season_number)
182
237
 
183
238
  if not reference_files:
184
239
  logger.error(f"No reference files found for season {season_number}")
185
240
  return None
186
- duration = float(
187
- subprocess.check_output([
188
- "ffprobe",
189
- "-v",
190
- "error",
191
- "-show_entries",
192
- "format=duration",
193
- "-of",
194
- "default=noprint_wrappers=1:nokey=1",
195
- video_file,
196
- ]).decode()
197
- )
241
+
242
+ # Cache video duration
243
+ duration = get_video_duration(video_file)
198
244
 
199
- duration = int(np.ceil(duration))
200
245
  # Try with tiny model first (fastest)
201
246
  logger.info("Attempting match with tiny model...")
202
247
  match = self._try_match_with_model(
203
- video_file, "tiny", duration, reference_files
248
+ video_file, "tiny", min(duration, 300), reference_files # Limit to first 5 minutes
204
249
  )
205
- if (
206
- match and match["confidence"] > 0.65
207
- ): # Slightly lower threshold for tiny
250
+ if match and match["confidence"] > 0.65: # Slightly lower threshold for tiny
208
251
  logger.info(
209
252
  f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
210
253
  )
@@ -212,10 +255,10 @@ class EpisodeMatcher:
212
255
 
213
256
  # If no match, try base model
214
257
  logger.info(
215
- "No match in first 3 minutes, extending base model search to 10 minutes..."
258
+ "No match with tiny model, extending base model search to 10 minutes..."
216
259
  )
217
260
  match = self._try_match_with_model(
218
- video_file, "base", duration, reference_files
261
+ video_file, "base", min(duration, 600), reference_files # Limit to first 10 minutes
219
262
  )
220
263
  if match:
221
264
  logger.info(
@@ -227,12 +270,30 @@ class EpisodeMatcher:
227
270
  return None
228
271
 
229
272
  finally:
230
- # Cleanup temp files
231
- for file in self.temp_dir.glob("chunk_*.wav"):
273
+ # Cleanup temp files - keep this limited to only files we know we created
274
+ for chunk_info in self.audio_chunks.values():
232
275
  try:
233
- file.unlink()
276
+ Path(chunk_info).unlink(missing_ok=True)
234
277
  except Exception as e:
235
- logger.warning(f"Failed to delete temp file {file}: {e}")
278
+ logger.warning(f"Failed to delete temp file {chunk_info}: {e}")
279
+
280
+
281
+ @lru_cache(maxsize=100)
282
+ def get_video_duration(video_file):
283
+ """Get video duration with caching."""
284
+ duration = float(
285
+ subprocess.check_output([
286
+ "ffprobe",
287
+ "-v",
288
+ "error",
289
+ "-show_entries",
290
+ "format=duration",
291
+ "-of",
292
+ "default=noprint_wrappers=1:nokey=1",
293
+ video_file,
294
+ ]).decode()
295
+ )
296
+ return int(np.ceil(duration))
236
297
 
237
298
 
238
299
  def detect_file_encoding(file_path):
@@ -247,7 +308,7 @@ def detect_file_encoding(file_path):
247
308
  """
248
309
  try:
249
310
  with open(file_path, "rb") as f:
250
- raw_data = f.read()
311
+ raw_data = f.read(min(1024 * 1024, Path(file_path).stat().st_size)) # Read up to 1MB
251
312
  result = chardet.detect(raw_data)
252
313
  encoding = result["encoding"]
253
314
  confidence = result["confidence"]
@@ -261,6 +322,7 @@ def detect_file_encoding(file_path):
261
322
  return "utf-8"
262
323
 
263
324
 
325
+ @lru_cache(maxsize=100)
264
326
  def read_file_with_fallback(file_path, encodings=None):
265
327
  """
266
328
  Read a file trying multiple encodings in order of preference.
@@ -344,12 +406,16 @@ class SubtitleReader:
344
406
 
345
407
  try:
346
408
  timestamp = lines[1]
347
- text = " ".join(lines[2:])
348
-
349
- end_stamp = timestamp.split(" --> ")[1].strip()
350
- total_seconds = SubtitleReader.parse_timestamp(end_stamp)
351
-
352
- if start_time <= total_seconds <= end_time:
409
+ time_parts = timestamp.split(" --> ")
410
+ start_stamp = time_parts[0].strip()
411
+ end_stamp = time_parts[1].strip()
412
+
413
+ subtitle_start = SubtitleReader.parse_timestamp(start_stamp)
414
+ subtitle_end = SubtitleReader.parse_timestamp(end_stamp)
415
+
416
+ # Check if this subtitle overlaps with our chunk
417
+ if subtitle_end >= start_time and subtitle_start <= end_time:
418
+ text = " ".join(lines[2:])
353
419
  text_lines.append(text)
354
420
 
355
421
  except (IndexError, ValueError) as e:
@@ -359,9 +425,9 @@ class SubtitleReader:
359
425
  return text_lines
360
426
 
361
427
 
428
+ # Global whisper model cache with better cache key
362
429
  _whisper_models = {}
363
430
 
364
-
365
431
  def get_whisper_model(model_name="tiny", device=None):
366
432
  """Cache whisper models to avoid reloading."""
367
433
  global _whisper_models
@@ -373,4 +439,4 @@ def get_whisper_model(model_name="tiny", device=None):
373
439
  _whisper_models[key] = whisper.load_model(model_name, device=device)
374
440
  logger.info(f"Loaded {model_name} model on {device}")
375
441
 
376
- return _whisper_models[key]
442
+ return _whisper_models[key]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mkv-episode-matcher
3
- Version: 0.7.2
3
+ Version: 0.8.0
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = mkv_episode_matcher
3
- version = 0.7.2
3
+ version = 0.8.0
4
4
  author = Jonathan Sakkos
5
5
  author_email = jonathansakkos@gmail.com
6
6
  description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.