mkv-episode-matcher 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -10,19 +10,56 @@ import torch
10
10
  import whisper
11
11
  from loguru import logger
12
12
  from rapidfuzz import fuzz
13
- from utils import extract_season_episode
13
+ from mkv_episode_matcher.utils import extract_season_episode
14
+ from functools import lru_cache
14
15
 
15
16
  console = Console()
16
17
 
18
+ class SubtitleCache:
19
+ """Cache for storing parsed subtitle data to avoid repeated loading and parsing."""
20
+
21
+ def __init__(self):
22
+ self.subtitles = {} # {file_path: parsed_content}
23
+ self.chunk_cache = {} # {(file_path, chunk_idx): text}
24
+
25
+ def get_subtitle_content(self, srt_file):
26
+ """Get the full content of a subtitle file, loading it only once."""
27
+ srt_file = str(srt_file)
28
+ if srt_file not in self.subtitles:
29
+ reader = SubtitleReader()
30
+ self.subtitles[srt_file] = reader.read_srt_file(srt_file)
31
+ return self.subtitles[srt_file]
32
+
33
+ def get_chunk(self, srt_file, chunk_idx, chunk_start, chunk_end):
34
+ """Get a specific time chunk from a subtitle file, with caching."""
35
+ srt_file = str(srt_file)
36
+ cache_key = (srt_file, chunk_idx)
37
+
38
+ if cache_key not in self.chunk_cache:
39
+ content = self.get_subtitle_content(srt_file)
40
+ reader = SubtitleReader()
41
+ text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
42
+ self.chunk_cache[cache_key] = " ".join(text_lines)
43
+
44
+ return self.chunk_cache[cache_key]
45
+
46
+
17
47
  class EpisodeMatcher:
18
48
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
19
49
  self.cache_dir = Path(cache_dir)
20
50
  self.min_confidence = min_confidence
21
51
  self.show_name = show_name
22
52
  self.chunk_duration = 30
53
+ self.skip_initial_duration = 300
23
54
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
24
55
  self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
25
56
  self.temp_dir.mkdir(exist_ok=True)
57
+ # Initialize subtitle cache
58
+ self.subtitle_cache = SubtitleCache()
59
+ # Cache for extracted audio chunks
60
+ self.audio_chunks = {}
61
+ # Store reference files to avoid repeated glob operations
62
+ self.reference_files_cache = {}
26
63
 
27
64
  def clean_text(self, text):
28
65
  text = text.lower().strip()
@@ -39,7 +76,12 @@ class EpisodeMatcher:
39
76
  ) / 100.0
40
77
 
41
78
  def extract_audio_chunk(self, mkv_file, start_time):
42
- """Extract a chunk of audio from MKV file."""
79
+ """Extract a chunk of audio from MKV file with caching."""
80
+ cache_key = (str(mkv_file), start_time)
81
+
82
+ if cache_key in self.audio_chunks:
83
+ return self.audio_chunks[cache_key]
84
+
43
85
  chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
44
86
  if not chunk_path.exists():
45
87
  cmd = [
@@ -59,14 +101,18 @@ class EpisodeMatcher:
59
101
  "16000",
60
102
  "-ac",
61
103
  "1",
104
+ "-y", # Overwrite output files without asking
62
105
  str(chunk_path),
63
106
  ]
64
107
  subprocess.run(cmd, capture_output=True)
65
- return str(chunk_path)
108
+
109
+ chunk_path_str = str(chunk_path)
110
+ self.audio_chunks[cache_key] = chunk_path_str
111
+ return chunk_path_str
66
112
 
67
113
  def load_reference_chunk(self, srt_file, chunk_idx):
68
114
  """
69
- Load reference subtitles for a specific time chunk with robust encoding handling.
115
+ Load reference subtitles for a specific time chunk with caching.
70
116
 
71
117
  Args:
72
118
  srt_file (str or Path): Path to the SRT file
@@ -75,28 +121,57 @@ class EpisodeMatcher:
75
121
  Returns:
76
122
  str: Combined text from the subtitle chunk
77
123
  """
78
- chunk_start = chunk_idx * self.chunk_duration
79
- chunk_end = chunk_start + self.chunk_duration
80
-
81
124
  try:
82
- # Read the file content using our robust reader
83
- reader = SubtitleReader()
84
- content = reader.read_srt_file(srt_file)
85
-
86
- # Extract subtitles for the time chunk
87
- text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
88
-
89
- return " ".join(text_lines)
90
-
125
+ # Apply the same offset as in _try_match_with_model
126
+ chunk_start = self.skip_initial_duration + (chunk_idx * self.chunk_duration)
127
+ chunk_end = chunk_start + self.chunk_duration
128
+
129
+ return self.subtitle_cache.get_chunk(srt_file, chunk_idx, chunk_start, chunk_end)
130
+
91
131
  except Exception as e:
92
132
  logger.error(f"Error loading reference chunk from {srt_file}: {e}")
93
133
  return ""
94
134
 
135
+ def get_reference_files(self, season_number):
136
+ """Get reference subtitle files with caching."""
137
+ cache_key = (self.show_name, season_number)
138
+ logger.debug(f"Reference cache key: {cache_key}")
139
+
140
+ if cache_key in self.reference_files_cache:
141
+ logger.debug("Returning cached reference files")
142
+ return self.reference_files_cache[cache_key]
143
+
144
+ reference_dir = self.cache_dir / "data" / self.show_name
145
+ patterns = [
146
+ f"S{season_number:02d}E",
147
+ f"S{season_number}E",
148
+ f"{season_number:02d}x",
149
+ f"{season_number}x",
150
+ ]
151
+
152
+ reference_files = []
153
+ for _pattern in patterns:
154
+ files = [
155
+ f
156
+ for f in reference_dir.glob("*.srt")
157
+ if any(
158
+ re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
159
+ )
160
+ ]
161
+ reference_files.extend(files)
162
+
163
+ # Remove duplicates while preserving order
164
+ reference_files = list(dict.fromkeys(reference_files))
165
+ logger.debug(f"Found {len(reference_files)} reference files for season {season_number}")
166
+ self.reference_files_cache[cache_key] = reference_files
167
+ return reference_files
168
+
95
169
  def _try_match_with_model(
96
170
  self, video_file, model_name, max_duration, reference_files
97
171
  ):
98
172
  """
99
- Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
173
+ Attempt to match using specified model, checking multiple chunks starting from skip_initial_duration
174
+ and continuing up to max_duration.
100
175
 
101
176
  Args:
102
177
  video_file: Path to the video file
@@ -107,18 +182,30 @@ class EpisodeMatcher:
107
182
  # Use cached model
108
183
  model = get_whisper_model(model_name, self.device)
109
184
 
110
- # Calculate number of chunks to check (30 seconds each)
111
- num_chunks = max_duration // self.chunk_duration
185
+ # Calculate number of chunks to check
186
+ num_chunks = min(max_duration // self.chunk_duration, 10) # Limit to 10 chunks for initial check
187
+
188
+ # Pre-load all reference chunks for the chunks we'll check
189
+ for chunk_idx in range(num_chunks):
190
+ for ref_file in reference_files:
191
+ self.load_reference_chunk(ref_file, chunk_idx)
112
192
 
113
193
  for chunk_idx in range(num_chunks):
114
- start_time = chunk_idx * self.chunk_duration
194
+ # Start at self.skip_initial_duration and check subsequent chunks
195
+ start_time = self.skip_initial_duration + (chunk_idx * self.chunk_duration)
115
196
  logger.debug(f"Trying {model_name} model at {start_time} seconds")
116
197
 
117
198
  audio_path = self.extract_audio_chunk(video_file, start_time)
199
+ logger.debug(f"Extracted audio chunk: {audio_path}")
118
200
 
119
201
  result = model.transcribe(audio_path, task="transcribe", language="en")
120
202
 
203
+
121
204
  chunk_text = result["text"]
205
+ logger.debug(f"Transcription result: {chunk_text} ({len(chunk_text)} characters)")
206
+ if len(chunk_text) < 10:
207
+ logger.debug(f"Transcription result too short: {chunk_text} ({len(chunk_text)} characters)")
208
+ continue
122
209
  best_confidence = 0
123
210
  best_match = None
124
211
 
@@ -128,14 +215,14 @@ class EpisodeMatcher:
128
215
  confidence = self.chunk_score(chunk_text, ref_text)
129
216
 
130
217
  if confidence > best_confidence:
131
- print(f"New best confidence: {confidence} for {ref_file}")
218
+ logger.debug(f"New best confidence: {confidence} for {ref_file}")
132
219
  best_confidence = confidence
133
220
  best_match = Path(ref_file)
134
221
 
135
222
  if confidence > self.min_confidence:
136
223
  print(f"Matched with {best_match} (confidence: {best_confidence:.2f})")
137
224
  try:
138
- season,episode = extract_season_episode(best_match.stem)
225
+ season, episode = extract_season_episode(best_match.stem)
139
226
  except Exception as e:
140
227
  print(f"Error extracting season/episode: {e}")
141
228
  continue
@@ -157,54 +244,22 @@ class EpisodeMatcher:
157
244
  def identify_episode(self, video_file, temp_dir, season_number):
158
245
  """Progressive episode identification with faster initial attempt."""
159
246
  try:
160
- # Get reference files first
161
- reference_dir = self.cache_dir / "data" / self.show_name
162
- patterns = [
163
- f"S{season_number:02d}E",
164
- f"S{season_number}E",
165
- f"{season_number:02d}x",
166
- f"{season_number}x",
167
- ]
168
-
169
- reference_files = []
170
- # TODO Figure our why patterns is not being used
171
- for _pattern in patterns:
172
- files = [
173
- f
174
- for f in reference_dir.glob("*.srt")
175
- if any(
176
- re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
177
- )
178
- ]
179
- reference_files.extend(files)
180
-
181
- reference_files = list(dict.fromkeys(reference_files))
247
+ # Get reference files first with caching
248
+ reference_files = self.get_reference_files(season_number)
182
249
 
183
250
  if not reference_files:
184
251
  logger.error(f"No reference files found for season {season_number}")
185
252
  return None
186
- duration = float(
187
- subprocess.check_output([
188
- "ffprobe",
189
- "-v",
190
- "error",
191
- "-show_entries",
192
- "format=duration",
193
- "-of",
194
- "default=noprint_wrappers=1:nokey=1",
195
- video_file,
196
- ]).decode()
197
- )
253
+
254
+ # Cache video duration
255
+ duration = get_video_duration(video_file)
198
256
 
199
- duration = int(np.ceil(duration))
200
257
  # Try with tiny model first (fastest)
201
258
  logger.info("Attempting match with tiny model...")
202
259
  match = self._try_match_with_model(
203
- video_file, "tiny", duration, reference_files
260
+ video_file, "tiny.en", min(duration, 300), reference_files # Limit to first 5 minutes
204
261
  )
205
- if (
206
- match and match["confidence"] > 0.65
207
- ): # Slightly lower threshold for tiny
262
+ if match and match["confidence"] > 0.65: # Slightly lower threshold for tiny
208
263
  logger.info(
209
264
  f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
210
265
  )
@@ -212,10 +267,10 @@ class EpisodeMatcher:
212
267
 
213
268
  # If no match, try base model
214
269
  logger.info(
215
- "No match in first 3 minutes, extending base model search to 10 minutes..."
270
+ "No match with tiny model, extending base model search to 5 minutes..."
216
271
  )
217
272
  match = self._try_match_with_model(
218
- video_file, "base", duration, reference_files
273
+ video_file, "base.en", min(duration, 300), reference_files # Limit to first 5 minutes
219
274
  )
220
275
  if match:
221
276
  logger.info(
@@ -227,12 +282,30 @@ class EpisodeMatcher:
227
282
  return None
228
283
 
229
284
  finally:
230
- # Cleanup temp files
231
- for file in self.temp_dir.glob("chunk_*.wav"):
285
+ # Cleanup temp files - keep this limited to only files we know we created
286
+ for chunk_info in self.audio_chunks.values():
232
287
  try:
233
- file.unlink()
288
+ Path(chunk_info).unlink(missing_ok=True)
234
289
  except Exception as e:
235
- logger.warning(f"Failed to delete temp file {file}: {e}")
290
+ logger.warning(f"Failed to delete temp file {chunk_info}: {e}")
291
+
292
+
293
+ @lru_cache(maxsize=100)
294
+ def get_video_duration(video_file):
295
+ """Get video duration with caching."""
296
+ duration = float(
297
+ subprocess.check_output([
298
+ "ffprobe",
299
+ "-v",
300
+ "error",
301
+ "-show_entries",
302
+ "format=duration",
303
+ "-of",
304
+ "default=noprint_wrappers=1:nokey=1",
305
+ video_file,
306
+ ]).decode()
307
+ )
308
+ return int(np.ceil(duration))
236
309
 
237
310
 
238
311
  def detect_file_encoding(file_path):
@@ -247,7 +320,7 @@ def detect_file_encoding(file_path):
247
320
  """
248
321
  try:
249
322
  with open(file_path, "rb") as f:
250
- raw_data = f.read()
323
+ raw_data = f.read(min(1024 * 1024, Path(file_path).stat().st_size)) # Read up to 1MB
251
324
  result = chardet.detect(raw_data)
252
325
  encoding = result["encoding"]
253
326
  confidence = result["confidence"]
@@ -261,6 +334,7 @@ def detect_file_encoding(file_path):
261
334
  return "utf-8"
262
335
 
263
336
 
337
+ @lru_cache(maxsize=100)
264
338
  def read_file_with_fallback(file_path, encodings=None):
265
339
  """
266
340
  Read a file trying multiple encodings in order of preference.
@@ -344,12 +418,16 @@ class SubtitleReader:
344
418
 
345
419
  try:
346
420
  timestamp = lines[1]
347
- text = " ".join(lines[2:])
348
-
349
- end_stamp = timestamp.split(" --> ")[1].strip()
350
- total_seconds = SubtitleReader.parse_timestamp(end_stamp)
351
-
352
- if start_time <= total_seconds <= end_time:
421
+ time_parts = timestamp.split(" --> ")
422
+ start_stamp = time_parts[0].strip()
423
+ end_stamp = time_parts[1].strip()
424
+
425
+ subtitle_start = SubtitleReader.parse_timestamp(start_stamp)
426
+ subtitle_end = SubtitleReader.parse_timestamp(end_stamp)
427
+
428
+ # Check if this subtitle overlaps with our chunk
429
+ if subtitle_end >= start_time and subtitle_start <= end_time:
430
+ text = " ".join(lines[2:])
353
431
  text_lines.append(text)
354
432
 
355
433
  except (IndexError, ValueError) as e:
@@ -359,9 +437,9 @@ class SubtitleReader:
359
437
  return text_lines
360
438
 
361
439
 
440
+ # Global whisper model cache with better cache key
362
441
  _whisper_models = {}
363
442
 
364
-
365
443
  def get_whisper_model(model_name="tiny", device=None):
366
444
  """Cache whisper models to avoid reloading."""
367
445
  global _whisper_models
@@ -373,4 +451,4 @@ def get_whisper_model(model_name="tiny", device=None):
373
451
  _whisper_models[key] = whisper.load_model(model_name, device=device)
374
452
  logger.info(f"Loaded {model_name} model on {device}")
375
453
 
376
- return _whisper_models[key]
454
+ return _whisper_models[key]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mkv-episode-matcher
3
- Version: 0.7.2
3
+ Version: 0.8.1
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -2,13 +2,13 @@ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb
2
2
  mkv_episode_matcher/__init__.py,sha256=u3yZcpuK0ICeUjxYKePvW-zS61E5ss5q2AvqnSHuz9E,240
3
3
  mkv_episode_matcher/__main__.py,sha256=O3GQk5R9BFuA-QNlqfBgDSS7G_W8IGSxiV8CFUbcaLc,10059
4
4
  mkv_episode_matcher/config.py,sha256=EcJJjkekQ7oWtarUkufCYON_QWbQvq55-zMqCTOqSa4,2265
5
- mkv_episode_matcher/episode_identification.py,sha256=H_cMURCzSSM227exB51o7WReJKl7q29hmgvE0aUzgn8,12967
5
+ mkv_episode_matcher/episode_identification.py,sha256=xH5HIa6oC4nXhlqzdqQn1XYQFNUrnbUVlW-R9RsBHq4,16745
6
6
  mkv_episode_matcher/episode_matcher.py,sha256=SxAbnXuTJITD1o0WohE9heE3Fm9zW_w0Nq3GzqtcIpQ,6329
7
7
  mkv_episode_matcher/subtitle_utils.py,sha256=Hz9b4CKPV07YKTY4dcN3WbvdbvH-S3J4zcb9CiyvPlE,2551
8
8
  mkv_episode_matcher/tmdb_client.py,sha256=LbMCgjmp7sCbrQo_CDlpcnryKPz5S7inE24YY9Pyjk4,4172
9
9
  mkv_episode_matcher/utils.py,sha256=modXMLmt2fpny8liXwqe4ylxnwwfg_98OLOacv5izps,14501
10
- mkv_episode_matcher-0.7.2.dist-info/METADATA,sha256=AvDPqs96T0pikdFHYVm0VFPvS7UbKrE0QCtsys9MMec,5384
11
- mkv_episode_matcher-0.7.2.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
12
- mkv_episode_matcher-0.7.2.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
13
- mkv_episode_matcher-0.7.2.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
14
- mkv_episode_matcher-0.7.2.dist-info/RECORD,,
10
+ mkv_episode_matcher-0.8.1.dist-info/METADATA,sha256=JpSdL1OU5UwQb6aPARqV9YzQWtoEdmoJZkmw_7FcUwM,5384
11
+ mkv_episode_matcher-0.8.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
12
+ mkv_episode_matcher-0.8.1.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
13
+ mkv_episode_matcher-0.8.1.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
14
+ mkv_episode_matcher-0.8.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.1)
2
+ Generator: setuptools (80.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5