mkv-episode-matcher 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (24) hide show
  1. mkv_episode_matcher/__init__.py +2 -2
  2. mkv_episode_matcher/__main__.py +222 -76
  3. mkv_episode_matcher/config.py +0 -3
  4. mkv_episode_matcher/episode_identification.py +164 -124
  5. mkv_episode_matcher/episode_matcher.py +102 -55
  6. mkv_episode_matcher/subtitle_utils.py +26 -25
  7. mkv_episode_matcher/utils.py +74 -57
  8. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/METADATA +10 -13
  9. mkv_episode_matcher-0.7.0.dist-info/RECORD +14 -0
  10. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/WHEEL +1 -1
  11. mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
  12. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
  13. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
  14. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
  15. mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
  16. mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  17. mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
  18. mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
  19. mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
  20. mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
  21. mkv_episode_matcher/mkv_to_srt.py +0 -302
  22. mkv_episode_matcher-0.5.0.dist-info/RECORD +0 -25
  23. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/entry_points.txt +0 -0
  24. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,17 @@
1
- import json
2
- import os
1
+ import re
3
2
  import subprocess
4
3
  import tempfile
5
4
  from pathlib import Path
5
+ from rich import print
6
+ from rich.console import Console
7
+ import chardet
8
+ import numpy as np
6
9
  import torch
7
- from rapidfuzz import fuzz
8
- from loguru import logger
9
10
  import whisper
10
- import numpy as np
11
- import re
12
- from pathlib import Path
13
- import chardet
14
11
  from loguru import logger
12
+ from rapidfuzz import fuzz
13
+
14
+ console = Console()
15
15
 
16
16
  class EpisodeMatcher:
17
17
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
@@ -22,35 +22,43 @@ class EpisodeMatcher:
22
22
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
23
  self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
24
24
  self.temp_dir.mkdir(exist_ok=True)
25
-
25
+
26
26
  def clean_text(self, text):
27
27
  text = text.lower().strip()
28
- text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
29
- text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
30
- return ' '.join(text.split())
28
+ text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
29
+ text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
30
+ return " ".join(text.split())
31
31
 
32
32
  def chunk_score(self, whisper_chunk, ref_chunk):
33
33
  whisper_clean = self.clean_text(whisper_chunk)
34
34
  ref_clean = self.clean_text(ref_chunk)
35
- return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
36
- fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
35
+ return (
36
+ fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7
37
+ + fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3
38
+ ) / 100.0
37
39
 
38
40
  def extract_audio_chunk(self, mkv_file, start_time):
39
41
  """Extract a chunk of audio from MKV file."""
40
42
  chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
41
43
  if not chunk_path.exists():
42
44
  cmd = [
43
- 'ffmpeg',
44
- '-ss', str(start_time),
45
- '-t', str(self.chunk_duration),
46
- '-i', mkv_file,
47
- '-vn', # Disable video
48
- '-sn', # Disable subtitles
49
- '-dn', # Disable data streams
50
- '-acodec', 'pcm_s16le',
51
- '-ar', '16000',
52
- '-ac', '1',
53
- str(chunk_path)
45
+ "ffmpeg",
46
+ "-ss",
47
+ str(start_time),
48
+ "-t",
49
+ str(self.chunk_duration),
50
+ "-i",
51
+ mkv_file,
52
+ "-vn", # Disable video
53
+ "-sn", # Disable subtitles
54
+ "-dn", # Disable data streams
55
+ "-acodec",
56
+ "pcm_s16le",
57
+ "-ar",
58
+ "16000",
59
+ "-ac",
60
+ "1",
61
+ str(chunk_path),
54
62
  ]
55
63
  subprocess.run(cmd, capture_output=True)
56
64
  return str(chunk_path)
@@ -58,34 +66,37 @@ class EpisodeMatcher:
58
66
  def load_reference_chunk(self, srt_file, chunk_idx):
59
67
  """
60
68
  Load reference subtitles for a specific time chunk with robust encoding handling.
61
-
69
+
62
70
  Args:
63
71
  srt_file (str or Path): Path to the SRT file
64
72
  chunk_idx (int): Index of the chunk to load
65
-
73
+
66
74
  Returns:
67
75
  str: Combined text from the subtitle chunk
68
76
  """
69
77
  chunk_start = chunk_idx * self.chunk_duration
70
78
  chunk_end = chunk_start + self.chunk_duration
71
-
79
+
72
80
  try:
73
81
  # Read the file content using our robust reader
74
82
  reader = SubtitleReader()
75
83
  content = reader.read_srt_file(srt_file)
76
-
84
+
77
85
  # Extract subtitles for the time chunk
78
86
  text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
79
-
80
- return ' '.join(text_lines)
81
-
87
+
88
+ return " ".join(text_lines)
89
+
82
90
  except Exception as e:
83
91
  logger.error(f"Error loading reference chunk from {srt_file}: {e}")
84
- return ''
85
- def _try_match_with_model(self, video_file, model_name, max_duration, reference_files):
92
+ return ""
93
+
94
+ def _try_match_with_model(
95
+ self, video_file, model_name, max_duration, reference_files
96
+ ):
86
97
  """
87
98
  Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
88
-
99
+
89
100
  Args:
90
101
  video_file: Path to the video file
91
102
  model_name: Name of the Whisper model to use
@@ -94,49 +105,46 @@ class EpisodeMatcher:
94
105
  """
95
106
  # Use cached model
96
107
  model = get_whisper_model(model_name, self.device)
97
-
108
+
98
109
  # Calculate number of chunks to check (30 seconds each)
99
110
  num_chunks = max_duration // self.chunk_duration
100
-
111
+
101
112
  for chunk_idx in range(num_chunks):
102
113
  start_time = chunk_idx * self.chunk_duration
103
114
  logger.debug(f"Trying {model_name} model at {start_time} seconds")
104
-
115
+
105
116
  audio_path = self.extract_audio_chunk(video_file, start_time)
106
-
107
- result = model.transcribe(
108
- audio_path,
109
- task="transcribe",
110
- language="en"
111
- )
112
-
117
+
118
+ result = model.transcribe(audio_path, task="transcribe", language="en")
119
+
113
120
  chunk_text = result["text"]
114
121
  best_confidence = 0
115
122
  best_match = None
116
-
123
+
117
124
  # Compare with reference chunks
118
125
  for ref_file in reference_files:
119
126
  ref_text = self.load_reference_chunk(ref_file, chunk_idx)
120
127
  confidence = self.chunk_score(chunk_text, ref_text)
121
-
128
+
122
129
  if confidence > best_confidence:
123
130
  best_confidence = confidence
124
131
  best_match = ref_file
125
-
132
+
126
133
  if confidence > self.min_confidence:
127
- season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
134
+ season_ep = re.search(r"S(\d+)E(\d+)", best_match.stem)
128
135
  if season_ep:
129
136
  season, episode = map(int, season_ep.groups())
130
137
  return {
131
- 'season': season,
132
- 'episode': episode,
133
- 'confidence': best_confidence,
134
- 'reference_file': str(best_match),
135
- 'matched_at': start_time
138
+ "season": season,
139
+ "episode": episode,
140
+ "confidence": best_confidence,
141
+ "reference_file": str(best_match),
142
+ "matched_at": start_time,
136
143
  }
137
-
138
- logger.debug(f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})")
139
-
144
+
145
+ logger.info(
146
+ f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
147
+ )
140
148
  return None
141
149
 
142
150
  def identify_episode(self, video_file, temp_dir, season_number):
@@ -150,44 +158,67 @@ class EpisodeMatcher:
150
158
  f"{season_number:02d}x",
151
159
  f"{season_number}x",
152
160
  ]
153
-
161
+
154
162
  reference_files = []
155
- for pattern in patterns:
156
- files = [f for f in reference_dir.glob("*.srt")
157
- if any(re.search(f"{p}\\d+", f.name, re.IGNORECASE)
158
- for p in patterns)]
163
+ # TODO Figure our why patterns is not being used
164
+ for _pattern in patterns:
165
+ files = [
166
+ f
167
+ for f in reference_dir.glob("*.srt")
168
+ if any(
169
+ re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
170
+ )
171
+ ]
159
172
  reference_files.extend(files)
160
-
173
+
161
174
  reference_files = list(dict.fromkeys(reference_files))
162
-
175
+
163
176
  if not reference_files:
164
177
  logger.error(f"No reference files found for season {season_number}")
165
178
  return None
179
+ duration = float(
180
+ subprocess.check_output([
181
+ "ffprobe",
182
+ "-v",
183
+ "error",
184
+ "-show_entries",
185
+ "format=duration",
186
+ "-of",
187
+ "default=noprint_wrappers=1:nokey=1",
188
+ video_file,
189
+ ]).decode()
190
+ )
166
191
 
167
- # Try with tiny model first (fastest) - check first 2 minutes
168
- logger.info("Attempting match with tiny model (first 2 minutes)...")
169
- match = self._try_match_with_model(video_file, "tiny", 120, reference_files)
170
- if match and match['confidence'] > 0.65: # Slightly lower threshold for tiny
171
- logger.info(f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
172
- return match
173
-
174
- # If unsuccessful with tiny, try base model on first 3 minutes
175
- logger.info("Tiny model match failed, trying base model (first 3 minutes)...")
176
- match = self._try_match_with_model(video_file, "base", 180, reference_files)
177
- if match and match['confidence'] > self.min_confidence:
178
- logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
192
+ duration = int(np.ceil(duration))
193
+ # Try with tiny model first (fastest)
194
+ logger.info("Attempting match with tiny model...")
195
+ match = self._try_match_with_model(
196
+ video_file, "tiny", duration, reference_files
197
+ )
198
+ if (
199
+ match and match["confidence"] > 0.65
200
+ ): # Slightly lower threshold for tiny
201
+ logger.info(
202
+ f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
203
+ )
179
204
  return match
180
-
181
- # If still no match, try base model on up to 10 minutes
182
- logger.info("No match in first 3 minutes, extending base model search to 10 minutes...")
183
- match = self._try_match_with_model(video_file, "base", 600, reference_files)
205
+
206
+ # If no match, try base model
207
+ logger.info(
208
+ "No match in first 3 minutes, extending base model search to 10 minutes..."
209
+ )
210
+ match = self._try_match_with_model(
211
+ video_file, "base", duration, reference_files
212
+ )
184
213
  if match:
185
- logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
214
+ logger.info(
215
+ f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
216
+ )
186
217
  return match
187
-
218
+
188
219
  logger.info("Speech recognition match failed")
189
220
  return None
190
-
221
+
191
222
  finally:
192
223
  # Cleanup temp files
193
224
  for file in self.temp_dir.glob("chunk_*.wav"):
@@ -196,134 +227,143 @@ class EpisodeMatcher:
196
227
  except Exception as e:
197
228
  logger.warning(f"Failed to delete temp file {file}: {e}")
198
229
 
230
+
199
231
  def detect_file_encoding(file_path):
200
232
  """
201
233
  Detect the encoding of a file using chardet.
202
-
234
+
203
235
  Args:
204
236
  file_path (str or Path): Path to the file
205
-
237
+
206
238
  Returns:
207
239
  str: Detected encoding, defaults to 'utf-8' if detection fails
208
240
  """
209
241
  try:
210
- with open(file_path, 'rb') as f:
242
+ with open(file_path, "rb") as f:
211
243
  raw_data = f.read()
212
244
  result = chardet.detect(raw_data)
213
- encoding = result['encoding']
214
- confidence = result['confidence']
215
-
216
- logger.debug(f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}")
217
- return encoding if encoding else 'utf-8'
245
+ encoding = result["encoding"]
246
+ confidence = result["confidence"]
247
+
248
+ logger.debug(
249
+ f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
250
+ )
251
+ return encoding if encoding else "utf-8"
218
252
  except Exception as e:
219
253
  logger.warning(f"Error detecting encoding for {file_path}: {e}")
220
- return 'utf-8'
254
+ return "utf-8"
255
+
221
256
 
222
257
  def read_file_with_fallback(file_path, encodings=None):
223
258
  """
224
259
  Read a file trying multiple encodings in order of preference.
225
-
260
+
226
261
  Args:
227
262
  file_path (str or Path): Path to the file
228
263
  encodings (list): List of encodings to try, defaults to common subtitle encodings
229
-
264
+
230
265
  Returns:
231
266
  str: File contents
232
-
267
+
233
268
  Raises:
234
269
  ValueError: If file cannot be read with any encoding
235
270
  """
236
271
  if encodings is None:
237
272
  # First try detected encoding, then fallback to common subtitle encodings
238
273
  detected = detect_file_encoding(file_path)
239
- encodings = [detected, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
240
-
274
+ encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
275
+
241
276
  file_path = Path(file_path)
242
277
  errors = []
243
-
278
+
244
279
  for encoding in encodings:
245
280
  try:
246
- with open(file_path, 'r', encoding=encoding) as f:
281
+ with open(file_path, encoding=encoding) as f:
247
282
  content = f.read()
248
283
  logger.debug(f"Successfully read {file_path} using {encoding} encoding")
249
284
  return content
250
285
  except UnicodeDecodeError as e:
251
286
  errors.append(f"{encoding}: {str(e)}")
252
287
  continue
253
-
254
- error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(errors)
288
+
289
+ error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
290
+ errors
291
+ )
255
292
  logger.error(error_msg)
256
293
  raise ValueError(error_msg)
257
294
 
295
+
258
296
  class SubtitleReader:
259
297
  """Helper class for reading and parsing subtitle files."""
260
-
298
+
261
299
  @staticmethod
262
300
  def parse_timestamp(timestamp):
263
301
  """Parse SRT timestamp into seconds."""
264
- hours, minutes, seconds = timestamp.replace(',', '.').split(':')
302
+ hours, minutes, seconds = timestamp.replace(",", ".").split(":")
265
303
  return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
266
-
304
+
267
305
  @staticmethod
268
306
  def read_srt_file(file_path):
269
307
  """
270
308
  Read an SRT file and return its contents with robust encoding handling.
271
-
309
+
272
310
  Args:
273
311
  file_path (str or Path): Path to the SRT file
274
-
312
+
275
313
  Returns:
276
314
  str: Contents of the SRT file
277
315
  """
278
316
  return read_file_with_fallback(file_path)
279
-
317
+
280
318
  @staticmethod
281
319
  def extract_subtitle_chunk(content, start_time, end_time):
282
320
  """
283
321
  Extract subtitle text for a specific time window.
284
-
322
+
285
323
  Args:
286
324
  content (str): Full SRT file content
287
325
  start_time (float): Chunk start time in seconds
288
326
  end_time (float): Chunk end time in seconds
289
-
327
+
290
328
  Returns:
291
329
  list: List of subtitle texts within the time window
292
330
  """
293
331
  text_lines = []
294
-
295
- for block in content.strip().split('\n\n'):
296
- lines = block.split('\n')
297
- if len(lines) < 3 or '-->' not in lines[1]:
332
+
333
+ for block in content.strip().split("\n\n"):
334
+ lines = block.split("\n")
335
+ if len(lines) < 3 or "-->" not in lines[1]:
298
336
  continue
299
-
337
+
300
338
  try:
301
339
  timestamp = lines[1]
302
- text = ' '.join(lines[2:])
303
-
304
- end_stamp = timestamp.split(' --> ')[1].strip()
340
+ text = " ".join(lines[2:])
341
+
342
+ end_stamp = timestamp.split(" --> ")[1].strip()
305
343
  total_seconds = SubtitleReader.parse_timestamp(end_stamp)
306
-
344
+
307
345
  if start_time <= total_seconds <= end_time:
308
346
  text_lines.append(text)
309
-
347
+
310
348
  except (IndexError, ValueError) as e:
311
349
  logger.warning(f"Error parsing subtitle block: {e}")
312
350
  continue
313
-
351
+
314
352
  return text_lines
315
-
353
+
354
+
316
355
  _whisper_models = {}
317
356
 
357
+
318
358
  def get_whisper_model(model_name="tiny", device=None):
319
359
  """Cache whisper models to avoid reloading."""
320
360
  global _whisper_models
321
361
  if device is None:
322
362
  device = "cuda" if torch.cuda.is_available() else "cpu"
323
-
363
+
324
364
  key = f"{model_name}_{device}"
325
365
  if key not in _whisper_models:
326
366
  _whisper_models[key] = whisper.load_model(model_name, device=device)
327
367
  logger.info(f"Loaded {model_name} model on {device}")
328
-
329
- return _whisper_models[key]
368
+
369
+ return _whisper_models[key]