mkv-episode-matcher 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -9,6 +9,10 @@ from loguru import logger
9
9
  import whisper
10
10
  import numpy as np
11
11
  import re
12
+ from pathlib import Path
13
+ import chardet
14
+ from loguru import logger
15
+
12
16
  class EpisodeMatcher:
13
17
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
14
18
  self.cache_dir = Path(cache_dir)
@@ -50,34 +54,32 @@ class EpisodeMatcher:
50
54
  return str(chunk_path)
51
55
 
52
56
  def load_reference_chunk(self, srt_file, chunk_idx):
53
- """Load reference subtitles for a specific time chunk."""
57
+ """
58
+ Load reference subtitles for a specific time chunk with robust encoding handling.
59
+
60
+ Args:
61
+ srt_file (str or Path): Path to the SRT file
62
+ chunk_idx (int): Index of the chunk to load
63
+
64
+ Returns:
65
+ str: Combined text from the subtitle chunk
66
+ """
54
67
  chunk_start = chunk_idx * self.chunk_duration
55
68
  chunk_end = chunk_start + self.chunk_duration
56
- text_lines = []
57
69
 
58
- with open(srt_file, 'r', encoding='utf-8') as f:
59
- content = f.read().strip()
70
+ try:
71
+ # Read the file content using our robust reader
72
+ reader = SubtitleReader()
73
+ content = reader.read_srt_file(srt_file)
60
74
 
61
- for block in content.split('\n\n'):
62
- lines = block.split('\n')
63
- if len(lines) < 3 or '-->' not in lines[1]: # Skip malformed blocks
64
- continue
65
-
66
- try:
67
- timestamp = lines[1]
68
- text = ' '.join(lines[2:])
69
-
70
- end_time = timestamp.split(' --> ')[1].strip()
71
- hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
72
- total_seconds = hours * 3600 + minutes * 60 + seconds
73
-
74
- if chunk_start <= total_seconds <= chunk_end:
75
- text_lines.append(text)
76
-
77
- except (IndexError, ValueError):
78
- continue
79
-
80
- return ' '.join(text_lines)
75
+ # Extract subtitles for the time chunk
76
+ text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
77
+
78
+ return ' '.join(text_lines)
79
+
80
+ except Exception as e:
81
+ logger.error(f"Error loading reference chunk from {srt_file}: {e}")
82
+ return ''
81
83
 
82
84
  def identify_episode(self, video_file, temp_dir, season_number):
83
85
  try:
@@ -94,14 +96,27 @@ class EpisodeMatcher:
94
96
  # Load Whisper model
95
97
  model = whisper.load_model("base", device=self.device)
96
98
 
97
- # Get season-specific reference files
99
+ # Get season-specific reference files using multiple patterns
98
100
  reference_dir = self.cache_dir / "data" / self.show_name
99
- season_pattern = f"S{season_number:02d}E"
100
- reference_files = [
101
- f for f in reference_dir.glob("*.srt")
102
- if season_pattern in f.name
101
+
102
+ # Create season patterns for different formats
103
+ patterns = [
104
+ f"S{season_number:02d}E", # S01E01
105
+ f"S{season_number}E", # S1E01
106
+ f"{season_number:02d}x", # 01x01
107
+ f"{season_number}x", # 1x01
103
108
  ]
104
109
 
110
+ reference_files = []
111
+ for pattern in patterns:
112
+ files = [f for f in reference_dir.glob("*.srt")
113
+ if any(re.search(f"{p}\\d+", f.name, re.IGNORECASE)
114
+ for p in patterns)]
115
+ reference_files.extend(files)
116
+
117
+ # Remove duplicates while preserving order
118
+ reference_files = list(dict.fromkeys(reference_files))
119
+
105
120
  if not reference_files:
106
121
  logger.error(f"No reference files found for season {season_number}")
107
122
  return None
@@ -147,4 +162,121 @@ class EpisodeMatcher:
147
162
  finally:
148
163
  # Cleanup temp files
149
164
  for file in self.temp_dir.glob("chunk_*.wav"):
150
- file.unlink()
165
+ file.unlink()
166
+
167
+ def detect_file_encoding(file_path):
168
+ """
169
+ Detect the encoding of a file using chardet.
170
+
171
+ Args:
172
+ file_path (str or Path): Path to the file
173
+
174
+ Returns:
175
+ str: Detected encoding, defaults to 'utf-8' if detection fails
176
+ """
177
+ try:
178
+ with open(file_path, 'rb') as f:
179
+ raw_data = f.read()
180
+ result = chardet.detect(raw_data)
181
+ encoding = result['encoding']
182
+ confidence = result['confidence']
183
+
184
+ logger.debug(f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}")
185
+ return encoding if encoding else 'utf-8'
186
+ except Exception as e:
187
+ logger.warning(f"Error detecting encoding for {file_path}: {e}")
188
+ return 'utf-8'
189
+
190
+ def read_file_with_fallback(file_path, encodings=None):
191
+ """
192
+ Read a file trying multiple encodings in order of preference.
193
+
194
+ Args:
195
+ file_path (str or Path): Path to the file
196
+ encodings (list): List of encodings to try, defaults to common subtitle encodings
197
+
198
+ Returns:
199
+ str: File contents
200
+
201
+ Raises:
202
+ ValueError: If file cannot be read with any encoding
203
+ """
204
+ if encodings is None:
205
+ # First try detected encoding, then fallback to common subtitle encodings
206
+ detected = detect_file_encoding(file_path)
207
+ encodings = [detected, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
208
+
209
+ file_path = Path(file_path)
210
+ errors = []
211
+
212
+ for encoding in encodings:
213
+ try:
214
+ with open(file_path, 'r', encoding=encoding) as f:
215
+ content = f.read()
216
+ logger.debug(f"Successfully read {file_path} using {encoding} encoding")
217
+ return content
218
+ except UnicodeDecodeError as e:
219
+ errors.append(f"{encoding}: {str(e)}")
220
+ continue
221
+
222
+ error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(errors)
223
+ logger.error(error_msg)
224
+ raise ValueError(error_msg)
225
+
226
+ class SubtitleReader:
227
+ """Helper class for reading and parsing subtitle files."""
228
+
229
+ @staticmethod
230
+ def parse_timestamp(timestamp):
231
+ """Parse SRT timestamp into seconds."""
232
+ hours, minutes, seconds = timestamp.replace(',', '.').split(':')
233
+ return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
234
+
235
+ @staticmethod
236
+ def read_srt_file(file_path):
237
+ """
238
+ Read an SRT file and return its contents with robust encoding handling.
239
+
240
+ Args:
241
+ file_path (str or Path): Path to the SRT file
242
+
243
+ Returns:
244
+ str: Contents of the SRT file
245
+ """
246
+ return read_file_with_fallback(file_path)
247
+
248
+ @staticmethod
249
+ def extract_subtitle_chunk(content, start_time, end_time):
250
+ """
251
+ Extract subtitle text for a specific time window.
252
+
253
+ Args:
254
+ content (str): Full SRT file content
255
+ start_time (float): Chunk start time in seconds
256
+ end_time (float): Chunk end time in seconds
257
+
258
+ Returns:
259
+ list: List of subtitle texts within the time window
260
+ """
261
+ text_lines = []
262
+
263
+ for block in content.strip().split('\n\n'):
264
+ lines = block.split('\n')
265
+ if len(lines) < 3 or '-->' not in lines[1]:
266
+ continue
267
+
268
+ try:
269
+ timestamp = lines[1]
270
+ text = ' '.join(lines[2:])
271
+
272
+ end_stamp = timestamp.split(' --> ')[1].strip()
273
+ total_seconds = SubtitleReader.parse_timestamp(end_stamp)
274
+
275
+ if start_time <= total_seconds <= end_time:
276
+ text_lines.append(text)
277
+
278
+ except (IndexError, ValueError) as e:
279
+ logger.warning(f"Error parsing subtitle block: {e}")
280
+ continue
281
+
282
+ return text_lines
@@ -29,6 +29,14 @@ def process_show(season=None, dry_run=False, get_subs=False):
29
29
  show_name = clean_text(os.path.basename(show_dir))
30
30
  matcher = EpisodeMatcher(CACHE_DIR, show_name)
31
31
 
32
+ # Early check for reference files
33
+ reference_dir = Path(CACHE_DIR) / "data" / show_name
34
+ reference_files = list(reference_dir.glob("*.srt"))
35
+ if not reference_files:
36
+ logger.error(f"No reference subtitle files found in {reference_dir}")
37
+ logger.info("Please download reference subtitles first")
38
+ return
39
+
32
40
  season_paths = get_valid_seasons(show_dir)
33
41
  if not season_paths:
34
42
  logger.warning(f"No seasons with .mkv files found")
@@ -300,7 +300,7 @@ def extract_srt_text(filepath):
300
300
 
301
301
  def extract_season_episode(filename):
302
302
  """
303
- Extract season and episode numbers from filename.
303
+ Extract season and episode numbers from filename with support for multiple formats.
304
304
 
305
305
  Args:
306
306
  filename (str): Filename to parse
@@ -308,10 +308,20 @@ def extract_season_episode(filename):
308
308
  Returns:
309
309
  tuple: (season_number, episode_number)
310
310
  """
311
- match = re.search(r'S(\d+)E(\d+)', filename)
312
- if match:
313
- return int(match.group(1)), int(match.group(2))
311
+ # List of patterns to try
312
+ patterns = [
313
+ r'S(\d+)E(\d+)', # S01E01
314
+ r'(\d+)x(\d+)', # 1x01 or 01x01
315
+ r'Season\s*(\d+).*?(\d+)' # Season 1 - 01
316
+ ]
317
+
318
+ for pattern in patterns:
319
+ match = re.search(pattern, filename, re.IGNORECASE)
320
+ if match:
321
+ return int(match.group(1)), int(match.group(2))
322
+
314
323
  return None, None
324
+
315
325
  def process_srt_files(show_dir):
316
326
  """
317
327
  Process all SRT files in the given directory and its subdirectories.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mkv-episode-matcher
3
- Version: 0.3.4
3
+ Version: 0.3.6
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -51,46 +51,31 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
51
51
  - ✨ **Bulk Processing**: Handle entire seasons at once
52
52
  - 🧪 **Dry Run Mode**: Test changes before applying
53
53
 
54
+ ## Prerequisites
55
+
56
+ - Python 3.9 or higher
57
+ - [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
58
+ - [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
59
+ - TMDb API key
60
+ - OpenSubtitles account (optional, for subtitle downloads)
61
+
54
62
  ## Quick Start
55
63
 
56
64
  1. Install the package:
57
65
  ```bash
58
66
  pip install mkv-episode-matcher
59
67
  ```
68
+ 2. Download .srt subtitles files to ~/.mkv-episode-matcher/cache/data/Show Name/
60
69
 
61
- 2. Run on your show directory:
70
+ 3. Run on your show directory:
62
71
  ```bash
63
- mkv-match --show-dir "path/to/your/show" --season 1
72
+ mkv-match --show-dir "path/to/your/show"
64
73
  ```
65
74
 
66
- ## Requirements
67
-
68
- - Python 3.8 or higher
69
- - TMDb API key
70
- - OpenSubtitles account (optional, for subtitle downloads)
71
-
72
75
  ## Documentation
73
76
 
74
77
  Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
75
78
 
76
- ## Basic Usage
77
-
78
- ```python
79
- from mkv_episode_matcher import process_show
80
-
81
- # Process all seasons
82
- process_show()
83
-
84
- # Process specific season
85
- process_show(season=1)
86
-
87
- # Test run without making changes
88
- process_show(season=1, dry_run=True)
89
-
90
- # Process and download subtitles
91
- process_show(get_subs=True)
92
- ```
93
-
94
79
  ## Directory Structure
95
80
 
96
81
  MKV Episode Matcher expects your TV shows to be organized as follows:
@@ -105,6 +90,23 @@ Show Name/
105
90
  │ └── episode2.mkv
106
91
  ```
107
92
 
93
+ ## Reference Subtitle File Structure
94
+
95
+ Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
96
+
97
+ ```
98
+
99
+ ~/.mkv-episode-matcher/cache/data/Show Name/
100
+ ├── Show Name - S01E01.srt
101
+ ├── Show Name - S01E02.srt
102
+ └── ...
103
+ ```
104
+
105
+ On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
106
+
107
+ Reference subtitle files should follow this naming pattern:
108
+ `{show_name} - S{season:02d}E{episode:02d}.srt`
109
+
108
110
  ## Contributing
109
111
 
110
112
  1. Fork the repository
@@ -2,13 +2,13 @@ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb
2
2
  mkv_episode_matcher/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
3
3
  mkv_episode_matcher/__main__.py,sha256=3ZcCUxeI7rUA-4oiCD2WXBiOFJAqLsVVWfZKN446FwQ,6792
4
4
  mkv_episode_matcher/config.py,sha256=zDDKBcsDt5fME9BRqiTi7yWKeast1pZh36BNYMvIBYM,2419
5
- mkv_episode_matcher/episode_identification.py,sha256=NopEkcBFFUjjrAujogeVcdISv8UZHFjYr5RJLM0j468,5875
6
- mkv_episode_matcher/episode_matcher.py,sha256=BJ76DPxsmZs-KfHZZ_0WvKSBZWXsUEO6lW34YdYEaxM,3979
5
+ mkv_episode_matcher/episode_identification.py,sha256=_6M1UJkq1RGfmLI32u9dNOVvgp5Vf2MjqW2MTx0Gl8E,10329
6
+ mkv_episode_matcher/episode_matcher.py,sha256=vunYpHQxyXo3l88BUScXa7_kMYMCV1pXpQxaLa-plZA,4325
7
7
  mkv_episode_matcher/mkv_to_srt.py,sha256=4yxBHRVhgVby0UtQ2aTXGuoQpid8pkgjMIaHU6GCdzc,10857
8
8
  mkv_episode_matcher/speech_to_text.py,sha256=-bnGvmtPCKyHFPEaXwIcEYTf_P13rNpAJA-2UFeRFrs,2806
9
9
  mkv_episode_matcher/subtitle_utils.py,sha256=rYSbd393pKYQW0w4sXgals02WFGqMYYYkQHDbEkWF8c,2666
10
10
  mkv_episode_matcher/tmdb_client.py,sha256=LbMCgjmp7sCbrQo_CDlpcnryKPz5S7inE24YY9Pyjk4,4172
11
- mkv_episode_matcher/utils.py,sha256=Txnn24ou7Pg3iMq9WrT3nwBRlRP8JEuZQ2ZYW7uesp4,13972
11
+ mkv_episode_matcher/utils.py,sha256=VASbougN3rb2iu40iZWkGjKIbahW713TOrFBo_TR9wo,14269
12
12
  mkv_episode_matcher/libraries/pgs2srt/.gitignore,sha256=mt3uxWYZaFurMw_yGE258gWhtGKPVR7e3Ll4ALJpyj4,23
13
13
  mkv_episode_matcher/libraries/pgs2srt/README.md,sha256=olb25G17tj0kxPgp_LcH5I2QWXjgP1m8JFyjYRGz4UU,1374
14
14
  mkv_episode_matcher/libraries/pgs2srt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -19,8 +19,8 @@ mkv_episode_matcher/libraries/pgs2srt/requirements.txt,sha256=sg87dqWw_qpbwciw-M
19
19
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py,sha256=geT1LXdVd8yED9zoJ9K1XfP2JzGcM7u1SslHYrJI09o,10061
20
20
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py,sha256=GKtVy_Lxv-z27mkRG8pJF2znKWXwZTot7jL6kN-zIxM,10503
21
21
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py,sha256=AlJHUYXl85J95OzGRik-AHVfzDd7Q8BJCvD4Nr8kRIk,938598
22
- mkv_episode_matcher-0.3.4.dist-info/METADATA,sha256=QPa0StsF0ADrzxSEswvQ4tTbBkztRjR82hNFsXnJwCc,4640
23
- mkv_episode_matcher-0.3.4.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
24
- mkv_episode_matcher-0.3.4.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
25
- mkv_episode_matcher-0.3.4.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
26
- mkv_episode_matcher-0.3.4.dist-info/RECORD,,
22
+ mkv_episode_matcher-0.3.6.dist-info/METADATA,sha256=gjlfeFtCjCCjf2qUWTGJ9zTvSSU_1KRhp1ZtIBISecI,5113
23
+ mkv_episode_matcher-0.3.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
24
+ mkv_episode_matcher-0.3.6.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
25
+ mkv_episode_matcher-0.3.6.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
26
+ mkv_episode_matcher-0.3.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5