mkv-episode-matcher 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -9,6 +9,10 @@ from loguru import logger
9
9
  import whisper
10
10
  import numpy as np
11
11
  import re
12
+ from pathlib import Path
13
+ import chardet
14
+ from loguru import logger
15
+
12
16
  class EpisodeMatcher:
13
17
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
14
18
  self.cache_dir = Path(cache_dir)
@@ -50,34 +54,32 @@ class EpisodeMatcher:
50
54
  return str(chunk_path)
51
55
 
52
56
  def load_reference_chunk(self, srt_file, chunk_idx):
53
- """Load reference subtitles for a specific time chunk."""
57
+ """
58
+ Load reference subtitles for a specific time chunk with robust encoding handling.
59
+
60
+ Args:
61
+ srt_file (str or Path): Path to the SRT file
62
+ chunk_idx (int): Index of the chunk to load
63
+
64
+ Returns:
65
+ str: Combined text from the subtitle chunk
66
+ """
54
67
  chunk_start = chunk_idx * self.chunk_duration
55
68
  chunk_end = chunk_start + self.chunk_duration
56
- text_lines = []
57
69
 
58
- with open(srt_file, 'r', encoding='utf-8') as f:
59
- content = f.read().strip()
70
+ try:
71
+ # Read the file content using our robust reader
72
+ reader = SubtitleReader()
73
+ content = reader.read_srt_file(srt_file)
60
74
 
61
- for block in content.split('\n\n'):
62
- lines = block.split('\n')
63
- if len(lines) < 3 or '-->' not in lines[1]: # Skip malformed blocks
64
- continue
65
-
66
- try:
67
- timestamp = lines[1]
68
- text = ' '.join(lines[2:])
69
-
70
- end_time = timestamp.split(' --> ')[1].strip()
71
- hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
72
- total_seconds = hours * 3600 + minutes * 60 + seconds
73
-
74
- if chunk_start <= total_seconds <= chunk_end:
75
- text_lines.append(text)
76
-
77
- except (IndexError, ValueError):
78
- continue
79
-
80
- return ' '.join(text_lines)
75
+ # Extract subtitles for the time chunk
76
+ text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
77
+
78
+ return ' '.join(text_lines)
79
+
80
+ except Exception as e:
81
+ logger.error(f"Error loading reference chunk from {srt_file}: {e}")
82
+ return ''
81
83
 
82
84
  def identify_episode(self, video_file, temp_dir, season_number):
83
85
  try:
@@ -147,4 +149,121 @@ class EpisodeMatcher:
147
149
  finally:
148
150
  # Cleanup temp files
149
151
  for file in self.temp_dir.glob("chunk_*.wav"):
150
- file.unlink()
152
+ file.unlink()
153
+
154
+ def detect_file_encoding(file_path):
155
+ """
156
+ Detect the encoding of a file using chardet.
157
+
158
+ Args:
159
+ file_path (str or Path): Path to the file
160
+
161
+ Returns:
162
+ str: Detected encoding, defaults to 'utf-8' if detection fails
163
+ """
164
+ try:
165
+ with open(file_path, 'rb') as f:
166
+ raw_data = f.read()
167
+ result = chardet.detect(raw_data)
168
+ encoding = result['encoding']
169
+ confidence = result['confidence']
170
+
171
+ logger.debug(f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}")
172
+ return encoding if encoding else 'utf-8'
173
+ except Exception as e:
174
+ logger.warning(f"Error detecting encoding for {file_path}: {e}")
175
+ return 'utf-8'
176
+
177
+ def read_file_with_fallback(file_path, encodings=None):
178
+ """
179
+ Read a file trying multiple encodings in order of preference.
180
+
181
+ Args:
182
+ file_path (str or Path): Path to the file
183
+ encodings (list): List of encodings to try, defaults to common subtitle encodings
184
+
185
+ Returns:
186
+ str: File contents
187
+
188
+ Raises:
189
+ ValueError: If file cannot be read with any encoding
190
+ """
191
+ if encodings is None:
192
+ # First try detected encoding, then fallback to common subtitle encodings
193
+ detected = detect_file_encoding(file_path)
194
+ encodings = [detected, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
195
+
196
+ file_path = Path(file_path)
197
+ errors = []
198
+
199
+ for encoding in encodings:
200
+ try:
201
+ with open(file_path, 'r', encoding=encoding) as f:
202
+ content = f.read()
203
+ logger.debug(f"Successfully read {file_path} using {encoding} encoding")
204
+ return content
205
+ except UnicodeDecodeError as e:
206
+ errors.append(f"{encoding}: {str(e)}")
207
+ continue
208
+
209
+ error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(errors)
210
+ logger.error(error_msg)
211
+ raise ValueError(error_msg)
212
+
213
+ class SubtitleReader:
214
+ """Helper class for reading and parsing subtitle files."""
215
+
216
+ @staticmethod
217
+ def parse_timestamp(timestamp):
218
+ """Parse SRT timestamp into seconds."""
219
+ hours, minutes, seconds = timestamp.replace(',', '.').split(':')
220
+ return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
221
+
222
+ @staticmethod
223
+ def read_srt_file(file_path):
224
+ """
225
+ Read an SRT file and return its contents with robust encoding handling.
226
+
227
+ Args:
228
+ file_path (str or Path): Path to the SRT file
229
+
230
+ Returns:
231
+ str: Contents of the SRT file
232
+ """
233
+ return read_file_with_fallback(file_path)
234
+
235
+ @staticmethod
236
+ def extract_subtitle_chunk(content, start_time, end_time):
237
+ """
238
+ Extract subtitle text for a specific time window.
239
+
240
+ Args:
241
+ content (str): Full SRT file content
242
+ start_time (float): Chunk start time in seconds
243
+ end_time (float): Chunk end time in seconds
244
+
245
+ Returns:
246
+ list: List of subtitle texts within the time window
247
+ """
248
+ text_lines = []
249
+
250
+ for block in content.strip().split('\n\n'):
251
+ lines = block.split('\n')
252
+ if len(lines) < 3 or '-->' not in lines[1]:
253
+ continue
254
+
255
+ try:
256
+ timestamp = lines[1]
257
+ text = ' '.join(lines[2:])
258
+
259
+ end_stamp = timestamp.split(' --> ')[1].strip()
260
+ total_seconds = SubtitleReader.parse_timestamp(end_stamp)
261
+
262
+ if start_time <= total_seconds <= end_time:
263
+ text_lines.append(text)
264
+
265
+ except (IndexError, ValueError) as e:
266
+ logger.warning(f"Error parsing subtitle block: {e}")
267
+ continue
268
+
269
+ return text_lines
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mkv-episode-matcher
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -51,6 +51,14 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
51
51
  - ✨ **Bulk Processing**: Handle entire seasons at once
52
52
  - 🧪 **Dry Run Mode**: Test changes before applying
53
53
 
54
+ ## Prerequisites
55
+
56
+ - Python 3.9 or higher
57
+ - [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
58
+ - [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
59
+ - TMDb API key
60
+ - OpenSubtitles account (optional, for subtitle downloads)
61
+
54
62
  ## Quick Start
55
63
 
56
64
  1. Install the package:
@@ -60,37 +68,13 @@ pip install mkv-episode-matcher
60
68
 
61
69
  2. Run on your show directory:
62
70
  ```bash
63
- mkv-match --show-dir "path/to/your/show" --season 1
71
+ mkv-match --show-dir "path/to/your/show" --get-subs true
64
72
  ```
65
73
 
66
- ## Requirements
67
-
68
- - Python 3.8 or higher
69
- - TMDb API key
70
- - OpenSubtitles account (optional, for subtitle downloads)
71
-
72
74
  ## Documentation
73
75
 
74
76
  Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
75
77
 
76
- ## Basic Usage
77
-
78
- ```python
79
- from mkv_episode_matcher import process_show
80
-
81
- # Process all seasons
82
- process_show()
83
-
84
- # Process specific season
85
- process_show(season=1)
86
-
87
- # Test run without making changes
88
- process_show(season=1, dry_run=True)
89
-
90
- # Process and download subtitles
91
- process_show(get_subs=True)
92
- ```
93
-
94
78
  ## Directory Structure
95
79
 
96
80
  MKV Episode Matcher expects your TV shows to be organized as follows:
@@ -105,6 +89,23 @@ Show Name/
105
89
  │ └── episode2.mkv
106
90
  ```
107
91
 
92
+ ## Reference Subtitle File Structure
93
+
94
+ Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
95
+
96
+ ```
97
+
98
+ ~/.mkv-episode-matcher/cache/data/Show Name/
99
+ ├── Show Name - S01E01.srt
100
+ ├── Show Name - S01E02.srt
101
+ └── ...
102
+ ```
103
+
104
+ On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
105
+
106
+ Reference subtitle files should follow this naming pattern:
107
+ `{show_name} - S{season:02d}E{episode:02d}.srt`
108
+
108
109
  ## Contributing
109
110
 
110
111
  1. Fork the repository
@@ -2,7 +2,7 @@ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb
2
2
  mkv_episode_matcher/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
3
3
  mkv_episode_matcher/__main__.py,sha256=3ZcCUxeI7rUA-4oiCD2WXBiOFJAqLsVVWfZKN446FwQ,6792
4
4
  mkv_episode_matcher/config.py,sha256=zDDKBcsDt5fME9BRqiTi7yWKeast1pZh36BNYMvIBYM,2419
5
- mkv_episode_matcher/episode_identification.py,sha256=NopEkcBFFUjjrAujogeVcdISv8UZHFjYr5RJLM0j468,5875
5
+ mkv_episode_matcher/episode_identification.py,sha256=xYqHq1YFbZT8L1Gfa_DhSStrLblKTWxZte__B0qikQU,9739
6
6
  mkv_episode_matcher/episode_matcher.py,sha256=BJ76DPxsmZs-KfHZZ_0WvKSBZWXsUEO6lW34YdYEaxM,3979
7
7
  mkv_episode_matcher/mkv_to_srt.py,sha256=4yxBHRVhgVby0UtQ2aTXGuoQpid8pkgjMIaHU6GCdzc,10857
8
8
  mkv_episode_matcher/speech_to_text.py,sha256=-bnGvmtPCKyHFPEaXwIcEYTf_P13rNpAJA-2UFeRFrs,2806
@@ -19,8 +19,8 @@ mkv_episode_matcher/libraries/pgs2srt/requirements.txt,sha256=sg87dqWw_qpbwciw-M
19
19
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py,sha256=geT1LXdVd8yED9zoJ9K1XfP2JzGcM7u1SslHYrJI09o,10061
20
20
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py,sha256=GKtVy_Lxv-z27mkRG8pJF2znKWXwZTot7jL6kN-zIxM,10503
21
21
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py,sha256=AlJHUYXl85J95OzGRik-AHVfzDd7Q8BJCvD4Nr8kRIk,938598
22
- mkv_episode_matcher-0.3.4.dist-info/METADATA,sha256=QPa0StsF0ADrzxSEswvQ4tTbBkztRjR82hNFsXnJwCc,4640
23
- mkv_episode_matcher-0.3.4.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
24
- mkv_episode_matcher-0.3.4.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
25
- mkv_episode_matcher-0.3.4.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
26
- mkv_episode_matcher-0.3.4.dist-info/RECORD,,
22
+ mkv_episode_matcher-0.3.5.dist-info/METADATA,sha256=mTKSbM9Ai5UDKyj2K4AKgkdjdPVEaxylfHHp95wVZv4,5048
23
+ mkv_episode_matcher-0.3.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
24
+ mkv_episode_matcher-0.3.5.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
25
+ mkv_episode_matcher-0.3.5.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
26
+ mkv_episode_matcher-0.3.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.6.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5