mkv-episode-matcher 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (53) hide show
  1. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.coverage +0 -0
  2. mkv_episode_matcher-0.3.5/.github/workflows/tests.yml +40 -0
  3. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/PKG-INFO +28 -27
  4. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/README.md +26 -25
  5. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/docs/quickstart.md +9 -15
  6. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/episode_identification.py +144 -25
  7. mkv_episode_matcher-0.3.5/mkv_episode_matcher/subtitle_utils.py +82 -0
  8. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/utils.py +43 -43
  9. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher.egg-info/PKG-INFO +28 -27
  10. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher.egg-info/SOURCES.txt +2 -1
  11. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/pyproject.toml +1 -0
  12. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/setup.cfg +1 -1
  13. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/tests/test_main.py +48 -18
  14. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/uv.lock +12 -1
  15. mkv_episode_matcher-0.3.3/tests/test_improvements.py +0 -59
  16. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.gitattributes +0 -0
  17. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.github/funding.yml +0 -0
  18. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.github/workflows/documentation.yml +0 -0
  19. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.github/workflows/python-publish.yml +0 -0
  20. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.gitignore +0 -0
  21. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.gitmodules +0 -0
  22. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.python-version +0 -0
  23. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/.vscode/settings.json +0 -0
  24. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/docs/api/index.md +0 -0
  25. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/docs/cli.md +0 -0
  26. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/docs/configuration.md +0 -0
  27. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/docs/installation.md +0 -0
  28. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/docs/tips.md +0 -0
  29. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkdocs.yml +0 -0
  30. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/.gitattributes +0 -0
  31. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/__init__.py +0 -0
  32. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/__main__.py +0 -0
  33. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/config.py +0 -0
  34. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/episode_matcher.py +0 -0
  35. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -0
  36. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -0
  37. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -0
  38. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -0
  39. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/README.md +0 -0
  40. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  41. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -0
  42. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -0
  43. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -0
  44. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -0
  45. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/mkv_to_srt.py +0 -0
  46. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/speech_to_text.py +0 -0
  47. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher/tmdb_client.py +0 -0
  48. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
  49. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
  50. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher.egg-info/requires.txt +0 -0
  51. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
  52. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/setup.py +0 -0
  53. {mkv_episode_matcher-0.3.3 → mkv_episode_matcher-0.3.5}/tests/__init__.py +0 -0
@@ -0,0 +1,40 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main, master]
6
+ pull_request:
7
+ branches: [main, master]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version:
15
+ - "3.9"
16
+ - "3.10"
17
+ - "3.11"
18
+ - "3.12"
19
+
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+
23
+ - name: Install uv and set the python version
24
+ uses: astral-sh/setup-uv@v4
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+
28
+ - name: Install dependencies
29
+ run: |
30
+ uv venv
31
+ uv pip install -e .
32
+
33
+ - name: Run tests with pytest and coverage
34
+ run: |
35
+ uv run --dev pytest --cov-branch --cov-report=xml
36
+
37
+ - name: Upload coverage reports to Codecov
38
+ uses: codecov/codecov-action@v5
39
+ with:
40
+ token: ${{ secrets.CODECOV_TOKEN }}
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mkv-episode-matcher
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -51,6 +51,14 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
51
51
  - ✨ **Bulk Processing**: Handle entire seasons at once
52
52
  - 🧪 **Dry Run Mode**: Test changes before applying
53
53
 
54
+ ## Prerequisites
55
+
56
+ - Python 3.9 or higher
57
+ - [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
58
+ - [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
59
+ - TMDb API key
60
+ - OpenSubtitles account (optional, for subtitle downloads)
61
+
54
62
  ## Quick Start
55
63
 
56
64
  1. Install the package:
@@ -60,37 +68,13 @@ pip install mkv-episode-matcher
60
68
 
61
69
  2. Run on your show directory:
62
70
  ```bash
63
- mkv-match --show-dir "path/to/your/show" --season 1
71
+ mkv-match --show-dir "path/to/your/show" --get-subs true
64
72
  ```
65
73
 
66
- ## Requirements
67
-
68
- - Python 3.8 or higher
69
- - TMDb API key
70
- - OpenSubtitles account (optional, for subtitle downloads)
71
-
72
74
  ## Documentation
73
75
 
74
76
  Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
75
77
 
76
- ## Basic Usage
77
-
78
- ```python
79
- from mkv_episode_matcher import process_show
80
-
81
- # Process all seasons
82
- process_show()
83
-
84
- # Process specific season
85
- process_show(season=1)
86
-
87
- # Test run without making changes
88
- process_show(season=1, dry_run=True)
89
-
90
- # Process and download subtitles
91
- process_show(get_subs=True)
92
- ```
93
-
94
78
  ## Directory Structure
95
79
 
96
80
  MKV Episode Matcher expects your TV shows to be organized as follows:
@@ -105,6 +89,23 @@ Show Name/
105
89
  │ └── episode2.mkv
106
90
  ```
107
91
 
92
+ ## Reference Subtitle File Structure
93
+
94
+ Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
95
+
96
+ ```
97
+
98
+ ~/.mkv-episode-matcher/cache/data/Show Name/
99
+ ├── Show Name - S01E01.srt
100
+ ├── Show Name - S01E02.srt
101
+ └── ...
102
+ ```
103
+
104
+ On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
105
+
106
+ Reference subtitle files should follow this naming pattern:
107
+ `{show_name} - S{season:02d}E{episode:02d}.srt`
108
+
108
109
  ## Contributing
109
110
 
110
111
  1. Fork the repository
@@ -22,6 +22,14 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
22
22
  - ✨ **Bulk Processing**: Handle entire seasons at once
23
23
  - 🧪 **Dry Run Mode**: Test changes before applying
24
24
 
25
+ ## Prerequisites
26
+
27
+ - Python 3.9 or higher
28
+ - [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
29
+ - [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
30
+ - TMDb API key
31
+ - OpenSubtitles account (optional, for subtitle downloads)
32
+
25
33
  ## Quick Start
26
34
 
27
35
  1. Install the package:
@@ -31,37 +39,13 @@ pip install mkv-episode-matcher
31
39
 
32
40
  2. Run on your show directory:
33
41
  ```bash
34
- mkv-match --show-dir "path/to/your/show" --season 1
42
+ mkv-match --show-dir "path/to/your/show" --get-subs true
35
43
  ```
36
44
 
37
- ## Requirements
38
-
39
- - Python 3.8 or higher
40
- - TMDb API key
41
- - OpenSubtitles account (optional, for subtitle downloads)
42
-
43
45
  ## Documentation
44
46
 
45
47
  Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
46
48
 
47
- ## Basic Usage
48
-
49
- ```python
50
- from mkv_episode_matcher import process_show
51
-
52
- # Process all seasons
53
- process_show()
54
-
55
- # Process specific season
56
- process_show(season=1)
57
-
58
- # Test run without making changes
59
- process_show(season=1, dry_run=True)
60
-
61
- # Process and download subtitles
62
- process_show(get_subs=True)
63
- ```
64
-
65
49
  ## Directory Structure
66
50
 
67
51
  MKV Episode Matcher expects your TV shows to be organized as follows:
@@ -76,6 +60,23 @@ Show Name/
76
60
  │ └── episode2.mkv
77
61
  ```
78
62
 
63
+ ## Reference Subtitle File Structure
64
+
65
+ Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
66
+
67
+ ```
68
+
69
+ ~/.mkv-episode-matcher/cache/data/Show Name/
70
+ ├── Show Name - S01E01.srt
71
+ ├── Show Name - S01E02.srt
72
+ └── ...
73
+ ```
74
+
75
+ On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
76
+
77
+ Reference subtitle files should follow this naming pattern:
78
+ `{show_name} - S{season:02d}E{episode:02d}.srt`
79
+
79
80
  ## Contributing
80
81
 
81
82
  1. Fork the repository
@@ -41,28 +41,22 @@ Show Name/
41
41
  │ ├── episode1.mkv
42
42
  │ └── episode2.mkv
43
43
  ```
44
+ <!-- Add a note about the .srt reference files -->
44
45
 
45
- ## Python API Usage
46
+ ## Reference Subtitle File Structure
46
47
 
47
- ```python
48
- from mkv_episode_matcher import process_show
48
+ Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
49
49
 
50
- # Process all seasons
51
- process_show()
52
-
53
- # Process specific season
54
- process_show(season=1)
55
-
56
- # Test run
57
- process_show(season=1, dry_run=True)
58
-
59
- # With subtitles
60
- process_show(season=1, get_subs=True)
50
+ ```plaintext
51
+ ~/.mkv-episode-matcher/cache/data/Show Name/
52
+ ├── Show Name - S01E01.srt
53
+ ├── Show Name - S01E02.srt
54
+ └── ...
61
55
  ```
62
56
 
63
57
  ## Configuration
64
58
 
65
- Create a configuration file at `~/.mkv-episode-matcher/config.ini`:
59
+ The configuration file is automatically generated at `~/.mkv-episode-matcher/config.ini`:
66
60
 
67
61
  ```ini
68
62
  [Config]
@@ -9,6 +9,10 @@ from loguru import logger
9
9
  import whisper
10
10
  import numpy as np
11
11
  import re
12
+ from pathlib import Path
13
+ import chardet
14
+ from loguru import logger
15
+
12
16
  class EpisodeMatcher:
13
17
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
14
18
  self.cache_dir = Path(cache_dir)
@@ -50,34 +54,32 @@ class EpisodeMatcher:
50
54
  return str(chunk_path)
51
55
 
52
56
  def load_reference_chunk(self, srt_file, chunk_idx):
53
- """Load reference subtitles for a specific time chunk."""
57
+ """
58
+ Load reference subtitles for a specific time chunk with robust encoding handling.
59
+
60
+ Args:
61
+ srt_file (str or Path): Path to the SRT file
62
+ chunk_idx (int): Index of the chunk to load
63
+
64
+ Returns:
65
+ str: Combined text from the subtitle chunk
66
+ """
54
67
  chunk_start = chunk_idx * self.chunk_duration
55
68
  chunk_end = chunk_start + self.chunk_duration
56
- text_lines = []
57
69
 
58
- with open(srt_file, 'r', encoding='utf-8') as f:
59
- content = f.read().strip()
70
+ try:
71
+ # Read the file content using our robust reader
72
+ reader = SubtitleReader()
73
+ content = reader.read_srt_file(srt_file)
60
74
 
61
- for block in content.split('\n\n'):
62
- lines = block.split('\n')
63
- if len(lines) < 3 or '-->' not in lines[1]: # Skip malformed blocks
64
- continue
65
-
66
- try:
67
- timestamp = lines[1]
68
- text = ' '.join(lines[2:])
69
-
70
- end_time = timestamp.split(' --> ')[1].strip()
71
- hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
72
- total_seconds = hours * 3600 + minutes * 60 + seconds
73
-
74
- if chunk_start <= total_seconds <= chunk_end:
75
- text_lines.append(text)
76
-
77
- except (IndexError, ValueError):
78
- continue
79
-
80
- return ' '.join(text_lines)
75
+ # Extract subtitles for the time chunk
76
+ text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
77
+
78
+ return ' '.join(text_lines)
79
+
80
+ except Exception as e:
81
+ logger.error(f"Error loading reference chunk from {srt_file}: {e}")
82
+ return ''
81
83
 
82
84
  def identify_episode(self, video_file, temp_dir, season_number):
83
85
  try:
@@ -147,4 +149,121 @@ class EpisodeMatcher:
147
149
  finally:
148
150
  # Cleanup temp files
149
151
  for file in self.temp_dir.glob("chunk_*.wav"):
150
- file.unlink()
152
+ file.unlink()
153
+
154
+ def detect_file_encoding(file_path):
155
+ """
156
+ Detect the encoding of a file using chardet.
157
+
158
+ Args:
159
+ file_path (str or Path): Path to the file
160
+
161
+ Returns:
162
+ str: Detected encoding, defaults to 'utf-8' if detection fails
163
+ """
164
+ try:
165
+ with open(file_path, 'rb') as f:
166
+ raw_data = f.read()
167
+ result = chardet.detect(raw_data)
168
+ encoding = result['encoding']
169
+ confidence = result['confidence']
170
+
171
+ logger.debug(f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}")
172
+ return encoding if encoding else 'utf-8'
173
+ except Exception as e:
174
+ logger.warning(f"Error detecting encoding for {file_path}: {e}")
175
+ return 'utf-8'
176
+
177
+ def read_file_with_fallback(file_path, encodings=None):
178
+ """
179
+ Read a file trying multiple encodings in order of preference.
180
+
181
+ Args:
182
+ file_path (str or Path): Path to the file
183
+ encodings (list): List of encodings to try, defaults to common subtitle encodings
184
+
185
+ Returns:
186
+ str: File contents
187
+
188
+ Raises:
189
+ ValueError: If file cannot be read with any encoding
190
+ """
191
+ if encodings is None:
192
+ # First try detected encoding, then fallback to common subtitle encodings
193
+ detected = detect_file_encoding(file_path)
194
+ encodings = [detected, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
195
+
196
+ file_path = Path(file_path)
197
+ errors = []
198
+
199
+ for encoding in encodings:
200
+ try:
201
+ with open(file_path, 'r', encoding=encoding) as f:
202
+ content = f.read()
203
+ logger.debug(f"Successfully read {file_path} using {encoding} encoding")
204
+ return content
205
+ except UnicodeDecodeError as e:
206
+ errors.append(f"{encoding}: {str(e)}")
207
+ continue
208
+
209
+ error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(errors)
210
+ logger.error(error_msg)
211
+ raise ValueError(error_msg)
212
+
213
+ class SubtitleReader:
214
+ """Helper class for reading and parsing subtitle files."""
215
+
216
+ @staticmethod
217
+ def parse_timestamp(timestamp):
218
+ """Parse SRT timestamp into seconds."""
219
+ hours, minutes, seconds = timestamp.replace(',', '.').split(':')
220
+ return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
221
+
222
+ @staticmethod
223
+ def read_srt_file(file_path):
224
+ """
225
+ Read an SRT file and return its contents with robust encoding handling.
226
+
227
+ Args:
228
+ file_path (str or Path): Path to the SRT file
229
+
230
+ Returns:
231
+ str: Contents of the SRT file
232
+ """
233
+ return read_file_with_fallback(file_path)
234
+
235
+ @staticmethod
236
+ def extract_subtitle_chunk(content, start_time, end_time):
237
+ """
238
+ Extract subtitle text for a specific time window.
239
+
240
+ Args:
241
+ content (str): Full SRT file content
242
+ start_time (float): Chunk start time in seconds
243
+ end_time (float): Chunk end time in seconds
244
+
245
+ Returns:
246
+ list: List of subtitle texts within the time window
247
+ """
248
+ text_lines = []
249
+
250
+ for block in content.strip().split('\n\n'):
251
+ lines = block.split('\n')
252
+ if len(lines) < 3 or '-->' not in lines[1]:
253
+ continue
254
+
255
+ try:
256
+ timestamp = lines[1]
257
+ text = ' '.join(lines[2:])
258
+
259
+ end_stamp = timestamp.split(' --> ')[1].strip()
260
+ total_seconds = SubtitleReader.parse_timestamp(end_stamp)
261
+
262
+ if start_time <= total_seconds <= end_time:
263
+ text_lines.append(text)
264
+
265
+ except (IndexError, ValueError) as e:
266
+ logger.warning(f"Error parsing subtitle block: {e}")
267
+ continue
268
+
269
+ return text_lines
@@ -0,0 +1,82 @@
1
+ from typing import List, Optional, Union
2
+ import os
3
+ import re
4
+
5
+ def generate_subtitle_patterns(series_name: str, season: int, episode: int) -> List[str]:
6
+ """
7
+ Generate various common subtitle filename patterns.
8
+
9
+ Args:
10
+ series_name (str): Name of the series
11
+ season (int): Season number
12
+ episode (int): Episode number
13
+
14
+ Returns:
15
+ List[str]: List of possible subtitle filenames
16
+ """
17
+ patterns = [
18
+ # Standard format: "Show Name - S01E02.srt"
19
+ f"{series_name} - S{season:02d}E{episode:02d}.srt",
20
+
21
+ # Season x Episode format: "Show Name - 1x02.srt"
22
+ f"{series_name} - {season}x{episode:02d}.srt",
23
+
24
+ # Separate season/episode: "Show Name - Season 1 Episode 02.srt"
25
+ f"{series_name} - Season {season} Episode {episode:02d}.srt",
26
+
27
+ # Compact format: "ShowName.S01E02.srt"
28
+ f"{series_name.replace(' ', '')}.S{season:02d}E{episode:02d}.srt",
29
+
30
+ # Numbered format: "Show Name 102.srt"
31
+ f"{series_name} {season:01d}{episode:02d}.srt",
32
+
33
+ # Dot format: "Show.Name.1x02.srt"
34
+ f"{series_name.replace(' ', '.')}.{season}x{episode:02d}.srt",
35
+
36
+ # Underscore format: "Show_Name_S01E02.srt"
37
+ f"{series_name.replace(' ', '_')}_S{season:02d}E{episode:02d}.srt",
38
+ ]
39
+
40
+ return patterns
41
+
42
+ def find_existing_subtitle(series_cache_dir: str, series_name: str, season: int, episode: int) -> Optional[str]:
43
+ """
44
+ Check for existing subtitle files in various naming formats.
45
+
46
+ Args:
47
+ series_cache_dir (str): Directory containing subtitle files
48
+ series_name (str): Name of the series
49
+ season (int): Season number
50
+ episode (int): Episode number
51
+
52
+ Returns:
53
+ Optional[str]: Path to existing subtitle file if found, None otherwise
54
+ """
55
+ patterns = generate_subtitle_patterns(series_name, season, episode)
56
+
57
+ for pattern in patterns:
58
+ filepath = os.path.join(series_cache_dir, pattern)
59
+ if os.path.exists(filepath):
60
+ return filepath
61
+
62
+ return None
63
+
64
+ def sanitize_filename(filename: str) -> str:
65
+ """
66
+ Sanitize filename by removing/replacing invalid characters.
67
+
68
+ Args:
69
+ filename (str): Original filename
70
+
71
+ Returns:
72
+ str: Sanitized filename
73
+ """
74
+ # Replace problematic characters
75
+ filename = filename.replace(':', ' -')
76
+ filename = filename.replace('/', '-')
77
+ filename = filename.replace('\\', '-')
78
+
79
+ # Remove any other invalid characters
80
+ filename = re.sub(r'[<>:"/\\|?*]', '', filename)
81
+
82
+ return filename.strip()
@@ -10,7 +10,7 @@ from opensubtitlescom import OpenSubtitles
10
10
  from mkv_episode_matcher.__main__ import CACHE_DIR, CONFIG_FILE
11
11
  from mkv_episode_matcher.config import get_config
12
12
  from mkv_episode_matcher.tmdb_client import fetch_season_details
13
-
13
+ from mkv_episode_matcher.subtitle_utils import find_existing_subtitle,sanitize_filename
14
14
  def get_valid_seasons(show_dir):
15
15
  """
16
16
  Get all season directories that contain MKV files.
@@ -128,20 +128,17 @@ def get_subtitles(show_id, seasons: set[int]):
128
128
  Args:
129
129
  show_id (int): The ID of the TV show.
130
130
  seasons (Set[int]): A set of season numbers for which subtitles should be retrieved.
131
-
132
- Returns:
133
- None
134
131
  """
135
-
136
132
  logger.info(f"Getting subtitles for show ID {show_id}")
137
133
  config = get_config(CONFIG_FILE)
138
134
  show_dir = config.get("show_dir")
139
- series_name = os.path.basename(show_dir)
135
+ series_name = sanitize_filename(os.path.basename(show_dir))
140
136
  tmdb_api_key = config.get("tmdb_api_key")
141
137
  open_subtitles_api_key = config.get("open_subtitles_api_key")
142
138
  open_subtitles_user_agent = config.get("open_subtitles_user_agent")
143
139
  open_subtitles_username = config.get("open_subtitles_username")
144
140
  open_subtitles_password = config.get("open_subtitles_password")
141
+
145
142
  if not all([
146
143
  show_dir,
147
144
  tmdb_api_key,
@@ -151,63 +148,66 @@ def get_subtitles(show_id, seasons: set[int]):
151
148
  open_subtitles_password,
152
149
  ]):
153
150
  logger.error("Missing configuration settings. Please run the setup script.")
151
+ return
152
+
154
153
  try:
155
- # Initialize the OpenSubtitles client
156
154
  subtitles = OpenSubtitles(open_subtitles_user_agent, open_subtitles_api_key)
157
-
158
- # Log in (retrieve auth token)
159
155
  subtitles.login(open_subtitles_username, open_subtitles_password)
160
156
  except Exception as e:
161
157
  logger.error(f"Failed to log in to OpenSubtitles: {e}")
162
158
  return
159
+
163
160
  for season in seasons:
164
161
  episodes = fetch_season_details(show_id, season)
165
162
  logger.info(f"Found {episodes} episodes in Season {season}")
166
163
 
167
164
  for episode in range(1, episodes + 1):
168
165
  logger.info(f"Processing Season {season}, Episode {episode}...")
166
+
169
167
  series_cache_dir = os.path.join(CACHE_DIR, "data", series_name)
170
168
  os.makedirs(series_cache_dir, exist_ok=True)
169
+
170
+ # Check for existing subtitle in any supported format
171
+ existing_subtitle = find_existing_subtitle(
172
+ series_cache_dir, series_name, season, episode
173
+ )
174
+
175
+ if existing_subtitle:
176
+ logger.info(f"Subtitle already exists: {os.path.basename(existing_subtitle)}")
177
+ continue
178
+
179
+ # Default to standard format for new downloads
171
180
  srt_filepath = os.path.join(
172
181
  series_cache_dir,
173
182
  f"{series_name} - S{season:02d}E{episode:02d}.srt",
174
183
  )
175
- if not os.path.exists(srt_filepath):
176
- # get the episode info from TMDB
177
- url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season}/episode/{episode}?api_key={tmdb_api_key}"
178
- response = requests.get(url)
179
- response.raise_for_status()
180
- episode_data = response.json()
181
- episode_data["name"]
182
- episode_id = episode_data["id"]
183
- # search for the subtitle
184
- response = subtitles.search(tmdb_id=episode_id, languages="en")
185
- if len(response.data) == 0:
186
- logger.warning(
187
- f"No subtitles found for {series_name} - S{season:02d}E{episode:02d}"
188
- )
189
-
190
- for subtitle in response.data:
191
- subtitle_dict = subtitle.to_dict()
192
- # Remove special characters and convert to uppercase
193
- filename_clean = re.sub(
194
- r"\W+", " ", subtitle_dict["file_name"]
195
- ).upper()
196
- if f"E{episode:02d}" in filename_clean:
197
- logger.info(f"Original filename: {subtitle_dict['file_name']}")
198
- srt_file = subtitles.download_and_save(subtitle)
199
- series_name = series_name.replace(":", " -")
200
- shutil.move(srt_file, srt_filepath)
201
- logger.info(f"Subtitle saved to {srt_filepath}")
202
- break
203
- else:
204
- continue
205
- else:
206
- logger.info(
207
- f"Subtitle already exists for {series_name} - S{season:02d}E{episode:02d}"
184
+
185
+ # get the episode info from TMDB
186
+ url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season}/episode/{episode}?api_key={tmdb_api_key}"
187
+ response = requests.get(url)
188
+ response.raise_for_status()
189
+ episode_data = response.json()
190
+ episode_id = episode_data["id"]
191
+
192
+ # search for the subtitle
193
+ response = subtitles.search(tmdb_id=episode_id, languages="en")
194
+ if len(response.data) == 0:
195
+ logger.warning(
196
+ f"No subtitles found for {series_name} - S{season:02d}E{episode:02d}"
208
197
  )
209
198
  continue
210
199
 
200
+ for subtitle in response.data:
201
+ subtitle_dict = subtitle.to_dict()
202
+ # Remove special characters and convert to uppercase
203
+ filename_clean = re.sub(r"\W+", " ", subtitle_dict["file_name"]).upper()
204
+ if f"E{episode:02d}" in filename_clean:
205
+ logger.info(f"Original filename: {subtitle_dict['file_name']}")
206
+ srt_file = subtitles.download_and_save(subtitle)
207
+ shutil.move(srt_file, srt_filepath)
208
+ logger.info(f"Subtitle saved to {srt_filepath}")
209
+ break
210
+
211
211
 
212
212
  def cleanup_ocr_files(show_dir):
213
213
  """
@@ -236,7 +236,7 @@ def clean_text(text):
236
236
  # Strip leading/trailing whitespace
237
237
  return cleaned_text.strip()
238
238
 
239
-
239
+ @logger.catch
240
240
  def process_reference_srt_files(series_name):
241
241
  """
242
242
  Process reference SRT files for a given series.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mkv-episode-matcher
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -51,6 +51,14 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
51
51
  - ✨ **Bulk Processing**: Handle entire seasons at once
52
52
  - 🧪 **Dry Run Mode**: Test changes before applying
53
53
 
54
+ ## Prerequisites
55
+
56
+ - Python 3.9 or higher
57
+ - [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
58
+ - [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
59
+ - TMDb API key
60
+ - OpenSubtitles account (optional, for subtitle downloads)
61
+
54
62
  ## Quick Start
55
63
 
56
64
  1. Install the package:
@@ -60,37 +68,13 @@ pip install mkv-episode-matcher
60
68
 
61
69
  2. Run on your show directory:
62
70
  ```bash
63
- mkv-match --show-dir "path/to/your/show" --season 1
71
+ mkv-match --show-dir "path/to/your/show" --get-subs true
64
72
  ```
65
73
 
66
- ## Requirements
67
-
68
- - Python 3.8 or higher
69
- - TMDb API key
70
- - OpenSubtitles account (optional, for subtitle downloads)
71
-
72
74
  ## Documentation
73
75
 
74
76
  Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
75
77
 
76
- ## Basic Usage
77
-
78
- ```python
79
- from mkv_episode_matcher import process_show
80
-
81
- # Process all seasons
82
- process_show()
83
-
84
- # Process specific season
85
- process_show(season=1)
86
-
87
- # Test run without making changes
88
- process_show(season=1, dry_run=True)
89
-
90
- # Process and download subtitles
91
- process_show(get_subs=True)
92
- ```
93
-
94
78
  ## Directory Structure
95
79
 
96
80
  MKV Episode Matcher expects your TV shows to be organized as follows:
@@ -105,6 +89,23 @@ Show Name/
105
89
  │ └── episode2.mkv
106
90
  ```
107
91
 
92
+ ## Reference Subtitle File Structure
93
+
94
+ Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
95
+
96
+ ```
97
+
98
+ ~/.mkv-episode-matcher/cache/data/Show Name/
99
+ ├── Show Name - S01E01.srt
100
+ ├── Show Name - S01E02.srt
101
+ └── ...
102
+ ```
103
+
104
+ On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
105
+
106
+ Reference subtitle files should follow this naming pattern:
107
+ `{show_name} - S{season:02d}E{episode:02d}.srt`
108
+
108
109
  ## Contributing
109
110
 
110
111
  1. Fork the repository
@@ -12,6 +12,7 @@ uv.lock
12
12
  .github/funding.yml
13
13
  .github/workflows/documentation.yml
14
14
  .github/workflows/python-publish.yml
15
+ .github/workflows/tests.yml
15
16
  .vscode/settings.json
16
17
  docs/cli.md
17
18
  docs/configuration.md
@@ -27,6 +28,7 @@ mkv_episode_matcher/episode_identification.py
27
28
  mkv_episode_matcher/episode_matcher.py
28
29
  mkv_episode_matcher/mkv_to_srt.py
29
30
  mkv_episode_matcher/speech_to_text.py
31
+ mkv_episode_matcher/subtitle_utils.py
30
32
  mkv_episode_matcher/tmdb_client.py
31
33
  mkv_episode_matcher/utils.py
32
34
  mkv_episode_matcher.egg-info/PKG-INFO
@@ -46,5 +48,4 @@ mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py
46
48
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py
47
49
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py
48
50
  tests/__init__.py
49
- tests/test_improvements.py
50
51
  tests/test_main.py
@@ -47,6 +47,7 @@ dev = [
47
47
  "pytest-cov>=6.0.0",
48
48
  "pytest>=8.3.3",
49
49
  "ruff>=0.8.0",
50
+ "chardet>=5.2.0",
50
51
  ]
51
52
 
52
53
 
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = mkv_episode_matcher
3
- version = 0.3.3
3
+ version = 0.3.5
4
4
  author = Jonathan Sakkos
5
5
  author_email = jonathansakkos@gmail.com
6
6
  description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
@@ -13,6 +13,31 @@ from mkv_episode_matcher.utils import (
13
13
  )
14
14
  from mkv_episode_matcher.episode_identification import EpisodeMatcher
15
15
  from mkv_episode_matcher.config import get_config, set_config
16
+ from unittest.mock import Mock, patch
17
+
18
+
19
+ # @pytest.fixture
20
+ # def mock_config():
21
+ # return {
22
+ # "tmdb_api_key": "test_key",
23
+ # "show_dir": "/test/path",
24
+ # "max_threads": 4,
25
+ # "tesseract_path": "/usr/bin/tesseract",
26
+ # }
27
+
28
+
29
+ @pytest.fixture
30
+ def mock_episode_data():
31
+ return {
32
+ "name": "Test Episode",
33
+ "season_number": 1,
34
+ "episode_number": 1,
35
+ "overview": "Test overview",
36
+ }
37
+
38
+ @pytest.fixture
39
+ def mock_seasons():
40
+ return ["/test/path/Season 1"]
16
41
 
17
42
  @pytest.fixture
18
43
  def temp_show_dir(tmp_path):
@@ -101,8 +126,8 @@ class TestEpisodeMatcher:
101
126
  return EpisodeMatcher(tmp_path, "Test Show")
102
127
 
103
128
  def test_clean_text(self, matcher):
104
- text = "Test [action] <tag> T-t-test"
105
- assert matcher.clean_text(text) == "test action tag test"
129
+ text = "Test [action] T-t-test"
130
+ assert matcher.clean_text(text) == "test action test"
106
131
 
107
132
  def test_chunk_score(self, matcher):
108
133
  score = matcher.chunk_score("Test dialogue", "test dialog")
@@ -116,22 +141,27 @@ class TestEpisodeMatcher:
116
141
  assert isinstance(chunk, str)
117
142
  assert mock_run.called
118
143
 
119
- class TestProcessShow:
120
- @patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
121
- @patch('mkv_episode_matcher.episode_matcher.get_config')
122
- def test_process_show_no_seasons(self, mock_config, mock_seasons, mock_config_data):
123
- mock_seasons.return_value = []
124
- mock_config.return_value = mock_config_data
125
- process_show()
126
- mock_seasons.assert_called_once()
127
-
128
- @patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
129
- @patch('mkv_episode_matcher.episode_matcher.get_config')
130
- def test_process_show_with_season(self, mock_config, mock_seasons, temp_show_dir, mock_config_data):
131
- mock_seasons.return_value = [str(temp_show_dir / "Season 1")]
132
- mock_config.return_value = mock_config_data
133
- process_show(season=1)
134
- mock_seasons.assert_called_once()
144
+ class TestEpisodeMatcher:
145
+ def test_extract_season_episode(self):
146
+ from mkv_episode_matcher.utils import extract_season_episode
147
+
148
+ # Test valid filename
149
+ assert extract_season_episode("Show - S01E02.mkv") == (1, 2)
150
+
151
+ # Test invalid filename
152
+ assert extract_season_episode("invalid.mkv") == (None, None)
153
+
154
+ @patch("mkv_episode_matcher.tmdb_client.requests.get")
155
+ def test_fetch_show_id(self, mock_get):
156
+ from mkv_episode_matcher.tmdb_client import fetch_show_id
157
+
158
+ mock_response = Mock()
159
+ mock_response.status_code = 200
160
+ mock_response.json.return_value = {"results": [{"id": 12345}]}
161
+ mock_get.return_value = mock_response
162
+
163
+ assert fetch_show_id("Test Show") == "12345"
164
+
135
165
 
136
166
  if __name__ == '__main__':
137
167
  pytest.main(['-v'])
@@ -24,6 +24,15 @@ wheels = [
24
24
  { url = "https://files.pythonhosted.org/packages/12/90/3c9ff0512038035f59d279fddeb79f5f1eccd8859f06d6163c58798b9487/certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8", size = 167321 },
25
25
  ]
26
26
 
27
+ [[package]]
28
+ name = "chardet"
29
+ version = "5.2.0"
30
+ source = { registry = "https://pypi.org/simple" }
31
+ sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 }
32
+ wheels = [
33
+ { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 },
34
+ ]
35
+
27
36
  [[package]]
28
37
  name = "charset-normalizer"
29
38
  version = "3.4.0"
@@ -374,7 +383,7 @@ wheels = [
374
383
 
375
384
  [[package]]
376
385
  name = "mkv-episode-matcher"
377
- version = "0.3.2.post1.dev0+g2c513fa.d20241126"
386
+ version = "0.3.4.post1.dev3+g95f005b.d20250112"
378
387
  source = { editable = "." }
379
388
  dependencies = [
380
389
  { name = "configparser" },
@@ -391,6 +400,7 @@ dependencies = [
391
400
 
392
401
  [package.dev-dependencies]
393
402
  dev = [
403
+ { name = "chardet" },
394
404
  { name = "pytest" },
395
405
  { name = "pytest-cov" },
396
406
  { name = "ruff" },
@@ -412,6 +422,7 @@ requires-dist = [
412
422
 
413
423
  [package.metadata.requires-dev]
414
424
  dev = [
425
+ { name = "chardet", specifier = ">=5.2.0" },
415
426
  { name = "pytest", specifier = ">=8.3.3" },
416
427
  { name = "pytest-cov", specifier = ">=6.0.0" },
417
428
  { name = "ruff", specifier = ">=0.8.0" },
@@ -1,59 +0,0 @@
1
- from unittest.mock import Mock, patch
2
-
3
- import pytest
4
-
5
-
6
- @pytest.fixture
7
- def mock_config():
8
- return {
9
- "tmdb_api_key": "test_key",
10
- "show_dir": "/test/path",
11
- "max_threads": 4,
12
- "tesseract_path": "/usr/bin/tesseract",
13
- }
14
-
15
-
16
- @pytest.fixture
17
- def mock_episode_data():
18
- return {
19
- "name": "Test Episode",
20
- "season_number": 1,
21
- "episode_number": 1,
22
- "overview": "Test overview",
23
- }
24
-
25
-
26
- class TestEpisodeMatcher:
27
- def test_extract_season_episode(self):
28
- from mkv_episode_matcher.episode_matcher import extract_season_episode
29
-
30
- # Test valid filename
31
- assert extract_season_episode("Show - S01E02.mkv") == (1, 2)
32
-
33
- # Test invalid filename
34
- assert extract_season_episode("invalid.mkv") == (None, None)
35
-
36
- @patch("mkv_episode_matcher.tmdb_client.requests.get")
37
- def test_fetch_show_id(self, mock_get):
38
- from mkv_episode_matcher.tmdb_client import fetch_show_id
39
-
40
- mock_response = Mock()
41
- mock_response.status_code = 200
42
- mock_response.json.return_value = {"results": [{"id": 12345}]}
43
- mock_get.return_value = mock_response
44
-
45
- assert fetch_show_id("Test Show") == "12345"
46
-
47
- @patch("mkv_episode_matcher.utils.OpenSubtitles")
48
- def test_get_subtitles(self, mock_subtitles):
49
- from mkv_episode_matcher.utils import get_subtitles
50
-
51
- # Test subtitle download
52
- mock_subtitles.return_value.search.return_value.data = [
53
- {"file_name": "Test.Show.S01E01.srt"}
54
- ]
55
-
56
- with patch("pathlib.Path.exists", return_value=False):
57
- get_subtitles(12345, {1})
58
-
59
- mock_subtitles.return_value.download_and_save.assert_called_once()