mkv-episode-matcher 0.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -0,0 +1,2 @@
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
File without changes
@@ -0,0 +1,177 @@
1
+ # __main__.py
2
+ import argparse
3
+ import os
4
+ from loguru import logger
5
+ from .config import set_config, get_config
6
+
7
+
8
+ # Check if logs directory exists, if not create it
9
+ if not os.path.exists('./logs'):
10
+ os.mkdir('./logs')
11
+
12
+ # Add a new handler for stdout logs
13
+ logger.add("./logs/file_stdout.log", format="{time} {level} {message}", level="DEBUG", rotation="10 MB")
14
+
15
+ # Add a new handler for error logs
16
+ logger.add("./logs/file_errors.log", level="ERROR", rotation="10 MB")
17
+
18
+ # Check if the configuration directory exists, if not create it
19
+ if not os.path.exists(os.path.join(os.path.expanduser("~"), ".mkv-episode-matcher")):
20
+ os.makedirs(os.path.join(os.path.expanduser("~"), ".mkv-episode-matcher"))
21
+
22
+ # Define the paths for the configuration file and cache directory
23
+ CONFIG_FILE = os.path.join(
24
+ os.path.expanduser("~"), ".mkv-episode-matcher", "config.ini"
25
+ )
26
+ CACHE_DIR = os.path.join(os.path.expanduser("~"), ".mkv-episode-matcher", "cache")
27
+
28
+ # Check if the cache directory exists, if not create it
29
+ if not os.path.exists(CACHE_DIR):
30
+ os.makedirs(CACHE_DIR)
31
+
32
+
33
+ @logger.catch
34
+ def main():
35
+ """
36
+ Entry point of the application.
37
+
38
+ This function is responsible for starting the application, parsing command-line arguments,
39
+ setting the configuration, and processing the show.
40
+
41
+ Command-line arguments:
42
+ --tmdb-api-key: The API key for the TMDb API. If not provided, the function will try to get it from the cache or prompt the user to input it.
43
+ --show-dir: The main directory of the show. If not provided, the function will prompt the user to input it.
44
+ --season: The season number to be processed. If not provided, all seasons will be processed.
45
+ --dry-run: A boolean flag indicating whether to perform a dry run (i.e., not rename any files). If not provided, the function will rename files.
46
+ --get-subs: A boolean flag indicating whether to download subtitles for the show. If not provided, the function will not download subtitles.
47
+ --tesseract-path: The path to the tesseract executable. If not provided, the function will try to get it from the cache or prompt the user to input it.
48
+
49
+ The function logs its progress to two separate log files: one for standard output and one for errors.
50
+ """
51
+
52
+ # Log the start of the application
53
+ logger.info("Starting the application")
54
+
55
+ # Parse command-line arguments
56
+ parser = argparse.ArgumentParser(description="Process shows with TMDb API")
57
+ parser.add_argument("--tmdb-api-key", help="TMDb API key")
58
+ parser.add_argument("--show-dir", help="Main directory of the show")
59
+ parser.add_argument(
60
+ "--season",
61
+ type=int,
62
+ default=None,
63
+ nargs="?",
64
+ help="Specify the season number to be processed (default: None)",
65
+ )
66
+ parser.add_argument(
67
+ "--dry-run",
68
+ type=bool,
69
+ default=None,
70
+ nargs="?",
71
+ help="Don't rename any files (default: None)",
72
+ )
73
+ parser.add_argument(
74
+ "--get-subs",
75
+ type=bool,
76
+ default=None,
77
+ nargs="?",
78
+ help="Download subtitles for the show (default: None)",
79
+ )
80
+ parser.add_argument(
81
+ "--tesseract-path",
82
+ type=str,
83
+ default=None,
84
+ nargs="?",
85
+ help="Path to the tesseract executable (default: None)",
86
+ )
87
+ args = parser.parse_args()
88
+ logger.debug(f"Command-line arguments: {args}")
89
+ open_subtitles_api_key = ""
90
+ open_subtitles_user_agent = ""
91
+ open_subtitles_username = ""
92
+ open_subtitles_password = ""
93
+ # Check if API key is provided via command-line argument
94
+ tmdb_api_key = args.tmdb_api_key
95
+
96
+ # If API key is not provided, try to get it from the cache
97
+ if not tmdb_api_key:
98
+ cached_config = get_config(CONFIG_FILE)
99
+ if cached_config:
100
+ tmdb_api_key = cached_config.get("tmdb_api_key")
101
+
102
+ # If API key is still not available, prompt the user to input it
103
+ if not tmdb_api_key:
104
+ tmdb_api_key = input("Enter your TMDb API key: ")
105
+ # Cache the API key
106
+
107
+ logger.debug(f"TMDb API Key: {tmdb_api_key}")
108
+ if args.get_subs:
109
+ logger.debug("Getting OpenSubtitles API key")
110
+ cached_config = get_config(CONFIG_FILE)
111
+ try:
112
+ open_subtitles_api_key = cached_config.get("open_subtitles_api_key")
113
+ open_subtitles_user_agent = cached_config.get("open_subtitles_user_agent")
114
+ open_subtitles_username = cached_config.get("open_subtitles_username")
115
+ open_subtitles_password = cached_config.get("open_subtitles_password")
116
+ except:
117
+ pass
118
+
119
+ if not open_subtitles_api_key:
120
+ open_subtitles_api_key = input("Enter your OpenSubtitles API key: ")
121
+
122
+ if not open_subtitles_user_agent:
123
+ open_subtitles_user_agent = input("Enter your OpenSubtitles User Agent: ")
124
+
125
+ if not open_subtitles_username:
126
+ open_subtitles_username = input("Enter your OpenSubtitles Username: ")
127
+
128
+ if not open_subtitles_password:
129
+ open_subtitles_password = input("Enter your OpenSubtitles Password: ")
130
+
131
+ # If show directory is provided via command-line argument, use it
132
+ show_dir = args.show_dir
133
+ if not show_dir:
134
+ show_dir = cached_config.get("show_dir")
135
+ if not show_dir:
136
+ # If show directory is not provided, prompt the user to input it
137
+ show_dir = input("Enter the main directory of the show:")
138
+ logger.info(f"Show Directory: {show_dir}")
139
+ # if the user does not provide a show directory, make the default show directory the current working directory
140
+ if not show_dir:
141
+ show_dir = os.getcwd()
142
+ if not args.tesseract_path:
143
+ tesseract_path = cached_config.get("tesseract_path")
144
+
145
+ if not tesseract_path:
146
+ tesseract_path = input(
147
+ r"Enter the path to the tesseract executable: ['C:\Program Files\Tesseract-OCR\tesseract.exe']"
148
+ )
149
+
150
+ else:
151
+ tesseract_path = args.tesseract_path
152
+ logger.debug(f"Teesseract Path: {tesseract_path}")
153
+ logger.debug(f"Show Directory: {show_dir}")
154
+
155
+ # Set the configuration
156
+ set_config(
157
+ tmdb_api_key,
158
+ open_subtitles_api_key,
159
+ open_subtitles_user_agent,
160
+ open_subtitles_username,
161
+ open_subtitles_password,
162
+ show_dir,
163
+ CONFIG_FILE,
164
+ tesseract_path=tesseract_path,
165
+ )
166
+ logger.info("Configuration set")
167
+
168
+ # Process the show
169
+ from .episode_matcher import process_show
170
+
171
+ process_show(args.season, dry_run=args.dry_run, get_subs=args.get_subs)
172
+ logger.info("Show processing completed")
173
+
174
+
175
+ # Run the main function if the script is run directly
176
+ if __name__ == "__main__":
177
+ main()
@@ -0,0 +1,79 @@
1
+ # config.py
2
+ import os
3
+ import configparser
4
+ import multiprocessing
5
+ from loguru import logger
6
+
7
+ MAX_THREADS = 4
8
+
9
+
10
+ def get_total_threads():
11
+ return multiprocessing.cpu_count()
12
+
13
+
14
+ total_threads = get_total_threads()
15
+
16
+ logger.info(f"Total available threads: {total_threads} -> Setting max to {MAX_THREADS}")
17
+
18
+
19
+ def set_config(
20
+ tmdb_api_key,
21
+ open_subtitles_api_key,
22
+ open_subtitles_user_agent,
23
+ open_subtitles_username,
24
+ open_subtitles_password,
25
+ show_dir,
26
+ file,
27
+ tesseract_path=None,
28
+ ):
29
+ """
30
+ Sets the configuration values and writes them to a file.
31
+
32
+ Args:
33
+ tmdb_api_key (str): The API key for TMDB (The Movie Database).
34
+ open_subtitles_api_key (str): The API key for OpenSubtitles.
35
+ open_subtitles_user_agent (str): The user agent for OpenSubtitles.
36
+ open_subtitles_username (str): The username for OpenSubtitles.
37
+ open_subtitles_password (str): The password for OpenSubtitles.
38
+ show_dir (str): The directory where the TV show episodes are located.
39
+ file (str): The path to the configuration file.
40
+ tesseract_path (str, optional): The path to the Tesseract OCR executable.
41
+
42
+ Returns:
43
+ None
44
+ """
45
+ config = configparser.ConfigParser()
46
+ config["Config"] = {
47
+ "tmdb_api_key": str(tmdb_api_key),
48
+ "show_dir": show_dir,
49
+ "max_threads": int(MAX_THREADS),
50
+ "open_subtitles_api_key": str(open_subtitles_api_key),
51
+ "open_subtitles_user_agent": str(open_subtitles_user_agent),
52
+ "open_subtitles_username": str(open_subtitles_username),
53
+ "open_subtitles_password": str(open_subtitles_password),
54
+ "tesseract_path": str(tesseract_path),
55
+ }
56
+ logger.info(
57
+ f"Setting config with API:{tmdb_api_key}, show_dir: {show_dir}, and max_threads: {MAX_THREADS}"
58
+ )
59
+ with open(file, "w") as configfile:
60
+ config.write(configfile)
61
+
62
+
63
+ def get_config(file):
64
+ """
65
+ Read and return the configuration from the specified file.
66
+
67
+ Args:
68
+ file (str): The path to the configuration file.
69
+
70
+ Returns:
71
+ dict: The configuration settings as a dictionary.
72
+
73
+ """
74
+ logger.info(f"Loading config from {file}")
75
+ config = configparser.ConfigParser()
76
+ if os.path.exists(file):
77
+ config.read(file)
78
+ return config["Config"] if "Config" in config else None
79
+ return {}
@@ -0,0 +1,235 @@
1
+ # episode_matcher.py
2
+ import os
3
+ from mkv_episode_matcher.config import get_config
4
+ from mkv_episode_matcher.tmdb_client import fetch_show_id
5
+ from mkv_episode_matcher.utils import get_subtitles, cleanup_ocr_files,check_filename
6
+ from loguru import logger
7
+ from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
8
+ from mkv_episode_matcher.mkv_to_srt import convert_mkv_to_srt
9
+ import re
10
+
11
+
12
+ # hash_data = {}
13
+ @logger.catch
14
+ def process_show(season=None, dry_run=False, get_subs=False):
15
+ """
16
+ Process the show by downloading episode images and finding matching episodes.
17
+
18
+ Args:
19
+ season (int, optional): The season number to process. If provided, only that season will be processed. Defaults to None.
20
+ force (bool, optional): Whether to force re-processing of episodes even if they already exist. Defaults to False.
21
+ dry_run (bool, optional): Whether to perform a dry run without actually processing the episodes. Defaults to False.
22
+ threshold (float, optional): The threshold value for matching episodes. Defaults to None.
23
+ """
24
+ config = get_config(CONFIG_FILE)
25
+ show_dir = config.get("show_dir")
26
+ show_name = os.path.basename(show_dir)
27
+ logger.info(f"Processing show '{show_name}'...")
28
+ show_id = fetch_show_id(show_name)
29
+
30
+ if show_id is None:
31
+ logger.error(f"Could not find show '{os.path.basename(show_dir)}' on TMDb.")
32
+ return
33
+ season_paths = [
34
+ os.path.join(show_dir, d)
35
+ for d in os.listdir(show_dir)
36
+ if os.path.isdir(os.path.join(show_dir, d))
37
+ ]
38
+ logger.info(
39
+ f"Found {len(season_paths)} seasons for show '{os.path.basename(show_dir)}'"
40
+ )
41
+ seasons_to_process = [
42
+ int(os.path.basename(season_path).split()[-1]) for season_path in season_paths
43
+ ]
44
+ if get_subs:
45
+ get_subtitles(show_id, seasons=set(seasons_to_process))
46
+ if season is not None:
47
+ mkv_files = [
48
+ os.path.join(show_dir, season)
49
+ for f in os.listdir(show_dir)
50
+ if f.endswith(".mkv")
51
+ ]
52
+
53
+ season_path = os.path.join(show_dir, f"Season {season}")
54
+ else:
55
+ for season_path in os.listdir(show_dir):
56
+ season_path = os.path.join(show_dir, season_path)
57
+ mkv_files = [
58
+ os.path.join(season_path, f)
59
+ for f in os.listdir(season_path)
60
+ if f.endswith(".mkv")
61
+ ]
62
+ # Filter out files that have already been processed
63
+ for f in mkv_files:
64
+ if check_filename(f):
65
+ logger.info(f"Skipping {f}, already processed")
66
+ mkv_files.remove(f)
67
+ if len(mkv_files) == 0:
68
+ logger.info("No new files to process")
69
+ return
70
+ convert_mkv_to_srt(season_path, mkv_files)
71
+ reference_text_dict = process_reference_srt_files(show_name)
72
+ srt_text_dict = process_srt_files(show_dir)
73
+ compare_and_rename_files(srt_text_dict, reference_text_dict, dry_run=dry_run)
74
+ cleanup_ocr_files(show_dir)
75
+
76
+ def check_filename(filename):
77
+ """
78
+ Check if the filename is in the correct format.
79
+
80
+ Args:
81
+ filename (str): The filename to check.
82
+
83
+ Returns:
84
+ bool: True if the filename is in the correct format, False otherwise.
85
+ """
86
+ # Check if the filename matches the expected format
87
+ match = re.match(r".*S\d+E\d+", filename)
88
+ return bool(match)
89
+ def extract_srt_text(filepath):
90
+ """
91
+ Extracts the text from an SRT file.
92
+
93
+ Args:
94
+ filepath (str): The path to the SRT file.
95
+
96
+ Returns:
97
+ list: A list of lists, where each inner list represents a block of text from the SRT file.
98
+ Each inner list contains the lines of text for that block.
99
+ """
100
+ # extract the text from the file
101
+ with open(filepath, "r") as f:
102
+ filepath = f.read()
103
+ text_lines = [
104
+ filepath.split("\n\n")[i].split("\n")[2:]
105
+ for i in range(len(filepath.split("\n\n")))
106
+ ]
107
+ # remove empty lines
108
+ text_lines = [[line for line in lines if line] for lines in text_lines]
109
+ # remove <i> or </i> tags
110
+ text_lines = [
111
+ [re.sub(r"<i>|</i>|", "", line) for line in lines] for lines in text_lines
112
+ ]
113
+ # remove empty lists
114
+ text_lines = [lines for lines in text_lines if lines]
115
+ return text_lines
116
+
117
+
118
+ def compare_text(text1, text2):
119
+ """
120
+ Compare two lists of text lines and return the number of matching lines.
121
+
122
+ Args:
123
+ text1 (list): List of text lines from the first source.
124
+ text2 (list): List of text lines from the second source.
125
+
126
+ Returns:
127
+ int: Number of matching lines between the two sources.
128
+ """
129
+ # Flatten the list of text lines
130
+ flat_text1 = [line for lines in text1 for line in lines]
131
+ flat_text2 = [line for lines in text2 for line in lines]
132
+
133
+ # Compare the two lists of text lines
134
+ matching_lines = set(flat_text1).intersection(flat_text2)
135
+ return len(matching_lines)
136
+
137
+
138
+ def extract_season_episode(filename):
139
+ """
140
+ Extract the season and episode number from the filename.
141
+
142
+ Args:
143
+ filename (str): The filename to extract the season and episode from.
144
+
145
+ Returns:
146
+ tuple: A tuple containing the season and episode number.
147
+ """
148
+ # Extract the season and episode number from the filename
149
+ match = re.search(r"S(\d+)E(\d+)", filename)
150
+ if match:
151
+ season = int(match.group(1))
152
+ episode = int(match.group(2))
153
+ return season, episode
154
+ else:
155
+ return None, None
156
+
157
+
158
+ def process_reference_srt_files(series_name):
159
+ """
160
+ Process reference SRT files for a given series.
161
+
162
+ Args:
163
+ series_name (str): The name of the series.
164
+
165
+ Returns:
166
+ dict: A dictionary containing the reference files where the keys are the MKV filenames
167
+ and the values are the corresponding SRT texts.
168
+ """
169
+ reference_files = {}
170
+ reference_dir = os.path.join(CACHE_DIR, "data", series_name)
171
+ for dirpath, _, filenames in os.walk(reference_dir):
172
+ for filename in filenames:
173
+ if filename.lower().endswith(".srt"):
174
+ srt_file = os.path.join(dirpath, filename)
175
+ print(f"Processing {srt_file}")
176
+ srt_text = extract_srt_text(srt_file)
177
+ season, episode = extract_season_episode(filename)
178
+ mkv_filename = f"{series_name} - S{season:02}E{episode:02}.mkv"
179
+ reference_files[mkv_filename] = srt_text
180
+ return reference_files
181
+
182
+
183
+ def process_srt_files(show_dir):
184
+ """
185
+ Process all SRT files in the given directory and its subdirectories.
186
+
187
+ Args:
188
+ show_dir (str): The directory path where the SRT files are located.
189
+
190
+ Returns:
191
+ dict: A dictionary containing the SRT file paths as keys and their corresponding text content as values.
192
+ """
193
+ srt_files = {}
194
+ for dirpath, _, filenames in os.walk(show_dir):
195
+ for filename in filenames:
196
+ if filename.lower().endswith(".srt"):
197
+ srt_file = os.path.join(dirpath, filename)
198
+ print(f"Processing {srt_file}")
199
+ srt_text = extract_srt_text(srt_file)
200
+ srt_files[srt_file] = srt_text
201
+ return srt_files
202
+
203
+
204
+ def compare_and_rename_files(srt_files, reference_files, dry_run=False):
205
+ """
206
+ Compare the srt files with the reference files and rename the matching mkv files.
207
+
208
+ Args:
209
+ srt_files (dict): A dictionary containing the srt files as keys and their contents as values.
210
+ reference_files (dict): A dictionary containing the reference files as keys and their contents as values.
211
+ dry_run (bool, optional): If True, the function will only log the renaming actions without actually renaming the files. Defaults to False.
212
+ """
213
+ logger.info(
214
+ f"Comparing {len(srt_files)} srt files with {len(reference_files)} reference files"
215
+ )
216
+ for srt_text in srt_files.keys():
217
+ parent_dir = os.path.dirname(os.path.dirname(srt_text))
218
+ for reference in reference_files.keys():
219
+ season, episode = extract_season_episode(reference)
220
+ mkv_file = os.path.join(
221
+ parent_dir, os.path.basename(srt_text).replace(".srt", ".mkv")
222
+ )
223
+ matching_lines = compare_text(
224
+ reference_files[reference], srt_files[srt_text]
225
+ )
226
+ if matching_lines >= int(len(reference_files[reference]) * 0.1):
227
+ logger.info(f"Matching lines: {matching_lines}")
228
+ logger.info(f"Found matching file: {mkv_file} ->{reference}")
229
+ new_filename = os.path.join(parent_dir, reference)
230
+ if not os.path.exists(new_filename):
231
+ if os.path.exists(mkv_file) and not dry_run:
232
+ logger.info(f"Renaming {mkv_file} to {new_filename}")
233
+ os.rename(mkv_file, new_filename)
234
+ else:
235
+ logger.info(f"File {new_filename} already exists, skipping")
@@ -0,0 +1,178 @@
1
+ import os
2
+ import subprocess
3
+
4
+ import sys
5
+
6
+ # Get the absolute path of the parent directory of the current script.
7
+ parent_dir = os.path.dirname(os.path.abspath(__file__))
8
+
9
+ # Add the parent directory to the Python path.
10
+ sys.path.append(parent_dir)
11
+ # Add the 'libraries' directory to the Python path.
12
+ sys.path.append(os.path.join(parent_dir, "libraries"))
13
+ # Add the 'libraries' directory to the Python path.
14
+ sys.path.append(os.path.join(parent_dir, "..", "libraries", "pgs2srt"))
15
+ import pytesseract
16
+ import re
17
+ from PIL import Image, ImageOps
18
+ from mkv_episode_matcher.__main__ import CONFIG_FILE
19
+ from mkv_episode_matcher.config import get_config
20
+ from datetime import datetime, timedelta
21
+ from concurrent.futures import ThreadPoolExecutor
22
+ from pgsreader import PGSReader
23
+ from imagemaker import make_image
24
+ from loguru import logger
25
+
26
+
27
+ def convert_mkv_to_sup(mkv_file, output_dir):
28
+ """
29
+ Convert an .mkv file to a .sup file using FFmpeg and pgs2srt.
30
+
31
+ Args:
32
+ mkv_file (str): Path to the .mkv file.
33
+ output_dir (str): Path to the directory where the .sup file will be saved.
34
+
35
+ Returns:
36
+ str: Path to the converted .sup file.
37
+ """
38
+ # Get the base name of the .mkv file without the extension
39
+ base_name = os.path.splitext(os.path.basename(mkv_file))[0]
40
+
41
+ # Construct the output .sup file path
42
+ sup_file = os.path.join(output_dir, f"{base_name}.sup")
43
+ if not os.path.exists(sup_file):
44
+ logger.info(f"Processing {mkv_file} to {sup_file}")
45
+ # FFmpeg command to convert .mkv to .sup
46
+ ffmpeg_cmd = ["ffmpeg", "-i", mkv_file, "-map", "0:s:0", "-c", "copy", sup_file]
47
+ try:
48
+ subprocess.run(ffmpeg_cmd, check=True)
49
+ logger.info(f"Converted {mkv_file} to {sup_file}")
50
+ except subprocess.CalledProcessError as e:
51
+ logger.error(f"Error converting {mkv_file}: {e}")
52
+ else:
53
+ logger.info(f"File {sup_file} already exists, skipping")
54
+ return sup_file
55
+
56
+
57
+ @logger.catch
58
+ def perform_ocr(sup_file_path):
59
+ """
60
+ Perform OCR on a .sup file and save the extracted text to a .srt file.
61
+
62
+ Args:
63
+ sup_file_path (str): Path to the .sup file.
64
+ """
65
+
66
+ # Get the base name of the .sup file without the extension
67
+ base_name = os.path.splitext(os.path.basename(sup_file_path))[0]
68
+ output_dir = os.path.dirname(sup_file_path)
69
+ logger.info(f"Performing OCR on {sup_file_path}")
70
+ # Construct the output .srt file path
71
+ srt_file = os.path.join(output_dir, f"{base_name}.srt")
72
+
73
+ # Load a PGS/SUP file.
74
+ pgs = PGSReader(sup_file_path)
75
+
76
+ # Set index
77
+ i = 0
78
+
79
+ # Complete subtitle track index
80
+ si = 0
81
+
82
+ tesseract_lang = "eng"
83
+ tesseract_config = "-c tessedit_char_blacklist=[] --psm 6 --oem {}".format(1)
84
+
85
+ config = get_config(CONFIG_FILE)
86
+ tesseract_path = config.get("tesseract_path")
87
+ logger.debug(f"Setting Teesseract Path to {tesseract_path}")
88
+ pytesseract.pytesseract.tesseract_cmd = str(tesseract_path)
89
+
90
+ # SubRip output
91
+ output = ""
92
+
93
+ if not os.path.exists(srt_file):
94
+ # Iterate the pgs generator
95
+ for ds in pgs.iter_displaysets():
96
+ # If set has image, parse the image
97
+ if ds.has_image:
98
+ # Get Palette Display Segment
99
+ pds = ds.pds[0]
100
+ # Get Object Display Segment
101
+ ods = ds.ods[0]
102
+
103
+ if pds and ods:
104
+ # Create and show the bitmap image and convert it to RGBA
105
+ src = make_image(ods, pds).convert("RGBA")
106
+
107
+ # Create grayscale image with black background
108
+ img = Image.new("L", src.size, "BLACK")
109
+ # Paste the subtitle bitmap
110
+ img.paste(src, (0, 0), src)
111
+ # Invert images so the text is readable by Tesseract
112
+ img = ImageOps.invert(img)
113
+
114
+ # Parse the image with tesesract
115
+ text = pytesseract.image_to_string(
116
+ img, lang=tesseract_lang, config=tesseract_config
117
+ ).strip()
118
+
119
+ # Replace "|" with "I"
120
+ # Works better than blacklisting "|" in Tesseract,
121
+ # which results in I becoming "!" "i" and "1"
122
+ text = re.sub(r"[|/\\]", "I", text)
123
+ text = re.sub(r"[_]", "L", text)
124
+ start = datetime.fromtimestamp(ods.presentation_timestamp / 1000)
125
+ start = start + timedelta(hours=-1)
126
+
127
+ else:
128
+ # Get Presentation Composition Segment
129
+ pcs = ds.pcs[0]
130
+
131
+ if pcs:
132
+ end = datetime.fromtimestamp(pcs.presentation_timestamp / 1000)
133
+ end = end + timedelta(hours=-1)
134
+
135
+ if (
136
+ isinstance(start, datetime)
137
+ and isinstance(end, datetime)
138
+ and len(text)
139
+ ):
140
+ si = si + 1
141
+ sub_output = str(si) + "\n"
142
+ sub_output += (
143
+ start.strftime("%H:%M:%S,%f")[0:12]
144
+ + " --> "
145
+ + end.strftime("%H:%M:%S,%f")[0:12]
146
+ + "\n"
147
+ )
148
+ sub_output += text + "\n\n"
149
+
150
+ output += sub_output
151
+ start = end = text = None
152
+ i = i + 1
153
+ with open(srt_file, "w") as f:
154
+ f.write(output)
155
+ logger.info(f"Saved to: {srt_file}")
156
+
157
+
158
+ def convert_mkv_to_srt(season_path, mkv_files):
159
+ """
160
+ Converts MKV files to SRT format.
161
+
162
+ Args:
163
+ season_path (str): The path to the season directory.
164
+ mkv_files (list): List of MKV files to convert.
165
+
166
+ Returns:
167
+ None
168
+ """
169
+ logger.info(f"Converting {len(mkv_files)} files to SRT")
170
+ output_dir = os.path.join(season_path, "ocr")
171
+ os.makedirs(output_dir, exist_ok=True)
172
+ sup_files = []
173
+ for mkv_file in mkv_files:
174
+ sup_file = convert_mkv_to_sup(mkv_file, output_dir)
175
+ sup_files.append(sup_file)
176
+ with ThreadPoolExecutor() as executor:
177
+ for sup_file in sup_files:
178
+ executor.submit(perform_ocr, sup_file)
@@ -0,0 +1,8 @@
1
+ requests
2
+ loguru
3
+ pillow
4
+ imagehash
5
+ configparser
6
+ tmdb_client
7
+ pytesseract
8
+ opensubtitlescom
@@ -0,0 +1,132 @@
1
+ # tmdb_client.py
2
+ import requests
3
+ from loguru import logger
4
+ from mkv_episode_matcher.config import get_config
5
+ from mkv_episode_matcher.__main__ import CONFIG_FILE
6
+ from threading import Lock
7
+ import time
8
+
9
+ BASE_IMAGE_URL = "https://image.tmdb.org/t/p/original"
10
+
11
+
12
+ class RateLimitedRequest:
13
+ """
14
+ A class that represents a rate-limited request object.
15
+
16
+ Attributes:
17
+ rate_limit (int): Maximum number of requests allowed per period.
18
+ period (int): Period in seconds.
19
+ requests_made (int): Counter for requests made.
20
+ start_time (float): Start time of the current period.
21
+ lock (Lock): Lock for synchronization.
22
+ """
23
+
24
+ def __init__(self, rate_limit=30, period=1):
25
+ self.rate_limit = rate_limit
26
+ self.period = period
27
+ self.requests_made = 0
28
+ self.start_time = time.time()
29
+ self.lock = Lock()
30
+
31
+ def get(self, url):
32
+ """
33
+ Sends a rate-limited GET request to the specified URL.
34
+
35
+ Args:
36
+ url (str): The URL to send the request to.
37
+
38
+ Returns:
39
+ Response: The response object returned by the request.
40
+ """
41
+ with self.lock:
42
+ if self.requests_made >= self.rate_limit:
43
+ sleep_time = self.period - (time.time() - self.start_time)
44
+ if sleep_time > 0:
45
+ time.sleep(sleep_time)
46
+ self.requests_made = 0
47
+ self.start_time = time.time()
48
+
49
+ self.requests_made += 1
50
+
51
+ response = requests.get(url)
52
+ return response
53
+
54
+
55
+ # Initialize rate-limited request
56
+ rate_limited_request = RateLimitedRequest(rate_limit=30, period=1)
57
+
58
+
59
+ def fetch_show_id(show_name):
60
+ """
61
+ Fetch the TMDb ID for a given show name.
62
+
63
+ Args:
64
+ show_name (str): The name of the show.
65
+
66
+ Returns:
67
+ str: The TMDb ID of the show, or None if not found.
68
+ """
69
+ config = get_config(CONFIG_FILE)
70
+ tmdb_api_key = config.get("tmdb_api_key")
71
+ url = f"https://api.themoviedb.org/3/search/tv?query={show_name}&api_key={tmdb_api_key}"
72
+ response = requests.get(url)
73
+ if response.status_code == 200:
74
+ results = response.json().get("results", [])
75
+ if results:
76
+ return str(results[0]["id"])
77
+ return None
78
+
79
+
80
+ def fetch_season_details(show_id, season_number):
81
+ """
82
+ Fetch the total number of episodes for a given show and season from the TMDb API.
83
+
84
+ Args:
85
+ show_id (str): The ID of the show on TMDb.
86
+ season_number (int): The season number to fetch details for.
87
+
88
+ Returns:
89
+ int: The total number of episodes in the season, or 0 if the API request failed.
90
+ """
91
+ logger.info(f"Fetching season details for Season {season_number}...")
92
+ config = get_config(CONFIG_FILE)
93
+ tmdb_api_key = config.get("tmdb_api_key")
94
+ url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season_number}?api_key={tmdb_api_key}"
95
+ try:
96
+ response = requests.get(url)
97
+ response.raise_for_status()
98
+ season_data = response.json()
99
+ total_episodes = len(season_data.get("episodes", []))
100
+ return total_episodes
101
+ except requests.exceptions.RequestException as e:
102
+ logger.error(f"Failed to fetch season details for Season {season_number}: {e}")
103
+ return 0
104
+ except KeyError:
105
+ logger.error(
106
+ f"Missing 'episodes' key in response JSON data for Season {season_number}"
107
+ )
108
+ return 0
109
+
110
+
111
+ def get_number_of_seasons(show_id):
112
+ """
113
+ Retrieves the number of seasons for a given TV show from the TMDB API.
114
+
115
+ Parameters:
116
+ - show_id (int): The ID of the TV show.
117
+
118
+ Returns:
119
+ - num_seasons (int): The number of seasons for the TV show.
120
+
121
+ Raises:
122
+ - requests.HTTPError: If there is an error while making the API request.
123
+ """
124
+ config = get_config(CONFIG_FILE)
125
+ tmdb_api_key = config.get("tmdb_api_key")
126
+ url = f"https://api.themoviedb.org/3/tv/{show_id}?api_key={tmdb_api_key}"
127
+ response = requests.get(url)
128
+ response.raise_for_status()
129
+ show_data = response.json()
130
+ num_seasons = show_data.get("number_of_seasons", 0)
131
+ logger.info(f"Found {num_seasons} seasons")
132
+ return num_seasons
@@ -0,0 +1,226 @@
1
+ # utils.py
2
+ import os
3
+ from typing import Set
4
+ from loguru import logger
5
+ import re
6
+ from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
7
+ from mkv_episode_matcher.config import get_config
8
+ from mkv_episode_matcher.tmdb_client import fetch_season_details
9
+ import requests
10
+ from opensubtitlescom import OpenSubtitles
11
+ import shutil
12
+
13
+
14
+ def check_filename(filename, series_title, season_number, episode_number):
15
+ """
16
+ Check if a filename matches the expected naming convention for a series episode.
17
+
18
+ Args:
19
+ filename (str): The filename to be checked.
20
+ series_title (str): The title of the series.
21
+ season_number (int): The season number of the episode.
22
+ episode_number (int): The episode number of the episode.
23
+
24
+ Returns:
25
+ bool: True if the filename matches the expected naming convention, False otherwise.
26
+
27
+ This function checks if the given filename matches the expected naming convention for a series episode.
28
+ The expected naming convention is '{series_title} - S{season_number:02d}E{episode_number:02d}.mkv'.
29
+ If the filename matches the expected pattern, it returns True; otherwise, it returns False.
30
+
31
+ Example:
32
+ If filename = 'Example - S01E03.mkv', series_title = 'Example', season_number = 1, and episode_number = 3,
33
+ the function will return True because the filename matches the expected pattern.
34
+ """
35
+ pattern = re.compile(
36
+ f"{re.escape(series_title)} - S{season_number:02d}E{episode_number:02d}.mkv"
37
+ )
38
+ return bool(pattern.match(filename))
39
+
40
+
41
+ def scramble_filename(original_file_path, file_number):
42
+ """
43
+ Scrambles the filename of the given file path by adding the series title and file number.
44
+
45
+ Args:
46
+ original_file_path (str): The original file path.
47
+ file_number (int): The file number to be added to the filename.
48
+
49
+ Returns:
50
+ None
51
+ """
52
+ logger.info(f"Scrambling {original_file_path}")
53
+ series_title = os.path.basename(
54
+ os.path.dirname(os.path.dirname(original_file_path))
55
+ )
56
+ original_file_name = os.path.basename(original_file_path)
57
+ extension = os.path.splitext(original_file_path)[-1]
58
+ new_file_name = f"{series_title} - {file_number:03d}{extension}"
59
+ new_file_path = os.path.join(os.path.dirname(original_file_path), new_file_name)
60
+ if not os.path.exists(new_file_path):
61
+ logger.info(f"Renaming {original_file_name} -> {new_file_name}")
62
+ os.rename(original_file_path, new_file_path)
63
+
64
+
65
+ def rename_episode_file(original_file_path, season_number, episode_number):
66
+ """
67
+ Rename an episode file with a standardized naming convention.
68
+
69
+ Args:
70
+ original_file_path (str): The original file path of the episode.
71
+ season_number (int): The season number of the episode.
72
+ episode_number (int): The episode number of the episode.
73
+
74
+ Returns:
75
+ None
76
+
77
+ This function renames an episode file with a standardized naming convention based on the series title, season number,
78
+ and episode number. If a file with the intended new name already exists, it appends a numerical suffix to the filename
79
+ until it finds a unique name.
80
+
81
+ Example:
82
+ If original_file_path = '/path/to/episode.mkv', season_number = 1, and episode_number = 3, and the series title is 'Example',
83
+ the function will rename the file to 'Example - S01E03.mkv' if no file with that name already exists. If a file with that
84
+ name already exists, it will be renamed to 'Example - S01E03_2.mkv', and so on.
85
+ """
86
+ series_title = os.path.basename(
87
+ os.path.dirname(os.path.dirname(original_file_path))
88
+ )
89
+ original_file_name = os.path.basename(original_file_path)
90
+ extension = os.path.splitext(original_file_path)[-1]
91
+ new_file_name = (
92
+ f"{series_title} - S{season_number:02d}E{episode_number:02d}{extension}"
93
+ )
94
+ new_file_path = os.path.join(os.path.dirname(original_file_path), new_file_name)
95
+
96
+ # Check if the new file path already exists
97
+ if os.path.exists(new_file_path):
98
+ logger.warning(f"Filename already exists: {new_file_name}.")
99
+
100
+ # If the file already exists, find a unique name by appending a numerical suffix
101
+ suffix = 2
102
+ while True:
103
+ new_file_name = f"{series_title} - S{season_number:02d}E{episode_number:02d}_{suffix}{extension}"
104
+ new_file_path = os.path.join(
105
+ os.path.dirname(original_file_path), new_file_name
106
+ )
107
+ if not os.path.exists(new_file_path):
108
+ break
109
+ suffix += 1
110
+
111
+ logger.info(f"Renaming {original_file_name} -> {new_file_name}")
112
+ os.rename(original_file_path, new_file_path)
113
+ else:
114
+ logger.info(f"Renaming {original_file_name} -> {new_file_name}")
115
+ os.rename(original_file_path, new_file_path)
116
+
117
+
118
+ def get_subtitles(show_id, seasons: Set[int]):
119
+ """
120
+ Retrieves and saves subtitles for a given TV show and seasons.
121
+
122
+ Args:
123
+ show_id (int): The ID of the TV show.
124
+ seasons (Set[int]): A set of season numbers for which subtitles should be retrieved.
125
+
126
+ Returns:
127
+ None
128
+ """
129
+
130
+ logger.info(f"Getting subtitles for show ID {show_id}")
131
+ config = get_config(CONFIG_FILE)
132
+ show_dir = config.get("show_dir")
133
+ series_name = os.path.basename(show_dir)
134
+ tmdb_api_key = config.get("tmdb_api_key")
135
+ open_subtitles_api_key = config.get("open_subtitles_api_key")
136
+ open_subtitles_user_agent = config.get("open_subtitles_user_agent")
137
+ open_subtitles_username = config.get("open_subtitles_username")
138
+ open_subtitles_password = config.get("open_subtitles_password")
139
+ if not all(
140
+ [
141
+ show_dir,
142
+ tmdb_api_key,
143
+ open_subtitles_api_key,
144
+ open_subtitles_user_agent,
145
+ open_subtitles_username,
146
+ open_subtitles_password,
147
+ ]
148
+ ):
149
+ logger.error("Missing configuration settings. Please run the setup script.")
150
+ try:
151
+ # Initialize the OpenSubtitles client
152
+ subtitles = OpenSubtitles(open_subtitles_user_agent, open_subtitles_api_key)
153
+
154
+ # Log in (retrieve auth token)
155
+ subtitles.login(open_subtitles_username, open_subtitles_password)
156
+ except Exception as e:
157
+ logger.error(f"Failed to log in to OpenSubtitles: {e}")
158
+ return
159
+ for season in seasons:
160
+ episodes = fetch_season_details(show_id, season)
161
+ logger.info(f"Found {episodes} episodes in Season {season}")
162
+
163
+ for episode in range(1, episodes + 1):
164
+ logger.info(f"Processing Season {season}, Episode {episode}...")
165
+ srt_filepath = os.path.join(
166
+ CACHE_DIR,
167
+ "data",
168
+ series_name,
169
+ f"{series_name} - S{season:02d}E{episode:02d}.srt",
170
+ )
171
+ if not os.path.exists(srt_filepath):
172
+ # get the episode info from TMDB
173
+ url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season}/episode/{episode}?api_key={tmdb_api_key}"
174
+ response = requests.get(url)
175
+ response.raise_for_status()
176
+ episode_data = response.json()
177
+ episode_name = episode_data["name"]
178
+ episode_id = episode_data["id"]
179
+ # search for the subtitle
180
+ response = subtitles.search(tmdb_id=episode_id, languages="en")
181
+ if len(response.data) == 0:
182
+ logger.warning(
183
+ f"No subtitles found for {series_name} - S{season:02d}E{episode:02d}"
184
+ )
185
+
186
+ for subtitle in response.data:
187
+ subtitle_dict = subtitle.to_dict()
188
+ # Remove special characters and convert to uppercase
189
+ filename_clean = re.sub(
190
+ r"\W+", " ", subtitle_dict["file_name"]
191
+ ).upper()
192
+ if f"E{episode:02d}" in filename_clean:
193
+ logger.info(f"Original filename: {subtitle_dict['file_name']}")
194
+ srt_file = subtitles.download_and_save(subtitle)
195
+ series_name = series_name.replace(":", " -")
196
+ shutil.move(srt_file, srt_filepath)
197
+ logger.info(f"Subtitle saved to {srt_filepath}")
198
+ break
199
+ else:
200
+ continue
201
+ else:
202
+ print(
203
+ f"Subtitle already exists for {series_name} - S{season:02d}E{episode:02d}"
204
+ )
205
+ continue
206
+
207
+
208
+ def cleanup_ocr_files(show_dir):
209
+ """
210
+ Clean up OCR files generated during the episode matching process.
211
+
212
+ Args:
213
+ show_dir (str): The directory containing the show files.
214
+
215
+ Returns:
216
+ None
217
+
218
+ This function cleans up the OCR files generated during the episode matching process.
219
+ It deletes the 'ocr' directory and all its contents in each season directory of the show.
220
+ """
221
+ for season_dir in os.listdir(show_dir):
222
+ season_dir_path = os.path.join(show_dir, season_dir)
223
+ ocr_dir_path = os.path.join(season_dir_path, "ocr")
224
+ if os.path.exists(ocr_dir_path):
225
+ logger.info(f"Cleaning up OCR files in {ocr_dir_path}")
226
+ shutil.rmtree(ocr_dir_path)
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.3
2
+ Name: mkv-episode-matcher
3
+ Version: 0.1.0
4
+ Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
+ Author-email: Jonathan Sakkos <jonathansakkos@protonmail.com>
6
+ Description-Content-Type: text/markdown
7
+
8
+ # MKV Episode Matcher
9
+
10
+ The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
11
+
12
+ ## Quick start
13
+
14
+ To use the MKV Episode Matcher, follow these steps:
15
+
16
+ 1. Clone this repository `git clone https://github.com/Jsakkos/mkv-episode-matcher`
17
+ 1. Obtain an API key from TMDb (https://developers.themoviedb.org/authentication/getting-a-apikey).
18
+ 2. (Optional) - Obtain an API key from Opensubtitles.com by creating an API consumer (https://www.opensubtitles.com/en/consumers)
19
+ 3. Provide a filepath to your show directory. This is the main directory that contains all of the episodes for a specific show.
20
+ The directory and subfolders must be arranged in the following structure:
21
+
22
+ - Show name
23
+ - Season 1
24
+ - Season 2
25
+ - ...
26
+ - Season n
27
+ 2. Call `python __main__.py` with the TMDB_API_KEY and SHOW_DIR as arguments or in environment variables from your command line:
28
+
29
+ ```
30
+ python __main__.py --api-key `your-api-key` --show-dir /path/to/show
31
+ ```
32
+
33
+ ## How it works
34
+
35
+ MKV Episode Matcher compares reference images from TMDb with frames from the mkv content using image hashing.
36
+
37
+ ## Caveats (WIP)
38
+
39
+ Currently, MKV Episode Matcher is slow (several minutes per episode), CPU intensive, and error-prone.
40
+
41
+ # Known issues
42
+
43
+ When reading BluRay files, the following warning pops up in the terminal:
44
+ ```
45
+ Could not find codec parameters for stream 3 (Subtitle: hdmv_pgs_subtitle (pgssub)): unspecified size
46
+ Consider increasing the value for the 'analyzeduration' (0) and 'probesize' (5000000) options
47
+ ```
48
+
49
+ # Contributing
50
+
51
+ Contributions are welcome! If you would like to contribute to the MKV Episode Matcher project, please follow these steps:
52
+
53
+ 1. Fork the repository.
54
+ 1. Clone the repository.
55
+ 2. Create a new branch for your contribution.
56
+ 3. Make your changes and commit them to your branch.
57
+ 4. Push your branch to your forked repository.
58
+ 5. Open a pull request to the main repository.
59
+
60
+ Please ensure that your code follows the project's coding conventions and standards. Additionally, provide a clear and detailed description of your changes in the pull request.
61
+
62
+ Thank you for your contribution!
63
+
64
+ # License
65
+
66
+ MIT License
67
+
68
+ Copyright (c) 2024 Jonathan Sakkos
69
+
70
+ Permission is hereby granted, free of charge, to any person obtaining a copy
71
+ of this software and associated documentation files (the "Software"), to deal
72
+ in the Software without restriction, including without limitation the rights
73
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
74
+ copies of the Software, and to permit persons to whom the Software is
75
+ furnished to do so, subject to the following conditions:
76
+
77
+ The above copyright notice and this permission notice shall be included in all
78
+ copies or substantial portions of the Software.
79
+
80
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
81
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
83
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
84
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
85
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
86
+ SOFTWARE.
87
+
88
+ # Acknowledgments
89
+ This product uses the TMDB API but is not endorsed or certified by TMDB.
90
+ ![The Movie DB Logo](https://www.themoviedb.org/assets/2/v4/logos/v2/blue_long_2-9665a76b1ae401a510ec1e0ca40ddcb3b0cfe45f1d51b77a308fea0845885648.svg)
@@ -0,0 +1,12 @@
1
+ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb6ZAFs,66
2
+ mkv_episode_matcher/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ mkv_episode_matcher/__main__.py,sha256=kCsrekSaNOkrfaY8Lm-VzVzALsxcmuaEWeeyCE5deEQ,6678
4
+ mkv_episode_matcher/config.py,sha256=2Ui0f9LUc0r6pmdRUmcopdykotHoFxjqJavLKLOzy5w,2354
5
+ mkv_episode_matcher/episode_matcher.py,sha256=IAUDOHyMzmoqBDilA5GIXiQdomfEWR7mn4xXU5XtvsM,8904
6
+ mkv_episode_matcher/mkv_to_srt.py,sha256=BSDgNCgrkpr451X-P0A3-Q4bENfItv2A43yp5dtB430,6468
7
+ mkv_episode_matcher/requirements.txt,sha256=0JLuUm69lLp8anUgtW48CuULZ_lSwd-1XL3eoShVWjI,93
8
+ mkv_episode_matcher/tmdb_client.py,sha256=3sWC0tHvsW2XAYA4ndXh3PjUFCobQRpXzykNP-Z4rAA,4170
9
+ mkv_episode_matcher/utils.py,sha256=ZkqGV3ZNPwpTvN1dHNZb-iLwJnk4ldk6w-Znh3TPH70,9297
10
+ mkv_episode_matcher-0.1.0.dist-info/METADATA,sha256=Z6_kFPF6S49njL8CzfhrvxMuuXH9qNueD3FyZrTHx5c,3759
11
+ mkv_episode_matcher-0.1.0.dist-info/WHEEL,sha256=cDcbFFSNXOE-241I5PFuLkIYfR_FM7WTlPEi33njInY,105
12
+ mkv_episode_matcher-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.24.2
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any