mkv-episode-matcher 0.1.13__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -9,15 +9,28 @@ sys.path.append(os.path.join(parent_dir, "libraries", "pgs2srt"))
9
9
  import re
10
10
  from concurrent.futures import ThreadPoolExecutor
11
11
  from datetime import datetime, timedelta
12
-
12
+ from pathlib import Path
13
13
  import pytesseract
14
14
  from imagemaker import make_image
15
15
  from loguru import logger
16
16
  from pgsreader import PGSReader
17
17
  from PIL import Image, ImageOps
18
-
18
+ from typing import Optional
19
19
  from mkv_episode_matcher.__main__ import CONFIG_FILE
20
20
  from mkv_episode_matcher.config import get_config
21
+ def check_if_processed(filename: str) -> bool:
22
+ """
23
+ Check if the file has already been processed (has SxxExx format)
24
+
25
+ Args:
26
+ filename (str): Filename to check
27
+
28
+ Returns:
29
+ bool: True if file is already processed
30
+ """
31
+ import re
32
+ match = re.search(r"S\d+E\d+", filename)
33
+ return bool(match)
21
34
 
22
35
 
23
36
  def convert_mkv_to_sup(mkv_file, output_dir):
@@ -51,21 +64,23 @@ def convert_mkv_to_sup(mkv_file, output_dir):
51
64
 
52
65
 
53
66
  @logger.catch
54
- def perform_ocr(sup_file_path):
67
+ def perform_ocr(sup_file_path: str) -> Optional[str]:
55
68
  """
56
69
  Perform OCR on a .sup file and save the extracted text to a .srt file.
57
-
58
- Args:
59
- sup_file_path (str): Path to the .sup file.
70
+ Returns the path to the created SRT file.
60
71
  """
61
-
62
72
  # Get the base name of the .sup file without the extension
63
73
  base_name = os.path.splitext(os.path.basename(sup_file_path))[0]
64
74
  output_dir = os.path.dirname(sup_file_path)
65
75
  logger.info(f"Performing OCR on {sup_file_path}")
76
+
66
77
  # Construct the output .srt file path
67
78
  srt_file = os.path.join(output_dir, f"{base_name}.srt")
68
79
 
80
+ if os.path.exists(srt_file):
81
+ logger.info(f"SRT file {srt_file} already exists, skipping OCR")
82
+ return srt_file
83
+
69
84
  # Load a PGS/SUP file.
70
85
  pgs = PGSReader(sup_file_path)
71
86
 
@@ -151,24 +166,137 @@ def perform_ocr(sup_file_path):
151
166
  logger.info(f"Saved to: {srt_file}")
152
167
 
153
168
 
154
- def convert_mkv_to_srt(season_path, mkv_files):
155
- """
156
- Converts MKV files to SRT format.
169
+ # def convert_mkv_to_srt(season_path, mkv_files):
170
+ # """
171
+ # Converts MKV files to SRT format.
157
172
 
158
- Args:
159
- season_path (str): The path to the season directory.
160
- mkv_files (list): List of MKV files to convert.
173
+ # Args:
174
+ # season_path (str): The path to the season directory.
175
+ # mkv_files (list): List of MKV files to convert.
161
176
 
162
- Returns:
163
- None
177
+ # Returns:
178
+ # None
179
+ # """
180
+ # logger.info(f"Converting {len(mkv_files)} files to SRT")
181
+ # output_dir = os.path.join(season_path, "ocr")
182
+ # os.makedirs(output_dir, exist_ok=True)
183
+ # sup_files = []
184
+ # for mkv_file in mkv_files:
185
+ # sup_file = convert_mkv_to_sup(mkv_file, output_dir)
186
+ # sup_files.append(sup_file)
187
+ # with ThreadPoolExecutor() as executor:
188
+ # for sup_file in sup_files:
189
+ # executor.submit(perform_ocr, sup_file)
190
+
191
+
192
+
193
+ def extract_subtitles(mkv_file: str, output_dir: str) -> Optional[str]:
194
+ """
195
+ Extract subtitles from MKV file based on detected subtitle type.
196
+ """
197
+ subtitle_type, stream_index = detect_subtitle_type(mkv_file)
198
+ if not subtitle_type:
199
+ logger.error(f"No supported subtitle streams found in {mkv_file}")
200
+ return None
201
+
202
+ base_name = Path(mkv_file).stem
203
+
204
+ if subtitle_type == 'subrip':
205
+ # For SRT subtitles, extract directly to .srt
206
+ output_file = os.path.join(output_dir, f"{base_name}.srt")
207
+ if not os.path.exists(output_file):
208
+ cmd = [
209
+ "ffmpeg", "-i", mkv_file,
210
+ "-map", f"0:{stream_index}",
211
+ output_file
212
+ ]
213
+ else:
214
+ # For DVD or PGS subtitles, extract to SUP format first
215
+ output_file = os.path.join(output_dir, f"{base_name}.sup")
216
+ if not os.path.exists(output_file):
217
+ cmd = [
218
+ "ffmpeg", "-i", mkv_file,
219
+ "-map", f"0:{stream_index}",
220
+ "-c", "copy",
221
+ output_file
222
+ ]
223
+
224
+ if not os.path.exists(output_file):
225
+ try:
226
+ subprocess.run(cmd, check=True)
227
+ logger.info(f"Extracted subtitles from {mkv_file} to {output_file}")
228
+ return output_file
229
+ except subprocess.CalledProcessError as e:
230
+ logger.error(f"Error extracting subtitles: {e}")
231
+ return None
232
+ else:
233
+ logger.info(f"Subtitle file {output_file} already exists, skipping extraction")
234
+ return output_file
235
+
236
+ def convert_mkv_to_srt(season_path: str, mkv_files: list[str]) -> None:
237
+ """
238
+ Convert subtitles from MKV files to SRT format.
164
239
  """
165
240
  logger.info(f"Converting {len(mkv_files)} files to SRT")
241
+
242
+ # Filter out already processed files
243
+ unprocessed_files = []
244
+ for mkv_file in mkv_files:
245
+ if check_if_processed(os.path.basename(mkv_file)):
246
+ logger.info(f"Skipping {mkv_file} - already processed")
247
+ continue
248
+ unprocessed_files.append(mkv_file)
249
+
250
+ if not unprocessed_files:
251
+ logger.info("No new files to process")
252
+ return
253
+
254
+ # Create OCR directory
166
255
  output_dir = os.path.join(season_path, "ocr")
167
256
  os.makedirs(output_dir, exist_ok=True)
168
- sup_files = []
169
- for mkv_file in mkv_files:
170
- sup_file = convert_mkv_to_sup(mkv_file, output_dir)
171
- sup_files.append(sup_file)
172
- with ThreadPoolExecutor() as executor:
173
- for sup_file in sup_files:
174
- executor.submit(perform_ocr, sup_file)
257
+
258
+ for mkv_file in unprocessed_files:
259
+ subtitle_file = extract_subtitles(mkv_file, output_dir)
260
+ if not subtitle_file:
261
+ continue
262
+
263
+ if subtitle_file.endswith('.srt'):
264
+ # Already have SRT, keep it in OCR directory
265
+ logger.info(f"Extracted SRT subtitle to {subtitle_file}")
266
+ else:
267
+ # For SUP files (DVD or PGS), perform OCR
268
+ srt_file = perform_ocr(subtitle_file)
269
+ if srt_file:
270
+ logger.info(f"Created SRT from OCR: {srt_file}")
271
+
272
+ def detect_subtitle_type(mkv_file: str) -> tuple[Optional[str], Optional[int]]:
273
+ """
274
+ Detect the type and index of subtitle streams in an MKV file.
275
+ """
276
+ cmd = ["ffmpeg", "-i", mkv_file]
277
+
278
+ try:
279
+ result = subprocess.run(cmd, capture_output=True, text=True)
280
+
281
+ subtitle_streams = []
282
+ for line in result.stderr.split('\n'):
283
+ if 'Subtitle' in line:
284
+ stream_index = int(line.split('#0:')[1].split('(')[0])
285
+ if 'subrip' in line:
286
+ subtitle_streams.append(('subrip', stream_index))
287
+ elif 'dvd_subtitle' in line:
288
+ subtitle_streams.append(('dvd_subtitle', stream_index))
289
+ elif 'hdmv_pgs_subtitle' in line:
290
+ subtitle_streams.append(('hdmv_pgs_subtitle', stream_index))
291
+
292
+ # Prioritize subtitle formats: SRT > DVD > PGS
293
+ for format_priority in ['subrip', 'dvd_subtitle', 'hdmv_pgs_subtitle']:
294
+ for format_type, index in subtitle_streams:
295
+ if format_type == format_priority:
296
+ return format_type, index
297
+
298
+ return None, None
299
+
300
+ except subprocess.CalledProcessError as e:
301
+ logger.error(f"Error detecting subtitle type: {e}")
302
+ return None, None
@@ -0,0 +1,90 @@
1
+ # mkv_episode_matcher/speech_to_text.py
2
+
3
+ import os
4
+ import subprocess
5
+ from pathlib import Path
6
+ import whisper
7
+ import torch
8
+ from loguru import logger
9
+
10
+ def process_speech_to_text(mkv_file, output_dir):
11
+ """
12
+ Convert MKV file to transcript using Whisper.
13
+
14
+ Args:
15
+ mkv_file (str): Path to MKV file
16
+ output_dir (str): Directory to save transcript files
17
+ """
18
+ # Extract audio if not already done
19
+ wav_file = extract_audio(mkv_file, output_dir)
20
+ if not wav_file:
21
+ return None
22
+
23
+ # Load model
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ if device == "cuda":
26
+ logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
27
+ else:
28
+ logger.info("CUDA not available. Using CPU.")
29
+
30
+ model = whisper.load_model("base", device=device)
31
+
32
+ # Generate transcript
33
+ segments_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.segments.json")
34
+ if not os.path.exists(segments_file):
35
+ try:
36
+ result = model.transcribe(
37
+ wav_file,
38
+ task="transcribe",
39
+ language="en",
40
+ )
41
+
42
+ # Save segments
43
+ import json
44
+ with open(segments_file, 'w', encoding='utf-8') as f:
45
+ json.dump(result["segments"], f, indent=2)
46
+
47
+ logger.info(f"Transcript saved to {segments_file}")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Error during transcription: {e}")
51
+ return None
52
+ else:
53
+ logger.info(f"Using existing transcript: {segments_file}")
54
+
55
+ return segments_file
56
+
57
+ def extract_audio(mkv_file, output_dir):
58
+ """
59
+ Extract audio from MKV file using FFmpeg.
60
+
61
+ Args:
62
+ mkv_file (str): Path to MKV file
63
+ output_dir (str): Directory to save WAV file
64
+
65
+ Returns:
66
+ str: Path to extracted WAV file
67
+ """
68
+ wav_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.wav")
69
+
70
+ if not os.path.exists(wav_file):
71
+ logger.info(f"Extracting audio from {mkv_file}")
72
+ try:
73
+ cmd = [
74
+ 'ffmpeg',
75
+ '-i', mkv_file,
76
+ '-vn', # Disable video
77
+ '-acodec', 'pcm_s16le', # Convert to PCM format
78
+ '-ar', '16000', # Set sample rate to 16kHz
79
+ '-ac', '1', # Convert to mono
80
+ wav_file
81
+ ]
82
+ subprocess.run(cmd, check=True, capture_output=True)
83
+ logger.info(f"Audio extracted to {wav_file}")
84
+ except subprocess.CalledProcessError as e:
85
+ logger.error(f"Error extracting audio: {e}")
86
+ return None
87
+ else:
88
+ logger.info(f"Audio file {wav_file} already exists, skipping extraction")
89
+
90
+ return wav_file
@@ -2,7 +2,6 @@
2
2
  import os
3
3
  import re
4
4
  import shutil
5
- from typing import Set
6
5
 
7
6
  import requests
8
7
  from loguru import logger
@@ -12,32 +11,51 @@ from mkv_episode_matcher.__main__ import CACHE_DIR, CONFIG_FILE
12
11
  from mkv_episode_matcher.config import get_config
13
12
  from mkv_episode_matcher.tmdb_client import fetch_season_details
14
13
 
15
-
16
- def check_filename(filename, series_title, season_number, episode_number):
14
+ def get_valid_seasons(show_dir):
17
15
  """
18
- Check if a filename matches the expected naming convention for a series episode.
16
+ Get all season directories that contain MKV files.
19
17
 
20
18
  Args:
21
- filename (str): The filename to be checked.
22
- series_title (str): The title of the series.
23
- season_number (int): The season number of the episode.
24
- episode_number (int): The episode number of the episode.
19
+ show_dir (str): Base directory for the TV show
25
20
 
26
21
  Returns:
27
- bool: True if the filename matches the expected naming convention, False otherwise.
22
+ list: List of paths to valid season directories
23
+ """
24
+ # Get all season directories
25
+ season_paths = [
26
+ os.path.join(show_dir, d)
27
+ for d in os.listdir(show_dir)
28
+ if os.path.isdir(os.path.join(show_dir, d))
29
+ ]
30
+
31
+ # Filter seasons to only include those with .mkv files
32
+ valid_season_paths = []
33
+ for season_path in season_paths:
34
+ mkv_files = [f for f in os.listdir(season_path) if f.endswith(".mkv")]
35
+ if mkv_files:
36
+ valid_season_paths.append(season_path)
37
+
38
+ if not valid_season_paths:
39
+ logger.warning(f"No seasons with .mkv files found in show '{os.path.basename(show_dir)}'")
40
+ else:
41
+ logger.info(
42
+ f"Found {len(valid_season_paths)} seasons with .mkv files in '{os.path.basename(show_dir)}'"
43
+ )
28
44
 
29
- This function checks if the given filename matches the expected naming convention for a series episode.
30
- The expected naming convention is '{series_title} - S{season_number:02d}E{episode_number:02d}.mkv'.
31
- If the filename matches the expected pattern, it returns True; otherwise, it returns False.
45
+ return valid_season_paths
46
+ def check_filename(filename):
47
+ """
48
+ Check if the filename is in the correct format (S01E02).
32
49
 
33
- Example:
34
- If filename = 'Example - S01E03.mkv', series_title = 'Example', season_number = 1, and episode_number = 3,
35
- the function will return True because the filename matches the expected pattern.
50
+ Args:
51
+ filename (str): The filename to check.
52
+
53
+ Returns:
54
+ bool: True if the filename matches the expected pattern.
36
55
  """
37
- pattern = re.compile(
38
- f"{re.escape(series_title)} - S{season_number:02d}E{episode_number:02d}.mkv"
39
- )
40
- return bool(pattern.match(filename))
56
+ # Check if the filename matches the expected format
57
+ match = re.search(r'.*S\d+E\d+', filename)
58
+ return bool(match)
41
59
 
42
60
 
43
61
  def scramble_filename(original_file_path, file_number):
@@ -64,60 +82,44 @@ def scramble_filename(original_file_path, file_number):
64
82
  os.rename(original_file_path, new_file_path)
65
83
 
66
84
 
67
- def rename_episode_file(original_file_path, season_number, episode_number):
85
+ def rename_episode_file(original_file_path, new_filename):
68
86
  """
69
87
  Rename an episode file with a standardized naming convention.
70
88
 
71
89
  Args:
72
90
  original_file_path (str): The original file path of the episode.
73
- season_number (int): The season number of the episode.
74
- episode_number (int): The episode number of the episode.
91
+ new_filename (str): The new filename including season/episode info.
75
92
 
76
93
  Returns:
77
- None
78
-
79
- This function renames an episode file with a standardized naming convention based on the series title, season number,
80
- and episode number. If a file with the intended new name already exists, it appends a numerical suffix to the filename
81
- until it finds a unique name.
82
-
83
- Example:
84
- If original_file_path = '/path/to/episode.mkv', season_number = 1, and episode_number = 3, and the series title is 'Example',
85
- the function will rename the file to 'Example - S01E03.mkv' if no file with that name already exists. If a file with that
86
- name already exists, it will be renamed to 'Example - S01E03_2.mkv', and so on.
94
+ str: Path to the renamed file, or None if rename failed.
87
95
  """
88
- series_title = os.path.basename(
89
- os.path.dirname(os.path.dirname(original_file_path))
90
- )
91
- original_file_name = os.path.basename(original_file_path)
92
- extension = os.path.splitext(original_file_path)[-1]
93
- new_file_name = (
94
- f"{series_title} - S{season_number:02d}E{episode_number:02d}{extension}"
95
- )
96
- new_file_path = os.path.join(os.path.dirname(original_file_path), new_file_name)
97
-
98
- # Check if the new file path already exists
96
+ original_dir = os.path.dirname(original_file_path)
97
+ new_file_path = os.path.join(original_dir, new_filename)
98
+
99
+ # Check if new filepath already exists
99
100
  if os.path.exists(new_file_path):
100
- logger.warning(f"Filename already exists: {new_file_name}.")
101
-
102
- # If the file already exists, find a unique name by appending a numerical suffix
101
+ logger.warning(f"File already exists: {new_filename}")
102
+
103
+ # Add numeric suffix if file exists
104
+ base, ext = os.path.splitext(new_filename)
103
105
  suffix = 2
104
106
  while True:
105
- new_file_name = f"{series_title} - S{season_number:02d}E{episode_number:02d}_{suffix}{extension}"
106
- new_file_path = os.path.join(
107
- os.path.dirname(original_file_path), new_file_name
108
- )
107
+ new_filename = f"{base}_{suffix}{ext}"
108
+ new_file_path = os.path.join(original_dir, new_filename)
109
109
  if not os.path.exists(new_file_path):
110
110
  break
111
111
  suffix += 1
112
-
113
- logger.info(f"Renaming {original_file_name} -> {new_file_name}")
114
- os.rename(original_file_path, new_file_path)
115
- else:
116
- logger.info(f"Renaming {original_file_name} -> {new_file_name}")
112
+
113
+ try:
117
114
  os.rename(original_file_path, new_file_path)
115
+ logger.info(f"Renamed {os.path.basename(original_file_path)} -> {new_filename}")
116
+ return new_file_path
117
+ except OSError as e:
118
+ logger.error(f"Failed to rename file: {e}")
119
+ return None
118
120
 
119
121
 
120
- def get_subtitles(show_id, seasons: Set[int]):
122
+ def get_subtitles(show_id, seasons: set[int]):
121
123
  """
122
124
  Retrieves and saves subtitles for a given TV show and seasons.
123
125
 
@@ -138,16 +140,14 @@ def get_subtitles(show_id, seasons: Set[int]):
138
140
  open_subtitles_user_agent = config.get("open_subtitles_user_agent")
139
141
  open_subtitles_username = config.get("open_subtitles_username")
140
142
  open_subtitles_password = config.get("open_subtitles_password")
141
- if not all(
142
- [
143
- show_dir,
144
- tmdb_api_key,
145
- open_subtitles_api_key,
146
- open_subtitles_user_agent,
147
- open_subtitles_username,
148
- open_subtitles_password,
149
- ]
150
- ):
143
+ if not all([
144
+ show_dir,
145
+ tmdb_api_key,
146
+ open_subtitles_api_key,
147
+ open_subtitles_user_agent,
148
+ open_subtitles_username,
149
+ open_subtitles_password,
150
+ ]):
151
151
  logger.error("Missing configuration settings. Please run the setup script.")
152
152
  try:
153
153
  # Initialize the OpenSubtitles client
@@ -164,11 +164,8 @@ def get_subtitles(show_id, seasons: Set[int]):
164
164
 
165
165
  for episode in range(1, episodes + 1):
166
166
  logger.info(f"Processing Season {season}, Episode {episode}...")
167
- series_cache_dir =os.path.join(
168
- CACHE_DIR,
169
- "data",
170
- series_name)
171
- os.makedirs(series_cache_dir,exist_ok=True)
167
+ series_cache_dir = os.path.join(CACHE_DIR, "data", series_name)
168
+ os.makedirs(series_cache_dir, exist_ok=True)
172
169
  srt_filepath = os.path.join(
173
170
  series_cache_dir,
174
171
  f"{series_name} - S{season:02d}E{episode:02d}.srt",
@@ -179,7 +176,7 @@ def get_subtitles(show_id, seasons: Set[int]):
179
176
  response = requests.get(url)
180
177
  response.raise_for_status()
181
178
  episode_data = response.json()
182
- episode_name = episode_data["name"]
179
+ episode_data["name"]
183
180
  episode_id = episode_data["id"]
184
181
  # search for the subtitle
185
182
  response = subtitles.search(tmdb_id=episode_id, languages="en")
@@ -229,8 +226,159 @@ def cleanup_ocr_files(show_dir):
229
226
  if os.path.exists(ocr_dir_path):
230
227
  logger.info(f"Cleaning up OCR files in {ocr_dir_path}")
231
228
  shutil.rmtree(ocr_dir_path)
229
+
230
+
232
231
  def clean_text(text):
233
232
  # Remove brackets, parentheses, and their content
234
- cleaned_text = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', text)
233
+ cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
235
234
  # Strip leading/trailing whitespace
236
- return cleaned_text.strip()
235
+ return cleaned_text.strip()
236
+ # mkv_episode_matcher/utils.py
237
+
238
+ # Add this to your existing utils.py, keeping all other functions
239
+
240
+ def process_reference_srt_files(series_name):
241
+ """
242
+ Process reference SRT files for a given series.
243
+
244
+ Args:
245
+ series_name (str): The name of the series.
246
+
247
+ Returns:
248
+ dict: A dictionary containing the reference files where the keys are the MKV filenames
249
+ and the values are the corresponding SRT texts.
250
+ """
251
+ from mkv_episode_matcher.__main__ import CACHE_DIR
252
+ import os
253
+
254
+ reference_files = {}
255
+ reference_dir = os.path.join(CACHE_DIR, "data", series_name)
256
+
257
+ for dirpath, _, filenames in os.walk(reference_dir):
258
+ for filename in filenames:
259
+ if filename.lower().endswith(".srt"):
260
+ srt_file = os.path.join(dirpath, filename)
261
+ logger.info(f"Processing {srt_file}")
262
+ srt_text = extract_srt_text(srt_file)
263
+ season, episode = extract_season_episode(filename)
264
+ mkv_filename = f"{series_name} - S{season:02}E{episode:02}.mkv"
265
+ reference_files[mkv_filename] = srt_text
266
+
267
+ return reference_files
268
+
269
+ def extract_srt_text(filepath):
270
+ """
271
+ Extracts text content from an SRT file.
272
+
273
+ Args:
274
+ filepath (str): Path to the SRT file.
275
+
276
+ Returns:
277
+ list: List of text lines from the SRT file.
278
+ """
279
+ # Read the file content
280
+ with open(filepath) as f:
281
+ content = f.read()
282
+
283
+ # Split into subtitle blocks
284
+ blocks = content.strip().split('\n\n')
285
+
286
+ text_lines = []
287
+ for block in blocks:
288
+ lines = block.split('\n')
289
+ if len(lines) < 3:
290
+ continue
291
+
292
+ # Skip index and timestamp, get all remaining lines as text
293
+ text = ' '.join(lines[2:])
294
+ # Remove stage directions and tags
295
+ text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
296
+ if text:
297
+ text_lines.append(text)
298
+
299
+ return text_lines
300
+
301
+ def extract_season_episode(filename):
302
+ """
303
+ Extract season and episode numbers from filename.
304
+
305
+ Args:
306
+ filename (str): Filename to parse
307
+
308
+ Returns:
309
+ tuple: (season_number, episode_number)
310
+ """
311
+ match = re.search(r'S(\d+)E(\d+)', filename)
312
+ if match:
313
+ return int(match.group(1)), int(match.group(2))
314
+ return None, None
315
+ def process_srt_files(show_dir):
316
+ """
317
+ Process all SRT files in the given directory and its subdirectories.
318
+
319
+ Args:
320
+ show_dir (str): The directory path where the SRT files are located.
321
+
322
+ Returns:
323
+ dict: A dictionary containing the SRT file paths as keys and their corresponding text content as values.
324
+ """
325
+ srt_files = {}
326
+ for dirpath, _, filenames in os.walk(show_dir):
327
+ for filename in filenames:
328
+ if filename.lower().endswith(".srt"):
329
+ srt_file = os.path.join(dirpath, filename)
330
+ logger.info(f"Processing {srt_file}")
331
+ srt_text = extract_srt_text(srt_file)
332
+ srt_files[srt_file] = srt_text
333
+ return srt_files
334
+ def compare_and_rename_files(srt_files, reference_files, dry_run=False):
335
+ """
336
+ Compare the srt files with the reference files and rename the matching mkv files.
337
+
338
+ Args:
339
+ srt_files (dict): A dictionary containing the srt files as keys and their contents as values.
340
+ reference_files (dict): A dictionary containing the reference files as keys and their contents as values.
341
+ dry_run (bool, optional): If True, the function will only log the renaming actions without actually renaming the files. Defaults to False.
342
+ """
343
+ logger.info(
344
+ f"Comparing {len(srt_files)} srt files with {len(reference_files)} reference files"
345
+ )
346
+ for srt_text in srt_files.keys():
347
+ parent_dir = os.path.dirname(os.path.dirname(srt_text))
348
+ for reference in reference_files.keys():
349
+ _season, _episode = extract_season_episode(reference)
350
+ mkv_file = os.path.join(
351
+ parent_dir, os.path.basename(srt_text).replace(".srt", ".mkv")
352
+ )
353
+ matching_lines = compare_text(
354
+ reference_files[reference], srt_files[srt_text]
355
+ )
356
+ if matching_lines >= int(len(reference_files[reference]) * 0.1):
357
+ logger.info(f"Matching lines: {matching_lines}")
358
+ logger.info(f"Found matching file: {mkv_file} ->{reference}")
359
+ new_filename = os.path.join(parent_dir, reference)
360
+ if not os.path.exists(new_filename):
361
+ if os.path.exists(mkv_file) and not dry_run:
362
+ logger.info(f"Renaming {mkv_file} to {new_filename}")
363
+ os.rename(mkv_file, new_filename)
364
+ else:
365
+ logger.info(f"File {new_filename} already exists, skipping")
366
+
367
+ def compare_text(text1, text2):
368
+ """
369
+ Compare two lists of text lines and return the number of matching lines.
370
+
371
+ Args:
372
+ text1 (list): List of text lines from the first source.
373
+ text2 (list): List of text lines from the second source.
374
+
375
+ Returns:
376
+ int: Number of matching lines between the two sources.
377
+ """
378
+ # Flatten the list of text lines
379
+ flat_text1 = [line for lines in text1 for line in lines]
380
+ flat_text2 = [line for lines in text2 for line in lines]
381
+
382
+ # Compare the two lists of text lines
383
+ matching_lines = set(flat_text1).intersection(flat_text2)
384
+ return len(matching_lines)