mkv-episode-matcher 0.1.13__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/__main__.py +8 -4
- mkv_episode_matcher/episode_identification.py +208 -0
- mkv_episode_matcher/episode_matcher.py +98 -242
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +38 -12
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +16644 -193
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +125 -80
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +7 -5
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +49 -20
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +53 -49
- mkv_episode_matcher/mkv_to_srt.py +150 -22
- mkv_episode_matcher/speech_to_text.py +90 -0
- mkv_episode_matcher/utils.py +222 -74
- mkv_episode_matcher-0.3.0.dist-info/METADATA +119 -0
- mkv_episode_matcher-0.3.0.dist-info/RECORD +25 -0
- mkv_episode_matcher/notebooks/get_subtitles_test.ipynb +0 -252
- mkv_episode_matcher/notebooks/whisper.ipynb +0 -122
- mkv_episode_matcher-0.1.13.dist-info/METADATA +0 -113
- mkv_episode_matcher-0.1.13.dist-info/RECORD +0 -25
- {mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/WHEEL +0 -0
- {mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -9,15 +9,28 @@ sys.path.append(os.path.join(parent_dir, "libraries", "pgs2srt"))
|
|
|
9
9
|
import re
|
|
10
10
|
from concurrent.futures import ThreadPoolExecutor
|
|
11
11
|
from datetime import datetime, timedelta
|
|
12
|
-
|
|
12
|
+
from pathlib import Path
|
|
13
13
|
import pytesseract
|
|
14
14
|
from imagemaker import make_image
|
|
15
15
|
from loguru import logger
|
|
16
16
|
from pgsreader import PGSReader
|
|
17
17
|
from PIL import Image, ImageOps
|
|
18
|
-
|
|
18
|
+
from typing import Optional
|
|
19
19
|
from mkv_episode_matcher.__main__ import CONFIG_FILE
|
|
20
20
|
from mkv_episode_matcher.config import get_config
|
|
21
|
+
def check_if_processed(filename: str) -> bool:
|
|
22
|
+
"""
|
|
23
|
+
Check if the file has already been processed (has SxxExx format)
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
filename (str): Filename to check
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
bool: True if file is already processed
|
|
30
|
+
"""
|
|
31
|
+
import re
|
|
32
|
+
match = re.search(r"S\d+E\d+", filename)
|
|
33
|
+
return bool(match)
|
|
21
34
|
|
|
22
35
|
|
|
23
36
|
def convert_mkv_to_sup(mkv_file, output_dir):
|
|
@@ -51,21 +64,23 @@ def convert_mkv_to_sup(mkv_file, output_dir):
|
|
|
51
64
|
|
|
52
65
|
|
|
53
66
|
@logger.catch
|
|
54
|
-
def perform_ocr(sup_file_path):
|
|
67
|
+
def perform_ocr(sup_file_path: str) -> Optional[str]:
|
|
55
68
|
"""
|
|
56
69
|
Perform OCR on a .sup file and save the extracted text to a .srt file.
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
sup_file_path (str): Path to the .sup file.
|
|
70
|
+
Returns the path to the created SRT file.
|
|
60
71
|
"""
|
|
61
|
-
|
|
62
72
|
# Get the base name of the .sup file without the extension
|
|
63
73
|
base_name = os.path.splitext(os.path.basename(sup_file_path))[0]
|
|
64
74
|
output_dir = os.path.dirname(sup_file_path)
|
|
65
75
|
logger.info(f"Performing OCR on {sup_file_path}")
|
|
76
|
+
|
|
66
77
|
# Construct the output .srt file path
|
|
67
78
|
srt_file = os.path.join(output_dir, f"{base_name}.srt")
|
|
68
79
|
|
|
80
|
+
if os.path.exists(srt_file):
|
|
81
|
+
logger.info(f"SRT file {srt_file} already exists, skipping OCR")
|
|
82
|
+
return srt_file
|
|
83
|
+
|
|
69
84
|
# Load a PGS/SUP file.
|
|
70
85
|
pgs = PGSReader(sup_file_path)
|
|
71
86
|
|
|
@@ -151,24 +166,137 @@ def perform_ocr(sup_file_path):
|
|
|
151
166
|
logger.info(f"Saved to: {srt_file}")
|
|
152
167
|
|
|
153
168
|
|
|
154
|
-
def convert_mkv_to_srt(season_path, mkv_files):
|
|
155
|
-
|
|
156
|
-
|
|
169
|
+
# def convert_mkv_to_srt(season_path, mkv_files):
|
|
170
|
+
# """
|
|
171
|
+
# Converts MKV files to SRT format.
|
|
157
172
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
173
|
+
# Args:
|
|
174
|
+
# season_path (str): The path to the season directory.
|
|
175
|
+
# mkv_files (list): List of MKV files to convert.
|
|
161
176
|
|
|
162
|
-
|
|
163
|
-
|
|
177
|
+
# Returns:
|
|
178
|
+
# None
|
|
179
|
+
# """
|
|
180
|
+
# logger.info(f"Converting {len(mkv_files)} files to SRT")
|
|
181
|
+
# output_dir = os.path.join(season_path, "ocr")
|
|
182
|
+
# os.makedirs(output_dir, exist_ok=True)
|
|
183
|
+
# sup_files = []
|
|
184
|
+
# for mkv_file in mkv_files:
|
|
185
|
+
# sup_file = convert_mkv_to_sup(mkv_file, output_dir)
|
|
186
|
+
# sup_files.append(sup_file)
|
|
187
|
+
# with ThreadPoolExecutor() as executor:
|
|
188
|
+
# for sup_file in sup_files:
|
|
189
|
+
# executor.submit(perform_ocr, sup_file)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def extract_subtitles(mkv_file: str, output_dir: str) -> Optional[str]:
|
|
194
|
+
"""
|
|
195
|
+
Extract subtitles from MKV file based on detected subtitle type.
|
|
196
|
+
"""
|
|
197
|
+
subtitle_type, stream_index = detect_subtitle_type(mkv_file)
|
|
198
|
+
if not subtitle_type:
|
|
199
|
+
logger.error(f"No supported subtitle streams found in {mkv_file}")
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
base_name = Path(mkv_file).stem
|
|
203
|
+
|
|
204
|
+
if subtitle_type == 'subrip':
|
|
205
|
+
# For SRT subtitles, extract directly to .srt
|
|
206
|
+
output_file = os.path.join(output_dir, f"{base_name}.srt")
|
|
207
|
+
if not os.path.exists(output_file):
|
|
208
|
+
cmd = [
|
|
209
|
+
"ffmpeg", "-i", mkv_file,
|
|
210
|
+
"-map", f"0:{stream_index}",
|
|
211
|
+
output_file
|
|
212
|
+
]
|
|
213
|
+
else:
|
|
214
|
+
# For DVD or PGS subtitles, extract to SUP format first
|
|
215
|
+
output_file = os.path.join(output_dir, f"{base_name}.sup")
|
|
216
|
+
if not os.path.exists(output_file):
|
|
217
|
+
cmd = [
|
|
218
|
+
"ffmpeg", "-i", mkv_file,
|
|
219
|
+
"-map", f"0:{stream_index}",
|
|
220
|
+
"-c", "copy",
|
|
221
|
+
output_file
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
if not os.path.exists(output_file):
|
|
225
|
+
try:
|
|
226
|
+
subprocess.run(cmd, check=True)
|
|
227
|
+
logger.info(f"Extracted subtitles from {mkv_file} to {output_file}")
|
|
228
|
+
return output_file
|
|
229
|
+
except subprocess.CalledProcessError as e:
|
|
230
|
+
logger.error(f"Error extracting subtitles: {e}")
|
|
231
|
+
return None
|
|
232
|
+
else:
|
|
233
|
+
logger.info(f"Subtitle file {output_file} already exists, skipping extraction")
|
|
234
|
+
return output_file
|
|
235
|
+
|
|
236
|
+
def convert_mkv_to_srt(season_path: str, mkv_files: list[str]) -> None:
|
|
237
|
+
"""
|
|
238
|
+
Convert subtitles from MKV files to SRT format.
|
|
164
239
|
"""
|
|
165
240
|
logger.info(f"Converting {len(mkv_files)} files to SRT")
|
|
241
|
+
|
|
242
|
+
# Filter out already processed files
|
|
243
|
+
unprocessed_files = []
|
|
244
|
+
for mkv_file in mkv_files:
|
|
245
|
+
if check_if_processed(os.path.basename(mkv_file)):
|
|
246
|
+
logger.info(f"Skipping {mkv_file} - already processed")
|
|
247
|
+
continue
|
|
248
|
+
unprocessed_files.append(mkv_file)
|
|
249
|
+
|
|
250
|
+
if not unprocessed_files:
|
|
251
|
+
logger.info("No new files to process")
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
# Create OCR directory
|
|
166
255
|
output_dir = os.path.join(season_path, "ocr")
|
|
167
256
|
os.makedirs(output_dir, exist_ok=True)
|
|
168
|
-
|
|
169
|
-
for mkv_file in
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
257
|
+
|
|
258
|
+
for mkv_file in unprocessed_files:
|
|
259
|
+
subtitle_file = extract_subtitles(mkv_file, output_dir)
|
|
260
|
+
if not subtitle_file:
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
if subtitle_file.endswith('.srt'):
|
|
264
|
+
# Already have SRT, keep it in OCR directory
|
|
265
|
+
logger.info(f"Extracted SRT subtitle to {subtitle_file}")
|
|
266
|
+
else:
|
|
267
|
+
# For SUP files (DVD or PGS), perform OCR
|
|
268
|
+
srt_file = perform_ocr(subtitle_file)
|
|
269
|
+
if srt_file:
|
|
270
|
+
logger.info(f"Created SRT from OCR: {srt_file}")
|
|
271
|
+
|
|
272
|
+
def detect_subtitle_type(mkv_file: str) -> tuple[Optional[str], Optional[int]]:
|
|
273
|
+
"""
|
|
274
|
+
Detect the type and index of subtitle streams in an MKV file.
|
|
275
|
+
"""
|
|
276
|
+
cmd = ["ffmpeg", "-i", mkv_file]
|
|
277
|
+
|
|
278
|
+
try:
|
|
279
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
280
|
+
|
|
281
|
+
subtitle_streams = []
|
|
282
|
+
for line in result.stderr.split('\n'):
|
|
283
|
+
if 'Subtitle' in line:
|
|
284
|
+
stream_index = int(line.split('#0:')[1].split('(')[0])
|
|
285
|
+
if 'subrip' in line:
|
|
286
|
+
subtitle_streams.append(('subrip', stream_index))
|
|
287
|
+
elif 'dvd_subtitle' in line:
|
|
288
|
+
subtitle_streams.append(('dvd_subtitle', stream_index))
|
|
289
|
+
elif 'hdmv_pgs_subtitle' in line:
|
|
290
|
+
subtitle_streams.append(('hdmv_pgs_subtitle', stream_index))
|
|
291
|
+
|
|
292
|
+
# Prioritize subtitle formats: SRT > DVD > PGS
|
|
293
|
+
for format_priority in ['subrip', 'dvd_subtitle', 'hdmv_pgs_subtitle']:
|
|
294
|
+
for format_type, index in subtitle_streams:
|
|
295
|
+
if format_type == format_priority:
|
|
296
|
+
return format_type, index
|
|
297
|
+
|
|
298
|
+
return None, None
|
|
299
|
+
|
|
300
|
+
except subprocess.CalledProcessError as e:
|
|
301
|
+
logger.error(f"Error detecting subtitle type: {e}")
|
|
302
|
+
return None, None
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# mkv_episode_matcher/speech_to_text.py
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import whisper
|
|
7
|
+
import torch
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
def process_speech_to_text(mkv_file, output_dir):
|
|
11
|
+
"""
|
|
12
|
+
Convert MKV file to transcript using Whisper.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
mkv_file (str): Path to MKV file
|
|
16
|
+
output_dir (str): Directory to save transcript files
|
|
17
|
+
"""
|
|
18
|
+
# Extract audio if not already done
|
|
19
|
+
wav_file = extract_audio(mkv_file, output_dir)
|
|
20
|
+
if not wav_file:
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
# Load model
|
|
24
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
25
|
+
if device == "cuda":
|
|
26
|
+
logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
|
|
27
|
+
else:
|
|
28
|
+
logger.info("CUDA not available. Using CPU.")
|
|
29
|
+
|
|
30
|
+
model = whisper.load_model("base", device=device)
|
|
31
|
+
|
|
32
|
+
# Generate transcript
|
|
33
|
+
segments_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.segments.json")
|
|
34
|
+
if not os.path.exists(segments_file):
|
|
35
|
+
try:
|
|
36
|
+
result = model.transcribe(
|
|
37
|
+
wav_file,
|
|
38
|
+
task="transcribe",
|
|
39
|
+
language="en",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Save segments
|
|
43
|
+
import json
|
|
44
|
+
with open(segments_file, 'w', encoding='utf-8') as f:
|
|
45
|
+
json.dump(result["segments"], f, indent=2)
|
|
46
|
+
|
|
47
|
+
logger.info(f"Transcript saved to {segments_file}")
|
|
48
|
+
|
|
49
|
+
except Exception as e:
|
|
50
|
+
logger.error(f"Error during transcription: {e}")
|
|
51
|
+
return None
|
|
52
|
+
else:
|
|
53
|
+
logger.info(f"Using existing transcript: {segments_file}")
|
|
54
|
+
|
|
55
|
+
return segments_file
|
|
56
|
+
|
|
57
|
+
def extract_audio(mkv_file, output_dir):
|
|
58
|
+
"""
|
|
59
|
+
Extract audio from MKV file using FFmpeg.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
mkv_file (str): Path to MKV file
|
|
63
|
+
output_dir (str): Directory to save WAV file
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
str: Path to extracted WAV file
|
|
67
|
+
"""
|
|
68
|
+
wav_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.wav")
|
|
69
|
+
|
|
70
|
+
if not os.path.exists(wav_file):
|
|
71
|
+
logger.info(f"Extracting audio from {mkv_file}")
|
|
72
|
+
try:
|
|
73
|
+
cmd = [
|
|
74
|
+
'ffmpeg',
|
|
75
|
+
'-i', mkv_file,
|
|
76
|
+
'-vn', # Disable video
|
|
77
|
+
'-acodec', 'pcm_s16le', # Convert to PCM format
|
|
78
|
+
'-ar', '16000', # Set sample rate to 16kHz
|
|
79
|
+
'-ac', '1', # Convert to mono
|
|
80
|
+
wav_file
|
|
81
|
+
]
|
|
82
|
+
subprocess.run(cmd, check=True, capture_output=True)
|
|
83
|
+
logger.info(f"Audio extracted to {wav_file}")
|
|
84
|
+
except subprocess.CalledProcessError as e:
|
|
85
|
+
logger.error(f"Error extracting audio: {e}")
|
|
86
|
+
return None
|
|
87
|
+
else:
|
|
88
|
+
logger.info(f"Audio file {wav_file} already exists, skipping extraction")
|
|
89
|
+
|
|
90
|
+
return wav_file
|
mkv_episode_matcher/utils.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
import shutil
|
|
5
|
-
from typing import Set
|
|
6
5
|
|
|
7
6
|
import requests
|
|
8
7
|
from loguru import logger
|
|
@@ -12,32 +11,51 @@ from mkv_episode_matcher.__main__ import CACHE_DIR, CONFIG_FILE
|
|
|
12
11
|
from mkv_episode_matcher.config import get_config
|
|
13
12
|
from mkv_episode_matcher.tmdb_client import fetch_season_details
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
def check_filename(filename, series_title, season_number, episode_number):
|
|
14
|
+
def get_valid_seasons(show_dir):
|
|
17
15
|
"""
|
|
18
|
-
|
|
16
|
+
Get all season directories that contain MKV files.
|
|
19
17
|
|
|
20
18
|
Args:
|
|
21
|
-
|
|
22
|
-
series_title (str): The title of the series.
|
|
23
|
-
season_number (int): The season number of the episode.
|
|
24
|
-
episode_number (int): The episode number of the episode.
|
|
19
|
+
show_dir (str): Base directory for the TV show
|
|
25
20
|
|
|
26
21
|
Returns:
|
|
27
|
-
|
|
22
|
+
list: List of paths to valid season directories
|
|
23
|
+
"""
|
|
24
|
+
# Get all season directories
|
|
25
|
+
season_paths = [
|
|
26
|
+
os.path.join(show_dir, d)
|
|
27
|
+
for d in os.listdir(show_dir)
|
|
28
|
+
if os.path.isdir(os.path.join(show_dir, d))
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
# Filter seasons to only include those with .mkv files
|
|
32
|
+
valid_season_paths = []
|
|
33
|
+
for season_path in season_paths:
|
|
34
|
+
mkv_files = [f for f in os.listdir(season_path) if f.endswith(".mkv")]
|
|
35
|
+
if mkv_files:
|
|
36
|
+
valid_season_paths.append(season_path)
|
|
37
|
+
|
|
38
|
+
if not valid_season_paths:
|
|
39
|
+
logger.warning(f"No seasons with .mkv files found in show '{os.path.basename(show_dir)}'")
|
|
40
|
+
else:
|
|
41
|
+
logger.info(
|
|
42
|
+
f"Found {len(valid_season_paths)} seasons with .mkv files in '{os.path.basename(show_dir)}'"
|
|
43
|
+
)
|
|
28
44
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
45
|
+
return valid_season_paths
|
|
46
|
+
def check_filename(filename):
|
|
47
|
+
"""
|
|
48
|
+
Check if the filename is in the correct format (S01E02).
|
|
32
49
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
50
|
+
Args:
|
|
51
|
+
filename (str): The filename to check.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
bool: True if the filename matches the expected pattern.
|
|
36
55
|
"""
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
)
|
|
40
|
-
return bool(pattern.match(filename))
|
|
56
|
+
# Check if the filename matches the expected format
|
|
57
|
+
match = re.search(r'.*S\d+E\d+', filename)
|
|
58
|
+
return bool(match)
|
|
41
59
|
|
|
42
60
|
|
|
43
61
|
def scramble_filename(original_file_path, file_number):
|
|
@@ -64,60 +82,44 @@ def scramble_filename(original_file_path, file_number):
|
|
|
64
82
|
os.rename(original_file_path, new_file_path)
|
|
65
83
|
|
|
66
84
|
|
|
67
|
-
def rename_episode_file(original_file_path,
|
|
85
|
+
def rename_episode_file(original_file_path, new_filename):
|
|
68
86
|
"""
|
|
69
87
|
Rename an episode file with a standardized naming convention.
|
|
70
88
|
|
|
71
89
|
Args:
|
|
72
90
|
original_file_path (str): The original file path of the episode.
|
|
73
|
-
|
|
74
|
-
episode_number (int): The episode number of the episode.
|
|
91
|
+
new_filename (str): The new filename including season/episode info.
|
|
75
92
|
|
|
76
93
|
Returns:
|
|
77
|
-
None
|
|
78
|
-
|
|
79
|
-
This function renames an episode file with a standardized naming convention based on the series title, season number,
|
|
80
|
-
and episode number. If a file with the intended new name already exists, it appends a numerical suffix to the filename
|
|
81
|
-
until it finds a unique name.
|
|
82
|
-
|
|
83
|
-
Example:
|
|
84
|
-
If original_file_path = '/path/to/episode.mkv', season_number = 1, and episode_number = 3, and the series title is 'Example',
|
|
85
|
-
the function will rename the file to 'Example - S01E03.mkv' if no file with that name already exists. If a file with that
|
|
86
|
-
name already exists, it will be renamed to 'Example - S01E03_2.mkv', and so on.
|
|
94
|
+
str: Path to the renamed file, or None if rename failed.
|
|
87
95
|
"""
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
extension = os.path.splitext(original_file_path)[-1]
|
|
93
|
-
new_file_name = (
|
|
94
|
-
f"{series_title} - S{season_number:02d}E{episode_number:02d}{extension}"
|
|
95
|
-
)
|
|
96
|
-
new_file_path = os.path.join(os.path.dirname(original_file_path), new_file_name)
|
|
97
|
-
|
|
98
|
-
# Check if the new file path already exists
|
|
96
|
+
original_dir = os.path.dirname(original_file_path)
|
|
97
|
+
new_file_path = os.path.join(original_dir, new_filename)
|
|
98
|
+
|
|
99
|
+
# Check if new filepath already exists
|
|
99
100
|
if os.path.exists(new_file_path):
|
|
100
|
-
logger.warning(f"
|
|
101
|
-
|
|
102
|
-
#
|
|
101
|
+
logger.warning(f"File already exists: {new_filename}")
|
|
102
|
+
|
|
103
|
+
# Add numeric suffix if file exists
|
|
104
|
+
base, ext = os.path.splitext(new_filename)
|
|
103
105
|
suffix = 2
|
|
104
106
|
while True:
|
|
105
|
-
|
|
106
|
-
new_file_path = os.path.join(
|
|
107
|
-
os.path.dirname(original_file_path), new_file_name
|
|
108
|
-
)
|
|
107
|
+
new_filename = f"{base}_{suffix}{ext}"
|
|
108
|
+
new_file_path = os.path.join(original_dir, new_filename)
|
|
109
109
|
if not os.path.exists(new_file_path):
|
|
110
110
|
break
|
|
111
111
|
suffix += 1
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
os.rename(original_file_path, new_file_path)
|
|
115
|
-
else:
|
|
116
|
-
logger.info(f"Renaming {original_file_name} -> {new_file_name}")
|
|
112
|
+
|
|
113
|
+
try:
|
|
117
114
|
os.rename(original_file_path, new_file_path)
|
|
115
|
+
logger.info(f"Renamed {os.path.basename(original_file_path)} -> {new_filename}")
|
|
116
|
+
return new_file_path
|
|
117
|
+
except OSError as e:
|
|
118
|
+
logger.error(f"Failed to rename file: {e}")
|
|
119
|
+
return None
|
|
118
120
|
|
|
119
121
|
|
|
120
|
-
def get_subtitles(show_id, seasons:
|
|
122
|
+
def get_subtitles(show_id, seasons: set[int]):
|
|
121
123
|
"""
|
|
122
124
|
Retrieves and saves subtitles for a given TV show and seasons.
|
|
123
125
|
|
|
@@ -138,16 +140,14 @@ def get_subtitles(show_id, seasons: Set[int]):
|
|
|
138
140
|
open_subtitles_user_agent = config.get("open_subtitles_user_agent")
|
|
139
141
|
open_subtitles_username = config.get("open_subtitles_username")
|
|
140
142
|
open_subtitles_password = config.get("open_subtitles_password")
|
|
141
|
-
if not all(
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
]
|
|
150
|
-
):
|
|
143
|
+
if not all([
|
|
144
|
+
show_dir,
|
|
145
|
+
tmdb_api_key,
|
|
146
|
+
open_subtitles_api_key,
|
|
147
|
+
open_subtitles_user_agent,
|
|
148
|
+
open_subtitles_username,
|
|
149
|
+
open_subtitles_password,
|
|
150
|
+
]):
|
|
151
151
|
logger.error("Missing configuration settings. Please run the setup script.")
|
|
152
152
|
try:
|
|
153
153
|
# Initialize the OpenSubtitles client
|
|
@@ -164,11 +164,8 @@ def get_subtitles(show_id, seasons: Set[int]):
|
|
|
164
164
|
|
|
165
165
|
for episode in range(1, episodes + 1):
|
|
166
166
|
logger.info(f"Processing Season {season}, Episode {episode}...")
|
|
167
|
-
series_cache_dir =os.path.join(
|
|
168
|
-
|
|
169
|
-
"data",
|
|
170
|
-
series_name)
|
|
171
|
-
os.makedirs(series_cache_dir,exist_ok=True)
|
|
167
|
+
series_cache_dir = os.path.join(CACHE_DIR, "data", series_name)
|
|
168
|
+
os.makedirs(series_cache_dir, exist_ok=True)
|
|
172
169
|
srt_filepath = os.path.join(
|
|
173
170
|
series_cache_dir,
|
|
174
171
|
f"{series_name} - S{season:02d}E{episode:02d}.srt",
|
|
@@ -179,7 +176,7 @@ def get_subtitles(show_id, seasons: Set[int]):
|
|
|
179
176
|
response = requests.get(url)
|
|
180
177
|
response.raise_for_status()
|
|
181
178
|
episode_data = response.json()
|
|
182
|
-
|
|
179
|
+
episode_data["name"]
|
|
183
180
|
episode_id = episode_data["id"]
|
|
184
181
|
# search for the subtitle
|
|
185
182
|
response = subtitles.search(tmdb_id=episode_id, languages="en")
|
|
@@ -229,8 +226,159 @@ def cleanup_ocr_files(show_dir):
|
|
|
229
226
|
if os.path.exists(ocr_dir_path):
|
|
230
227
|
logger.info(f"Cleaning up OCR files in {ocr_dir_path}")
|
|
231
228
|
shutil.rmtree(ocr_dir_path)
|
|
229
|
+
|
|
230
|
+
|
|
232
231
|
def clean_text(text):
|
|
233
232
|
# Remove brackets, parentheses, and their content
|
|
234
|
-
cleaned_text = re.sub(r
|
|
233
|
+
cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
|
|
235
234
|
# Strip leading/trailing whitespace
|
|
236
|
-
return cleaned_text.strip()
|
|
235
|
+
return cleaned_text.strip()
|
|
236
|
+
# mkv_episode_matcher/utils.py
|
|
237
|
+
|
|
238
|
+
# Add this to your existing utils.py, keeping all other functions
|
|
239
|
+
|
|
240
|
+
def process_reference_srt_files(series_name):
|
|
241
|
+
"""
|
|
242
|
+
Process reference SRT files for a given series.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
series_name (str): The name of the series.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
dict: A dictionary containing the reference files where the keys are the MKV filenames
|
|
249
|
+
and the values are the corresponding SRT texts.
|
|
250
|
+
"""
|
|
251
|
+
from mkv_episode_matcher.__main__ import CACHE_DIR
|
|
252
|
+
import os
|
|
253
|
+
|
|
254
|
+
reference_files = {}
|
|
255
|
+
reference_dir = os.path.join(CACHE_DIR, "data", series_name)
|
|
256
|
+
|
|
257
|
+
for dirpath, _, filenames in os.walk(reference_dir):
|
|
258
|
+
for filename in filenames:
|
|
259
|
+
if filename.lower().endswith(".srt"):
|
|
260
|
+
srt_file = os.path.join(dirpath, filename)
|
|
261
|
+
logger.info(f"Processing {srt_file}")
|
|
262
|
+
srt_text = extract_srt_text(srt_file)
|
|
263
|
+
season, episode = extract_season_episode(filename)
|
|
264
|
+
mkv_filename = f"{series_name} - S{season:02}E{episode:02}.mkv"
|
|
265
|
+
reference_files[mkv_filename] = srt_text
|
|
266
|
+
|
|
267
|
+
return reference_files
|
|
268
|
+
|
|
269
|
+
def extract_srt_text(filepath):
|
|
270
|
+
"""
|
|
271
|
+
Extracts text content from an SRT file.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
filepath (str): Path to the SRT file.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
list: List of text lines from the SRT file.
|
|
278
|
+
"""
|
|
279
|
+
# Read the file content
|
|
280
|
+
with open(filepath) as f:
|
|
281
|
+
content = f.read()
|
|
282
|
+
|
|
283
|
+
# Split into subtitle blocks
|
|
284
|
+
blocks = content.strip().split('\n\n')
|
|
285
|
+
|
|
286
|
+
text_lines = []
|
|
287
|
+
for block in blocks:
|
|
288
|
+
lines = block.split('\n')
|
|
289
|
+
if len(lines) < 3:
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
# Skip index and timestamp, get all remaining lines as text
|
|
293
|
+
text = ' '.join(lines[2:])
|
|
294
|
+
# Remove stage directions and tags
|
|
295
|
+
text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
|
|
296
|
+
if text:
|
|
297
|
+
text_lines.append(text)
|
|
298
|
+
|
|
299
|
+
return text_lines
|
|
300
|
+
|
|
301
|
+
def extract_season_episode(filename):
|
|
302
|
+
"""
|
|
303
|
+
Extract season and episode numbers from filename.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
filename (str): Filename to parse
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
tuple: (season_number, episode_number)
|
|
310
|
+
"""
|
|
311
|
+
match = re.search(r'S(\d+)E(\d+)', filename)
|
|
312
|
+
if match:
|
|
313
|
+
return int(match.group(1)), int(match.group(2))
|
|
314
|
+
return None, None
|
|
315
|
+
def process_srt_files(show_dir):
|
|
316
|
+
"""
|
|
317
|
+
Process all SRT files in the given directory and its subdirectories.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
show_dir (str): The directory path where the SRT files are located.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
dict: A dictionary containing the SRT file paths as keys and their corresponding text content as values.
|
|
324
|
+
"""
|
|
325
|
+
srt_files = {}
|
|
326
|
+
for dirpath, _, filenames in os.walk(show_dir):
|
|
327
|
+
for filename in filenames:
|
|
328
|
+
if filename.lower().endswith(".srt"):
|
|
329
|
+
srt_file = os.path.join(dirpath, filename)
|
|
330
|
+
logger.info(f"Processing {srt_file}")
|
|
331
|
+
srt_text = extract_srt_text(srt_file)
|
|
332
|
+
srt_files[srt_file] = srt_text
|
|
333
|
+
return srt_files
|
|
334
|
+
def compare_and_rename_files(srt_files, reference_files, dry_run=False):
|
|
335
|
+
"""
|
|
336
|
+
Compare the srt files with the reference files and rename the matching mkv files.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
srt_files (dict): A dictionary containing the srt files as keys and their contents as values.
|
|
340
|
+
reference_files (dict): A dictionary containing the reference files as keys and their contents as values.
|
|
341
|
+
dry_run (bool, optional): If True, the function will only log the renaming actions without actually renaming the files. Defaults to False.
|
|
342
|
+
"""
|
|
343
|
+
logger.info(
|
|
344
|
+
f"Comparing {len(srt_files)} srt files with {len(reference_files)} reference files"
|
|
345
|
+
)
|
|
346
|
+
for srt_text in srt_files.keys():
|
|
347
|
+
parent_dir = os.path.dirname(os.path.dirname(srt_text))
|
|
348
|
+
for reference in reference_files.keys():
|
|
349
|
+
_season, _episode = extract_season_episode(reference)
|
|
350
|
+
mkv_file = os.path.join(
|
|
351
|
+
parent_dir, os.path.basename(srt_text).replace(".srt", ".mkv")
|
|
352
|
+
)
|
|
353
|
+
matching_lines = compare_text(
|
|
354
|
+
reference_files[reference], srt_files[srt_text]
|
|
355
|
+
)
|
|
356
|
+
if matching_lines >= int(len(reference_files[reference]) * 0.1):
|
|
357
|
+
logger.info(f"Matching lines: {matching_lines}")
|
|
358
|
+
logger.info(f"Found matching file: {mkv_file} ->{reference}")
|
|
359
|
+
new_filename = os.path.join(parent_dir, reference)
|
|
360
|
+
if not os.path.exists(new_filename):
|
|
361
|
+
if os.path.exists(mkv_file) and not dry_run:
|
|
362
|
+
logger.info(f"Renaming {mkv_file} to {new_filename}")
|
|
363
|
+
os.rename(mkv_file, new_filename)
|
|
364
|
+
else:
|
|
365
|
+
logger.info(f"File {new_filename} already exists, skipping")
|
|
366
|
+
|
|
367
|
+
def compare_text(text1, text2):
|
|
368
|
+
"""
|
|
369
|
+
Compare two lists of text lines and return the number of matching lines.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
text1 (list): List of text lines from the first source.
|
|
373
|
+
text2 (list): List of text lines from the second source.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
int: Number of matching lines between the two sources.
|
|
377
|
+
"""
|
|
378
|
+
# Flatten the list of text lines
|
|
379
|
+
flat_text1 = [line for lines in text1 for line in lines]
|
|
380
|
+
flat_text2 = [line for lines in text2 for line in lines]
|
|
381
|
+
|
|
382
|
+
# Compare the two lists of text lines
|
|
383
|
+
matching_lines = set(flat_text1).intersection(flat_text2)
|
|
384
|
+
return len(matching_lines)
|