mkv-episode-matcher 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/__init__.py +2 -2
- mkv_episode_matcher/__main__.py +222 -76
- mkv_episode_matcher/config.py +0 -3
- mkv_episode_matcher/episode_identification.py +164 -124
- mkv_episode_matcher/episode_matcher.py +102 -55
- mkv_episode_matcher/subtitle_utils.py +26 -25
- mkv_episode_matcher/utils.py +74 -57
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/METADATA +10 -13
- mkv_episode_matcher-0.7.0.dist-info/RECORD +14 -0
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/WHEEL +1 -1
- mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
- mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
- mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
- mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
- mkv_episode_matcher/mkv_to_srt.py +0 -302
- mkv_episode_matcher-0.5.0.dist-info/RECORD +0 -25
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
import
|
|
2
|
-
import os
|
|
1
|
+
import re
|
|
3
2
|
import subprocess
|
|
4
3
|
import tempfile
|
|
5
4
|
from pathlib import Path
|
|
5
|
+
from rich import print
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
import chardet
|
|
8
|
+
import numpy as np
|
|
6
9
|
import torch
|
|
7
|
-
from rapidfuzz import fuzz
|
|
8
|
-
from loguru import logger
|
|
9
10
|
import whisper
|
|
10
|
-
import numpy as np
|
|
11
|
-
import re
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
import chardet
|
|
14
11
|
from loguru import logger
|
|
12
|
+
from rapidfuzz import fuzz
|
|
13
|
+
|
|
14
|
+
console = Console()
|
|
15
15
|
|
|
16
16
|
class EpisodeMatcher:
|
|
17
17
|
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
@@ -22,35 +22,43 @@ class EpisodeMatcher:
|
|
|
22
22
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
23
23
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
24
24
|
self.temp_dir.mkdir(exist_ok=True)
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
def clean_text(self, text):
|
|
27
27
|
text = text.lower().strip()
|
|
28
|
-
text = re.sub(r
|
|
29
|
-
text = re.sub(r
|
|
30
|
-
return
|
|
28
|
+
text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
|
|
29
|
+
text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
|
|
30
|
+
return " ".join(text.split())
|
|
31
31
|
|
|
32
32
|
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
33
33
|
whisper_clean = self.clean_text(whisper_chunk)
|
|
34
34
|
ref_clean = self.clean_text(ref_chunk)
|
|
35
|
-
return (
|
|
36
|
-
|
|
35
|
+
return (
|
|
36
|
+
fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7
|
|
37
|
+
+ fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3
|
|
38
|
+
) / 100.0
|
|
37
39
|
|
|
38
40
|
def extract_audio_chunk(self, mkv_file, start_time):
|
|
39
41
|
"""Extract a chunk of audio from MKV file."""
|
|
40
42
|
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
41
43
|
if not chunk_path.exists():
|
|
42
44
|
cmd = [
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
45
|
+
"ffmpeg",
|
|
46
|
+
"-ss",
|
|
47
|
+
str(start_time),
|
|
48
|
+
"-t",
|
|
49
|
+
str(self.chunk_duration),
|
|
50
|
+
"-i",
|
|
51
|
+
mkv_file,
|
|
52
|
+
"-vn", # Disable video
|
|
53
|
+
"-sn", # Disable subtitles
|
|
54
|
+
"-dn", # Disable data streams
|
|
55
|
+
"-acodec",
|
|
56
|
+
"pcm_s16le",
|
|
57
|
+
"-ar",
|
|
58
|
+
"16000",
|
|
59
|
+
"-ac",
|
|
60
|
+
"1",
|
|
61
|
+
str(chunk_path),
|
|
54
62
|
]
|
|
55
63
|
subprocess.run(cmd, capture_output=True)
|
|
56
64
|
return str(chunk_path)
|
|
@@ -58,34 +66,37 @@ class EpisodeMatcher:
|
|
|
58
66
|
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
59
67
|
"""
|
|
60
68
|
Load reference subtitles for a specific time chunk with robust encoding handling.
|
|
61
|
-
|
|
69
|
+
|
|
62
70
|
Args:
|
|
63
71
|
srt_file (str or Path): Path to the SRT file
|
|
64
72
|
chunk_idx (int): Index of the chunk to load
|
|
65
|
-
|
|
73
|
+
|
|
66
74
|
Returns:
|
|
67
75
|
str: Combined text from the subtitle chunk
|
|
68
76
|
"""
|
|
69
77
|
chunk_start = chunk_idx * self.chunk_duration
|
|
70
78
|
chunk_end = chunk_start + self.chunk_duration
|
|
71
|
-
|
|
79
|
+
|
|
72
80
|
try:
|
|
73
81
|
# Read the file content using our robust reader
|
|
74
82
|
reader = SubtitleReader()
|
|
75
83
|
content = reader.read_srt_file(srt_file)
|
|
76
|
-
|
|
84
|
+
|
|
77
85
|
# Extract subtitles for the time chunk
|
|
78
86
|
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
79
|
-
|
|
80
|
-
return
|
|
81
|
-
|
|
87
|
+
|
|
88
|
+
return " ".join(text_lines)
|
|
89
|
+
|
|
82
90
|
except Exception as e:
|
|
83
91
|
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
84
|
-
return
|
|
85
|
-
|
|
92
|
+
return ""
|
|
93
|
+
|
|
94
|
+
def _try_match_with_model(
|
|
95
|
+
self, video_file, model_name, max_duration, reference_files
|
|
96
|
+
):
|
|
86
97
|
"""
|
|
87
98
|
Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
|
|
88
|
-
|
|
99
|
+
|
|
89
100
|
Args:
|
|
90
101
|
video_file: Path to the video file
|
|
91
102
|
model_name: Name of the Whisper model to use
|
|
@@ -94,49 +105,46 @@ class EpisodeMatcher:
|
|
|
94
105
|
"""
|
|
95
106
|
# Use cached model
|
|
96
107
|
model = get_whisper_model(model_name, self.device)
|
|
97
|
-
|
|
108
|
+
|
|
98
109
|
# Calculate number of chunks to check (30 seconds each)
|
|
99
110
|
num_chunks = max_duration // self.chunk_duration
|
|
100
|
-
|
|
111
|
+
|
|
101
112
|
for chunk_idx in range(num_chunks):
|
|
102
113
|
start_time = chunk_idx * self.chunk_duration
|
|
103
114
|
logger.debug(f"Trying {model_name} model at {start_time} seconds")
|
|
104
|
-
|
|
115
|
+
|
|
105
116
|
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
106
|
-
|
|
107
|
-
result = model.transcribe(
|
|
108
|
-
|
|
109
|
-
task="transcribe",
|
|
110
|
-
language="en"
|
|
111
|
-
)
|
|
112
|
-
|
|
117
|
+
|
|
118
|
+
result = model.transcribe(audio_path, task="transcribe", language="en")
|
|
119
|
+
|
|
113
120
|
chunk_text = result["text"]
|
|
114
121
|
best_confidence = 0
|
|
115
122
|
best_match = None
|
|
116
|
-
|
|
123
|
+
|
|
117
124
|
# Compare with reference chunks
|
|
118
125
|
for ref_file in reference_files:
|
|
119
126
|
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
120
127
|
confidence = self.chunk_score(chunk_text, ref_text)
|
|
121
|
-
|
|
128
|
+
|
|
122
129
|
if confidence > best_confidence:
|
|
123
130
|
best_confidence = confidence
|
|
124
131
|
best_match = ref_file
|
|
125
|
-
|
|
132
|
+
|
|
126
133
|
if confidence > self.min_confidence:
|
|
127
|
-
season_ep = re.search(r
|
|
134
|
+
season_ep = re.search(r"S(\d+)E(\d+)", best_match.stem)
|
|
128
135
|
if season_ep:
|
|
129
136
|
season, episode = map(int, season_ep.groups())
|
|
130
137
|
return {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
138
|
+
"season": season,
|
|
139
|
+
"episode": episode,
|
|
140
|
+
"confidence": best_confidence,
|
|
141
|
+
"reference_file": str(best_match),
|
|
142
|
+
"matched_at": start_time,
|
|
136
143
|
}
|
|
137
|
-
|
|
138
|
-
logger.
|
|
139
|
-
|
|
144
|
+
|
|
145
|
+
logger.info(
|
|
146
|
+
f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
|
|
147
|
+
)
|
|
140
148
|
return None
|
|
141
149
|
|
|
142
150
|
def identify_episode(self, video_file, temp_dir, season_number):
|
|
@@ -150,44 +158,67 @@ class EpisodeMatcher:
|
|
|
150
158
|
f"{season_number:02d}x",
|
|
151
159
|
f"{season_number}x",
|
|
152
160
|
]
|
|
153
|
-
|
|
161
|
+
|
|
154
162
|
reference_files = []
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
163
|
+
# TODO Figure our why patterns is not being used
|
|
164
|
+
for _pattern in patterns:
|
|
165
|
+
files = [
|
|
166
|
+
f
|
|
167
|
+
for f in reference_dir.glob("*.srt")
|
|
168
|
+
if any(
|
|
169
|
+
re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
|
|
170
|
+
)
|
|
171
|
+
]
|
|
159
172
|
reference_files.extend(files)
|
|
160
|
-
|
|
173
|
+
|
|
161
174
|
reference_files = list(dict.fromkeys(reference_files))
|
|
162
|
-
|
|
175
|
+
|
|
163
176
|
if not reference_files:
|
|
164
177
|
logger.error(f"No reference files found for season {season_number}")
|
|
165
178
|
return None
|
|
179
|
+
duration = float(
|
|
180
|
+
subprocess.check_output([
|
|
181
|
+
"ffprobe",
|
|
182
|
+
"-v",
|
|
183
|
+
"error",
|
|
184
|
+
"-show_entries",
|
|
185
|
+
"format=duration",
|
|
186
|
+
"-of",
|
|
187
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
188
|
+
video_file,
|
|
189
|
+
]).decode()
|
|
190
|
+
)
|
|
166
191
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
match
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
192
|
+
duration = int(np.ceil(duration))
|
|
193
|
+
# Try with tiny model first (fastest)
|
|
194
|
+
logger.info("Attempting match with tiny model...")
|
|
195
|
+
match = self._try_match_with_model(
|
|
196
|
+
video_file, "tiny", duration, reference_files
|
|
197
|
+
)
|
|
198
|
+
if (
|
|
199
|
+
match and match["confidence"] > 0.65
|
|
200
|
+
): # Slightly lower threshold for tiny
|
|
201
|
+
logger.info(
|
|
202
|
+
f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
203
|
+
)
|
|
179
204
|
return match
|
|
180
|
-
|
|
181
|
-
# If
|
|
182
|
-
logger.info(
|
|
183
|
-
|
|
205
|
+
|
|
206
|
+
# If no match, try base model
|
|
207
|
+
logger.info(
|
|
208
|
+
"No match in first 3 minutes, extending base model search to 10 minutes..."
|
|
209
|
+
)
|
|
210
|
+
match = self._try_match_with_model(
|
|
211
|
+
video_file, "base", duration, reference_files
|
|
212
|
+
)
|
|
184
213
|
if match:
|
|
185
|
-
logger.info(
|
|
214
|
+
logger.info(
|
|
215
|
+
f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
216
|
+
)
|
|
186
217
|
return match
|
|
187
|
-
|
|
218
|
+
|
|
188
219
|
logger.info("Speech recognition match failed")
|
|
189
220
|
return None
|
|
190
|
-
|
|
221
|
+
|
|
191
222
|
finally:
|
|
192
223
|
# Cleanup temp files
|
|
193
224
|
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
@@ -196,134 +227,143 @@ class EpisodeMatcher:
|
|
|
196
227
|
except Exception as e:
|
|
197
228
|
logger.warning(f"Failed to delete temp file {file}: {e}")
|
|
198
229
|
|
|
230
|
+
|
|
199
231
|
def detect_file_encoding(file_path):
|
|
200
232
|
"""
|
|
201
233
|
Detect the encoding of a file using chardet.
|
|
202
|
-
|
|
234
|
+
|
|
203
235
|
Args:
|
|
204
236
|
file_path (str or Path): Path to the file
|
|
205
|
-
|
|
237
|
+
|
|
206
238
|
Returns:
|
|
207
239
|
str: Detected encoding, defaults to 'utf-8' if detection fails
|
|
208
240
|
"""
|
|
209
241
|
try:
|
|
210
|
-
with open(file_path,
|
|
242
|
+
with open(file_path, "rb") as f:
|
|
211
243
|
raw_data = f.read()
|
|
212
244
|
result = chardet.detect(raw_data)
|
|
213
|
-
encoding = result[
|
|
214
|
-
confidence = result[
|
|
215
|
-
|
|
216
|
-
logger.debug(
|
|
217
|
-
|
|
245
|
+
encoding = result["encoding"]
|
|
246
|
+
confidence = result["confidence"]
|
|
247
|
+
|
|
248
|
+
logger.debug(
|
|
249
|
+
f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
|
|
250
|
+
)
|
|
251
|
+
return encoding if encoding else "utf-8"
|
|
218
252
|
except Exception as e:
|
|
219
253
|
logger.warning(f"Error detecting encoding for {file_path}: {e}")
|
|
220
|
-
return
|
|
254
|
+
return "utf-8"
|
|
255
|
+
|
|
221
256
|
|
|
222
257
|
def read_file_with_fallback(file_path, encodings=None):
|
|
223
258
|
"""
|
|
224
259
|
Read a file trying multiple encodings in order of preference.
|
|
225
|
-
|
|
260
|
+
|
|
226
261
|
Args:
|
|
227
262
|
file_path (str or Path): Path to the file
|
|
228
263
|
encodings (list): List of encodings to try, defaults to common subtitle encodings
|
|
229
|
-
|
|
264
|
+
|
|
230
265
|
Returns:
|
|
231
266
|
str: File contents
|
|
232
|
-
|
|
267
|
+
|
|
233
268
|
Raises:
|
|
234
269
|
ValueError: If file cannot be read with any encoding
|
|
235
270
|
"""
|
|
236
271
|
if encodings is None:
|
|
237
272
|
# First try detected encoding, then fallback to common subtitle encodings
|
|
238
273
|
detected = detect_file_encoding(file_path)
|
|
239
|
-
encodings = [detected,
|
|
240
|
-
|
|
274
|
+
encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
|
|
275
|
+
|
|
241
276
|
file_path = Path(file_path)
|
|
242
277
|
errors = []
|
|
243
|
-
|
|
278
|
+
|
|
244
279
|
for encoding in encodings:
|
|
245
280
|
try:
|
|
246
|
-
with open(file_path,
|
|
281
|
+
with open(file_path, encoding=encoding) as f:
|
|
247
282
|
content = f.read()
|
|
248
283
|
logger.debug(f"Successfully read {file_path} using {encoding} encoding")
|
|
249
284
|
return content
|
|
250
285
|
except UnicodeDecodeError as e:
|
|
251
286
|
errors.append(f"{encoding}: {str(e)}")
|
|
252
287
|
continue
|
|
253
|
-
|
|
254
|
-
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
|
|
288
|
+
|
|
289
|
+
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
|
|
290
|
+
errors
|
|
291
|
+
)
|
|
255
292
|
logger.error(error_msg)
|
|
256
293
|
raise ValueError(error_msg)
|
|
257
294
|
|
|
295
|
+
|
|
258
296
|
class SubtitleReader:
|
|
259
297
|
"""Helper class for reading and parsing subtitle files."""
|
|
260
|
-
|
|
298
|
+
|
|
261
299
|
@staticmethod
|
|
262
300
|
def parse_timestamp(timestamp):
|
|
263
301
|
"""Parse SRT timestamp into seconds."""
|
|
264
|
-
hours, minutes, seconds = timestamp.replace(
|
|
302
|
+
hours, minutes, seconds = timestamp.replace(",", ".").split(":")
|
|
265
303
|
return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
|
|
266
|
-
|
|
304
|
+
|
|
267
305
|
@staticmethod
|
|
268
306
|
def read_srt_file(file_path):
|
|
269
307
|
"""
|
|
270
308
|
Read an SRT file and return its contents with robust encoding handling.
|
|
271
|
-
|
|
309
|
+
|
|
272
310
|
Args:
|
|
273
311
|
file_path (str or Path): Path to the SRT file
|
|
274
|
-
|
|
312
|
+
|
|
275
313
|
Returns:
|
|
276
314
|
str: Contents of the SRT file
|
|
277
315
|
"""
|
|
278
316
|
return read_file_with_fallback(file_path)
|
|
279
|
-
|
|
317
|
+
|
|
280
318
|
@staticmethod
|
|
281
319
|
def extract_subtitle_chunk(content, start_time, end_time):
|
|
282
320
|
"""
|
|
283
321
|
Extract subtitle text for a specific time window.
|
|
284
|
-
|
|
322
|
+
|
|
285
323
|
Args:
|
|
286
324
|
content (str): Full SRT file content
|
|
287
325
|
start_time (float): Chunk start time in seconds
|
|
288
326
|
end_time (float): Chunk end time in seconds
|
|
289
|
-
|
|
327
|
+
|
|
290
328
|
Returns:
|
|
291
329
|
list: List of subtitle texts within the time window
|
|
292
330
|
"""
|
|
293
331
|
text_lines = []
|
|
294
|
-
|
|
295
|
-
for block in content.strip().split(
|
|
296
|
-
lines = block.split(
|
|
297
|
-
if len(lines) < 3 or
|
|
332
|
+
|
|
333
|
+
for block in content.strip().split("\n\n"):
|
|
334
|
+
lines = block.split("\n")
|
|
335
|
+
if len(lines) < 3 or "-->" not in lines[1]:
|
|
298
336
|
continue
|
|
299
|
-
|
|
337
|
+
|
|
300
338
|
try:
|
|
301
339
|
timestamp = lines[1]
|
|
302
|
-
text =
|
|
303
|
-
|
|
304
|
-
end_stamp = timestamp.split(
|
|
340
|
+
text = " ".join(lines[2:])
|
|
341
|
+
|
|
342
|
+
end_stamp = timestamp.split(" --> ")[1].strip()
|
|
305
343
|
total_seconds = SubtitleReader.parse_timestamp(end_stamp)
|
|
306
|
-
|
|
344
|
+
|
|
307
345
|
if start_time <= total_seconds <= end_time:
|
|
308
346
|
text_lines.append(text)
|
|
309
|
-
|
|
347
|
+
|
|
310
348
|
except (IndexError, ValueError) as e:
|
|
311
349
|
logger.warning(f"Error parsing subtitle block: {e}")
|
|
312
350
|
continue
|
|
313
|
-
|
|
351
|
+
|
|
314
352
|
return text_lines
|
|
315
|
-
|
|
353
|
+
|
|
354
|
+
|
|
316
355
|
_whisper_models = {}
|
|
317
356
|
|
|
357
|
+
|
|
318
358
|
def get_whisper_model(model_name="tiny", device=None):
|
|
319
359
|
"""Cache whisper models to avoid reloading."""
|
|
320
360
|
global _whisper_models
|
|
321
361
|
if device is None:
|
|
322
362
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
323
|
-
|
|
363
|
+
|
|
324
364
|
key = f"{model_name}_{device}"
|
|
325
365
|
if key not in _whisper_models:
|
|
326
366
|
_whisper_models[key] = whisper.load_model(model_name, device=device)
|
|
327
367
|
logger.info(f"Loaded {model_name} model on {device}")
|
|
328
|
-
|
|
329
|
-
return _whisper_models[key]
|
|
368
|
+
|
|
369
|
+
return _whisper_models[key]
|