mkv-episode-matcher 0.4.5__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/__init__.py +2 -2
- mkv_episode_matcher/__main__.py +14 -29
- mkv_episode_matcher/config.py +0 -3
- mkv_episode_matcher/episode_identification.py +222 -136
- mkv_episode_matcher/episode_matcher.py +19 -42
- mkv_episode_matcher/subtitle_utils.py +26 -25
- mkv_episode_matcher/utils.py +61 -54
- {mkv_episode_matcher-0.4.5.dist-info → mkv_episode_matcher-0.6.0.dist-info}/METADATA +7 -13
- mkv_episode_matcher-0.6.0.dist-info/RECORD +14 -0
- {mkv_episode_matcher-0.4.5.dist-info → mkv_episode_matcher-0.6.0.dist-info}/WHEEL +1 -1
- mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
- mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
- mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
- mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
- mkv_episode_matcher/mkv_to_srt.py +0 -302
- mkv_episode_matcher/speech_to_text.py +0 -96
- mkv_episode_matcher-0.4.5.dist-info/RECORD +0 -26
- {mkv_episode_matcher-0.4.5.dist-info → mkv_episode_matcher-0.6.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.4.5.dist-info → mkv_episode_matcher-0.6.0.dist-info}/top_level.txt +0 -0
mkv_episode_matcher/__init__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""MKV Episode Matcher package."""
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
3
4
|
|
|
4
5
|
try:
|
|
5
6
|
__version__ = version("mkv-episode-matcher")
|
|
6
7
|
except PackageNotFoundError:
|
|
7
8
|
# package is not installed
|
|
8
9
|
__version__ = "unknown"
|
|
9
|
-
|
mkv_episode_matcher/__main__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# __main__.py
|
|
2
2
|
import argparse
|
|
3
3
|
import os
|
|
4
|
-
import sys
|
|
5
4
|
|
|
6
5
|
from loguru import logger
|
|
6
|
+
|
|
7
7
|
from mkv_episode_matcher import __version__
|
|
8
8
|
from mkv_episode_matcher.config import get_config, set_config
|
|
9
9
|
|
|
@@ -34,7 +34,7 @@ if not os.path.exists(log_dir):
|
|
|
34
34
|
logger.add(
|
|
35
35
|
os.path.join(log_dir, "stdout.log"),
|
|
36
36
|
format="{time} {level} {message}",
|
|
37
|
-
level="
|
|
37
|
+
level="INFO",
|
|
38
38
|
rotation="10 MB",
|
|
39
39
|
)
|
|
40
40
|
|
|
@@ -56,7 +56,6 @@ def main():
|
|
|
56
56
|
--season: The season number to be processed. If not provided, all seasons will be processed.
|
|
57
57
|
--dry-run: A boolean flag indicating whether to perform a dry run (i.e., not rename any files). If not provided, the function will rename files.
|
|
58
58
|
--get-subs: A boolean flag indicating whether to download subtitles for the show. If not provided, the function will not download subtitles.
|
|
59
|
-
--tesseract-path: The path to the tesseract executable. If not provided, the function will try to get it from the cache or prompt the user to input it.
|
|
60
59
|
|
|
61
60
|
The function logs its progress to two separate log files: one for standard output and one for errors.
|
|
62
61
|
"""
|
|
@@ -67,7 +66,7 @@ def main():
|
|
|
67
66
|
"--version",
|
|
68
67
|
action="version",
|
|
69
68
|
version=f"%(prog)s {__version__}",
|
|
70
|
-
help="Show the version number and exit"
|
|
69
|
+
help="Show the version number and exit",
|
|
71
70
|
)
|
|
72
71
|
parser.add_argument("--tmdb-api-key", help="TMDb API key")
|
|
73
72
|
parser.add_argument("--show-dir", help="Main directory of the show")
|
|
@@ -92,13 +91,6 @@ def main():
|
|
|
92
91
|
nargs="?",
|
|
93
92
|
help="Download subtitles for the show (default: None)",
|
|
94
93
|
)
|
|
95
|
-
parser.add_argument(
|
|
96
|
-
"--tesseract-path",
|
|
97
|
-
type=str,
|
|
98
|
-
default=None,
|
|
99
|
-
nargs="?",
|
|
100
|
-
help="Path to the tesseract executable (default: None)",
|
|
101
|
-
)
|
|
102
94
|
parser.add_argument(
|
|
103
95
|
"--check-gpu",
|
|
104
96
|
type=bool,
|
|
@@ -108,7 +100,8 @@ def main():
|
|
|
108
100
|
)
|
|
109
101
|
args = parser.parse_args()
|
|
110
102
|
if args.check_gpu:
|
|
111
|
-
from mkv_episode_matcher.
|
|
103
|
+
from mkv_episode_matcher.utils import check_gpu_support
|
|
104
|
+
|
|
112
105
|
check_gpu_support()
|
|
113
106
|
return
|
|
114
107
|
logger.debug(f"Command-line arguments: {args}")
|
|
@@ -118,17 +111,17 @@ def main():
|
|
|
118
111
|
|
|
119
112
|
# Get TMDb API key
|
|
120
113
|
tmdb_api_key = args.tmdb_api_key or config.get("tmdb_api_key")
|
|
121
|
-
|
|
122
|
-
tmdb_api_key = input("Enter your TMDb API key: ")
|
|
123
|
-
logger.debug(f"TMDb API Key: {tmdb_api_key}")
|
|
124
|
-
|
|
114
|
+
|
|
125
115
|
logger.debug("Getting OpenSubtitles API key")
|
|
126
116
|
open_subtitles_api_key = config.get("open_subtitles_api_key")
|
|
127
117
|
open_subtitles_user_agent = config.get("open_subtitles_user_agent")
|
|
128
118
|
open_subtitles_username = config.get("open_subtitles_username")
|
|
129
119
|
open_subtitles_password = config.get("open_subtitles_password")
|
|
130
|
-
|
|
120
|
+
|
|
131
121
|
if args.get_subs:
|
|
122
|
+
if not tmdb_api_key:
|
|
123
|
+
tmdb_api_key = input("Enter your TMDb API key: ")
|
|
124
|
+
logger.debug(f"TMDb API Key: {tmdb_api_key}")
|
|
132
125
|
if not open_subtitles_api_key:
|
|
133
126
|
open_subtitles_api_key = input("Enter your OpenSubtitles API key: ")
|
|
134
127
|
if not open_subtitles_user_agent:
|
|
@@ -137,24 +130,17 @@ def main():
|
|
|
137
130
|
open_subtitles_username = input("Enter your OpenSubtitles Username: ")
|
|
138
131
|
if not open_subtitles_password:
|
|
139
132
|
open_subtitles_password = input("Enter your OpenSubtitles Password: ")
|
|
140
|
-
|
|
141
|
-
# Use config for show directory
|
|
133
|
+
|
|
134
|
+
# Use config for show directory
|
|
142
135
|
show_dir = args.show_dir or config.get("show_dir")
|
|
143
136
|
if not show_dir:
|
|
144
137
|
show_dir = input("Enter the main directory of the show:")
|
|
145
138
|
logger.info(f"Show Directory: {show_dir}")
|
|
146
139
|
if not show_dir:
|
|
147
140
|
show_dir = os.getcwd()
|
|
148
|
-
|
|
149
|
-
if not args.tesseract_path:
|
|
150
|
-
tesseract_path = config.get("tesseract_path")
|
|
151
|
-
if not tesseract_path:
|
|
152
|
-
tesseract_path = input(r"Enter the path to the tesseract executable: ['C:\Program Files\Tesseract-OCR\tesseract.exe']")
|
|
153
|
-
else:
|
|
154
|
-
tesseract_path = args.tesseract_path
|
|
155
|
-
logger.debug(f"Teesseract Path: {tesseract_path}")
|
|
141
|
+
|
|
156
142
|
logger.debug(f"Show Directory: {show_dir}")
|
|
157
|
-
|
|
143
|
+
|
|
158
144
|
# Set the configuration
|
|
159
145
|
set_config(
|
|
160
146
|
tmdb_api_key,
|
|
@@ -164,7 +150,6 @@ def main():
|
|
|
164
150
|
open_subtitles_password,
|
|
165
151
|
show_dir,
|
|
166
152
|
CONFIG_FILE,
|
|
167
|
-
tesseract_path=tesseract_path,
|
|
168
153
|
)
|
|
169
154
|
logger.info("Configuration set")
|
|
170
155
|
|
mkv_episode_matcher/config.py
CHANGED
|
@@ -27,7 +27,6 @@ def set_config(
|
|
|
27
27
|
open_subtitles_password,
|
|
28
28
|
show_dir,
|
|
29
29
|
file,
|
|
30
|
-
tesseract_path=None,
|
|
31
30
|
):
|
|
32
31
|
"""
|
|
33
32
|
Sets the configuration values and writes them to a file.
|
|
@@ -40,7 +39,6 @@ def set_config(
|
|
|
40
39
|
open_subtitles_password (str): The password for OpenSubtitles.
|
|
41
40
|
show_dir (str): The directory where the TV show episodes are located.
|
|
42
41
|
file (str): The path to the configuration file.
|
|
43
|
-
tesseract_path (str, optional): The path to the Tesseract OCR executable.
|
|
44
42
|
|
|
45
43
|
Returns:
|
|
46
44
|
None
|
|
@@ -54,7 +52,6 @@ def set_config(
|
|
|
54
52
|
"open_subtitles_user_agent": str(open_subtitles_user_agent),
|
|
55
53
|
"open_subtitles_username": str(open_subtitles_username),
|
|
56
54
|
"open_subtitles_password": str(open_subtitles_password),
|
|
57
|
-
"tesseract_path": str(tesseract_path),
|
|
58
55
|
}
|
|
59
56
|
logger.info(
|
|
60
57
|
f"Setting config with API:{tmdb_api_key}, show_dir: {show_dir}, and max_threads: {MAX_THREADS}"
|
|
@@ -1,54 +1,62 @@
|
|
|
1
|
-
import
|
|
2
|
-
import os
|
|
1
|
+
import re
|
|
3
2
|
import subprocess
|
|
4
3
|
import tempfile
|
|
5
4
|
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import chardet
|
|
7
|
+
import numpy as np
|
|
6
8
|
import torch
|
|
7
|
-
from rapidfuzz import fuzz
|
|
8
|
-
from loguru import logger
|
|
9
9
|
import whisper
|
|
10
|
-
import numpy as np
|
|
11
|
-
import re
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
import chardet
|
|
14
10
|
from loguru import logger
|
|
11
|
+
from rapidfuzz import fuzz
|
|
12
|
+
|
|
15
13
|
|
|
16
14
|
class EpisodeMatcher:
|
|
17
15
|
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
18
16
|
self.cache_dir = Path(cache_dir)
|
|
19
17
|
self.min_confidence = min_confidence
|
|
20
18
|
self.show_name = show_name
|
|
21
|
-
self.chunk_duration =
|
|
19
|
+
self.chunk_duration = 30
|
|
22
20
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
23
21
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
24
22
|
self.temp_dir.mkdir(exist_ok=True)
|
|
25
|
-
|
|
23
|
+
|
|
26
24
|
def clean_text(self, text):
|
|
27
25
|
text = text.lower().strip()
|
|
28
|
-
text = re.sub(r
|
|
29
|
-
text = re.sub(r
|
|
30
|
-
return
|
|
26
|
+
text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
|
|
27
|
+
text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
|
|
28
|
+
return " ".join(text.split())
|
|
31
29
|
|
|
32
30
|
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
33
31
|
whisper_clean = self.clean_text(whisper_chunk)
|
|
34
32
|
ref_clean = self.clean_text(ref_chunk)
|
|
35
|
-
return (
|
|
36
|
-
|
|
33
|
+
return (
|
|
34
|
+
fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7
|
|
35
|
+
+ fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3
|
|
36
|
+
) / 100.0
|
|
37
37
|
|
|
38
38
|
def extract_audio_chunk(self, mkv_file, start_time):
|
|
39
39
|
"""Extract a chunk of audio from MKV file."""
|
|
40
40
|
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
41
41
|
if not chunk_path.exists():
|
|
42
42
|
cmd = [
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
43
|
+
"ffmpeg",
|
|
44
|
+
"-ss",
|
|
45
|
+
str(start_time),
|
|
46
|
+
"-t",
|
|
47
|
+
str(self.chunk_duration),
|
|
48
|
+
"-i",
|
|
49
|
+
mkv_file,
|
|
50
|
+
"-vn", # Disable video
|
|
51
|
+
"-sn", # Disable subtitles
|
|
52
|
+
"-dn", # Disable data streams
|
|
53
|
+
"-acodec",
|
|
54
|
+
"pcm_s16le",
|
|
55
|
+
"-ar",
|
|
56
|
+
"16000",
|
|
57
|
+
"-ac",
|
|
58
|
+
"1",
|
|
59
|
+
str(chunk_path),
|
|
52
60
|
]
|
|
53
61
|
subprocess.run(cmd, capture_output=True)
|
|
54
62
|
return str(chunk_path)
|
|
@@ -56,227 +64,305 @@ class EpisodeMatcher:
|
|
|
56
64
|
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
57
65
|
"""
|
|
58
66
|
Load reference subtitles for a specific time chunk with robust encoding handling.
|
|
59
|
-
|
|
67
|
+
|
|
60
68
|
Args:
|
|
61
69
|
srt_file (str or Path): Path to the SRT file
|
|
62
70
|
chunk_idx (int): Index of the chunk to load
|
|
63
|
-
|
|
71
|
+
|
|
64
72
|
Returns:
|
|
65
73
|
str: Combined text from the subtitle chunk
|
|
66
74
|
"""
|
|
67
75
|
chunk_start = chunk_idx * self.chunk_duration
|
|
68
76
|
chunk_end = chunk_start + self.chunk_duration
|
|
69
|
-
|
|
77
|
+
|
|
70
78
|
try:
|
|
71
79
|
# Read the file content using our robust reader
|
|
72
80
|
reader = SubtitleReader()
|
|
73
81
|
content = reader.read_srt_file(srt_file)
|
|
74
|
-
|
|
82
|
+
|
|
75
83
|
# Extract subtitles for the time chunk
|
|
76
84
|
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
77
|
-
|
|
78
|
-
return
|
|
79
|
-
|
|
85
|
+
|
|
86
|
+
return " ".join(text_lines)
|
|
87
|
+
|
|
80
88
|
except Exception as e:
|
|
81
89
|
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
82
|
-
return
|
|
90
|
+
return ""
|
|
91
|
+
|
|
92
|
+
def _try_match_with_model(
|
|
93
|
+
self, video_file, model_name, max_duration, reference_files
|
|
94
|
+
):
|
|
95
|
+
"""
|
|
96
|
+
Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
video_file: Path to the video file
|
|
100
|
+
model_name: Name of the Whisper model to use
|
|
101
|
+
max_duration: Maximum duration in seconds to check
|
|
102
|
+
reference_files: List of reference subtitle files
|
|
103
|
+
"""
|
|
104
|
+
# Use cached model
|
|
105
|
+
model = get_whisper_model(model_name, self.device)
|
|
106
|
+
|
|
107
|
+
# Calculate number of chunks to check (30 seconds each)
|
|
108
|
+
num_chunks = max_duration // self.chunk_duration
|
|
109
|
+
|
|
110
|
+
for chunk_idx in range(num_chunks):
|
|
111
|
+
start_time = chunk_idx * self.chunk_duration
|
|
112
|
+
logger.debug(f"Trying {model_name} model at {start_time} seconds")
|
|
113
|
+
|
|
114
|
+
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
115
|
+
|
|
116
|
+
result = model.transcribe(audio_path, task="transcribe", language="en")
|
|
117
|
+
|
|
118
|
+
chunk_text = result["text"]
|
|
119
|
+
best_confidence = 0
|
|
120
|
+
best_match = None
|
|
121
|
+
|
|
122
|
+
# Compare with reference chunks
|
|
123
|
+
for ref_file in reference_files:
|
|
124
|
+
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
125
|
+
confidence = self.chunk_score(chunk_text, ref_text)
|
|
126
|
+
|
|
127
|
+
if confidence > best_confidence:
|
|
128
|
+
best_confidence = confidence
|
|
129
|
+
best_match = ref_file
|
|
130
|
+
|
|
131
|
+
if confidence > self.min_confidence:
|
|
132
|
+
season_ep = re.search(r"S(\d+)E(\d+)", best_match.stem)
|
|
133
|
+
if season_ep:
|
|
134
|
+
season, episode = map(int, season_ep.groups())
|
|
135
|
+
return {
|
|
136
|
+
"season": season,
|
|
137
|
+
"episode": episode,
|
|
138
|
+
"confidence": best_confidence,
|
|
139
|
+
"reference_file": str(best_match),
|
|
140
|
+
"matched_at": start_time,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
logger.info(
|
|
144
|
+
f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
return None
|
|
83
148
|
|
|
84
149
|
def identify_episode(self, video_file, temp_dir, season_number):
|
|
150
|
+
"""Progressive episode identification with faster initial attempt."""
|
|
85
151
|
try:
|
|
86
|
-
# Get
|
|
87
|
-
duration = float(subprocess.check_output([
|
|
88
|
-
'ffprobe', '-v', 'error',
|
|
89
|
-
'-show_entries', 'format=duration',
|
|
90
|
-
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
91
|
-
video_file
|
|
92
|
-
]).decode())
|
|
93
|
-
|
|
94
|
-
total_chunks = int(np.ceil(duration / self.chunk_duration))
|
|
95
|
-
|
|
96
|
-
# Load Whisper model
|
|
97
|
-
model = whisper.load_model("base", device=self.device)
|
|
98
|
-
|
|
99
|
-
# Get season-specific reference files using multiple patterns
|
|
152
|
+
# Get reference files first
|
|
100
153
|
reference_dir = self.cache_dir / "data" / self.show_name
|
|
101
|
-
|
|
102
|
-
# Create season patterns for different formats
|
|
103
154
|
patterns = [
|
|
104
|
-
f"S{season_number:02d}E",
|
|
105
|
-
f"S{season_number}E",
|
|
106
|
-
f"{season_number:02d}x",
|
|
107
|
-
f"{season_number}x",
|
|
155
|
+
f"S{season_number:02d}E",
|
|
156
|
+
f"S{season_number}E",
|
|
157
|
+
f"{season_number:02d}x",
|
|
158
|
+
f"{season_number}x",
|
|
108
159
|
]
|
|
109
|
-
|
|
160
|
+
|
|
110
161
|
reference_files = []
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
162
|
+
# TODO Figure our why patterns is not being used
|
|
163
|
+
for _pattern in patterns:
|
|
164
|
+
files = [
|
|
165
|
+
f
|
|
166
|
+
for f in reference_dir.glob("*.srt")
|
|
167
|
+
if any(
|
|
168
|
+
re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
|
|
169
|
+
)
|
|
170
|
+
]
|
|
115
171
|
reference_files.extend(files)
|
|
116
|
-
|
|
117
|
-
# Remove duplicates while preserving order
|
|
172
|
+
|
|
118
173
|
reference_files = list(dict.fromkeys(reference_files))
|
|
119
|
-
|
|
174
|
+
|
|
120
175
|
if not reference_files:
|
|
121
176
|
logger.error(f"No reference files found for season {season_number}")
|
|
122
177
|
return None
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
178
|
+
duration = float(
|
|
179
|
+
subprocess.check_output([
|
|
180
|
+
"ffprobe",
|
|
181
|
+
"-v",
|
|
182
|
+
"error",
|
|
183
|
+
"-show_entries",
|
|
184
|
+
"format=duration",
|
|
185
|
+
"-of",
|
|
186
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
187
|
+
video_file,
|
|
188
|
+
]).decode()
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
duration = int(np.ceil(duration))
|
|
192
|
+
# Try with tiny model first (fastest)
|
|
193
|
+
logger.info("Attempting match with tiny model...")
|
|
194
|
+
match = self._try_match_with_model(
|
|
195
|
+
video_file, "tiny", duration, reference_files
|
|
196
|
+
)
|
|
197
|
+
if (
|
|
198
|
+
match and match["confidence"] > 0.65
|
|
199
|
+
): # Slightly lower threshold for tiny
|
|
200
|
+
logger.info(
|
|
201
|
+
f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
202
|
+
)
|
|
203
|
+
return match
|
|
204
|
+
|
|
205
|
+
# If no match, try base model
|
|
206
|
+
logger.info(
|
|
207
|
+
"No match in first 3 minutes, extending base model search to 10 minutes..."
|
|
208
|
+
)
|
|
209
|
+
match = self._try_match_with_model(
|
|
210
|
+
video_file, "base", duration, reference_files
|
|
211
|
+
)
|
|
212
|
+
if match:
|
|
213
|
+
logger.info(
|
|
214
|
+
f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
134
215
|
)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
best_match = None
|
|
139
|
-
|
|
140
|
-
# Compare with reference chunks
|
|
141
|
-
for ref_file in reference_files:
|
|
142
|
-
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
143
|
-
confidence = self.chunk_score(chunk_text, ref_text)
|
|
144
|
-
|
|
145
|
-
if confidence > best_confidence:
|
|
146
|
-
best_confidence = confidence
|
|
147
|
-
best_match = ref_file
|
|
148
|
-
|
|
149
|
-
if confidence > self.min_confidence:
|
|
150
|
-
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
151
|
-
if season_ep:
|
|
152
|
-
season, episode = map(int, season_ep.groups())
|
|
153
|
-
return {
|
|
154
|
-
'season': season,
|
|
155
|
-
'episode': episode,
|
|
156
|
-
'confidence': best_confidence,
|
|
157
|
-
'reference_file': str(best_match),
|
|
158
|
-
}
|
|
159
|
-
|
|
216
|
+
return match
|
|
217
|
+
|
|
218
|
+
logger.info("Speech recognition match failed")
|
|
160
219
|
return None
|
|
161
|
-
|
|
220
|
+
|
|
162
221
|
finally:
|
|
163
222
|
# Cleanup temp files
|
|
164
223
|
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
165
|
-
|
|
224
|
+
try:
|
|
225
|
+
file.unlink()
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.warning(f"Failed to delete temp file {file}: {e}")
|
|
228
|
+
|
|
166
229
|
|
|
167
230
|
def detect_file_encoding(file_path):
|
|
168
231
|
"""
|
|
169
232
|
Detect the encoding of a file using chardet.
|
|
170
|
-
|
|
233
|
+
|
|
171
234
|
Args:
|
|
172
235
|
file_path (str or Path): Path to the file
|
|
173
|
-
|
|
236
|
+
|
|
174
237
|
Returns:
|
|
175
238
|
str: Detected encoding, defaults to 'utf-8' if detection fails
|
|
176
239
|
"""
|
|
177
240
|
try:
|
|
178
|
-
with open(file_path,
|
|
241
|
+
with open(file_path, "rb") as f:
|
|
179
242
|
raw_data = f.read()
|
|
180
243
|
result = chardet.detect(raw_data)
|
|
181
|
-
encoding = result[
|
|
182
|
-
confidence = result[
|
|
183
|
-
|
|
184
|
-
logger.debug(
|
|
185
|
-
|
|
244
|
+
encoding = result["encoding"]
|
|
245
|
+
confidence = result["confidence"]
|
|
246
|
+
|
|
247
|
+
logger.debug(
|
|
248
|
+
f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
|
|
249
|
+
)
|
|
250
|
+
return encoding if encoding else "utf-8"
|
|
186
251
|
except Exception as e:
|
|
187
252
|
logger.warning(f"Error detecting encoding for {file_path}: {e}")
|
|
188
|
-
return
|
|
253
|
+
return "utf-8"
|
|
254
|
+
|
|
189
255
|
|
|
190
256
|
def read_file_with_fallback(file_path, encodings=None):
|
|
191
257
|
"""
|
|
192
258
|
Read a file trying multiple encodings in order of preference.
|
|
193
|
-
|
|
259
|
+
|
|
194
260
|
Args:
|
|
195
261
|
file_path (str or Path): Path to the file
|
|
196
262
|
encodings (list): List of encodings to try, defaults to common subtitle encodings
|
|
197
|
-
|
|
263
|
+
|
|
198
264
|
Returns:
|
|
199
265
|
str: File contents
|
|
200
|
-
|
|
266
|
+
|
|
201
267
|
Raises:
|
|
202
268
|
ValueError: If file cannot be read with any encoding
|
|
203
269
|
"""
|
|
204
270
|
if encodings is None:
|
|
205
271
|
# First try detected encoding, then fallback to common subtitle encodings
|
|
206
272
|
detected = detect_file_encoding(file_path)
|
|
207
|
-
encodings = [detected,
|
|
208
|
-
|
|
273
|
+
encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
|
|
274
|
+
|
|
209
275
|
file_path = Path(file_path)
|
|
210
276
|
errors = []
|
|
211
|
-
|
|
277
|
+
|
|
212
278
|
for encoding in encodings:
|
|
213
279
|
try:
|
|
214
|
-
with open(file_path,
|
|
280
|
+
with open(file_path, encoding=encoding) as f:
|
|
215
281
|
content = f.read()
|
|
216
282
|
logger.debug(f"Successfully read {file_path} using {encoding} encoding")
|
|
217
283
|
return content
|
|
218
284
|
except UnicodeDecodeError as e:
|
|
219
285
|
errors.append(f"{encoding}: {str(e)}")
|
|
220
286
|
continue
|
|
221
|
-
|
|
222
|
-
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
|
|
287
|
+
|
|
288
|
+
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
|
|
289
|
+
errors
|
|
290
|
+
)
|
|
223
291
|
logger.error(error_msg)
|
|
224
292
|
raise ValueError(error_msg)
|
|
225
293
|
|
|
294
|
+
|
|
226
295
|
class SubtitleReader:
|
|
227
296
|
"""Helper class for reading and parsing subtitle files."""
|
|
228
|
-
|
|
297
|
+
|
|
229
298
|
@staticmethod
|
|
230
299
|
def parse_timestamp(timestamp):
|
|
231
300
|
"""Parse SRT timestamp into seconds."""
|
|
232
|
-
hours, minutes, seconds = timestamp.replace(
|
|
301
|
+
hours, minutes, seconds = timestamp.replace(",", ".").split(":")
|
|
233
302
|
return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
|
|
234
|
-
|
|
303
|
+
|
|
235
304
|
@staticmethod
|
|
236
305
|
def read_srt_file(file_path):
|
|
237
306
|
"""
|
|
238
307
|
Read an SRT file and return its contents with robust encoding handling.
|
|
239
|
-
|
|
308
|
+
|
|
240
309
|
Args:
|
|
241
310
|
file_path (str or Path): Path to the SRT file
|
|
242
|
-
|
|
311
|
+
|
|
243
312
|
Returns:
|
|
244
313
|
str: Contents of the SRT file
|
|
245
314
|
"""
|
|
246
315
|
return read_file_with_fallback(file_path)
|
|
247
|
-
|
|
316
|
+
|
|
248
317
|
@staticmethod
|
|
249
318
|
def extract_subtitle_chunk(content, start_time, end_time):
|
|
250
319
|
"""
|
|
251
320
|
Extract subtitle text for a specific time window.
|
|
252
|
-
|
|
321
|
+
|
|
253
322
|
Args:
|
|
254
323
|
content (str): Full SRT file content
|
|
255
324
|
start_time (float): Chunk start time in seconds
|
|
256
325
|
end_time (float): Chunk end time in seconds
|
|
257
|
-
|
|
326
|
+
|
|
258
327
|
Returns:
|
|
259
328
|
list: List of subtitle texts within the time window
|
|
260
329
|
"""
|
|
261
330
|
text_lines = []
|
|
262
|
-
|
|
263
|
-
for block in content.strip().split(
|
|
264
|
-
lines = block.split(
|
|
265
|
-
if len(lines) < 3 or
|
|
331
|
+
|
|
332
|
+
for block in content.strip().split("\n\n"):
|
|
333
|
+
lines = block.split("\n")
|
|
334
|
+
if len(lines) < 3 or "-->" not in lines[1]:
|
|
266
335
|
continue
|
|
267
|
-
|
|
336
|
+
|
|
268
337
|
try:
|
|
269
338
|
timestamp = lines[1]
|
|
270
|
-
text =
|
|
271
|
-
|
|
272
|
-
end_stamp = timestamp.split(
|
|
339
|
+
text = " ".join(lines[2:])
|
|
340
|
+
|
|
341
|
+
end_stamp = timestamp.split(" --> ")[1].strip()
|
|
273
342
|
total_seconds = SubtitleReader.parse_timestamp(end_stamp)
|
|
274
|
-
|
|
343
|
+
|
|
275
344
|
if start_time <= total_seconds <= end_time:
|
|
276
345
|
text_lines.append(text)
|
|
277
|
-
|
|
346
|
+
|
|
278
347
|
except (IndexError, ValueError) as e:
|
|
279
348
|
logger.warning(f"Error parsing subtitle block: {e}")
|
|
280
349
|
continue
|
|
281
|
-
|
|
282
|
-
return text_lines
|
|
350
|
+
|
|
351
|
+
return text_lines
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
_whisper_models = {}
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def get_whisper_model(model_name="tiny", device=None):
|
|
358
|
+
"""Cache whisper models to avoid reloading."""
|
|
359
|
+
global _whisper_models
|
|
360
|
+
if device is None:
|
|
361
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
362
|
+
|
|
363
|
+
key = f"{model_name}_{device}"
|
|
364
|
+
if key not in _whisper_models:
|
|
365
|
+
_whisper_models[key] = whisper.load_model(model_name, device=device)
|
|
366
|
+
logger.info(f"Loaded {model_name} model on {device}")
|
|
367
|
+
|
|
368
|
+
return _whisper_models[key]
|