mkv-episode-matcher 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/__init__.py +2 -2
- mkv_episode_matcher/__main__.py +13 -28
- mkv_episode_matcher/config.py +0 -3
- mkv_episode_matcher/episode_identification.py +163 -124
- mkv_episode_matcher/episode_matcher.py +19 -39
- mkv_episode_matcher/subtitle_utils.py +26 -25
- mkv_episode_matcher/utils.py +56 -56
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/METADATA +7 -13
- mkv_episode_matcher-0.6.0.dist-info/RECORD +14 -0
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/WHEEL +1 -1
- mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
- mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
- mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
- mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
- mkv_episode_matcher/mkv_to_srt.py +0 -302
- mkv_episode_matcher-0.5.0.dist-info/RECORD +0 -25
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/top_level.txt +0 -0
mkv_episode_matcher/__init__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""MKV Episode Matcher package."""
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
3
4
|
|
|
4
5
|
try:
|
|
5
6
|
__version__ = version("mkv-episode-matcher")
|
|
6
7
|
except PackageNotFoundError:
|
|
7
8
|
# package is not installed
|
|
8
9
|
__version__ = "unknown"
|
|
9
|
-
|
mkv_episode_matcher/__main__.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# __main__.py
|
|
2
2
|
import argparse
|
|
3
3
|
import os
|
|
4
|
-
import sys
|
|
5
4
|
|
|
6
5
|
from loguru import logger
|
|
6
|
+
|
|
7
7
|
from mkv_episode_matcher import __version__
|
|
8
8
|
from mkv_episode_matcher.config import get_config, set_config
|
|
9
9
|
|
|
@@ -34,7 +34,7 @@ if not os.path.exists(log_dir):
|
|
|
34
34
|
logger.add(
|
|
35
35
|
os.path.join(log_dir, "stdout.log"),
|
|
36
36
|
format="{time} {level} {message}",
|
|
37
|
-
level="
|
|
37
|
+
level="INFO",
|
|
38
38
|
rotation="10 MB",
|
|
39
39
|
)
|
|
40
40
|
|
|
@@ -56,7 +56,6 @@ def main():
|
|
|
56
56
|
--season: The season number to be processed. If not provided, all seasons will be processed.
|
|
57
57
|
--dry-run: A boolean flag indicating whether to perform a dry run (i.e., not rename any files). If not provided, the function will rename files.
|
|
58
58
|
--get-subs: A boolean flag indicating whether to download subtitles for the show. If not provided, the function will not download subtitles.
|
|
59
|
-
--tesseract-path: The path to the tesseract executable. If not provided, the function will try to get it from the cache or prompt the user to input it.
|
|
60
59
|
|
|
61
60
|
The function logs its progress to two separate log files: one for standard output and one for errors.
|
|
62
61
|
"""
|
|
@@ -67,7 +66,7 @@ def main():
|
|
|
67
66
|
"--version",
|
|
68
67
|
action="version",
|
|
69
68
|
version=f"%(prog)s {__version__}",
|
|
70
|
-
help="Show the version number and exit"
|
|
69
|
+
help="Show the version number and exit",
|
|
71
70
|
)
|
|
72
71
|
parser.add_argument("--tmdb-api-key", help="TMDb API key")
|
|
73
72
|
parser.add_argument("--show-dir", help="Main directory of the show")
|
|
@@ -92,13 +91,6 @@ def main():
|
|
|
92
91
|
nargs="?",
|
|
93
92
|
help="Download subtitles for the show (default: None)",
|
|
94
93
|
)
|
|
95
|
-
parser.add_argument(
|
|
96
|
-
"--tesseract-path",
|
|
97
|
-
type=str,
|
|
98
|
-
default=None,
|
|
99
|
-
nargs="?",
|
|
100
|
-
help="Path to the tesseract executable (default: None)",
|
|
101
|
-
)
|
|
102
94
|
parser.add_argument(
|
|
103
95
|
"--check-gpu",
|
|
104
96
|
type=bool,
|
|
@@ -109,6 +101,7 @@ def main():
|
|
|
109
101
|
args = parser.parse_args()
|
|
110
102
|
if args.check_gpu:
|
|
111
103
|
from mkv_episode_matcher.utils import check_gpu_support
|
|
104
|
+
|
|
112
105
|
check_gpu_support()
|
|
113
106
|
return
|
|
114
107
|
logger.debug(f"Command-line arguments: {args}")
|
|
@@ -118,17 +111,17 @@ def main():
|
|
|
118
111
|
|
|
119
112
|
# Get TMDb API key
|
|
120
113
|
tmdb_api_key = args.tmdb_api_key or config.get("tmdb_api_key")
|
|
121
|
-
|
|
122
|
-
tmdb_api_key = input("Enter your TMDb API key: ")
|
|
123
|
-
logger.debug(f"TMDb API Key: {tmdb_api_key}")
|
|
124
|
-
|
|
114
|
+
|
|
125
115
|
logger.debug("Getting OpenSubtitles API key")
|
|
126
116
|
open_subtitles_api_key = config.get("open_subtitles_api_key")
|
|
127
117
|
open_subtitles_user_agent = config.get("open_subtitles_user_agent")
|
|
128
118
|
open_subtitles_username = config.get("open_subtitles_username")
|
|
129
119
|
open_subtitles_password = config.get("open_subtitles_password")
|
|
130
|
-
|
|
120
|
+
|
|
131
121
|
if args.get_subs:
|
|
122
|
+
if not tmdb_api_key:
|
|
123
|
+
tmdb_api_key = input("Enter your TMDb API key: ")
|
|
124
|
+
logger.debug(f"TMDb API Key: {tmdb_api_key}")
|
|
132
125
|
if not open_subtitles_api_key:
|
|
133
126
|
open_subtitles_api_key = input("Enter your OpenSubtitles API key: ")
|
|
134
127
|
if not open_subtitles_user_agent:
|
|
@@ -137,24 +130,17 @@ def main():
|
|
|
137
130
|
open_subtitles_username = input("Enter your OpenSubtitles Username: ")
|
|
138
131
|
if not open_subtitles_password:
|
|
139
132
|
open_subtitles_password = input("Enter your OpenSubtitles Password: ")
|
|
140
|
-
|
|
141
|
-
# Use config for show directory
|
|
133
|
+
|
|
134
|
+
# Use config for show directory
|
|
142
135
|
show_dir = args.show_dir or config.get("show_dir")
|
|
143
136
|
if not show_dir:
|
|
144
137
|
show_dir = input("Enter the main directory of the show:")
|
|
145
138
|
logger.info(f"Show Directory: {show_dir}")
|
|
146
139
|
if not show_dir:
|
|
147
140
|
show_dir = os.getcwd()
|
|
148
|
-
|
|
149
|
-
if not args.tesseract_path:
|
|
150
|
-
tesseract_path = config.get("tesseract_path")
|
|
151
|
-
if not tesseract_path:
|
|
152
|
-
tesseract_path = input(r"Enter the path to the tesseract executable: ['C:\Program Files\Tesseract-OCR\tesseract.exe']")
|
|
153
|
-
else:
|
|
154
|
-
tesseract_path = args.tesseract_path
|
|
155
|
-
logger.debug(f"Teesseract Path: {tesseract_path}")
|
|
141
|
+
|
|
156
142
|
logger.debug(f"Show Directory: {show_dir}")
|
|
157
|
-
|
|
143
|
+
|
|
158
144
|
# Set the configuration
|
|
159
145
|
set_config(
|
|
160
146
|
tmdb_api_key,
|
|
@@ -164,7 +150,6 @@ def main():
|
|
|
164
150
|
open_subtitles_password,
|
|
165
151
|
show_dir,
|
|
166
152
|
CONFIG_FILE,
|
|
167
|
-
tesseract_path=tesseract_path,
|
|
168
153
|
)
|
|
169
154
|
logger.info("Configuration set")
|
|
170
155
|
|
mkv_episode_matcher/config.py
CHANGED
|
@@ -27,7 +27,6 @@ def set_config(
|
|
|
27
27
|
open_subtitles_password,
|
|
28
28
|
show_dir,
|
|
29
29
|
file,
|
|
30
|
-
tesseract_path=None,
|
|
31
30
|
):
|
|
32
31
|
"""
|
|
33
32
|
Sets the configuration values and writes them to a file.
|
|
@@ -40,7 +39,6 @@ def set_config(
|
|
|
40
39
|
open_subtitles_password (str): The password for OpenSubtitles.
|
|
41
40
|
show_dir (str): The directory where the TV show episodes are located.
|
|
42
41
|
file (str): The path to the configuration file.
|
|
43
|
-
tesseract_path (str, optional): The path to the Tesseract OCR executable.
|
|
44
42
|
|
|
45
43
|
Returns:
|
|
46
44
|
None
|
|
@@ -54,7 +52,6 @@ def set_config(
|
|
|
54
52
|
"open_subtitles_user_agent": str(open_subtitles_user_agent),
|
|
55
53
|
"open_subtitles_username": str(open_subtitles_username),
|
|
56
54
|
"open_subtitles_password": str(open_subtitles_password),
|
|
57
|
-
"tesseract_path": str(tesseract_path),
|
|
58
55
|
}
|
|
59
56
|
logger.info(
|
|
60
57
|
f"Setting config with API:{tmdb_api_key}, show_dir: {show_dir}, and max_threads: {MAX_THREADS}"
|
|
@@ -1,17 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
import os
|
|
1
|
+
import re
|
|
3
2
|
import subprocess
|
|
4
3
|
import tempfile
|
|
5
4
|
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import chardet
|
|
7
|
+
import numpy as np
|
|
6
8
|
import torch
|
|
7
|
-
from rapidfuzz import fuzz
|
|
8
|
-
from loguru import logger
|
|
9
9
|
import whisper
|
|
10
|
-
import numpy as np
|
|
11
|
-
import re
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
import chardet
|
|
14
10
|
from loguru import logger
|
|
11
|
+
from rapidfuzz import fuzz
|
|
12
|
+
|
|
15
13
|
|
|
16
14
|
class EpisodeMatcher:
|
|
17
15
|
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
@@ -22,35 +20,43 @@ class EpisodeMatcher:
|
|
|
22
20
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
23
21
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
24
22
|
self.temp_dir.mkdir(exist_ok=True)
|
|
25
|
-
|
|
23
|
+
|
|
26
24
|
def clean_text(self, text):
|
|
27
25
|
text = text.lower().strip()
|
|
28
|
-
text = re.sub(r
|
|
29
|
-
text = re.sub(r
|
|
30
|
-
return
|
|
26
|
+
text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
|
|
27
|
+
text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
|
|
28
|
+
return " ".join(text.split())
|
|
31
29
|
|
|
32
30
|
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
33
31
|
whisper_clean = self.clean_text(whisper_chunk)
|
|
34
32
|
ref_clean = self.clean_text(ref_chunk)
|
|
35
|
-
return (
|
|
36
|
-
|
|
33
|
+
return (
|
|
34
|
+
fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7
|
|
35
|
+
+ fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3
|
|
36
|
+
) / 100.0
|
|
37
37
|
|
|
38
38
|
def extract_audio_chunk(self, mkv_file, start_time):
|
|
39
39
|
"""Extract a chunk of audio from MKV file."""
|
|
40
40
|
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
41
41
|
if not chunk_path.exists():
|
|
42
42
|
cmd = [
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
43
|
+
"ffmpeg",
|
|
44
|
+
"-ss",
|
|
45
|
+
str(start_time),
|
|
46
|
+
"-t",
|
|
47
|
+
str(self.chunk_duration),
|
|
48
|
+
"-i",
|
|
49
|
+
mkv_file,
|
|
50
|
+
"-vn", # Disable video
|
|
51
|
+
"-sn", # Disable subtitles
|
|
52
|
+
"-dn", # Disable data streams
|
|
53
|
+
"-acodec",
|
|
54
|
+
"pcm_s16le",
|
|
55
|
+
"-ar",
|
|
56
|
+
"16000",
|
|
57
|
+
"-ac",
|
|
58
|
+
"1",
|
|
59
|
+
str(chunk_path),
|
|
54
60
|
]
|
|
55
61
|
subprocess.run(cmd, capture_output=True)
|
|
56
62
|
return str(chunk_path)
|
|
@@ -58,34 +64,37 @@ class EpisodeMatcher:
|
|
|
58
64
|
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
59
65
|
"""
|
|
60
66
|
Load reference subtitles for a specific time chunk with robust encoding handling.
|
|
61
|
-
|
|
67
|
+
|
|
62
68
|
Args:
|
|
63
69
|
srt_file (str or Path): Path to the SRT file
|
|
64
70
|
chunk_idx (int): Index of the chunk to load
|
|
65
|
-
|
|
71
|
+
|
|
66
72
|
Returns:
|
|
67
73
|
str: Combined text from the subtitle chunk
|
|
68
74
|
"""
|
|
69
75
|
chunk_start = chunk_idx * self.chunk_duration
|
|
70
76
|
chunk_end = chunk_start + self.chunk_duration
|
|
71
|
-
|
|
77
|
+
|
|
72
78
|
try:
|
|
73
79
|
# Read the file content using our robust reader
|
|
74
80
|
reader = SubtitleReader()
|
|
75
81
|
content = reader.read_srt_file(srt_file)
|
|
76
|
-
|
|
82
|
+
|
|
77
83
|
# Extract subtitles for the time chunk
|
|
78
84
|
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
79
|
-
|
|
80
|
-
return
|
|
81
|
-
|
|
85
|
+
|
|
86
|
+
return " ".join(text_lines)
|
|
87
|
+
|
|
82
88
|
except Exception as e:
|
|
83
89
|
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
84
|
-
return
|
|
85
|
-
|
|
90
|
+
return ""
|
|
91
|
+
|
|
92
|
+
def _try_match_with_model(
|
|
93
|
+
self, video_file, model_name, max_duration, reference_files
|
|
94
|
+
):
|
|
86
95
|
"""
|
|
87
96
|
Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
|
|
88
|
-
|
|
97
|
+
|
|
89
98
|
Args:
|
|
90
99
|
video_file: Path to the video file
|
|
91
100
|
model_name: Name of the Whisper model to use
|
|
@@ -94,49 +103,47 @@ class EpisodeMatcher:
|
|
|
94
103
|
"""
|
|
95
104
|
# Use cached model
|
|
96
105
|
model = get_whisper_model(model_name, self.device)
|
|
97
|
-
|
|
106
|
+
|
|
98
107
|
# Calculate number of chunks to check (30 seconds each)
|
|
99
108
|
num_chunks = max_duration // self.chunk_duration
|
|
100
|
-
|
|
109
|
+
|
|
101
110
|
for chunk_idx in range(num_chunks):
|
|
102
111
|
start_time = chunk_idx * self.chunk_duration
|
|
103
112
|
logger.debug(f"Trying {model_name} model at {start_time} seconds")
|
|
104
|
-
|
|
113
|
+
|
|
105
114
|
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
106
|
-
|
|
107
|
-
result = model.transcribe(
|
|
108
|
-
|
|
109
|
-
task="transcribe",
|
|
110
|
-
language="en"
|
|
111
|
-
)
|
|
112
|
-
|
|
115
|
+
|
|
116
|
+
result = model.transcribe(audio_path, task="transcribe", language="en")
|
|
117
|
+
|
|
113
118
|
chunk_text = result["text"]
|
|
114
119
|
best_confidence = 0
|
|
115
120
|
best_match = None
|
|
116
|
-
|
|
121
|
+
|
|
117
122
|
# Compare with reference chunks
|
|
118
123
|
for ref_file in reference_files:
|
|
119
124
|
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
120
125
|
confidence = self.chunk_score(chunk_text, ref_text)
|
|
121
|
-
|
|
126
|
+
|
|
122
127
|
if confidence > best_confidence:
|
|
123
128
|
best_confidence = confidence
|
|
124
129
|
best_match = ref_file
|
|
125
|
-
|
|
130
|
+
|
|
126
131
|
if confidence > self.min_confidence:
|
|
127
|
-
season_ep = re.search(r
|
|
132
|
+
season_ep = re.search(r"S(\d+)E(\d+)", best_match.stem)
|
|
128
133
|
if season_ep:
|
|
129
134
|
season, episode = map(int, season_ep.groups())
|
|
130
135
|
return {
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
+
"season": season,
|
|
137
|
+
"episode": episode,
|
|
138
|
+
"confidence": best_confidence,
|
|
139
|
+
"reference_file": str(best_match),
|
|
140
|
+
"matched_at": start_time,
|
|
136
141
|
}
|
|
137
|
-
|
|
138
|
-
logger.
|
|
139
|
-
|
|
142
|
+
|
|
143
|
+
logger.info(
|
|
144
|
+
f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
|
|
145
|
+
)
|
|
146
|
+
|
|
140
147
|
return None
|
|
141
148
|
|
|
142
149
|
def identify_episode(self, video_file, temp_dir, season_number):
|
|
@@ -150,44 +157,67 @@ class EpisodeMatcher:
|
|
|
150
157
|
f"{season_number:02d}x",
|
|
151
158
|
f"{season_number}x",
|
|
152
159
|
]
|
|
153
|
-
|
|
160
|
+
|
|
154
161
|
reference_files = []
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
162
|
+
# TODO Figure our why patterns is not being used
|
|
163
|
+
for _pattern in patterns:
|
|
164
|
+
files = [
|
|
165
|
+
f
|
|
166
|
+
for f in reference_dir.glob("*.srt")
|
|
167
|
+
if any(
|
|
168
|
+
re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
|
|
169
|
+
)
|
|
170
|
+
]
|
|
159
171
|
reference_files.extend(files)
|
|
160
|
-
|
|
172
|
+
|
|
161
173
|
reference_files = list(dict.fromkeys(reference_files))
|
|
162
|
-
|
|
174
|
+
|
|
163
175
|
if not reference_files:
|
|
164
176
|
logger.error(f"No reference files found for season {season_number}")
|
|
165
177
|
return None
|
|
178
|
+
duration = float(
|
|
179
|
+
subprocess.check_output([
|
|
180
|
+
"ffprobe",
|
|
181
|
+
"-v",
|
|
182
|
+
"error",
|
|
183
|
+
"-show_entries",
|
|
184
|
+
"format=duration",
|
|
185
|
+
"-of",
|
|
186
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
187
|
+
video_file,
|
|
188
|
+
]).decode()
|
|
189
|
+
)
|
|
166
190
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
match
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
191
|
+
duration = int(np.ceil(duration))
|
|
192
|
+
# Try with tiny model first (fastest)
|
|
193
|
+
logger.info("Attempting match with tiny model...")
|
|
194
|
+
match = self._try_match_with_model(
|
|
195
|
+
video_file, "tiny", duration, reference_files
|
|
196
|
+
)
|
|
197
|
+
if (
|
|
198
|
+
match and match["confidence"] > 0.65
|
|
199
|
+
): # Slightly lower threshold for tiny
|
|
200
|
+
logger.info(
|
|
201
|
+
f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
202
|
+
)
|
|
179
203
|
return match
|
|
180
|
-
|
|
181
|
-
# If
|
|
182
|
-
logger.info(
|
|
183
|
-
|
|
204
|
+
|
|
205
|
+
# If no match, try base model
|
|
206
|
+
logger.info(
|
|
207
|
+
"No match in first 3 minutes, extending base model search to 10 minutes..."
|
|
208
|
+
)
|
|
209
|
+
match = self._try_match_with_model(
|
|
210
|
+
video_file, "base", duration, reference_files
|
|
211
|
+
)
|
|
184
212
|
if match:
|
|
185
|
-
logger.info(
|
|
213
|
+
logger.info(
|
|
214
|
+
f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
|
|
215
|
+
)
|
|
186
216
|
return match
|
|
187
|
-
|
|
217
|
+
|
|
188
218
|
logger.info("Speech recognition match failed")
|
|
189
219
|
return None
|
|
190
|
-
|
|
220
|
+
|
|
191
221
|
finally:
|
|
192
222
|
# Cleanup temp files
|
|
193
223
|
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
@@ -196,134 +226,143 @@ class EpisodeMatcher:
|
|
|
196
226
|
except Exception as e:
|
|
197
227
|
logger.warning(f"Failed to delete temp file {file}: {e}")
|
|
198
228
|
|
|
229
|
+
|
|
199
230
|
def detect_file_encoding(file_path):
|
|
200
231
|
"""
|
|
201
232
|
Detect the encoding of a file using chardet.
|
|
202
|
-
|
|
233
|
+
|
|
203
234
|
Args:
|
|
204
235
|
file_path (str or Path): Path to the file
|
|
205
|
-
|
|
236
|
+
|
|
206
237
|
Returns:
|
|
207
238
|
str: Detected encoding, defaults to 'utf-8' if detection fails
|
|
208
239
|
"""
|
|
209
240
|
try:
|
|
210
|
-
with open(file_path,
|
|
241
|
+
with open(file_path, "rb") as f:
|
|
211
242
|
raw_data = f.read()
|
|
212
243
|
result = chardet.detect(raw_data)
|
|
213
|
-
encoding = result[
|
|
214
|
-
confidence = result[
|
|
215
|
-
|
|
216
|
-
logger.debug(
|
|
217
|
-
|
|
244
|
+
encoding = result["encoding"]
|
|
245
|
+
confidence = result["confidence"]
|
|
246
|
+
|
|
247
|
+
logger.debug(
|
|
248
|
+
f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
|
|
249
|
+
)
|
|
250
|
+
return encoding if encoding else "utf-8"
|
|
218
251
|
except Exception as e:
|
|
219
252
|
logger.warning(f"Error detecting encoding for {file_path}: {e}")
|
|
220
|
-
return
|
|
253
|
+
return "utf-8"
|
|
254
|
+
|
|
221
255
|
|
|
222
256
|
def read_file_with_fallback(file_path, encodings=None):
|
|
223
257
|
"""
|
|
224
258
|
Read a file trying multiple encodings in order of preference.
|
|
225
|
-
|
|
259
|
+
|
|
226
260
|
Args:
|
|
227
261
|
file_path (str or Path): Path to the file
|
|
228
262
|
encodings (list): List of encodings to try, defaults to common subtitle encodings
|
|
229
|
-
|
|
263
|
+
|
|
230
264
|
Returns:
|
|
231
265
|
str: File contents
|
|
232
|
-
|
|
266
|
+
|
|
233
267
|
Raises:
|
|
234
268
|
ValueError: If file cannot be read with any encoding
|
|
235
269
|
"""
|
|
236
270
|
if encodings is None:
|
|
237
271
|
# First try detected encoding, then fallback to common subtitle encodings
|
|
238
272
|
detected = detect_file_encoding(file_path)
|
|
239
|
-
encodings = [detected,
|
|
240
|
-
|
|
273
|
+
encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
|
|
274
|
+
|
|
241
275
|
file_path = Path(file_path)
|
|
242
276
|
errors = []
|
|
243
|
-
|
|
277
|
+
|
|
244
278
|
for encoding in encodings:
|
|
245
279
|
try:
|
|
246
|
-
with open(file_path,
|
|
280
|
+
with open(file_path, encoding=encoding) as f:
|
|
247
281
|
content = f.read()
|
|
248
282
|
logger.debug(f"Successfully read {file_path} using {encoding} encoding")
|
|
249
283
|
return content
|
|
250
284
|
except UnicodeDecodeError as e:
|
|
251
285
|
errors.append(f"{encoding}: {str(e)}")
|
|
252
286
|
continue
|
|
253
|
-
|
|
254
|
-
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
|
|
287
|
+
|
|
288
|
+
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
|
|
289
|
+
errors
|
|
290
|
+
)
|
|
255
291
|
logger.error(error_msg)
|
|
256
292
|
raise ValueError(error_msg)
|
|
257
293
|
|
|
294
|
+
|
|
258
295
|
class SubtitleReader:
|
|
259
296
|
"""Helper class for reading and parsing subtitle files."""
|
|
260
|
-
|
|
297
|
+
|
|
261
298
|
@staticmethod
|
|
262
299
|
def parse_timestamp(timestamp):
|
|
263
300
|
"""Parse SRT timestamp into seconds."""
|
|
264
|
-
hours, minutes, seconds = timestamp.replace(
|
|
301
|
+
hours, minutes, seconds = timestamp.replace(",", ".").split(":")
|
|
265
302
|
return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
|
|
266
|
-
|
|
303
|
+
|
|
267
304
|
@staticmethod
|
|
268
305
|
def read_srt_file(file_path):
|
|
269
306
|
"""
|
|
270
307
|
Read an SRT file and return its contents with robust encoding handling.
|
|
271
|
-
|
|
308
|
+
|
|
272
309
|
Args:
|
|
273
310
|
file_path (str or Path): Path to the SRT file
|
|
274
|
-
|
|
311
|
+
|
|
275
312
|
Returns:
|
|
276
313
|
str: Contents of the SRT file
|
|
277
314
|
"""
|
|
278
315
|
return read_file_with_fallback(file_path)
|
|
279
|
-
|
|
316
|
+
|
|
280
317
|
@staticmethod
|
|
281
318
|
def extract_subtitle_chunk(content, start_time, end_time):
|
|
282
319
|
"""
|
|
283
320
|
Extract subtitle text for a specific time window.
|
|
284
|
-
|
|
321
|
+
|
|
285
322
|
Args:
|
|
286
323
|
content (str): Full SRT file content
|
|
287
324
|
start_time (float): Chunk start time in seconds
|
|
288
325
|
end_time (float): Chunk end time in seconds
|
|
289
|
-
|
|
326
|
+
|
|
290
327
|
Returns:
|
|
291
328
|
list: List of subtitle texts within the time window
|
|
292
329
|
"""
|
|
293
330
|
text_lines = []
|
|
294
|
-
|
|
295
|
-
for block in content.strip().split(
|
|
296
|
-
lines = block.split(
|
|
297
|
-
if len(lines) < 3 or
|
|
331
|
+
|
|
332
|
+
for block in content.strip().split("\n\n"):
|
|
333
|
+
lines = block.split("\n")
|
|
334
|
+
if len(lines) < 3 or "-->" not in lines[1]:
|
|
298
335
|
continue
|
|
299
|
-
|
|
336
|
+
|
|
300
337
|
try:
|
|
301
338
|
timestamp = lines[1]
|
|
302
|
-
text =
|
|
303
|
-
|
|
304
|
-
end_stamp = timestamp.split(
|
|
339
|
+
text = " ".join(lines[2:])
|
|
340
|
+
|
|
341
|
+
end_stamp = timestamp.split(" --> ")[1].strip()
|
|
305
342
|
total_seconds = SubtitleReader.parse_timestamp(end_stamp)
|
|
306
|
-
|
|
343
|
+
|
|
307
344
|
if start_time <= total_seconds <= end_time:
|
|
308
345
|
text_lines.append(text)
|
|
309
|
-
|
|
346
|
+
|
|
310
347
|
except (IndexError, ValueError) as e:
|
|
311
348
|
logger.warning(f"Error parsing subtitle block: {e}")
|
|
312
349
|
continue
|
|
313
|
-
|
|
350
|
+
|
|
314
351
|
return text_lines
|
|
315
|
-
|
|
352
|
+
|
|
353
|
+
|
|
316
354
|
_whisper_models = {}
|
|
317
355
|
|
|
356
|
+
|
|
318
357
|
def get_whisper_model(model_name="tiny", device=None):
|
|
319
358
|
"""Cache whisper models to avoid reloading."""
|
|
320
359
|
global _whisper_models
|
|
321
360
|
if device is None:
|
|
322
361
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
323
|
-
|
|
362
|
+
|
|
324
363
|
key = f"{model_name}_{device}"
|
|
325
364
|
if key not in _whisper_models:
|
|
326
365
|
_whisper_models[key] = whisper.load_model(model_name, device=device)
|
|
327
366
|
logger.info(f"Loaded {model_name} model on {device}")
|
|
328
|
-
|
|
329
|
-
return _whisper_models[key]
|
|
367
|
+
|
|
368
|
+
return _whisper_models[key]
|