mkv-episode-matcher 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mkv_episode_matcher/__init__.py +8 -0
- mkv_episode_matcher/__main__.py +2 -177
- mkv_episode_matcher/asr_models.py +506 -0
- mkv_episode_matcher/cli.py +558 -0
- mkv_episode_matcher/core/config_manager.py +100 -0
- mkv_episode_matcher/core/engine.py +577 -0
- mkv_episode_matcher/core/matcher.py +214 -0
- mkv_episode_matcher/core/models.py +91 -0
- mkv_episode_matcher/core/providers/asr.py +85 -0
- mkv_episode_matcher/core/providers/subtitles.py +341 -0
- mkv_episode_matcher/core/utils.py +148 -0
- mkv_episode_matcher/episode_identification.py +550 -118
- mkv_episode_matcher/subtitle_utils.py +82 -0
- mkv_episode_matcher/tmdb_client.py +56 -14
- mkv_episode_matcher/ui/flet_app.py +708 -0
- mkv_episode_matcher/utils.py +262 -139
- mkv_episode_matcher-1.0.0.dist-info/METADATA +242 -0
- mkv_episode_matcher-1.0.0.dist-info/RECORD +23 -0
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/WHEEL +1 -1
- mkv_episode_matcher-1.0.0.dist-info/licenses/LICENSE +21 -0
- mkv_episode_matcher/config.py +0 -82
- mkv_episode_matcher/episode_matcher.py +0 -100
- mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
- mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
- mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
- mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
- mkv_episode_matcher/mkv_to_srt.py +0 -302
- mkv_episode_matcher/speech_to_text.py +0 -90
- mkv_episode_matcher-0.3.3.dist-info/METADATA +0 -125
- mkv_episode_matcher-0.3.3.dist-info/RECORD +0 -25
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import re
|
|
3
|
+
import shutil
|
|
4
|
+
import time
|
|
5
|
+
from functools import wraps
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Callable, TypeVar
|
|
8
|
+
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from opensubtitlescom import OpenSubtitles
|
|
11
|
+
|
|
12
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
13
|
+
|
|
14
|
+
from mkv_episode_matcher.core.config_manager import get_config_manager
|
|
15
|
+
from mkv_episode_matcher.core.models import EpisodeInfo, SubtitleFile
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def retry_with_backoff(
|
|
19
|
+
max_retries: int = 3,
|
|
20
|
+
base_delay: float = 1.0,
|
|
21
|
+
max_delay: float = 60.0,
|
|
22
|
+
backoff_factor: float = 2.0,
|
|
23
|
+
) -> Callable[[F], F]:
|
|
24
|
+
"""Decorator for retrying operations with exponential backoff."""
|
|
25
|
+
|
|
26
|
+
def decorator(func: F) -> F:
|
|
27
|
+
@wraps(func)
|
|
28
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
29
|
+
last_exception = None
|
|
30
|
+
delay = base_delay
|
|
31
|
+
|
|
32
|
+
for attempt in range(max_retries + 1):
|
|
33
|
+
try:
|
|
34
|
+
return func(*args, **kwargs)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
last_exception = e
|
|
37
|
+
if attempt == max_retries:
|
|
38
|
+
logger.error(
|
|
39
|
+
f"Max retries ({max_retries}) exceeded for {func.__name__}: {e}"
|
|
40
|
+
)
|
|
41
|
+
raise e
|
|
42
|
+
|
|
43
|
+
logger.warning(
|
|
44
|
+
f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}, retrying in {delay:.1f}s..."
|
|
45
|
+
)
|
|
46
|
+
time.sleep(delay)
|
|
47
|
+
delay = min(delay * backoff_factor, max_delay)
|
|
48
|
+
|
|
49
|
+
raise last_exception
|
|
50
|
+
|
|
51
|
+
return wrapper # type: ignore
|
|
52
|
+
|
|
53
|
+
return decorator
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def parse_season_episode(filename: str) -> EpisodeInfo | None:
|
|
57
|
+
"""Parse season and episode from filename using regex."""
|
|
58
|
+
# S01E01
|
|
59
|
+
match = re.search(r"[Ss](\d{1,2})[Ee](\d{1,2})", filename)
|
|
60
|
+
if match:
|
|
61
|
+
return EpisodeInfo(
|
|
62
|
+
series_name="", # Placeholder
|
|
63
|
+
season=int(match.group(1)),
|
|
64
|
+
episode=int(match.group(2)),
|
|
65
|
+
)
|
|
66
|
+
# 1x01
|
|
67
|
+
match = re.search(r"(\d{1,2})x(\d{1,2})", filename)
|
|
68
|
+
if match:
|
|
69
|
+
return EpisodeInfo(
|
|
70
|
+
series_name="", season=int(match.group(1)), episode=int(match.group(2))
|
|
71
|
+
)
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SubtitleProvider(abc.ABC):
|
|
76
|
+
@abc.abstractmethod
|
|
77
|
+
def get_subtitles(
|
|
78
|
+
self, show_name: str, season: int, video_files: list[Path] = None
|
|
79
|
+
) -> list[SubtitleFile]:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class LocalSubtitleProvider(SubtitleProvider):
|
|
84
|
+
"""Provider that scans a local directory for subtitle files."""
|
|
85
|
+
|
|
86
|
+
def __init__(self, cache_dir: Path):
|
|
87
|
+
self.cache_dir = cache_dir / "data"
|
|
88
|
+
|
|
89
|
+
def get_subtitles(
|
|
90
|
+
self, show_name: str, season: int, video_files: list[Path] = None
|
|
91
|
+
) -> list[SubtitleFile]:
|
|
92
|
+
"""Get all subtitle files for a specific show and season."""
|
|
93
|
+
show_dir = self.cache_dir / show_name
|
|
94
|
+
if not show_dir.exists():
|
|
95
|
+
# logger.warning(f"No subtitle cache found at {show_dir}")
|
|
96
|
+
return []
|
|
97
|
+
|
|
98
|
+
subtitles = []
|
|
99
|
+
# Case insensitive glob
|
|
100
|
+
files = list(show_dir.glob("*.srt")) + list(show_dir.glob("*.SRT"))
|
|
101
|
+
|
|
102
|
+
for f in files:
|
|
103
|
+
info = parse_season_episode(f.name)
|
|
104
|
+
if info:
|
|
105
|
+
if info.season == season:
|
|
106
|
+
info.series_name = show_name
|
|
107
|
+
subtitles.append(SubtitleFile(path=f, episode_info=info))
|
|
108
|
+
|
|
109
|
+
# Deduplicate by path
|
|
110
|
+
seen = set()
|
|
111
|
+
unique_subs = []
|
|
112
|
+
for sub in subtitles:
|
|
113
|
+
if sub.path not in seen:
|
|
114
|
+
seen.add(sub.path)
|
|
115
|
+
unique_subs.append(sub)
|
|
116
|
+
|
|
117
|
+
return unique_subs
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class OpenSubtitlesProvider(SubtitleProvider):
|
|
121
|
+
"""Provider that downloads subtitles using OpenSubtitles.com."""
|
|
122
|
+
|
|
123
|
+
def __init__(self):
|
|
124
|
+
cm = get_config_manager()
|
|
125
|
+
self.config = cm.load()
|
|
126
|
+
self.client = None
|
|
127
|
+
self.network_timeout = 30 # seconds
|
|
128
|
+
self._authenticate()
|
|
129
|
+
|
|
130
|
+
def _authenticate(self):
|
|
131
|
+
if not self.config.open_subtitles_api_key:
|
|
132
|
+
logger.warning("OpenSubtitles API key not configured")
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
self.client = OpenSubtitles(
|
|
137
|
+
self.config.open_subtitles_user_agent,
|
|
138
|
+
self.config.open_subtitles_api_key,
|
|
139
|
+
)
|
|
140
|
+
if (
|
|
141
|
+
self.config.open_subtitles_username
|
|
142
|
+
and self.config.open_subtitles_password
|
|
143
|
+
):
|
|
144
|
+
self.client.login(
|
|
145
|
+
self.config.open_subtitles_username,
|
|
146
|
+
self.config.open_subtitles_password,
|
|
147
|
+
)
|
|
148
|
+
logger.debug("Logged in to OpenSubtitles")
|
|
149
|
+
else:
|
|
150
|
+
logger.debug("Initialized OpenSubtitles (no login)")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logger.error(f"Failed to initialize OpenSubtitles: {e}")
|
|
153
|
+
self.client = None
|
|
154
|
+
|
|
155
|
+
@retry_with_backoff(max_retries=3, base_delay=1.0)
|
|
156
|
+
def _search_with_retry(self, query: str, languages: str = "en"):
|
|
157
|
+
"""Search for subtitles with retry logic."""
|
|
158
|
+
if not self.client:
|
|
159
|
+
raise RuntimeError("OpenSubtitles client not initialized")
|
|
160
|
+
|
|
161
|
+
import signal
|
|
162
|
+
|
|
163
|
+
def timeout_handler(signum, frame):
|
|
164
|
+
raise TimeoutError(
|
|
165
|
+
f"Search operation timed out after {self.network_timeout}s"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Set timeout for search operation (Unix-like systems only)
|
|
169
|
+
if hasattr(signal, "SIGALRM"):
|
|
170
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
171
|
+
signal.alarm(self.network_timeout)
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
return self.client.search(query=query, languages=languages)
|
|
175
|
+
finally:
|
|
176
|
+
if hasattr(signal, "SIGALRM"):
|
|
177
|
+
signal.alarm(0) # Cancel the alarm
|
|
178
|
+
|
|
179
|
+
@retry_with_backoff(max_retries=2, base_delay=0.5)
|
|
180
|
+
def _download_with_retry(self, subtitle):
|
|
181
|
+
"""Download subtitle file with retry logic."""
|
|
182
|
+
if not self.client:
|
|
183
|
+
raise RuntimeError("OpenSubtitles client not initialized")
|
|
184
|
+
|
|
185
|
+
import signal
|
|
186
|
+
|
|
187
|
+
def timeout_handler(signum, frame):
|
|
188
|
+
raise TimeoutError(
|
|
189
|
+
f"Download operation timed out after {self.network_timeout}s"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Set timeout for download operation (Unix-like systems only)
|
|
193
|
+
if hasattr(signal, "SIGALRM"):
|
|
194
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
195
|
+
signal.alarm(self.network_timeout)
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
return self.client.download_and_save(subtitle)
|
|
199
|
+
finally:
|
|
200
|
+
if hasattr(signal, "SIGALRM"):
|
|
201
|
+
signal.alarm(0) # Cancel the alarm
|
|
202
|
+
|
|
203
|
+
def get_subtitles(
|
|
204
|
+
self, show_name: str, season: int, video_files: list[Path] = None
|
|
205
|
+
) -> list[SubtitleFile]:
|
|
206
|
+
"""Get subtitles for a show/season by downloading them."""
|
|
207
|
+
if not self.client:
|
|
208
|
+
logger.error("OpenSubtitles client not available")
|
|
209
|
+
return []
|
|
210
|
+
|
|
211
|
+
# We need video files to do specific searching usually, but if we just want to bulk match
|
|
212
|
+
# we might want to search by query.
|
|
213
|
+
# However, the engine usually passes a list of video files for the season.
|
|
214
|
+
|
|
215
|
+
# If we have video files, we can try to find subs for them specifically?
|
|
216
|
+
# Or just search for "Show Name S01" to get a bunch?
|
|
217
|
+
# OpenSubtitles API allows searching by query "Show Name S01".
|
|
218
|
+
|
|
219
|
+
logger.info(f"Searching OpenSubtitles for {show_name} S{season:02d}")
|
|
220
|
+
|
|
221
|
+
# Prepare cache directory
|
|
222
|
+
cache_dir = self.config.cache_dir / "data" / show_name
|
|
223
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
224
|
+
|
|
225
|
+
downloaded_subtitles = []
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
# Search by query with retry logic
|
|
229
|
+
query = f"{show_name} S{season:02d}"
|
|
230
|
+
response = self._search_with_retry(query)
|
|
231
|
+
|
|
232
|
+
if not response.data:
|
|
233
|
+
logger.warning(f"No subtitles found for query: {query}")
|
|
234
|
+
return []
|
|
235
|
+
|
|
236
|
+
logger.info(f"Found {len(response.data)} potential subtitles")
|
|
237
|
+
|
|
238
|
+
# Limit downloads to a reasonable number or try to match specifically?
|
|
239
|
+
# For now, let's download unique episodes for this season.
|
|
240
|
+
|
|
241
|
+
seen_episodes = set()
|
|
242
|
+
|
|
243
|
+
for subtitle in response.data:
|
|
244
|
+
# Use API provided metadata first
|
|
245
|
+
api_season = getattr(subtitle, "season_number", None)
|
|
246
|
+
api_episode = getattr(subtitle, "episode_number", None)
|
|
247
|
+
|
|
248
|
+
# Get filename from files list or top level
|
|
249
|
+
sub_filename = subtitle.file_name
|
|
250
|
+
if not sub_filename and subtitle.files:
|
|
251
|
+
# files is a list of dicts based on debug output
|
|
252
|
+
if isinstance(subtitle.files[0], dict):
|
|
253
|
+
sub_filename = subtitle.files[0].get("file_name", "")
|
|
254
|
+
else:
|
|
255
|
+
# Fallback if it somehow changes to object
|
|
256
|
+
sub_filename = getattr(subtitle.files[0], "file_name", "")
|
|
257
|
+
|
|
258
|
+
# Check match
|
|
259
|
+
if api_season and api_episode:
|
|
260
|
+
if api_season != season:
|
|
261
|
+
continue
|
|
262
|
+
ep_num = api_episode
|
|
263
|
+
else:
|
|
264
|
+
# Fallback to parsing filename
|
|
265
|
+
info = parse_season_episode(sub_filename or "")
|
|
266
|
+
if not info or info.season != season:
|
|
267
|
+
continue
|
|
268
|
+
ep_num = info.episode
|
|
269
|
+
|
|
270
|
+
if ep_num in seen_episodes:
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
# Download with retry
|
|
274
|
+
try:
|
|
275
|
+
logger.info(f"Downloading subtitle for S{season:02d}E{ep_num:02d}")
|
|
276
|
+
srt_file = self._download_with_retry(subtitle)
|
|
277
|
+
|
|
278
|
+
# Move to cache
|
|
279
|
+
target_name = f"{show_name} - S{season:02d}E{ep_num:02d}.srt"
|
|
280
|
+
target_path = cache_dir / target_name
|
|
281
|
+
|
|
282
|
+
shutil.move(srt_file, target_path)
|
|
283
|
+
|
|
284
|
+
downloaded_subtitles.append(
|
|
285
|
+
SubtitleFile(
|
|
286
|
+
path=target_path,
|
|
287
|
+
language="en",
|
|
288
|
+
episode_info=EpisodeInfo(
|
|
289
|
+
series_name=show_name, season=season, episode=ep_num
|
|
290
|
+
),
|
|
291
|
+
)
|
|
292
|
+
)
|
|
293
|
+
seen_episodes.add(ep_num)
|
|
294
|
+
|
|
295
|
+
except Exception as e:
|
|
296
|
+
logger.error(f"Failed to download/save subtitle: {e}")
|
|
297
|
+
|
|
298
|
+
return downloaded_subtitles
|
|
299
|
+
|
|
300
|
+
except Exception as e:
|
|
301
|
+
logger.error(f"OpenSubtitles search failed: {e}")
|
|
302
|
+
return []
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class CompositeSubtitleProvider(SubtitleProvider):
|
|
306
|
+
def __init__(self, providers: list[SubtitleProvider]):
|
|
307
|
+
self.providers = providers
|
|
308
|
+
|
|
309
|
+
def get_subtitles(
|
|
310
|
+
self, show_name: str, season: int, video_files: list[Path] = None
|
|
311
|
+
) -> list[SubtitleFile]:
|
|
312
|
+
results = []
|
|
313
|
+
|
|
314
|
+
# Try each provider in order, but prioritize cached results
|
|
315
|
+
for i, provider in enumerate(self.providers):
|
|
316
|
+
provider_results = provider.get_subtitles(show_name, season, video_files)
|
|
317
|
+
|
|
318
|
+
# If this is the local provider and we have results, prefer them
|
|
319
|
+
if isinstance(provider, LocalSubtitleProvider) and provider_results:
|
|
320
|
+
logger.info(
|
|
321
|
+
f"Found {len(provider_results)} cached subtitles for {show_name} S{season:02d}"
|
|
322
|
+
)
|
|
323
|
+
results.extend(provider_results)
|
|
324
|
+
# Return early if we have enough cached subtitles
|
|
325
|
+
if (
|
|
326
|
+
len(provider_results) >= 3
|
|
327
|
+
): # Arbitrary threshold for "enough" episodes
|
|
328
|
+
logger.info("Using cached subtitles, skipping download")
|
|
329
|
+
return results
|
|
330
|
+
else:
|
|
331
|
+
# For non-local providers, only use if we don't have cached results
|
|
332
|
+
if not results:
|
|
333
|
+
logger.info(f"No cached subtitles found, trying provider {i + 1}")
|
|
334
|
+
results.extend(provider_results)
|
|
335
|
+
else:
|
|
336
|
+
logger.info(
|
|
337
|
+
"Skipping additional providers since cached subtitles are available"
|
|
338
|
+
)
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
return results
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import chardet
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect_file_encoding(file_path: Path) -> str:
|
|
9
|
+
"""Detect the encoding of a file using chardet."""
|
|
10
|
+
try:
|
|
11
|
+
with open(file_path, "rb") as f:
|
|
12
|
+
raw_data = f.read(min(1024 * 1024, file_path.stat().st_size))
|
|
13
|
+
result = chardet.detect(raw_data)
|
|
14
|
+
encoding = result["encoding"]
|
|
15
|
+
return encoding if encoding else "utf-8"
|
|
16
|
+
except Exception as e:
|
|
17
|
+
logger.warning(f"Error detecting encoding for {file_path}: {e}")
|
|
18
|
+
return "utf-8"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def read_file_with_fallback(file_path: Path, encodings: list[str] | None = None) -> str:
|
|
22
|
+
"""Read a file trying multiple encodings."""
|
|
23
|
+
if encodings is None:
|
|
24
|
+
detected = detect_file_encoding(file_path)
|
|
25
|
+
encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
|
|
26
|
+
|
|
27
|
+
errors = []
|
|
28
|
+
for encoding in encodings:
|
|
29
|
+
try:
|
|
30
|
+
with open(file_path, encoding=encoding) as f:
|
|
31
|
+
return f.read()
|
|
32
|
+
except UnicodeDecodeError as e:
|
|
33
|
+
errors.append(f"{encoding}: {str(e)}")
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
raise ValueError(f"Failed to read {file_path} with any encoding. Errors: {errors}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SubtitleReader:
|
|
40
|
+
"""Helper class for reading and parsing subtitle files."""
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def parse_timestamp(timestamp: str) -> float:
|
|
44
|
+
"""Parse SRT timestamp into seconds."""
|
|
45
|
+
hours, minutes, seconds = timestamp.replace(",", ".").split(":")
|
|
46
|
+
return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def read_srt_file(file_path: Path) -> str:
|
|
50
|
+
return read_file_with_fallback(file_path)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def extract_subtitle_chunk(
|
|
54
|
+
content: str, start_time: float, end_time: float
|
|
55
|
+
) -> list[str]:
|
|
56
|
+
"""Extract subtitle text for a specific time window."""
|
|
57
|
+
text_lines = []
|
|
58
|
+
for block in content.strip().split("\n\n"):
|
|
59
|
+
lines = block.split("\n")
|
|
60
|
+
if len(lines) < 3 or "-->" not in lines[1]:
|
|
61
|
+
continue
|
|
62
|
+
try:
|
|
63
|
+
timestamp = lines[1]
|
|
64
|
+
time_parts = timestamp.split(" --> ")
|
|
65
|
+
s_stamp = SubtitleReader.parse_timestamp(time_parts[0].strip())
|
|
66
|
+
e_stamp = SubtitleReader.parse_timestamp(time_parts[1].strip())
|
|
67
|
+
|
|
68
|
+
if e_stamp >= start_time and s_stamp <= end_time:
|
|
69
|
+
text_lines.append(" ".join(lines[2:]))
|
|
70
|
+
except (IndexError, ValueError):
|
|
71
|
+
continue
|
|
72
|
+
return text_lines
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def clean_text(text: str) -> str:
|
|
76
|
+
"""Clean and normalize text for matching."""
|
|
77
|
+
text = text.lower().strip()
|
|
78
|
+
text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
|
|
79
|
+
text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
|
|
80
|
+
return " ".join(text.split())
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
import subprocess
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_video_duration(video_file: Path) -> float:
|
|
87
|
+
"""Get video duration using ffprobe."""
|
|
88
|
+
try:
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
[
|
|
91
|
+
"ffprobe",
|
|
92
|
+
"-v",
|
|
93
|
+
"error",
|
|
94
|
+
"-show_entries",
|
|
95
|
+
"format=duration",
|
|
96
|
+
"-of",
|
|
97
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
98
|
+
str(video_file),
|
|
99
|
+
],
|
|
100
|
+
capture_output=True,
|
|
101
|
+
text=True,
|
|
102
|
+
timeout=10,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
if result.returncode != 0:
|
|
106
|
+
raise RuntimeError(f"ffprobe error: {result.stderr}")
|
|
107
|
+
|
|
108
|
+
return float(result.stdout.strip())
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"Failed to get duration for {video_file}: {e}")
|
|
111
|
+
return 0.0
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def extract_audio_chunk(
|
|
115
|
+
video_file: Path, start_time: float, duration: float, output_path: Path
|
|
116
|
+
) -> Path:
|
|
117
|
+
"""Extract audio chunk using ffmpeg."""
|
|
118
|
+
cmd = [
|
|
119
|
+
"ffmpeg",
|
|
120
|
+
"-ss",
|
|
121
|
+
str(start_time),
|
|
122
|
+
"-t",
|
|
123
|
+
str(duration),
|
|
124
|
+
"-i",
|
|
125
|
+
str(video_file),
|
|
126
|
+
"-vn",
|
|
127
|
+
"-sn",
|
|
128
|
+
"-dn",
|
|
129
|
+
"-acodec",
|
|
130
|
+
"pcm_s16le",
|
|
131
|
+
"-ar",
|
|
132
|
+
"16000",
|
|
133
|
+
"-ac",
|
|
134
|
+
"1",
|
|
135
|
+
"-y",
|
|
136
|
+
str(output_path),
|
|
137
|
+
]
|
|
138
|
+
try:
|
|
139
|
+
subprocess.run(cmd, capture_output=True, check=True, timeout=30)
|
|
140
|
+
if not output_path.exists() or output_path.stat().st_size < 1024:
|
|
141
|
+
raise RuntimeError("Output file too small or missing")
|
|
142
|
+
return output_path
|
|
143
|
+
except subprocess.CalledProcessError as e:
|
|
144
|
+
logger.error(f"FFmpeg failed: {e.stderr}")
|
|
145
|
+
raise
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.error(f"Extraction failed: {e}")
|
|
148
|
+
raise
|