lattifai 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +9 -1
  3. lattifai/alignment/lattice1_aligner.py +175 -54
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +3 -2
  7. lattifai/alignment/text_align.py +441 -0
  8. lattifai/alignment/tokenizer.py +134 -65
  9. lattifai/audio2.py +162 -183
  10. lattifai/cli/__init__.py +2 -1
  11. lattifai/cli/alignment.py +5 -0
  12. lattifai/cli/caption.py +111 -4
  13. lattifai/cli/transcribe.py +2 -6
  14. lattifai/cli/youtube.py +7 -1
  15. lattifai/client.py +72 -123
  16. lattifai/config/__init__.py +28 -0
  17. lattifai/config/alignment.py +14 -0
  18. lattifai/config/caption.py +45 -31
  19. lattifai/config/client.py +16 -0
  20. lattifai/config/event.py +102 -0
  21. lattifai/config/media.py +20 -0
  22. lattifai/config/transcription.py +25 -1
  23. lattifai/data/__init__.py +8 -0
  24. lattifai/data/caption.py +228 -0
  25. lattifai/diarization/__init__.py +41 -1
  26. lattifai/errors.py +78 -53
  27. lattifai/event/__init__.py +65 -0
  28. lattifai/event/lattifai.py +166 -0
  29. lattifai/mixin.py +49 -32
  30. lattifai/transcription/base.py +8 -2
  31. lattifai/transcription/gemini.py +147 -16
  32. lattifai/transcription/lattifai.py +25 -63
  33. lattifai/types.py +1 -1
  34. lattifai/utils.py +7 -13
  35. lattifai/workflow/__init__.py +28 -4
  36. lattifai/workflow/file_manager.py +2 -5
  37. lattifai/youtube/__init__.py +43 -0
  38. lattifai/youtube/client.py +1265 -0
  39. lattifai/youtube/types.py +23 -0
  40. lattifai-1.3.0.dist-info/METADATA +678 -0
  41. lattifai-1.3.0.dist-info/RECORD +57 -0
  42. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +1 -2
  43. lattifai/__init__.py +0 -88
  44. lattifai/alignment/sentence_splitter.py +0 -219
  45. lattifai/caption/__init__.py +0 -20
  46. lattifai/caption/caption.py +0 -1467
  47. lattifai/caption/gemini_reader.py +0 -462
  48. lattifai/caption/gemini_writer.py +0 -173
  49. lattifai/caption/supervision.py +0 -34
  50. lattifai/caption/text_parser.py +0 -145
  51. lattifai/cli/app_installer.py +0 -142
  52. lattifai/cli/server.py +0 -44
  53. lattifai/server/app.py +0 -427
  54. lattifai/workflow/youtube.py +0 -577
  55. lattifai-1.2.1.dist-info/METADATA +0 -1134
  56. lattifai-1.2.1.dist-info/RECORD +0 -58
  57. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  58. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {lattifai-1.2.1.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1265 @@
1
+ """
2
+ YouTube client for metadata extraction and media download using yt-dlp
3
+ """
4
+
5
+ import asyncio
6
+ import logging
7
+ import os
8
+ import re
9
+ import tempfile
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ try:
14
+ import yt_dlp
15
+ except ImportError:
16
+ yt_dlp = None
17
+
18
+ from lattifai.caption.config import CAPTION_FORMATS
19
+
20
+ from ..errors import LattifAIError
21
+ from ..workflow.base import setup_workflow_logger
22
+ from ..workflow.file_manager import TRANSCRIBE_CHOICE, FileExistenceManager
23
+ from .types import CaptionTrack, VideoMetadata
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class YouTubeError(LattifAIError):
29
+ """Base error for YouTube operations"""
30
+
31
+ pass
32
+
33
+
34
+ class VideoUnavailableError(YouTubeError):
35
+ """Video is not available (private, deleted, etc)"""
36
+
37
+ pass
38
+
39
+
40
+ class YoutubeLoader:
41
+ """Lightweight YouTube metadata and caption content loader
42
+
43
+ Use this class when you need to:
44
+ - Fetch video metadata quickly
45
+ - Get caption content in memory (not save to disk)
46
+ - Support proxy and cookies configuration
47
+ """
48
+
49
+ def __init__(self, proxy: Optional[str] = None, cookies: Optional[str] = None):
50
+ if yt_dlp is None:
51
+ raise ImportError("yt-dlp is required. Install with `pip install yt-dlp`")
52
+
53
+ # Auto-load from environment if not specified
54
+ if proxy is None:
55
+ proxy = os.getenv("YOUTUBE_PROXY")
56
+ if cookies is None:
57
+ cookies = os.getenv("YOUTUBE_COOKIE_FILE") or os.getenv("YOUTUBE_COOKIE_BROWSER")
58
+
59
+ self.proxy = proxy
60
+ self.cookies = cookies
61
+
62
+ # Base configuration for metadata extraction
63
+ self._base_opts = {
64
+ "quiet": True,
65
+ "no_warnings": True,
66
+ "skip_download": True,
67
+ "extract_flat": False, # Need full info for captions
68
+ "youtube_include_dash_manifest": False,
69
+ "youtube_include_hls_manifest": False,
70
+ }
71
+
72
+ if self.proxy:
73
+ self._base_opts["proxy"] = self.proxy
74
+ logger.info(f"🌐 Using proxy: {self.proxy}")
75
+
76
+ # Cookie configuration
77
+ if self.cookies:
78
+ # Check if it's a browser name (chrome, firefox, safari, etc.)
79
+ browser_names = ["chrome", "firefox", "safari", "edge", "opera", "brave"]
80
+ if self.cookies.lower() in browser_names:
81
+ # Use cookies from browser directly
82
+ self._base_opts["cookiesfrombrowser"] = (self.cookies.lower(),)
83
+ logger.info(f"🍪 Using cookies from browser: {self.cookies}")
84
+ else:
85
+ # Use cookie file
86
+ cookie_path = Path(self.cookies).expanduser()
87
+ if cookie_path.exists():
88
+ self._base_opts["cookiefile"] = str(cookie_path)
89
+ logger.info(f"🍪 Using cookie file: {cookie_path}")
90
+ else:
91
+ logger.warning(f"⚠️ Cookie file not found: {cookie_path}")
92
+ logger.warning("💡 Tip: Run 'yt-dlp --cookies-from-browser chrome' to extract cookies")
93
+
94
+ # Note: player_client configuration is removed to avoid format availability issues
95
+ # with certain videos. Let yt-dlp automatically select the best client.
96
+ # Previous config caused "Requested format is not available" errors for some videos.
97
+
98
+ def get_video_info(self, video_id: str) -> Dict[str, Any]:
99
+ """
100
+ Fetch basic video metadata and list of available captions.
101
+ Returns a dict with 'metadata' (VideoMetadata) and 'captions' (List[CaptionTrack]).
102
+ """
103
+ url = f"https://www.youtube.com/watch?v={video_id}"
104
+ opts = {
105
+ **self._base_opts,
106
+ "writesubtitles": True,
107
+ "writeautomaticsub": True,
108
+ }
109
+
110
+ try:
111
+ with yt_dlp.YoutubeDL(opts) as ydl:
112
+ info = ydl.extract_info(url, download=False)
113
+
114
+ # Parse metadata
115
+ metadata = VideoMetadata(
116
+ video_id=info.get("id", video_id),
117
+ title=info.get("title", "Unknown"),
118
+ description=info.get("description", ""),
119
+ duration=float(info.get("duration", 0)),
120
+ thumbnail_url=info.get("thumbnail", ""),
121
+ channel_name=info.get("uploader", "Unknown"),
122
+ view_count=info.get("view_count", 0),
123
+ upload_date=info.get("upload_date"),
124
+ )
125
+
126
+ # Parse captions
127
+ tracks: List[CaptionTrack] = []
128
+
129
+ # Manual captions
130
+ subtitles = info.get("subtitles", {})
131
+ for lang, formats in subtitles.items():
132
+ for fmt in formats:
133
+ tracks.append(
134
+ CaptionTrack(
135
+ language_code=lang,
136
+ language_name=self._get_lang_name(formats),
137
+ kind="manual",
138
+ ext=fmt.get("ext", ""),
139
+ url=fmt.get("url"),
140
+ )
141
+ )
142
+
143
+ # Auto captions
144
+ auto_subs = info.get("automatic_captions", {})
145
+ for lang, formats in auto_subs.items():
146
+ for fmt in formats:
147
+ tracks.append(
148
+ CaptionTrack(
149
+ language_code=lang,
150
+ language_name=self._get_lang_name(formats),
151
+ kind="asr",
152
+ ext=fmt.get("ext", ""),
153
+ url=fmt.get("url"),
154
+ )
155
+ )
156
+
157
+ return {"metadata": metadata, "captions": tracks}
158
+
159
+ except yt_dlp.utils.DownloadError as e:
160
+ msg = str(e)
161
+ if "Sign in to confirm" in msg or "not a bot" in msg:
162
+ # Bot detection error - provide helpful guidance
163
+ error_msg = (
164
+ f"🤖 YouTube Bot Detection: Video {video_id} requires authentication.\n\n"
165
+ "Solutions:\n"
166
+ "1. Use browser cookies (recommended):\n"
167
+ " loader = YoutubeLoader(cookies='chrome') # or 'firefox', 'safari'\n\n"
168
+ "2. Export cookie file:\n"
169
+ " yt-dlp --cookies-from-browser chrome --cookies cookies.txt <video_url>\n"
170
+ " loader = YoutubeLoader(cookies='cookies.txt')\n\n"
171
+ "3. Environment variable:\n"
172
+ " export YOUTUBE_COOKIE_BROWSER=chrome\n\n"
173
+ f"Original error: {msg}"
174
+ )
175
+ raise VideoUnavailableError(error_msg) from e
176
+ elif "Private video" in msg:
177
+ raise VideoUnavailableError(f"Video {video_id} is private") from e
178
+ raise YouTubeError(f"yt-dlp failed: {msg}") from e
179
+ except Exception as e:
180
+ raise YouTubeError(f"Unexpected error: {str(e)}") from e
181
+
182
+ def get_caption(self, video_id: str, lang: str = "en") -> Dict[str, str]:
183
+ """
184
+ Fetch transcript for a specific language.
185
+ Returns a dict with 'content' (raw string) and 'fmt' (format extension).
186
+ """
187
+ url = f"https://www.youtube.com/watch?v={video_id}"
188
+
189
+ # We need to download json3 or vtt to parse.
190
+ # Ideally we want json3 for precision, but yt-dlp prefers vtt/srv3
191
+
192
+ opts = {
193
+ **self._base_opts,
194
+ "writesubtitles": True,
195
+ "writeautomaticsub": True,
196
+ "subtitleslangs": [lang],
197
+ "skip_download": True,
198
+ }
199
+
200
+ try:
201
+ with yt_dlp.YoutubeDL(opts) as ydl:
202
+ info = ydl.extract_info(url, download=False)
203
+
204
+ # Look for the requested language in subtitles or automatic_captions
205
+ subs = info.get("subtitles", {}).get(lang)
206
+ if not subs:
207
+ subs = info.get("automatic_captions", {}).get(lang)
208
+
209
+ if not subs:
210
+ raise YouTubeError(f"No captions found for language: {lang}")
211
+
212
+ # Sort to find best format (json3 > vtt > ttml > srv3)
213
+ best_fmt = self._find_best_format(subs)
214
+ if not best_fmt or not best_fmt.get("url"):
215
+ raise YouTubeError("Could not find a download URL for captions")
216
+
217
+ caption_url = best_fmt["url"]
218
+ ext = best_fmt.get("ext")
219
+ content = self._fetch_caption(caption_url)
220
+
221
+ return {"content": content, "fmt": ext}
222
+
223
+ except Exception as e:
224
+ raise YouTubeError(f"Failed to fetch transcript: {str(e)}") from e
225
+
226
+ def _get_lang_name(self, formats: List[Dict]) -> str:
227
+ if formats and "name" in formats[0]:
228
+ return formats[0]["name"]
229
+ return "Unknown"
230
+
231
+ def _find_best_format(self, formats: List[Dict]) -> Optional[Dict]:
232
+ # Prefer json3 (best precision), srv3 (word-level timing), then vtt
233
+ priority = ["json3", "srv3", "vtt", "ttml", "srv2", "srv1"]
234
+
235
+ for fmt_ext in priority:
236
+ for f in formats:
237
+ if f.get("ext") == fmt_ext:
238
+ return f
239
+ return formats[0] if formats else None
240
+
241
+ def _fetch_caption(self, url: str) -> str:
242
+ import requests
243
+
244
+ try:
245
+ resp = requests.get(url, proxies={"https": self.proxy} if self.proxy else None)
246
+ resp.raise_for_status()
247
+ return resp.text
248
+ except Exception as e:
249
+ logger.error(f"Error fetching caption: {e}")
250
+ raise YouTubeError("Failed to fetch caption content") from e
251
+
252
+ def get_audio_url(
253
+ self,
254
+ video_id: str,
255
+ format_preference: str = "m4a",
256
+ quality: str = "best",
257
+ audio_track_id: Optional[str] = None,
258
+ ) -> Dict[str, Any]:
259
+ """
260
+ Get direct audio-only stream URL for a YouTube video.
261
+
262
+ Args:
263
+ video_id: YouTube video ID
264
+ format_preference: Preferred audio format (m4a, webm, opus)
265
+ quality: Audio quality - "best" (highest bitrate), "medium" (~128kbps),
266
+ "low" (~50kbps), or specific bitrate like "128", "64"
267
+ audio_track_id: Specific audio track ID for multi-language videos (e.g., "en.2")
268
+
269
+ Returns:
270
+ Dict with url, mime_type, bitrate, content_length, format_id, ext
271
+ """
272
+ url = f"https://www.youtube.com/watch?v={video_id}"
273
+
274
+ # Use base opts (includes proxy and cookie config) + DASH manifest
275
+ opts = {
276
+ **self._base_opts,
277
+ "youtube_include_dash_manifest": True,
278
+ }
279
+
280
+ try:
281
+ with yt_dlp.YoutubeDL(opts) as ydl:
282
+ info = ydl.extract_info(url, download=False)
283
+
284
+ # Get all formats and filter for audio-only (no video track)
285
+ formats = info.get("formats", [])
286
+
287
+ def is_direct_url(url: str) -> bool:
288
+ """Check if URL is a direct stream URL (not HLS manifest)"""
289
+ if not url:
290
+ return False
291
+ # HLS manifests contain these patterns
292
+ hls_patterns = ["manifest.googlevideo.com", "/hls_playlist/", ".m3u8"]
293
+ return not any(p in url for p in hls_patterns)
294
+
295
+ audio_formats = [
296
+ f
297
+ for f in formats
298
+ if f.get("acodec") not in (None, "none")
299
+ and f.get("vcodec") in (None, "none")
300
+ and f.get("url") # Must have a direct URL
301
+ and is_direct_url(f.get("url")) # Exclude HLS manifests
302
+ ]
303
+
304
+ if not audio_formats:
305
+ # Fallback: If no audio-only formats, use lowest resolution video with audio
306
+ # This happens with HLS-only videos (e.g., protected content)
307
+ logger.warning("No audio-only formats found. Falling back to lowest resolution video with audio.")
308
+ audio_formats = [
309
+ f
310
+ for f in formats
311
+ if f.get("acodec") not in (None, "none")
312
+ and f.get("vcodec") not in (None, "none")
313
+ and f.get("url")
314
+ and is_direct_url(f.get("url")) # Exclude HLS manifests
315
+ ]
316
+ # Sort by resolution (lowest first) for minimal bandwidth
317
+ audio_formats.sort(key=lambda f: f.get("height") or f.get("width") or 9999)
318
+
319
+ if not audio_formats:
320
+ # Check if there are HLS-only formats (common for Shorts)
321
+ # HLS can still work with server-side streaming (same IP)
322
+ hls_with_audio = [f for f in formats if f.get("acodec") not in (None, "none") and f.get("url")]
323
+ if hls_with_audio:
324
+ logger.warning("Only HLS streams available. Returning HLS URL for server-side streaming.")
325
+ # Sort: prefer audio-only, then by resolution (lowest first)
326
+ hls_with_audio.sort(
327
+ key=lambda f: (
328
+ 0 if f.get("vcodec") in (None, "none") else 1,
329
+ f.get("height") or f.get("width") or 9999,
330
+ )
331
+ )
332
+ audio_formats = hls_with_audio
333
+ else:
334
+ raise YouTubeError(
335
+ "No formats with audio available. YouTube may require authentication for this video."
336
+ )
337
+
338
+ # Filter by audio_track_id if specified (for multi-language audio)
339
+ if audio_track_id:
340
+ # yt-dlp uses format_id patterns like "251-0" or "audio_track" field
341
+ # Try matching by format_id suffix or audio_track field
342
+ track_filtered = [
343
+ f
344
+ for f in audio_formats
345
+ if f.get("audio_track", {}).get("id") == audio_track_id
346
+ or (f.get("format_id") and audio_track_id in f.get("format_id", ""))
347
+ or f.get("language") == audio_track_id.split(".")[0] # e.g., "en" from "en.2"
348
+ ]
349
+ if track_filtered:
350
+ audio_formats = track_filtered
351
+ logger.info(f"Filtered to {len(audio_formats)} formats for audio_track_id={audio_track_id}")
352
+
353
+ # Parse quality parameter
354
+ # "best" = highest bitrate, "medium" ~128kbps, "low" ~50kbps
355
+ quality_tier = quality.lower()
356
+ if quality_tier == "best":
357
+ max_bitrate = float("inf")
358
+ elif quality_tier == "medium":
359
+ max_bitrate = 160 # Allow up to 160kbps for "medium"
360
+ elif quality_tier == "low":
361
+ max_bitrate = 70 # Allow up to 70kbps for "low"
362
+ elif quality_tier.isdigit():
363
+ max_bitrate = int(quality_tier) + 20 # Allow some tolerance
364
+ else:
365
+ max_bitrate = float("inf") # Default to best
366
+
367
+ # Sort by preference: format match > bitrate (within limit)
368
+ def score_format(f: Dict) -> tuple:
369
+ ext = f.get("ext", "")
370
+ ext_match = 2 if ext == format_preference else 0
371
+ # Prefer m4a/webm over other formats
372
+ common_format = 1 if ext in ("m4a", "webm", "opus") else 0
373
+ bitrate = f.get("abr") or f.get("tbr") or 0
374
+
375
+ # For quality tiers, filter then maximize
376
+ if bitrate <= max_bitrate:
377
+ quality_score = bitrate # Higher is better within limit
378
+ else:
379
+ quality_score = -1000 # Exclude formats exceeding limit
380
+
381
+ return (ext_match, common_format, quality_score)
382
+
383
+ audio_formats.sort(key=score_format, reverse=True)
384
+ best = audio_formats[0]
385
+
386
+ # Check if selected format is HLS (requires server-side streaming)
387
+ best_url = best.get("url", "")
388
+ is_hls = not is_direct_url(best_url)
389
+
390
+ return {
391
+ "url": best_url,
392
+ "mime_type": best.get("ext", format_preference),
393
+ "bitrate": best.get("abr") or best.get("tbr"),
394
+ "sample_rate": best.get("asr"), # Audio sample rate
395
+ "content_length": best.get("filesize") or best.get("filesize_approx"),
396
+ "format_id": best.get("format_id"),
397
+ "ext": best.get("ext"),
398
+ "is_hls": is_hls, # True = use server streaming, False = use proxy
399
+ }
400
+
401
+ except yt_dlp.utils.DownloadError as e:
402
+ msg = str(e)
403
+ if "Sign in to confirm" in msg or "not a bot" in msg:
404
+ raise YouTubeError(
405
+ f"🤖 YouTube Bot Detection: Cookie configuration required to access this video. "
406
+ f"Reference: YoutubeLoader(cookies='chrome') or set environment variable YOUTUBE_COOKIE_BROWSER=chrome. "
407
+ f"Original error: {msg}"
408
+ ) from e
409
+ raise YouTubeError(f"Failed to get audio URL: {msg}") from e
410
+ except Exception as e:
411
+ raise YouTubeError(f"Unexpected error getting audio URL: {str(e)}") from e
412
+
413
+ def get_video_url(self, video_id: str, format_preference: str = "mp4", quality: str = "best") -> Dict[str, Any]:
414
+ """
415
+ Get direct video stream URL for a YouTube video.
416
+
417
+ Args:
418
+ video_id: YouTube video ID
419
+ format_preference: Preferred video format (mp4, webm)
420
+ quality: Video quality (best, 1080, 720, 480, 360)
421
+
422
+ Returns:
423
+ Dict with url, mime_type, width, height, fps, vcodec, acodec, bitrate, content_length, format_id, ext
424
+
425
+ Note:
426
+ Prioritizes formats that include both video AND audio to avoid silent videos.
427
+ YouTube separates high-quality video and audio streams; we prefer pre-muxed formats.
428
+ """
429
+ url = f"https://www.youtube.com/watch?v={video_id}"
430
+
431
+ # Use base opts (includes proxy and cookie config) + DASH and HLS manifests
432
+ opts = {
433
+ **self._base_opts,
434
+ "youtube_include_dash_manifest": True,
435
+ "youtube_include_hls_manifest": True,
436
+ }
437
+
438
+ try:
439
+ with yt_dlp.YoutubeDL(opts) as ydl:
440
+ info = ydl.extract_info(url, download=False)
441
+
442
+ # Get all formats
443
+ formats = info.get("formats", [])
444
+
445
+ def is_direct_url(url: str) -> bool:
446
+ """Check if URL is a direct stream URL (not HLS manifest)"""
447
+ if not url:
448
+ return False
449
+ hls_patterns = ["manifest.googlevideo.com", "/hls_playlist/", ".m3u8"]
450
+ return not any(p in url for p in hls_patterns)
451
+
452
+ # Filter for video formats:
453
+ # - Must have video codec
454
+ # - Must have a URL
455
+ # - Prefer direct URLs (DASH) over HLS manifests
456
+ def is_usable_video(f: Dict) -> bool:
457
+ if f.get("vcodec") in (None, "none"):
458
+ return False
459
+ if not f.get("url"):
460
+ return False
461
+ return True
462
+
463
+ # First try: direct URLs only (exclude HLS)
464
+ video_formats = [f for f in formats if is_usable_video(f) and is_direct_url(f.get("url", ""))]
465
+
466
+ # Fallback: include HLS if no direct formats
467
+ if not video_formats:
468
+ logger.warning("No direct video URLs found. Falling back to HLS formats.")
469
+ video_formats = [f for f in formats if is_usable_video(f)]
470
+
471
+ if not video_formats:
472
+ raise YouTubeError("No video formats available")
473
+
474
+ # Parse target height from quality parameter
475
+ target_height = None
476
+ if quality != "best" and quality.isdigit():
477
+ target_height = int(quality)
478
+
479
+ # Sort by preference: has_audio (MOST IMPORTANT) > format match > resolution > bitrate
480
+ # YouTube high-quality streams are often video-only; we MUST prefer formats with audio
481
+ def score_format(f: Dict) -> tuple:
482
+ ext = f.get("ext", "")
483
+ ext_match = 1 if ext == format_preference else 0
484
+ height = f.get("height") or 0
485
+ bitrate = f.get("tbr") or f.get("vbr") or 0
486
+ # has_audio is now the HIGHEST priority - video without audio is useless for most users
487
+ has_audio = 10 if f.get("acodec") not in (None, "none") else 0
488
+
489
+ # For quality filtering, penalize formats exceeding target
490
+ height_score = height
491
+ if target_height and height > target_height:
492
+ height_score = -1000 # Heavily penalize exceeding target
493
+
494
+ return (has_audio, ext_match, height_score, bitrate)
495
+
496
+ video_formats.sort(key=score_format, reverse=True)
497
+ best = video_formats[0]
498
+
499
+ # Check if selected format is HLS
500
+ best_url = best.get("url", "")
501
+ is_hls = not is_direct_url(best_url)
502
+
503
+ # Log selection for debugging
504
+ logger.info(
505
+ f"Selected video format: {best.get('format_id')} "
506
+ f"({best.get('width')}x{best.get('height')}, "
507
+ f"vcodec={best.get('vcodec')}, acodec={best.get('acodec')}, is_hls={is_hls})"
508
+ )
509
+
510
+ return {
511
+ "url": best_url,
512
+ "mime_type": best.get("ext", format_preference),
513
+ "width": best.get("width"),
514
+ "height": best.get("height"),
515
+ "fps": best.get("fps"),
516
+ "vcodec": best.get("vcodec"),
517
+ "acodec": best.get("acodec"),
518
+ "bitrate": best.get("tbr") or best.get("vbr"),
519
+ "content_length": best.get("filesize") or best.get("filesize_approx"),
520
+ "format_id": best.get("format_id"),
521
+ "ext": best.get("ext"),
522
+ "is_hls": is_hls,
523
+ }
524
+
525
+ except yt_dlp.utils.DownloadError as e:
526
+ msg = str(e)
527
+ if "Sign in to confirm" in msg or "not a bot" in msg:
528
+ raise YouTubeError(
529
+ f"🤖 YouTube Bot Detection: Cookie configuration required to access this video. "
530
+ f"Reference: YoutubeLoader(cookies='chrome') or set environment variable YOUTUBE_COOKIE_BROWSER=chrome. "
531
+ f"Original error: {msg}"
532
+ ) from e
533
+ raise YouTubeError(f"Failed to get video URL: {msg}") from e
534
+ except Exception as e:
535
+ raise YouTubeError(f"Unexpected error getting video URL: {str(e)}") from e
536
+
537
+
538
+ class YouTubeDownloader:
539
+ """YouTube media and caption file downloader using yt-dlp
540
+
541
+ Use this class when you need to:
542
+ - Download audio/video files to disk
543
+ - Download caption files to disk
544
+ - Manage file existence and overwrite options
545
+ - Async download support
546
+ """
547
+
548
+ def __init__(self):
549
+ if yt_dlp is None:
550
+ raise ImportError("yt-dlp is required. Install with `pip install yt-dlp`")
551
+
552
+ self.logger = setup_workflow_logger("youtube")
553
+ self.logger.info(f"yt-dlp version: {yt_dlp.version.__version__}")
554
+
555
+ def _normalize_audio_quality(self, quality: str) -> str:
556
+ """
557
+ Normalize quality parameter for audio downloads.
558
+
559
+ Handles cross-type quality values (e.g., video resolution used for audio).
560
+
561
+ Args:
562
+ quality: Raw quality string
563
+
564
+ Returns:
565
+ Normalized audio quality string
566
+ """
567
+ quality_lower = quality.lower()
568
+
569
+ # Direct audio quality values
570
+ if quality_lower in ("best", "medium", "low"):
571
+ return quality_lower
572
+
573
+ # Numeric values need interpretation
574
+ if quality_lower.isdigit():
575
+ value = int(quality_lower)
576
+ # Values > 320 are likely video resolutions, not audio bitrates
577
+ if value > 320:
578
+ self.logger.warning(f"⚠️ Quality '{quality}' looks like video resolution, using 'best' for audio")
579
+ return "best"
580
+ # Values <= 320 are reasonable audio bitrates
581
+ return quality_lower
582
+
583
+ # Unknown value, default to best
584
+ return "best"
585
+
586
+ def _normalize_video_quality(self, quality: str) -> str:
587
+ """
588
+ Normalize quality parameter for video downloads.
589
+
590
+ Handles cross-type quality values (e.g., audio bitrate/quality used for video).
591
+
592
+ Args:
593
+ quality: Raw quality string
594
+
595
+ Returns:
596
+ Normalized video quality string
597
+ """
598
+ quality_lower = quality.lower()
599
+
600
+ # Map audio quality terms to video equivalents
601
+ if quality_lower == "low":
602
+ self.logger.info("🎬 Mapping audio quality 'low' to video 360p")
603
+ return "360"
604
+ elif quality_lower == "medium":
605
+ self.logger.info("🎬 Mapping audio quality 'medium' to video 720p")
606
+ return "720"
607
+ elif quality_lower == "best":
608
+ return "best"
609
+
610
+ # Numeric values
611
+ if quality_lower.isdigit():
612
+ value = int(quality_lower)
613
+ # Values <= 320 are likely audio bitrates, not video resolutions
614
+ if value <= 320:
615
+ self.logger.warning(f"⚠️ Quality '{quality}' looks like audio bitrate, using 'best' for video")
616
+ return "best"
617
+ # Values > 320 are reasonable video resolutions
618
+ return quality_lower
619
+
620
+ # Unknown value, default to best
621
+ return "best"
622
+
623
+ def _build_audio_format_selector(self, audio_track_id: Optional[str], quality: str = "best") -> str:
624
+ """
625
+ Build yt-dlp format selector string for audio track and quality selection.
626
+
627
+ Args:
628
+ audio_track_id: Audio track selection:
629
+ - "original": Select the original audio track (format_id contains "drc")
630
+ - Language code (e.g., "en", "ja"): Select by language
631
+ - Format ID (e.g., "251-drc"): Select specific format
632
+ - None: No filtering
633
+ quality: Audio quality:
634
+ - "best": Highest bitrate (default)
635
+ - "medium": ~128 kbps
636
+ - "low": ~50 kbps
637
+ - Numeric string (e.g., "128"): Target bitrate in kbps
638
+
639
+ Returns:
640
+ yt-dlp format selector string
641
+ """
642
+ # Normalize quality for audio context
643
+ quality_lower = self._normalize_audio_quality(quality)
644
+
645
+ # Build quality filter
646
+ quality_filter = ""
647
+ if quality_lower == "medium":
648
+ quality_filter = "[abr<=160]"
649
+ self.logger.info("🎵 Audio quality: medium (~128 kbps)")
650
+ elif quality_lower == "low":
651
+ quality_filter = "[abr<=70]"
652
+ self.logger.info("🎵 Audio quality: low (~50 kbps)")
653
+ elif quality_lower.isdigit():
654
+ max_bitrate = int(quality_lower) + 20 # Allow some tolerance
655
+ quality_filter = f"[abr<={max_bitrate}]"
656
+ self.logger.info(f"🎵 Audio quality: ~{quality_lower} kbps")
657
+ # "best" = no filter, use bestaudio
658
+
659
+ # Build track filter
660
+ if audio_track_id is None:
661
+ return f"bestaudio{quality_filter}/bestaudio/best"
662
+
663
+ if audio_track_id.lower() == "original":
664
+ self.logger.info("🎵 Selecting original audio track (format_id contains 'drc')")
665
+ return f"bestaudio[format_id*=drc]{quality_filter}/bestaudio{quality_filter}/bestaudio/best"
666
+
667
+ # Check if it looks like a format_id (contains hyphen or is numeric)
668
+ if "-" in audio_track_id or audio_track_id.isdigit():
669
+ self.logger.info(f"🎵 Selecting audio by format_id: {audio_track_id}")
670
+ return f"bestaudio[format_id={audio_track_id}]{quality_filter}/bestaudio{quality_filter}/bestaudio/best"
671
+
672
+ # Assume it's a language code
673
+ self.logger.info(f"🎵 Selecting audio by language: {audio_track_id}")
674
+ return f"bestaudio[language^={audio_track_id}]{quality_filter}/bestaudio{quality_filter}/bestaudio/best"
675
+
676
+ def _build_video_format_selector(self, audio_format_selector: str, quality: str = "best") -> str:
677
+ """
678
+ Build yt-dlp format selector string for video with quality selection.
679
+
680
+ Args:
681
+ audio_format_selector: Audio format selector from _build_audio_format_selector
682
+ quality: Video quality:
683
+ - "best": Highest resolution (default)
684
+ - "low": 360p
685
+ - "medium": 720p
686
+ - "1080", "720", "480", "360": Target resolution
687
+
688
+ Returns:
689
+ yt-dlp format selector string
690
+ """
691
+ # Normalize quality for video context
692
+ quality_lower = self._normalize_video_quality(quality)
693
+
694
+ if quality_lower.isdigit():
695
+ height = int(quality_lower)
696
+ self.logger.info(f"🎬 Video quality: {height}p")
697
+ return f"bestvideo[height<={height}]+{audio_format_selector}/best[height<={height}]/best"
698
+
699
+ # "best" or fallback
700
+ return f"bestvideo*+{audio_format_selector}/best"
701
+
702
+ @staticmethod
703
+ def extract_video_id(url: str) -> str:
704
+ """
705
+ Extract video ID from YouTube URL
706
+
707
+ Supports various YouTube URL formats:
708
+ - https://www.youtube.com/watch?v=VIDEO_ID
709
+ - https://youtu.be/VIDEO_ID
710
+ - https://www.youtube.com/shorts/VIDEO_ID
711
+ - https://m.youtube.com/watch?v=VIDEO_ID
712
+
713
+ Returns:
714
+ Video ID (e.g., 'cprOj8PWepY')
715
+ """
716
+ patterns = [
717
+ r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
718
+ r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
719
+ r"youtube\.com/v/([a-zA-Z0-9_-]{11})",
720
+ ]
721
+
722
+ for pattern in patterns:
723
+ match = re.search(pattern, url)
724
+ if match:
725
+ return match.group(1)
726
+ return "youtube_media"
727
+
728
+ async def get_video_info(self, url: str) -> Dict[str, Any]:
729
+ """Get video metadata without downloading"""
730
+ self.logger.info(f"🔍 Extracting video info for: {url}")
731
+
732
+ opts = {
733
+ "quiet": True,
734
+ "no_warnings": True,
735
+ "skip_download": True,
736
+ }
737
+
738
+ try:
739
+ # Run in thread pool to avoid blocking
740
+ loop = asyncio.get_event_loop()
741
+
742
+ def _extract_info():
743
+ with yt_dlp.YoutubeDL(opts) as ydl:
744
+ return ydl.extract_info(url, download=False)
745
+
746
+ metadata = await loop.run_in_executor(None, _extract_info)
747
+
748
+ # Extract relevant info
749
+ info = {
750
+ "title": metadata.get("title", "Unknown"),
751
+ "duration": metadata.get("duration", 0),
752
+ "uploader": metadata.get("uploader", "Unknown"),
753
+ "upload_date": metadata.get("upload_date", "Unknown"),
754
+ "view_count": metadata.get("view_count", 0),
755
+ "description": metadata.get("description", ""),
756
+ "thumbnail": metadata.get("thumbnail", ""),
757
+ "webpage_url": metadata.get("webpage_url", url),
758
+ }
759
+
760
+ self.logger.info(f'✅ Video info extracted: {info["title"]}')
761
+ return info
762
+
763
+ except yt_dlp.utils.DownloadError as e:
764
+ self.logger.error(f"Failed to extract video info: {str(e)}")
765
+ raise RuntimeError(f"Failed to extract video info: {str(e)}")
766
+ except Exception as e:
767
+ self.logger.error(f"Failed to parse video metadata: {str(e)}")
768
+ raise RuntimeError(f"Failed to parse video metadata: {str(e)}")
769
+
770
+ async def download_media(
771
+ self,
772
+ url: str,
773
+ output_dir: Optional[str] = None,
774
+ media_format: Optional[str] = None,
775
+ force_overwrite: bool = False,
776
+ audio_track_id: Optional[str] = "original",
777
+ quality: str = "best",
778
+ ) -> str:
779
+ """
780
+ Download media (audio or video) from YouTube URL based on format
781
+
782
+ This is a unified method that automatically selects between audio and video
783
+ download based on the media format extension.
784
+
785
+ Args:
786
+ url: YouTube URL
787
+ output_dir: Output directory (default: temp directory)
788
+ media_format: Media format - audio (mp3, wav, m4a, aac, opus, ogg, flac, aiff)
789
+ or video (mp4, webm, mkv, avi, mov, etc.) (default: mp3)
790
+ force_overwrite: Skip user confirmation and overwrite existing files
791
+ audio_track_id: Audio track selection for multi-language videos:
792
+ - "original": Select the original audio track (default)
793
+ - Language code (e.g., "en", "ja"): Select by language
794
+ - Format ID (e.g., "251-drc"): Select specific format
795
+ - None: No filtering, use yt-dlp default
796
+ quality: Media quality selection:
797
+ For audio: "best", "medium", "low", or bitrate like "128"
798
+ For video: "best", "1080", "720", "480", "360"
799
+
800
+ Returns:
801
+ Path to downloaded media file
802
+ """
803
+ media_format = media_format or "mp3"
804
+
805
+ # Determine if format is audio or video
806
+ audio_formats = ["mp3", "wav", "m4a", "aac", "opus", "ogg", "flac", "aiff"]
807
+ is_audio = media_format.lower() in audio_formats
808
+
809
+ if is_audio:
810
+ self.logger.info(f"🎵 Detected audio format: {media_format}")
811
+ return await self.download_audio(
812
+ url=url,
813
+ output_dir=output_dir,
814
+ media_format=media_format,
815
+ force_overwrite=force_overwrite,
816
+ audio_track_id=audio_track_id,
817
+ quality=quality,
818
+ )
819
+ else:
820
+ self.logger.info(f"🎬 Detected video format: {media_format}")
821
+ return await self.download_video(
822
+ url=url,
823
+ output_dir=output_dir,
824
+ video_format=media_format,
825
+ force_overwrite=force_overwrite,
826
+ audio_track_id=audio_track_id,
827
+ quality=quality,
828
+ )
829
+
830
+ async def _download_media_internal(
831
+ self,
832
+ url: str,
833
+ output_dir: str,
834
+ media_format: str,
835
+ is_audio: bool,
836
+ force_overwrite: bool = False,
837
+ audio_track_id: Optional[str] = "original",
838
+ quality: str = "best",
839
+ ) -> str:
840
+ """
841
+ Internal unified method for downloading audio or video from YouTube
842
+
843
+ Args:
844
+ url: YouTube URL
845
+ output_dir: Output directory
846
+ media_format: Media format (audio or video extension)
847
+ is_audio: True for audio download, False for video download
848
+ force_overwrite: Skip user confirmation and overwrite existing files
849
+ audio_track_id: Audio track selection for multi-language videos:
850
+ - "original": Select the original audio track (default)
851
+ - Language code (e.g., "en", "ja"): Select by language
852
+ - Format ID (e.g., "251-drc"): Select specific format
853
+ - None: No filtering, use yt-dlp default
854
+ quality: Media quality selection:
855
+ For audio: "best", "medium", "low", or bitrate like "128"
856
+ For video: "best", "1080", "720", "480", "360"
857
+
858
+ Returns:
859
+ Path to downloaded media file
860
+ """
861
+ target_dir = Path(output_dir).expanduser()
862
+ media_type = "audio" if is_audio else "video"
863
+ emoji = "🎵" if is_audio else "🎬"
864
+
865
+ self.logger.info(f"{emoji} Downloading {media_type} from: {url}")
866
+ self.logger.info(f"📁 Output directory: {target_dir}")
867
+ self.logger.info(f'{"🎶" if is_audio else "🎥"} Media format: {media_format}')
868
+
869
+ # Create output directory if it doesn't exist
870
+ target_dir.mkdir(parents=True, exist_ok=True)
871
+
872
+ # Extract video ID and check for existing files
873
+ video_id = self.extract_video_id(url)
874
+ existing_files = FileExistenceManager.check_existing_files(video_id, str(target_dir), [media_format])
875
+
876
+ # Handle existing files
877
+ if existing_files["media"] and not force_overwrite:
878
+ if FileExistenceManager.is_interactive_mode():
879
+ user_choice = FileExistenceManager.prompt_user_confirmation(
880
+ {"media": existing_files["media"]}, "media download"
881
+ )
882
+
883
+ if user_choice == "cancel":
884
+ raise RuntimeError("Media download cancelled by user")
885
+ elif user_choice == "overwrite":
886
+ # Continue with download
887
+ pass
888
+ elif user_choice in existing_files["media"]:
889
+ # User selected a specific file
890
+ return user_choice
891
+ else:
892
+ # Fallback: use first file
893
+ self.logger.info(f'✅ Using existing media file: {existing_files["media"][0]}')
894
+ return existing_files["media"][0]
895
+ else:
896
+ # Non-interactive mode: use existing file
897
+ self.logger.info(f'✅ Using existing media file: {existing_files["media"][0]}')
898
+ return existing_files["media"][0]
899
+
900
+ # Generate output filename template
901
+ output_template = str(target_dir / f"{video_id}.%(ext)s")
902
+
903
+ # Build format selector with audio track and quality filtering
904
+ audio_format_selector = self._build_audio_format_selector(audio_track_id, quality)
905
+
906
+ # Build yt-dlp options based on media type
907
+ if is_audio:
908
+ opts = {
909
+ "format": audio_format_selector,
910
+ "postprocessors": [
911
+ {
912
+ "key": "FFmpegExtractAudio",
913
+ "preferredcodec": media_format,
914
+ "preferredquality": "0", # Best quality for conversion
915
+ }
916
+ ],
917
+ "outtmpl": output_template,
918
+ "noplaylist": True,
919
+ "quiet": False,
920
+ "no_warnings": True,
921
+ }
922
+ else:
923
+ # For video, combine video with selected audio track
924
+ video_format_selector = self._build_video_format_selector(audio_format_selector, quality)
925
+ opts = {
926
+ "format": video_format_selector,
927
+ "merge_output_format": media_format,
928
+ "outtmpl": output_template,
929
+ "noplaylist": True,
930
+ "quiet": False,
931
+ "no_warnings": True,
932
+ }
933
+
934
+ try:
935
+ # Run in thread pool to avoid blocking
936
+ loop = asyncio.get_event_loop()
937
+
938
+ def _download():
939
+ with yt_dlp.YoutubeDL(opts) as ydl:
940
+ ydl.download([url])
941
+
942
+ await loop.run_in_executor(None, _download)
943
+
944
+ self.logger.info(f"✅ {media_type.capitalize()} download completed")
945
+
946
+ # Check for expected file format
947
+ expected_file = target_dir / f"{video_id}.{media_format}"
948
+ if expected_file.exists():
949
+ self.logger.info(f"{emoji} Downloaded {media_type}: {expected_file}")
950
+ return str(expected_file)
951
+
952
+ # Fallback: search for media files with this video_id
953
+ if is_audio:
954
+ fallback_extensions = [media_format, "mp3", "wav", "m4a", "aac"]
955
+ else:
956
+ fallback_extensions = [media_format, "mp4", "webm", "mkv"]
957
+
958
+ for ext in fallback_extensions:
959
+ files = list(target_dir.glob(f"{video_id}*.{ext}"))
960
+ if files:
961
+ latest_file = max(files, key=os.path.getctime)
962
+ self.logger.info(f"{emoji} Found {media_type} file: {latest_file}")
963
+ return str(latest_file)
964
+
965
+ raise RuntimeError(f"Downloaded {media_type} file not found")
966
+
967
+ except yt_dlp.utils.DownloadError as e:
968
+ self.logger.error(f"Failed to download {media_type}: {str(e)}")
969
+ raise RuntimeError(f"Failed to download {media_type}: {str(e)}")
970
+ except Exception as e:
971
+ self.logger.error(f"Failed to download {media_type}: {str(e)}")
972
+ raise RuntimeError(f"Failed to download {media_type}: {str(e)}")
973
+
974
+ async def download_audio(
975
+ self,
976
+ url: str,
977
+ output_dir: Optional[str] = None,
978
+ media_format: Optional[str] = None,
979
+ force_overwrite: bool = False,
980
+ audio_track_id: Optional[str] = "original",
981
+ quality: str = "best",
982
+ ) -> str:
983
+ """
984
+ Download audio from YouTube URL
985
+
986
+ Args:
987
+ url: YouTube URL
988
+ output_dir: Output directory (default: temp directory)
989
+ media_format: Audio format (default: mp3)
990
+ force_overwrite: Skip user confirmation and overwrite existing files
991
+ audio_track_id: Audio track selection for multi-language videos
992
+ quality: Audio quality ("best", "medium", "low", or bitrate like "128")
993
+
994
+ Returns:
995
+ Path to downloaded audio file
996
+ """
997
+ target_dir = output_dir or tempfile.gettempdir()
998
+ media_format = media_format or "mp3"
999
+ return await self._download_media_internal(
1000
+ url,
1001
+ target_dir,
1002
+ media_format,
1003
+ is_audio=True,
1004
+ force_overwrite=force_overwrite,
1005
+ audio_track_id=audio_track_id,
1006
+ quality=quality,
1007
+ )
1008
+
1009
+ async def download_video(
1010
+ self,
1011
+ url: str,
1012
+ output_dir: Optional[str] = None,
1013
+ video_format: str = "mp4",
1014
+ force_overwrite: bool = False,
1015
+ audio_track_id: Optional[str] = "original",
1016
+ quality: str = "best",
1017
+ ) -> str:
1018
+ """
1019
+ Download video from YouTube URL
1020
+
1021
+ Args:
1022
+ url: YouTube URL
1023
+ output_dir: Output directory (default: temp directory)
1024
+ video_format: Video format
1025
+ force_overwrite: Skip user confirmation and overwrite existing files
1026
+ audio_track_id: Audio track selection for multi-language videos
1027
+ quality: Video quality ("best", "1080", "720", "480", "360")
1028
+
1029
+ Returns:
1030
+ Path to downloaded video file
1031
+ """
1032
+ target_dir = output_dir or tempfile.gettempdir()
1033
+ return await self._download_media_internal(
1034
+ url,
1035
+ target_dir,
1036
+ video_format,
1037
+ is_audio=False,
1038
+ force_overwrite=force_overwrite,
1039
+ audio_track_id=audio_track_id,
1040
+ quality=quality,
1041
+ )
1042
+
1043
+ async def download_captions(
1044
+ self,
1045
+ url: str,
1046
+ output_dir: str,
1047
+ force_overwrite: bool = False,
1048
+ source_lang: Optional[str] = None,
1049
+ transcriber_name: Optional[str] = None,
1050
+ ) -> Optional[str]:
1051
+ """
1052
+ Download video captions using yt-dlp
1053
+
1054
+ Args:
1055
+ url: YouTube URL
1056
+ output_dir: Output directory
1057
+ force_overwrite: Skip user confirmation and overwrite existing files
1058
+ source_lang: Specific caption language/track to download (e.g., 'en')
1059
+ If None, downloads all available captions
1060
+ transcriber_name: Name of the transcriber (for user prompts)
1061
+ Returns:
1062
+ Path to downloaded transcript file or None if not available
1063
+ """
1064
+ target_dir = Path(output_dir).expanduser()
1065
+
1066
+ # Create output directory if it doesn't exist
1067
+ target_dir.mkdir(parents=True, exist_ok=True)
1068
+
1069
+ # Extract video ID and check for existing caption files
1070
+ video_id = self.extract_video_id(url)
1071
+ if not force_overwrite:
1072
+ existing_files = FileExistenceManager.check_existing_files(
1073
+ video_id, str(target_dir), caption_formats=CAPTION_FORMATS
1074
+ )
1075
+
1076
+ # Handle existing caption files
1077
+ if existing_files["caption"] and not force_overwrite:
1078
+ if FileExistenceManager.is_interactive_mode():
1079
+ user_choice = FileExistenceManager.prompt_user_confirmation(
1080
+ {"caption": existing_files["caption"]}, "caption download", transcriber_name=transcriber_name
1081
+ )
1082
+
1083
+ if user_choice == "cancel":
1084
+ raise RuntimeError("Caption download cancelled by user")
1085
+ elif user_choice == "overwrite":
1086
+ # Continue with download
1087
+ pass
1088
+ elif user_choice == TRANSCRIBE_CHOICE:
1089
+ return TRANSCRIBE_CHOICE
1090
+ elif user_choice in existing_files["caption"]:
1091
+ # User selected a specific file
1092
+ caption_file = Path(user_choice)
1093
+ self.logger.info(f"✅ Using selected caption file: {caption_file}")
1094
+ return str(caption_file)
1095
+ else:
1096
+ # Fallback: use first file
1097
+ caption_file = Path(existing_files["caption"][0])
1098
+ self.logger.info(f"✅ Using existing caption file: {caption_file}")
1099
+ return str(caption_file)
1100
+ else:
1101
+ caption_file = Path(existing_files["caption"][0])
1102
+ self.logger.info(f"🔍 Found existing caption: {caption_file}")
1103
+ return str(caption_file)
1104
+
1105
+ self.logger.info(f"📥 Downloading caption for: {url}")
1106
+ if source_lang:
1107
+ self.logger.info(f"🎯 Targeting specific caption track: {source_lang}")
1108
+
1109
+ output_template = str(target_dir / f"{video_id}.%(ext)s")
1110
+
1111
+ # Configure yt-dlp options for caption download
1112
+ opts = {
1113
+ "skip_download": True, # Don't download video/audio
1114
+ "writesubtitles": True,
1115
+ "writeautomaticsub": True,
1116
+ "subtitlesformat": "best",
1117
+ "outtmpl": output_template,
1118
+ "quiet": False,
1119
+ "no_warnings": True,
1120
+ }
1121
+
1122
+ # Add caption language selection if specified
1123
+ if source_lang:
1124
+ opts["subtitleslangs"] = [f"{source_lang}*"]
1125
+
1126
+ try:
1127
+ # Run in thread pool to avoid blocking
1128
+ loop = asyncio.get_event_loop()
1129
+
1130
+ def _download_subs():
1131
+ with yt_dlp.YoutubeDL(opts) as ydl:
1132
+ ydl.download([url])
1133
+
1134
+ await loop.run_in_executor(None, _download_subs)
1135
+
1136
+ except yt_dlp.utils.DownloadError as e:
1137
+ error_msg = str(e)
1138
+
1139
+ # Check for specific error conditions
1140
+ if "No automatic or manual captions found" in error_msg:
1141
+ self.logger.warning("No captions available for this video")
1142
+ elif "HTTP Error 429" in error_msg or "Too Many Requests" in error_msg:
1143
+ self.logger.error("YouTube rate limit exceeded. Please try again later or use a different method.")
1144
+ self.logger.error(
1145
+ "YouTube rate limit exceeded (HTTP 429). "
1146
+ "Try again later or use --cookies option with authenticated cookies. "
1147
+ "See: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"
1148
+ )
1149
+ else:
1150
+ self.logger.error(f"Failed to download transcript: {error_msg}")
1151
+ except Exception as e:
1152
+ self.logger.error(f"Failed to download transcript: {str(e)}")
1153
+
1154
+ # Find the downloaded transcript file
1155
+ caption_patterns = [
1156
+ f"{video_id}.*vtt",
1157
+ f"{video_id}.*srt",
1158
+ f"{video_id}.*sub",
1159
+ f"{video_id}.*sbv",
1160
+ f"{video_id}.*ssa",
1161
+ f"{video_id}.*ass",
1162
+ ]
1163
+
1164
+ caption_files = []
1165
+ for pattern in caption_patterns:
1166
+ _caption_files = list(target_dir.glob(pattern))
1167
+ for caption_file in _caption_files:
1168
+ self.logger.info(f"📥 Downloaded caption: {caption_file}")
1169
+ caption_files.extend(_caption_files)
1170
+
1171
+ # If only one caption file, return it directly
1172
+ if len(caption_files) == 1:
1173
+ self.logger.info(f"✅ Using caption: {caption_files[0]}")
1174
+ return str(caption_files[0])
1175
+
1176
+ # Multiple caption files found, let user choose
1177
+ if FileExistenceManager.is_interactive_mode():
1178
+ self.logger.info(f"📋 Found {len(caption_files)} caption files")
1179
+ caption_choice = FileExistenceManager.prompt_file_selection(
1180
+ file_type="caption",
1181
+ files=[str(f) for f in caption_files],
1182
+ operation="use",
1183
+ transcriber_name=transcriber_name,
1184
+ )
1185
+
1186
+ if caption_choice == "cancel":
1187
+ raise RuntimeError("Caption selection cancelled by user")
1188
+ elif caption_choice == TRANSCRIBE_CHOICE:
1189
+ return caption_choice
1190
+ elif caption_choice:
1191
+ self.logger.info(f"✅ Selected caption: {caption_choice}")
1192
+ return caption_choice
1193
+ elif caption_files:
1194
+ # Fallback to first file
1195
+ self.logger.info(f"✅ Using first caption: {caption_files[0]}")
1196
+ return str(caption_files[0])
1197
+ else:
1198
+ self.logger.warning("No caption files available after download")
1199
+ return None
1200
+ elif caption_files:
1201
+ # Non-interactive mode: use first file
1202
+ self.logger.info(f"✅ Using first caption: {caption_files[0]}")
1203
+ return str(caption_files[0])
1204
+ else:
1205
+ self.logger.warning("No caption files available after download")
1206
+ return None
1207
+
1208
+ async def list_available_captions(self, url: str) -> List[Dict[str, Any]]:
1209
+ """
1210
+ List all available caption tracks for a YouTube video
1211
+
1212
+ Args:
1213
+ url: YouTube URL
1214
+
1215
+ Returns:
1216
+ List of caption track information dictionaries
1217
+ """
1218
+ self.logger.info(f"📋 Listing available captions for: {url}")
1219
+
1220
+ opts = {
1221
+ "skip_download": True,
1222
+ "listsubtitles": True,
1223
+ "quiet": True,
1224
+ "no_warnings": True,
1225
+ }
1226
+
1227
+ try:
1228
+ # Run in thread pool to avoid blocking
1229
+ loop = asyncio.get_event_loop()
1230
+
1231
+ def _get_info():
1232
+ with yt_dlp.YoutubeDL(opts) as ydl:
1233
+ return ydl.extract_info(url, download=False)
1234
+
1235
+ info = await loop.run_in_executor(None, _get_info)
1236
+
1237
+ caption_info = []
1238
+
1239
+ # Parse manual captions
1240
+ subtitles = info.get("subtitles", {})
1241
+ for lang, formats in subtitles.items():
1242
+ if formats:
1243
+ format_names = [f.get("ext", "") for f in formats]
1244
+ lang_name = formats[0].get("name", lang) if formats else lang
1245
+ caption_info.append(
1246
+ {"language": lang, "name": lang_name, "formats": format_names, "kind": "manual"}
1247
+ )
1248
+
1249
+ # Parse automatic captions
1250
+ auto_subs = info.get("automatic_captions", {})
1251
+ for lang, formats in auto_subs.items():
1252
+ if formats:
1253
+ format_names = [f.get("ext", "") for f in formats]
1254
+ lang_name = formats[0].get("name", lang) if formats else lang
1255
+ caption_info.append({"language": lang, "name": lang_name, "formats": format_names, "kind": "asr"})
1256
+
1257
+ self.logger.info(f"✅ Found {len(caption_info)} caption tracks")
1258
+ return caption_info
1259
+
1260
+ except yt_dlp.utils.DownloadError as e:
1261
+ self.logger.error(f"Failed to list captions: {str(e)}")
1262
+ raise RuntimeError(f"Failed to list captions: {str(e)}")
1263
+ except Exception as e:
1264
+ self.logger.error(f"Failed to list captions: {str(e)}")
1265
+ raise RuntimeError(f"Failed to list captions: {str(e)}")