lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. lattifai/__init__.py +0 -24
  2. lattifai/alignment/__init__.py +10 -1
  3. lattifai/alignment/lattice1_aligner.py +66 -58
  4. lattifai/alignment/lattice1_worker.py +1 -6
  5. lattifai/alignment/punctuation.py +38 -0
  6. lattifai/alignment/segmenter.py +1 -1
  7. lattifai/alignment/sentence_splitter.py +350 -0
  8. lattifai/alignment/text_align.py +440 -0
  9. lattifai/alignment/tokenizer.py +91 -220
  10. lattifai/caption/__init__.py +82 -6
  11. lattifai/caption/caption.py +335 -1143
  12. lattifai/caption/formats/__init__.py +199 -0
  13. lattifai/caption/formats/base.py +211 -0
  14. lattifai/caption/formats/gemini.py +722 -0
  15. lattifai/caption/formats/json.py +194 -0
  16. lattifai/caption/formats/lrc.py +309 -0
  17. lattifai/caption/formats/nle/__init__.py +9 -0
  18. lattifai/caption/formats/nle/audition.py +561 -0
  19. lattifai/caption/formats/nle/avid.py +423 -0
  20. lattifai/caption/formats/nle/fcpxml.py +549 -0
  21. lattifai/caption/formats/nle/premiere.py +589 -0
  22. lattifai/caption/formats/pysubs2.py +642 -0
  23. lattifai/caption/formats/sbv.py +147 -0
  24. lattifai/caption/formats/tabular.py +338 -0
  25. lattifai/caption/formats/textgrid.py +193 -0
  26. lattifai/caption/formats/ttml.py +652 -0
  27. lattifai/caption/formats/vtt.py +469 -0
  28. lattifai/caption/parsers/__init__.py +9 -0
  29. lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
  30. lattifai/caption/standardize.py +636 -0
  31. lattifai/caption/utils.py +474 -0
  32. lattifai/cli/__init__.py +2 -1
  33. lattifai/cli/caption.py +108 -1
  34. lattifai/cli/transcribe.py +4 -9
  35. lattifai/cli/youtube.py +4 -1
  36. lattifai/client.py +48 -84
  37. lattifai/config/__init__.py +11 -1
  38. lattifai/config/alignment.py +9 -2
  39. lattifai/config/caption.py +267 -23
  40. lattifai/config/media.py +20 -0
  41. lattifai/diarization/__init__.py +41 -1
  42. lattifai/mixin.py +36 -18
  43. lattifai/transcription/base.py +6 -1
  44. lattifai/transcription/lattifai.py +19 -54
  45. lattifai/utils.py +81 -13
  46. lattifai/workflow/__init__.py +28 -4
  47. lattifai/workflow/file_manager.py +2 -5
  48. lattifai/youtube/__init__.py +43 -0
  49. lattifai/youtube/client.py +1170 -0
  50. lattifai/youtube/types.py +23 -0
  51. lattifai-1.2.2.dist-info/METADATA +615 -0
  52. lattifai-1.2.2.dist-info/RECORD +76 -0
  53. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
  54. lattifai/caption/gemini_reader.py +0 -371
  55. lattifai/caption/gemini_writer.py +0 -173
  56. lattifai/cli/app_installer.py +0 -142
  57. lattifai/cli/server.py +0 -44
  58. lattifai/server/app.py +0 -427
  59. lattifai/workflow/youtube.py +0 -577
  60. lattifai-1.2.0.dist-info/METADATA +0 -1133
  61. lattifai-1.2.0.dist-info/RECORD +0 -57
  62. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
  63. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
  64. {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
@@ -1,577 +0,0 @@
1
- """
2
- YouTube downloader module using yt-dlp and Agent
3
- """
4
-
5
- import asyncio
6
- import os
7
- import re
8
- import subprocess
9
- import tempfile
10
- from pathlib import Path
11
- from typing import Any, Dict, List, Optional
12
-
13
- from ..config.caption import CAPTION_FORMATS
14
- from .base import setup_workflow_logger
15
- from .file_manager import TRANSCRIBE_CHOICE, FileExistenceManager
16
-
17
-
18
- class YouTubeDownloader:
19
- """YouTube video/audio downloader using yt-dlp
20
-
21
- Configuration (in __init__):
22
- - None (stateless downloader)
23
-
24
- Runtime parameters (in __call__ or methods):
25
- - url: YouTube URL to download
26
- - output_dir: Where to save files
27
- - media_format: Format to download (mp3, mp4, etc.)
28
- - force_overwrite: Whether to overwrite existing files
29
- """
30
-
31
- def __init__(self):
32
- self.logger = setup_workflow_logger("youtube")
33
- # Check if yt-dlp is available
34
- self._check_ytdlp()
35
-
36
- @staticmethod
37
- def extract_video_id(url: str) -> str:
38
- """
39
- Extract video ID from YouTube URL
40
-
41
- Supports various YouTube URL formats:
42
- - https://www.youtube.com/watch?v=VIDEO_ID
43
- - https://youtu.be/VIDEO_ID
44
- - https://www.youtube.com/shorts/VIDEO_ID
45
- - https://m.youtube.com/watch?v=VIDEO_ID
46
-
47
- Returns:
48
- Video ID (e.g., 'cprOj8PWepY')
49
- """
50
- patterns = [
51
- r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
52
- r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
53
- r"youtube\.com/v/([a-zA-Z0-9_-]{11})",
54
- ]
55
-
56
- for pattern in patterns:
57
- match = re.search(pattern, url)
58
- if match:
59
- return match.group(1)
60
- return "youtube_media"
61
-
62
- def _check_ytdlp(self):
63
- """Check if yt-dlp is installed"""
64
- try:
65
- result = subprocess.run(["yt-dlp", "--version"], capture_output=True, text=True, check=True)
66
- self.logger.info(f"yt-dlp version: {result.stdout.strip()}")
67
- except (subprocess.CalledProcessError, FileNotFoundError):
68
- raise RuntimeError(
69
- "yt-dlp is not installed or not found in PATH. Please install it with: pip install yt-dlp"
70
- )
71
-
72
- async def get_video_info(self, url: str) -> Dict[str, Any]:
73
- """Get video metadata without downloading"""
74
- self.logger.info(f"🔍 Extracting video info for: {url}")
75
-
76
- cmd = ["yt-dlp", "--dump-json", "--no-download", url]
77
-
78
- try:
79
- # Run in thread pool to avoid blocking
80
- loop = asyncio.get_event_loop()
81
- result = await loop.run_in_executor(
82
- None, lambda: subprocess.run(cmd, capture_output=True, text=True, check=True)
83
- )
84
-
85
- import json
86
-
87
- metadata = json.loads(result.stdout)
88
-
89
- # Extract relevant info
90
- info = {
91
- "title": metadata.get("title", "Unknown"),
92
- "duration": metadata.get("duration", 0),
93
- "uploader": metadata.get("uploader", "Unknown"),
94
- "upload_date": metadata.get("upload_date", "Unknown"),
95
- "view_count": metadata.get("view_count", 0),
96
- "description": metadata.get("description", ""),
97
- "thumbnail": metadata.get("thumbnail", ""),
98
- "webpage_url": metadata.get("webpage_url", url),
99
- }
100
-
101
- self.logger.info(f'✅ Video info extracted: {info["title"]}')
102
- return info
103
-
104
- except subprocess.CalledProcessError as e:
105
- self.logger.error(f"Failed to extract video info: {e.stderr}")
106
- raise RuntimeError(f"Failed to extract video info: {e.stderr}")
107
- except json.JSONDecodeError as e:
108
- self.logger.error(f"Failed to parse video metadata: {e}")
109
- raise RuntimeError(f"Failed to parse video metadata: {e}")
110
-
111
- async def download_media(
112
- self,
113
- url: str,
114
- output_dir: Optional[str] = None,
115
- media_format: Optional[str] = None,
116
- force_overwrite: bool = False,
117
- ) -> str:
118
- """
119
- Download media (audio or video) from YouTube URL based on format
120
-
121
- This is a unified method that automatically selects between audio and video
122
- download based on the media format extension.
123
-
124
- Args:
125
- url: YouTube URL
126
- output_dir: Output directory (default: temp directory)
127
- media_format: Media format - audio (mp3, wav, m4a, aac, opus, ogg, flac, aiff)
128
- or video (mp4, webm, mkv, avi, mov, etc.) (default: instance format)
129
- force_overwrite: Skip user confirmation and overwrite existing files
130
-
131
- Returns:
132
- Path to downloaded media file
133
- """
134
- media_format = media_format or self.media_format
135
-
136
- # Determine if format is audio or video
137
- audio_formats = ["mp3", "wav", "m4a", "aac", "opus", "ogg", "flac", "aiff"]
138
- is_audio = media_format.lower() in audio_formats
139
-
140
- if is_audio:
141
- self.logger.info(f"🎵 Detected audio format: {media_format}")
142
- return await self.download_audio(
143
- url=url, output_dir=output_dir, media_format=media_format, force_overwrite=force_overwrite
144
- )
145
- else:
146
- self.logger.info(f"🎬 Detected video format: {media_format}")
147
- return await self.download_video(
148
- url=url, output_dir=output_dir, video_format=media_format, force_overwrite=force_overwrite
149
- )
150
-
151
- async def _download_media_internal(
152
- self,
153
- url: str,
154
- output_dir: str,
155
- media_format: str,
156
- is_audio: bool,
157
- force_overwrite: bool = False,
158
- ) -> str:
159
- """
160
- Internal unified method for downloading audio or video from YouTube
161
-
162
- Args:
163
- url: YouTube URL
164
- output_dir: Output directory
165
- media_format: Media format (audio or video extension)
166
- is_audio: True for audio download, False for video download
167
- force_overwrite: Skip user confirmation and overwrite existing files
168
-
169
- Returns:
170
- Path to downloaded media file
171
- """
172
- target_dir = Path(output_dir).expanduser()
173
- media_type = "audio" if is_audio else "video"
174
- emoji = "🎵" if is_audio else "🎬"
175
-
176
- self.logger.info(f"{emoji} Downloading {media_type} from: {url}")
177
- self.logger.info(f"📁 Output directory: {target_dir}")
178
- self.logger.info(f'{"🎶" if is_audio else "🎥"} Media format: {media_format}')
179
-
180
- # Create output directory if it doesn't exist
181
- target_dir.mkdir(parents=True, exist_ok=True)
182
-
183
- # Extract video ID and check for existing files
184
- video_id = self.extract_video_id(url)
185
- existing_files = FileExistenceManager.check_existing_files(video_id, str(target_dir), [media_format])
186
-
187
- # Handle existing files
188
- if existing_files["media"] and not force_overwrite:
189
- if FileExistenceManager.is_interactive_mode():
190
- user_choice = FileExistenceManager.prompt_user_confirmation(
191
- {"media": existing_files["media"]}, "media download"
192
- )
193
-
194
- if user_choice == "cancel":
195
- raise RuntimeError("Media download cancelled by user")
196
- elif user_choice == "overwrite":
197
- # Continue with download
198
- pass
199
- elif user_choice in existing_files["media"]:
200
- # User selected a specific file
201
- # self.logger.info(f"✅ Using selected media file: {user_choice}")
202
- return user_choice
203
- else:
204
- # Fallback: use first file
205
- self.logger.info(f'✅ Using existing media file: {existing_files["media"][0]}')
206
- return existing_files["media"][0]
207
- else:
208
- # Non-interactive mode: use existing file
209
- self.logger.info(f'✅ Using existing media file: {existing_files["media"][0]}')
210
- return existing_files["media"][0]
211
-
212
- # Generate output filename template
213
- output_template = str(target_dir / f"{video_id}.%(ext)s")
214
-
215
- # Build yt-dlp command based on media type
216
- if is_audio:
217
- cmd = [
218
- "yt-dlp",
219
- "--extract-audio",
220
- "--audio-format",
221
- media_format,
222
- "--audio-quality",
223
- "0", # Best quality
224
- "--output",
225
- output_template,
226
- "--no-playlist",
227
- url,
228
- ]
229
- else:
230
- cmd = [
231
- "yt-dlp",
232
- "--format",
233
- "bestvideo*+bestaudio/best",
234
- "--merge-output-format",
235
- media_format,
236
- "--output",
237
- output_template,
238
- "--no-playlist",
239
- url,
240
- ]
241
-
242
- try:
243
- # Run in thread pool to avoid blocking
244
- loop = asyncio.get_event_loop()
245
- result = await loop.run_in_executor(
246
- None, lambda: subprocess.run(cmd, capture_output=True, text=True, check=True)
247
- )
248
-
249
- self.logger.info(f"✅ {media_type.capitalize()} download completed")
250
-
251
- # Find the downloaded file
252
- # Try to parse from yt-dlp output first
253
- if is_audio:
254
- output_lines = result.stderr.strip().split("\n")
255
- for line in reversed(output_lines):
256
- if "Destination:" in line or "has already been downloaded" in line:
257
- parts = line.split()
258
- filename = " ".join(parts[1:]) if "Destination:" in line else parts[0]
259
- file_path = target_dir / filename
260
- if file_path.exists():
261
- self.logger.info(f"{emoji} Downloaded {media_type} file: {file_path}")
262
- return str(file_path)
263
-
264
- # Check for expected file format
265
- expected_file = target_dir / f"{video_id}.{media_format}"
266
- if expected_file.exists():
267
- self.logger.info(f"{emoji} Downloaded {media_type}: {expected_file}")
268
- return str(expected_file)
269
-
270
- # Fallback: search for media files with this video_id
271
- if is_audio:
272
- fallback_extensions = [media_format, "mp3", "wav", "m4a", "aac"]
273
- else:
274
- fallback_extensions = [media_format, "mp4", "webm", "mkv"]
275
-
276
- for ext in fallback_extensions:
277
- files = list(target_dir.glob(f"{video_id}*.{ext}"))
278
- if files:
279
- latest_file = max(files, key=os.path.getctime)
280
- self.logger.info(f"{emoji} Found {media_type} file: {latest_file}")
281
- return str(latest_file)
282
-
283
- raise RuntimeError(f"Downloaded {media_type} file not found")
284
-
285
- except subprocess.CalledProcessError as e:
286
- self.logger.error(f"Failed to download {media_type}: {e.stderr}")
287
- raise RuntimeError(f"Failed to download {media_type}: {e.stderr}")
288
-
289
- async def download_audio(
290
- self,
291
- url: str,
292
- output_dir: Optional[str] = None,
293
- media_format: Optional[str] = None,
294
- force_overwrite: bool = False,
295
- ) -> str:
296
- """
297
- Download audio from YouTube URL
298
-
299
- Args:
300
- url: YouTube URL
301
- output_dir: Output directory (default: temp directory)
302
- media_format: Audio format (default: instance format)
303
- force_overwrite: Skip user confirmation and overwrite existing files
304
-
305
- Returns:
306
- Path to downloaded audio file
307
- """
308
- target_dir = output_dir or tempfile.gettempdir()
309
- media_format = media_format or self.media_format
310
- return await self._download_media_internal(
311
- url, target_dir, media_format, is_audio=True, force_overwrite=force_overwrite
312
- )
313
-
314
- async def download_video(
315
- self, url: str, output_dir: Optional[str] = None, video_format: str = "mp4", force_overwrite: bool = False
316
- ) -> str:
317
- """
318
- Download video from YouTube URL
319
-
320
- Args:
321
- url: YouTube URL
322
- output_dir: Output directory (default: temp directory)
323
- video_format: Video format
324
- force_overwrite: Skip user confirmation and overwrite existing files
325
-
326
- Returns:
327
- Path to downloaded video file
328
- """
329
- target_dir = output_dir or tempfile.gettempdir()
330
- return await self._download_media_internal(
331
- url, target_dir, video_format, is_audio=False, force_overwrite=force_overwrite
332
- )
333
-
334
- async def download_captions(
335
- self,
336
- url: str,
337
- output_dir: str,
338
- force_overwrite: bool = False,
339
- source_lang: Optional[str] = None,
340
- transcriber_name: Optional[str] = None,
341
- ) -> Optional[str]:
342
- """
343
- Download video captions using yt-dlp
344
-
345
- Args:
346
- url: YouTube URL
347
- output_dir: Output directory
348
- force_overwrite: Skip user confirmation and overwrite existing files
349
- source_lang: Specific caption language/track to download (e.g., 'en')
350
- If None, downloads all available captions
351
- transcriber_name: Name of the transcriber (for user prompts)
352
- Returns:
353
- Path to downloaded transcript file or None if not available
354
- """
355
- target_dir = Path(output_dir).expanduser()
356
-
357
- # Create output directory if it doesn't exist
358
- target_dir.mkdir(parents=True, exist_ok=True)
359
-
360
- # Extract video ID and check for existing caption files
361
- video_id = self.extract_video_id(url)
362
- if not force_overwrite:
363
- existing_files = FileExistenceManager.check_existing_files(
364
- video_id, str(target_dir), caption_formats=CAPTION_FORMATS
365
- )
366
-
367
- # Handle existing caption files
368
- if existing_files["caption"] and not force_overwrite:
369
- if FileExistenceManager.is_interactive_mode():
370
- user_choice = FileExistenceManager.prompt_user_confirmation(
371
- {"caption": existing_files["caption"]}, "caption download", transcriber_name=transcriber_name
372
- )
373
-
374
- if user_choice == "cancel":
375
- raise RuntimeError("Caption download cancelled by user")
376
- elif user_choice == "overwrite":
377
- # Continue with download
378
- pass
379
- elif user_choice == TRANSCRIBE_CHOICE:
380
- return TRANSCRIBE_CHOICE
381
- elif user_choice in existing_files["caption"]:
382
- # User selected a specific file
383
- caption_file = Path(user_choice)
384
- self.logger.info(f"✅ Using selected caption file: {caption_file}")
385
- return str(caption_file)
386
- else:
387
- # Fallback: use first file
388
- caption_file = Path(existing_files["caption"][0])
389
- self.logger.info(f"✅ Using existing caption file: {caption_file}")
390
- return str(caption_file)
391
- else:
392
- caption_file = Path(existing_files["caption"][0])
393
- self.logger.info(f"🔍 Found existing caption: {caption_file}")
394
- return str(caption_file)
395
-
396
- self.logger.info(f"📥 Downloading caption for: {url}")
397
- if source_lang:
398
- self.logger.info(f"🎯 Targeting specific caption track: {source_lang}")
399
-
400
- output_template = str(target_dir / f"{video_id}.%(ext)s")
401
-
402
- # Configure yt-dlp options for caption download
403
- ytdlp_options = [
404
- "yt-dlp",
405
- "--skip-download", # Don't download video/audio
406
- "--output",
407
- output_template,
408
- "--sub-format",
409
- "best", # Prefer best available format
410
- "--no-warnings", # Suppress warnings for cleaner output
411
- "--extractor-retries",
412
- "3", # Retry on errors
413
- "--sleep-requests",
414
- "1", # Sleep between requests to avoid rate limiting
415
- ]
416
-
417
- # Add caption language selection if specified
418
- if source_lang:
419
- ytdlp_options.extend(["--write-sub", "--write-auto-sub", "--sub-langs", f"{source_lang}*"])
420
- else:
421
- # Download only manual captions (not auto-generated) in English to avoid rate limiting
422
- ytdlp_options.extend(["--write-sub", "--write-auto-sub"])
423
-
424
- ytdlp_options.append(url)
425
-
426
- try:
427
- # Run in thread pool to avoid blocking
428
- loop = asyncio.get_event_loop()
429
- result = await loop.run_in_executor(
430
- None, lambda: subprocess.run(ytdlp_options, capture_output=True, text=True, check=True)
431
- )
432
- # Only log success message, not full yt-dlp output
433
- self.logger.debug(f"yt-dlp output: {result.stdout.strip()}")
434
- except subprocess.CalledProcessError as e:
435
- error_msg = e.stderr.strip() if e.stderr else str(e)
436
-
437
- # Check for specific error conditions
438
- if "No automatic or manual captions found" in error_msg:
439
- self.logger.warning("No captions available for this video")
440
- elif "HTTP Error 429" in error_msg or "Too Many Requests" in error_msg:
441
- self.logger.error("YouTube rate limit exceeded. Please try again later or use a different method.")
442
- self.logger.error(
443
- "YouTube rate limit exceeded (HTTP 429). "
444
- "Try again later or use --cookies option with authenticated cookies. "
445
- "See: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"
446
- )
447
- else:
448
- self.logger.error(f"Failed to download transcript: {error_msg}")
449
-
450
- # Find the downloaded transcript file
451
- caption_patterns = [
452
- f"{video_id}.*vtt",
453
- f"{video_id}.*srt",
454
- f"{video_id}.*sub",
455
- f"{video_id}.*sbv",
456
- f"{video_id}.*ssa",
457
- f"{video_id}.*ass",
458
- ]
459
-
460
- caption_files = []
461
- for pattern in caption_patterns:
462
- _caption_files = list(target_dir.glob(pattern))
463
- for caption_file in _caption_files:
464
- self.logger.info(f"📥 Downloaded caption: {caption_file}")
465
- caption_files.extend(_caption_files)
466
-
467
- # If only one caption file, return it directly
468
- if len(caption_files) == 1:
469
- self.logger.info(f"✅ Using caption: {caption_files[0]}")
470
- return str(caption_files[0])
471
-
472
- # Multiple caption files found, let user choose
473
- if FileExistenceManager.is_interactive_mode():
474
- self.logger.info(f"📋 Found {len(caption_files)} caption files")
475
- caption_choice = FileExistenceManager.prompt_file_selection(
476
- file_type="caption",
477
- files=[str(f) for f in caption_files],
478
- operation="use",
479
- transcriber_name=transcriber_name,
480
- )
481
-
482
- if caption_choice == "cancel":
483
- raise RuntimeError("Caption selection cancelled by user")
484
- elif caption_choice == TRANSCRIBE_CHOICE:
485
- return caption_choice
486
- elif caption_choice:
487
- self.logger.info(f"✅ Selected caption: {caption_choice}")
488
- return caption_choice
489
- elif caption_files:
490
- # Fallback to first file
491
- self.logger.info(f"✅ Using first caption: {caption_files[0]}")
492
- return str(caption_files[0])
493
- else:
494
- self.logger.warning("No caption files available after download")
495
- return None
496
- elif caption_files:
497
- # Non-interactive mode: use first file
498
- self.logger.info(f"✅ Using first caption: {caption_files[0]}")
499
- return str(caption_files[0])
500
- else:
501
- self.logger.warning("No caption files available after download")
502
- return None
503
-
504
- async def list_available_captions(self, url: str) -> List[Dict[str, Any]]:
505
- """
506
- List all available caption tracks for a YouTube video
507
-
508
- Args:
509
- url: YouTube URL
510
-
511
- Returns:
512
- List of caption track information dictionaries
513
- """
514
- self.logger.info(f"📋 Listing available captions for: {url}")
515
-
516
- cmd = ["yt-dlp", "--list-subs", "--no-download", url]
517
-
518
- try:
519
- # Run in thread pool to avoid blocking
520
- loop = asyncio.get_event_loop()
521
- result = await loop.run_in_executor(
522
- None, lambda: subprocess.run(cmd, capture_output=True, text=True, check=True)
523
- )
524
-
525
- # Parse the caption list output
526
- caption_info = []
527
- lines = result.stdout.strip().split("\n")
528
-
529
- # Look for the caption section (not automatic captions)
530
- in_caption_section = False
531
- for line in lines:
532
- if "Available captions for" in line:
533
- in_caption_section = True
534
- continue
535
- elif "Available automatic captions for" in line:
536
- in_caption_section = False
537
- continue
538
- elif in_caption_section and line.strip():
539
- # Skip header lines
540
- if "Language" in line and "Name" in line and "Formats" in line:
541
- continue
542
-
543
- # Parse caption information
544
- # Format: "Language Name Formats" where formats are comma-separated
545
- # Example: "en-uYU-mmqFLq8 English - CC1 vtt, srt, ttml, srv3, srv2, srv1, json3"
546
-
547
- if line.strip() and not line.startswith("["):
548
- # Split by multiple spaces to separate language, name, and formats
549
- import re
550
-
551
- parts = re.split(r"\s{2,}", line.strip())
552
-
553
- if len(parts) >= 2:
554
- # First part is language, last part is formats
555
- language_and_name = parts[0]
556
- formats_str = parts[-1]
557
-
558
- # Split language and name - language is first word
559
- lang_name_parts = language_and_name.split(" ", 1)
560
- language = lang_name_parts[0]
561
- name = lang_name_parts[1] if len(lang_name_parts) > 1 else ""
562
-
563
- # If there are more than 2 parts, middle parts are also part of name
564
- if len(parts) > 2:
565
- name = " ".join([name] + parts[1:-1]).strip()
566
-
567
- # Parse formats - they are comma-separated
568
- formats = [f.strip() for f in formats_str.split(",") if f.strip()]
569
-
570
- caption_info.append({"language": language, "name": name, "formats": formats})
571
-
572
- self.logger.info(f"✅ Found {len(caption_info)} caption tracks")
573
- return caption_info
574
-
575
- except subprocess.CalledProcessError as e:
576
- self.logger.error(f"Failed to list captions: {e.stderr}")
577
- raise RuntimeError(f"Failed to list captions: {e.stderr}")