lattifai 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +0 -24
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/lattice1_worker.py +1 -6
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/segmenter.py +1 -1
- lattifai/alignment/sentence_splitter.py +350 -0
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +91 -220
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1143
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/formats/gemini.py +722 -0
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +4 -9
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +48 -84
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +9 -2
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +36 -18
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +81 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_reader.py +0 -371
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.0.dist-info/METADATA +0 -1133
- lattifai-1.2.0.dist-info/RECORD +0 -57
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.0.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
lattifai/workflow/youtube.py
DELETED
|
@@ -1,577 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
YouTube downloader module using yt-dlp and Agent
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
import os
|
|
7
|
-
import re
|
|
8
|
-
import subprocess
|
|
9
|
-
import tempfile
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import Any, Dict, List, Optional
|
|
12
|
-
|
|
13
|
-
from ..config.caption import CAPTION_FORMATS
|
|
14
|
-
from .base import setup_workflow_logger
|
|
15
|
-
from .file_manager import TRANSCRIBE_CHOICE, FileExistenceManager
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class YouTubeDownloader:
|
|
19
|
-
"""YouTube video/audio downloader using yt-dlp
|
|
20
|
-
|
|
21
|
-
Configuration (in __init__):
|
|
22
|
-
- None (stateless downloader)
|
|
23
|
-
|
|
24
|
-
Runtime parameters (in __call__ or methods):
|
|
25
|
-
- url: YouTube URL to download
|
|
26
|
-
- output_dir: Where to save files
|
|
27
|
-
- media_format: Format to download (mp3, mp4, etc.)
|
|
28
|
-
- force_overwrite: Whether to overwrite existing files
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
def __init__(self):
|
|
32
|
-
self.logger = setup_workflow_logger("youtube")
|
|
33
|
-
# Check if yt-dlp is available
|
|
34
|
-
self._check_ytdlp()
|
|
35
|
-
|
|
36
|
-
@staticmethod
|
|
37
|
-
def extract_video_id(url: str) -> str:
|
|
38
|
-
"""
|
|
39
|
-
Extract video ID from YouTube URL
|
|
40
|
-
|
|
41
|
-
Supports various YouTube URL formats:
|
|
42
|
-
- https://www.youtube.com/watch?v=VIDEO_ID
|
|
43
|
-
- https://youtu.be/VIDEO_ID
|
|
44
|
-
- https://www.youtube.com/shorts/VIDEO_ID
|
|
45
|
-
- https://m.youtube.com/watch?v=VIDEO_ID
|
|
46
|
-
|
|
47
|
-
Returns:
|
|
48
|
-
Video ID (e.g., 'cprOj8PWepY')
|
|
49
|
-
"""
|
|
50
|
-
patterns = [
|
|
51
|
-
r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
|
|
52
|
-
r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
|
|
53
|
-
r"youtube\.com/v/([a-zA-Z0-9_-]{11})",
|
|
54
|
-
]
|
|
55
|
-
|
|
56
|
-
for pattern in patterns:
|
|
57
|
-
match = re.search(pattern, url)
|
|
58
|
-
if match:
|
|
59
|
-
return match.group(1)
|
|
60
|
-
return "youtube_media"
|
|
61
|
-
|
|
62
|
-
def _check_ytdlp(self):
|
|
63
|
-
"""Check if yt-dlp is installed"""
|
|
64
|
-
try:
|
|
65
|
-
result = subprocess.run(["yt-dlp", "--version"], capture_output=True, text=True, check=True)
|
|
66
|
-
self.logger.info(f"yt-dlp version: {result.stdout.strip()}")
|
|
67
|
-
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
68
|
-
raise RuntimeError(
|
|
69
|
-
"yt-dlp is not installed or not found in PATH. Please install it with: pip install yt-dlp"
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
async def get_video_info(self, url: str) -> Dict[str, Any]:
|
|
73
|
-
"""Get video metadata without downloading"""
|
|
74
|
-
self.logger.info(f"🔍 Extracting video info for: {url}")
|
|
75
|
-
|
|
76
|
-
cmd = ["yt-dlp", "--dump-json", "--no-download", url]
|
|
77
|
-
|
|
78
|
-
try:
|
|
79
|
-
# Run in thread pool to avoid blocking
|
|
80
|
-
loop = asyncio.get_event_loop()
|
|
81
|
-
result = await loop.run_in_executor(
|
|
82
|
-
None, lambda: subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
import json
|
|
86
|
-
|
|
87
|
-
metadata = json.loads(result.stdout)
|
|
88
|
-
|
|
89
|
-
# Extract relevant info
|
|
90
|
-
info = {
|
|
91
|
-
"title": metadata.get("title", "Unknown"),
|
|
92
|
-
"duration": metadata.get("duration", 0),
|
|
93
|
-
"uploader": metadata.get("uploader", "Unknown"),
|
|
94
|
-
"upload_date": metadata.get("upload_date", "Unknown"),
|
|
95
|
-
"view_count": metadata.get("view_count", 0),
|
|
96
|
-
"description": metadata.get("description", ""),
|
|
97
|
-
"thumbnail": metadata.get("thumbnail", ""),
|
|
98
|
-
"webpage_url": metadata.get("webpage_url", url),
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
self.logger.info(f'✅ Video info extracted: {info["title"]}')
|
|
102
|
-
return info
|
|
103
|
-
|
|
104
|
-
except subprocess.CalledProcessError as e:
|
|
105
|
-
self.logger.error(f"Failed to extract video info: {e.stderr}")
|
|
106
|
-
raise RuntimeError(f"Failed to extract video info: {e.stderr}")
|
|
107
|
-
except json.JSONDecodeError as e:
|
|
108
|
-
self.logger.error(f"Failed to parse video metadata: {e}")
|
|
109
|
-
raise RuntimeError(f"Failed to parse video metadata: {e}")
|
|
110
|
-
|
|
111
|
-
async def download_media(
|
|
112
|
-
self,
|
|
113
|
-
url: str,
|
|
114
|
-
output_dir: Optional[str] = None,
|
|
115
|
-
media_format: Optional[str] = None,
|
|
116
|
-
force_overwrite: bool = False,
|
|
117
|
-
) -> str:
|
|
118
|
-
"""
|
|
119
|
-
Download media (audio or video) from YouTube URL based on format
|
|
120
|
-
|
|
121
|
-
This is a unified method that automatically selects between audio and video
|
|
122
|
-
download based on the media format extension.
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
url: YouTube URL
|
|
126
|
-
output_dir: Output directory (default: temp directory)
|
|
127
|
-
media_format: Media format - audio (mp3, wav, m4a, aac, opus, ogg, flac, aiff)
|
|
128
|
-
or video (mp4, webm, mkv, avi, mov, etc.) (default: instance format)
|
|
129
|
-
force_overwrite: Skip user confirmation and overwrite existing files
|
|
130
|
-
|
|
131
|
-
Returns:
|
|
132
|
-
Path to downloaded media file
|
|
133
|
-
"""
|
|
134
|
-
media_format = media_format or self.media_format
|
|
135
|
-
|
|
136
|
-
# Determine if format is audio or video
|
|
137
|
-
audio_formats = ["mp3", "wav", "m4a", "aac", "opus", "ogg", "flac", "aiff"]
|
|
138
|
-
is_audio = media_format.lower() in audio_formats
|
|
139
|
-
|
|
140
|
-
if is_audio:
|
|
141
|
-
self.logger.info(f"🎵 Detected audio format: {media_format}")
|
|
142
|
-
return await self.download_audio(
|
|
143
|
-
url=url, output_dir=output_dir, media_format=media_format, force_overwrite=force_overwrite
|
|
144
|
-
)
|
|
145
|
-
else:
|
|
146
|
-
self.logger.info(f"🎬 Detected video format: {media_format}")
|
|
147
|
-
return await self.download_video(
|
|
148
|
-
url=url, output_dir=output_dir, video_format=media_format, force_overwrite=force_overwrite
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
async def _download_media_internal(
|
|
152
|
-
self,
|
|
153
|
-
url: str,
|
|
154
|
-
output_dir: str,
|
|
155
|
-
media_format: str,
|
|
156
|
-
is_audio: bool,
|
|
157
|
-
force_overwrite: bool = False,
|
|
158
|
-
) -> str:
|
|
159
|
-
"""
|
|
160
|
-
Internal unified method for downloading audio or video from YouTube
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
url: YouTube URL
|
|
164
|
-
output_dir: Output directory
|
|
165
|
-
media_format: Media format (audio or video extension)
|
|
166
|
-
is_audio: True for audio download, False for video download
|
|
167
|
-
force_overwrite: Skip user confirmation and overwrite existing files
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
Path to downloaded media file
|
|
171
|
-
"""
|
|
172
|
-
target_dir = Path(output_dir).expanduser()
|
|
173
|
-
media_type = "audio" if is_audio else "video"
|
|
174
|
-
emoji = "🎵" if is_audio else "🎬"
|
|
175
|
-
|
|
176
|
-
self.logger.info(f"{emoji} Downloading {media_type} from: {url}")
|
|
177
|
-
self.logger.info(f"📁 Output directory: {target_dir}")
|
|
178
|
-
self.logger.info(f'{"🎶" if is_audio else "🎥"} Media format: {media_format}')
|
|
179
|
-
|
|
180
|
-
# Create output directory if it doesn't exist
|
|
181
|
-
target_dir.mkdir(parents=True, exist_ok=True)
|
|
182
|
-
|
|
183
|
-
# Extract video ID and check for existing files
|
|
184
|
-
video_id = self.extract_video_id(url)
|
|
185
|
-
existing_files = FileExistenceManager.check_existing_files(video_id, str(target_dir), [media_format])
|
|
186
|
-
|
|
187
|
-
# Handle existing files
|
|
188
|
-
if existing_files["media"] and not force_overwrite:
|
|
189
|
-
if FileExistenceManager.is_interactive_mode():
|
|
190
|
-
user_choice = FileExistenceManager.prompt_user_confirmation(
|
|
191
|
-
{"media": existing_files["media"]}, "media download"
|
|
192
|
-
)
|
|
193
|
-
|
|
194
|
-
if user_choice == "cancel":
|
|
195
|
-
raise RuntimeError("Media download cancelled by user")
|
|
196
|
-
elif user_choice == "overwrite":
|
|
197
|
-
# Continue with download
|
|
198
|
-
pass
|
|
199
|
-
elif user_choice in existing_files["media"]:
|
|
200
|
-
# User selected a specific file
|
|
201
|
-
# self.logger.info(f"✅ Using selected media file: {user_choice}")
|
|
202
|
-
return user_choice
|
|
203
|
-
else:
|
|
204
|
-
# Fallback: use first file
|
|
205
|
-
self.logger.info(f'✅ Using existing media file: {existing_files["media"][0]}')
|
|
206
|
-
return existing_files["media"][0]
|
|
207
|
-
else:
|
|
208
|
-
# Non-interactive mode: use existing file
|
|
209
|
-
self.logger.info(f'✅ Using existing media file: {existing_files["media"][0]}')
|
|
210
|
-
return existing_files["media"][0]
|
|
211
|
-
|
|
212
|
-
# Generate output filename template
|
|
213
|
-
output_template = str(target_dir / f"{video_id}.%(ext)s")
|
|
214
|
-
|
|
215
|
-
# Build yt-dlp command based on media type
|
|
216
|
-
if is_audio:
|
|
217
|
-
cmd = [
|
|
218
|
-
"yt-dlp",
|
|
219
|
-
"--extract-audio",
|
|
220
|
-
"--audio-format",
|
|
221
|
-
media_format,
|
|
222
|
-
"--audio-quality",
|
|
223
|
-
"0", # Best quality
|
|
224
|
-
"--output",
|
|
225
|
-
output_template,
|
|
226
|
-
"--no-playlist",
|
|
227
|
-
url,
|
|
228
|
-
]
|
|
229
|
-
else:
|
|
230
|
-
cmd = [
|
|
231
|
-
"yt-dlp",
|
|
232
|
-
"--format",
|
|
233
|
-
"bestvideo*+bestaudio/best",
|
|
234
|
-
"--merge-output-format",
|
|
235
|
-
media_format,
|
|
236
|
-
"--output",
|
|
237
|
-
output_template,
|
|
238
|
-
"--no-playlist",
|
|
239
|
-
url,
|
|
240
|
-
]
|
|
241
|
-
|
|
242
|
-
try:
|
|
243
|
-
# Run in thread pool to avoid blocking
|
|
244
|
-
loop = asyncio.get_event_loop()
|
|
245
|
-
result = await loop.run_in_executor(
|
|
246
|
-
None, lambda: subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
self.logger.info(f"✅ {media_type.capitalize()} download completed")
|
|
250
|
-
|
|
251
|
-
# Find the downloaded file
|
|
252
|
-
# Try to parse from yt-dlp output first
|
|
253
|
-
if is_audio:
|
|
254
|
-
output_lines = result.stderr.strip().split("\n")
|
|
255
|
-
for line in reversed(output_lines):
|
|
256
|
-
if "Destination:" in line or "has already been downloaded" in line:
|
|
257
|
-
parts = line.split()
|
|
258
|
-
filename = " ".join(parts[1:]) if "Destination:" in line else parts[0]
|
|
259
|
-
file_path = target_dir / filename
|
|
260
|
-
if file_path.exists():
|
|
261
|
-
self.logger.info(f"{emoji} Downloaded {media_type} file: {file_path}")
|
|
262
|
-
return str(file_path)
|
|
263
|
-
|
|
264
|
-
# Check for expected file format
|
|
265
|
-
expected_file = target_dir / f"{video_id}.{media_format}"
|
|
266
|
-
if expected_file.exists():
|
|
267
|
-
self.logger.info(f"{emoji} Downloaded {media_type}: {expected_file}")
|
|
268
|
-
return str(expected_file)
|
|
269
|
-
|
|
270
|
-
# Fallback: search for media files with this video_id
|
|
271
|
-
if is_audio:
|
|
272
|
-
fallback_extensions = [media_format, "mp3", "wav", "m4a", "aac"]
|
|
273
|
-
else:
|
|
274
|
-
fallback_extensions = [media_format, "mp4", "webm", "mkv"]
|
|
275
|
-
|
|
276
|
-
for ext in fallback_extensions:
|
|
277
|
-
files = list(target_dir.glob(f"{video_id}*.{ext}"))
|
|
278
|
-
if files:
|
|
279
|
-
latest_file = max(files, key=os.path.getctime)
|
|
280
|
-
self.logger.info(f"{emoji} Found {media_type} file: {latest_file}")
|
|
281
|
-
return str(latest_file)
|
|
282
|
-
|
|
283
|
-
raise RuntimeError(f"Downloaded {media_type} file not found")
|
|
284
|
-
|
|
285
|
-
except subprocess.CalledProcessError as e:
|
|
286
|
-
self.logger.error(f"Failed to download {media_type}: {e.stderr}")
|
|
287
|
-
raise RuntimeError(f"Failed to download {media_type}: {e.stderr}")
|
|
288
|
-
|
|
289
|
-
async def download_audio(
|
|
290
|
-
self,
|
|
291
|
-
url: str,
|
|
292
|
-
output_dir: Optional[str] = None,
|
|
293
|
-
media_format: Optional[str] = None,
|
|
294
|
-
force_overwrite: bool = False,
|
|
295
|
-
) -> str:
|
|
296
|
-
"""
|
|
297
|
-
Download audio from YouTube URL
|
|
298
|
-
|
|
299
|
-
Args:
|
|
300
|
-
url: YouTube URL
|
|
301
|
-
output_dir: Output directory (default: temp directory)
|
|
302
|
-
media_format: Audio format (default: instance format)
|
|
303
|
-
force_overwrite: Skip user confirmation and overwrite existing files
|
|
304
|
-
|
|
305
|
-
Returns:
|
|
306
|
-
Path to downloaded audio file
|
|
307
|
-
"""
|
|
308
|
-
target_dir = output_dir or tempfile.gettempdir()
|
|
309
|
-
media_format = media_format or self.media_format
|
|
310
|
-
return await self._download_media_internal(
|
|
311
|
-
url, target_dir, media_format, is_audio=True, force_overwrite=force_overwrite
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
async def download_video(
|
|
315
|
-
self, url: str, output_dir: Optional[str] = None, video_format: str = "mp4", force_overwrite: bool = False
|
|
316
|
-
) -> str:
|
|
317
|
-
"""
|
|
318
|
-
Download video from YouTube URL
|
|
319
|
-
|
|
320
|
-
Args:
|
|
321
|
-
url: YouTube URL
|
|
322
|
-
output_dir: Output directory (default: temp directory)
|
|
323
|
-
video_format: Video format
|
|
324
|
-
force_overwrite: Skip user confirmation and overwrite existing files
|
|
325
|
-
|
|
326
|
-
Returns:
|
|
327
|
-
Path to downloaded video file
|
|
328
|
-
"""
|
|
329
|
-
target_dir = output_dir or tempfile.gettempdir()
|
|
330
|
-
return await self._download_media_internal(
|
|
331
|
-
url, target_dir, video_format, is_audio=False, force_overwrite=force_overwrite
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
async def download_captions(
|
|
335
|
-
self,
|
|
336
|
-
url: str,
|
|
337
|
-
output_dir: str,
|
|
338
|
-
force_overwrite: bool = False,
|
|
339
|
-
source_lang: Optional[str] = None,
|
|
340
|
-
transcriber_name: Optional[str] = None,
|
|
341
|
-
) -> Optional[str]:
|
|
342
|
-
"""
|
|
343
|
-
Download video captions using yt-dlp
|
|
344
|
-
|
|
345
|
-
Args:
|
|
346
|
-
url: YouTube URL
|
|
347
|
-
output_dir: Output directory
|
|
348
|
-
force_overwrite: Skip user confirmation and overwrite existing files
|
|
349
|
-
source_lang: Specific caption language/track to download (e.g., 'en')
|
|
350
|
-
If None, downloads all available captions
|
|
351
|
-
transcriber_name: Name of the transcriber (for user prompts)
|
|
352
|
-
Returns:
|
|
353
|
-
Path to downloaded transcript file or None if not available
|
|
354
|
-
"""
|
|
355
|
-
target_dir = Path(output_dir).expanduser()
|
|
356
|
-
|
|
357
|
-
# Create output directory if it doesn't exist
|
|
358
|
-
target_dir.mkdir(parents=True, exist_ok=True)
|
|
359
|
-
|
|
360
|
-
# Extract video ID and check for existing caption files
|
|
361
|
-
video_id = self.extract_video_id(url)
|
|
362
|
-
if not force_overwrite:
|
|
363
|
-
existing_files = FileExistenceManager.check_existing_files(
|
|
364
|
-
video_id, str(target_dir), caption_formats=CAPTION_FORMATS
|
|
365
|
-
)
|
|
366
|
-
|
|
367
|
-
# Handle existing caption files
|
|
368
|
-
if existing_files["caption"] and not force_overwrite:
|
|
369
|
-
if FileExistenceManager.is_interactive_mode():
|
|
370
|
-
user_choice = FileExistenceManager.prompt_user_confirmation(
|
|
371
|
-
{"caption": existing_files["caption"]}, "caption download", transcriber_name=transcriber_name
|
|
372
|
-
)
|
|
373
|
-
|
|
374
|
-
if user_choice == "cancel":
|
|
375
|
-
raise RuntimeError("Caption download cancelled by user")
|
|
376
|
-
elif user_choice == "overwrite":
|
|
377
|
-
# Continue with download
|
|
378
|
-
pass
|
|
379
|
-
elif user_choice == TRANSCRIBE_CHOICE:
|
|
380
|
-
return TRANSCRIBE_CHOICE
|
|
381
|
-
elif user_choice in existing_files["caption"]:
|
|
382
|
-
# User selected a specific file
|
|
383
|
-
caption_file = Path(user_choice)
|
|
384
|
-
self.logger.info(f"✅ Using selected caption file: {caption_file}")
|
|
385
|
-
return str(caption_file)
|
|
386
|
-
else:
|
|
387
|
-
# Fallback: use first file
|
|
388
|
-
caption_file = Path(existing_files["caption"][0])
|
|
389
|
-
self.logger.info(f"✅ Using existing caption file: {caption_file}")
|
|
390
|
-
return str(caption_file)
|
|
391
|
-
else:
|
|
392
|
-
caption_file = Path(existing_files["caption"][0])
|
|
393
|
-
self.logger.info(f"🔍 Found existing caption: {caption_file}")
|
|
394
|
-
return str(caption_file)
|
|
395
|
-
|
|
396
|
-
self.logger.info(f"📥 Downloading caption for: {url}")
|
|
397
|
-
if source_lang:
|
|
398
|
-
self.logger.info(f"🎯 Targeting specific caption track: {source_lang}")
|
|
399
|
-
|
|
400
|
-
output_template = str(target_dir / f"{video_id}.%(ext)s")
|
|
401
|
-
|
|
402
|
-
# Configure yt-dlp options for caption download
|
|
403
|
-
ytdlp_options = [
|
|
404
|
-
"yt-dlp",
|
|
405
|
-
"--skip-download", # Don't download video/audio
|
|
406
|
-
"--output",
|
|
407
|
-
output_template,
|
|
408
|
-
"--sub-format",
|
|
409
|
-
"best", # Prefer best available format
|
|
410
|
-
"--no-warnings", # Suppress warnings for cleaner output
|
|
411
|
-
"--extractor-retries",
|
|
412
|
-
"3", # Retry on errors
|
|
413
|
-
"--sleep-requests",
|
|
414
|
-
"1", # Sleep between requests to avoid rate limiting
|
|
415
|
-
]
|
|
416
|
-
|
|
417
|
-
# Add caption language selection if specified
|
|
418
|
-
if source_lang:
|
|
419
|
-
ytdlp_options.extend(["--write-sub", "--write-auto-sub", "--sub-langs", f"{source_lang}*"])
|
|
420
|
-
else:
|
|
421
|
-
# Download only manual captions (not auto-generated) in English to avoid rate limiting
|
|
422
|
-
ytdlp_options.extend(["--write-sub", "--write-auto-sub"])
|
|
423
|
-
|
|
424
|
-
ytdlp_options.append(url)
|
|
425
|
-
|
|
426
|
-
try:
|
|
427
|
-
# Run in thread pool to avoid blocking
|
|
428
|
-
loop = asyncio.get_event_loop()
|
|
429
|
-
result = await loop.run_in_executor(
|
|
430
|
-
None, lambda: subprocess.run(ytdlp_options, capture_output=True, text=True, check=True)
|
|
431
|
-
)
|
|
432
|
-
# Only log success message, not full yt-dlp output
|
|
433
|
-
self.logger.debug(f"yt-dlp output: {result.stdout.strip()}")
|
|
434
|
-
except subprocess.CalledProcessError as e:
|
|
435
|
-
error_msg = e.stderr.strip() if e.stderr else str(e)
|
|
436
|
-
|
|
437
|
-
# Check for specific error conditions
|
|
438
|
-
if "No automatic or manual captions found" in error_msg:
|
|
439
|
-
self.logger.warning("No captions available for this video")
|
|
440
|
-
elif "HTTP Error 429" in error_msg or "Too Many Requests" in error_msg:
|
|
441
|
-
self.logger.error("YouTube rate limit exceeded. Please try again later or use a different method.")
|
|
442
|
-
self.logger.error(
|
|
443
|
-
"YouTube rate limit exceeded (HTTP 429). "
|
|
444
|
-
"Try again later or use --cookies option with authenticated cookies. "
|
|
445
|
-
"See: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"
|
|
446
|
-
)
|
|
447
|
-
else:
|
|
448
|
-
self.logger.error(f"Failed to download transcript: {error_msg}")
|
|
449
|
-
|
|
450
|
-
# Find the downloaded transcript file
|
|
451
|
-
caption_patterns = [
|
|
452
|
-
f"{video_id}.*vtt",
|
|
453
|
-
f"{video_id}.*srt",
|
|
454
|
-
f"{video_id}.*sub",
|
|
455
|
-
f"{video_id}.*sbv",
|
|
456
|
-
f"{video_id}.*ssa",
|
|
457
|
-
f"{video_id}.*ass",
|
|
458
|
-
]
|
|
459
|
-
|
|
460
|
-
caption_files = []
|
|
461
|
-
for pattern in caption_patterns:
|
|
462
|
-
_caption_files = list(target_dir.glob(pattern))
|
|
463
|
-
for caption_file in _caption_files:
|
|
464
|
-
self.logger.info(f"📥 Downloaded caption: {caption_file}")
|
|
465
|
-
caption_files.extend(_caption_files)
|
|
466
|
-
|
|
467
|
-
# If only one caption file, return it directly
|
|
468
|
-
if len(caption_files) == 1:
|
|
469
|
-
self.logger.info(f"✅ Using caption: {caption_files[0]}")
|
|
470
|
-
return str(caption_files[0])
|
|
471
|
-
|
|
472
|
-
# Multiple caption files found, let user choose
|
|
473
|
-
if FileExistenceManager.is_interactive_mode():
|
|
474
|
-
self.logger.info(f"📋 Found {len(caption_files)} caption files")
|
|
475
|
-
caption_choice = FileExistenceManager.prompt_file_selection(
|
|
476
|
-
file_type="caption",
|
|
477
|
-
files=[str(f) for f in caption_files],
|
|
478
|
-
operation="use",
|
|
479
|
-
transcriber_name=transcriber_name,
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
if caption_choice == "cancel":
|
|
483
|
-
raise RuntimeError("Caption selection cancelled by user")
|
|
484
|
-
elif caption_choice == TRANSCRIBE_CHOICE:
|
|
485
|
-
return caption_choice
|
|
486
|
-
elif caption_choice:
|
|
487
|
-
self.logger.info(f"✅ Selected caption: {caption_choice}")
|
|
488
|
-
return caption_choice
|
|
489
|
-
elif caption_files:
|
|
490
|
-
# Fallback to first file
|
|
491
|
-
self.logger.info(f"✅ Using first caption: {caption_files[0]}")
|
|
492
|
-
return str(caption_files[0])
|
|
493
|
-
else:
|
|
494
|
-
self.logger.warning("No caption files available after download")
|
|
495
|
-
return None
|
|
496
|
-
elif caption_files:
|
|
497
|
-
# Non-interactive mode: use first file
|
|
498
|
-
self.logger.info(f"✅ Using first caption: {caption_files[0]}")
|
|
499
|
-
return str(caption_files[0])
|
|
500
|
-
else:
|
|
501
|
-
self.logger.warning("No caption files available after download")
|
|
502
|
-
return None
|
|
503
|
-
|
|
504
|
-
async def list_available_captions(self, url: str) -> List[Dict[str, Any]]:
|
|
505
|
-
"""
|
|
506
|
-
List all available caption tracks for a YouTube video
|
|
507
|
-
|
|
508
|
-
Args:
|
|
509
|
-
url: YouTube URL
|
|
510
|
-
|
|
511
|
-
Returns:
|
|
512
|
-
List of caption track information dictionaries
|
|
513
|
-
"""
|
|
514
|
-
self.logger.info(f"📋 Listing available captions for: {url}")
|
|
515
|
-
|
|
516
|
-
cmd = ["yt-dlp", "--list-subs", "--no-download", url]
|
|
517
|
-
|
|
518
|
-
try:
|
|
519
|
-
# Run in thread pool to avoid blocking
|
|
520
|
-
loop = asyncio.get_event_loop()
|
|
521
|
-
result = await loop.run_in_executor(
|
|
522
|
-
None, lambda: subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
523
|
-
)
|
|
524
|
-
|
|
525
|
-
# Parse the caption list output
|
|
526
|
-
caption_info = []
|
|
527
|
-
lines = result.stdout.strip().split("\n")
|
|
528
|
-
|
|
529
|
-
# Look for the caption section (not automatic captions)
|
|
530
|
-
in_caption_section = False
|
|
531
|
-
for line in lines:
|
|
532
|
-
if "Available captions for" in line:
|
|
533
|
-
in_caption_section = True
|
|
534
|
-
continue
|
|
535
|
-
elif "Available automatic captions for" in line:
|
|
536
|
-
in_caption_section = False
|
|
537
|
-
continue
|
|
538
|
-
elif in_caption_section and line.strip():
|
|
539
|
-
# Skip header lines
|
|
540
|
-
if "Language" in line and "Name" in line and "Formats" in line:
|
|
541
|
-
continue
|
|
542
|
-
|
|
543
|
-
# Parse caption information
|
|
544
|
-
# Format: "Language Name Formats" where formats are comma-separated
|
|
545
|
-
# Example: "en-uYU-mmqFLq8 English - CC1 vtt, srt, ttml, srv3, srv2, srv1, json3"
|
|
546
|
-
|
|
547
|
-
if line.strip() and not line.startswith("["):
|
|
548
|
-
# Split by multiple spaces to separate language, name, and formats
|
|
549
|
-
import re
|
|
550
|
-
|
|
551
|
-
parts = re.split(r"\s{2,}", line.strip())
|
|
552
|
-
|
|
553
|
-
if len(parts) >= 2:
|
|
554
|
-
# First part is language, last part is formats
|
|
555
|
-
language_and_name = parts[0]
|
|
556
|
-
formats_str = parts[-1]
|
|
557
|
-
|
|
558
|
-
# Split language and name - language is first word
|
|
559
|
-
lang_name_parts = language_and_name.split(" ", 1)
|
|
560
|
-
language = lang_name_parts[0]
|
|
561
|
-
name = lang_name_parts[1] if len(lang_name_parts) > 1 else ""
|
|
562
|
-
|
|
563
|
-
# If there are more than 2 parts, middle parts are also part of name
|
|
564
|
-
if len(parts) > 2:
|
|
565
|
-
name = " ".join([name] + parts[1:-1]).strip()
|
|
566
|
-
|
|
567
|
-
# Parse formats - they are comma-separated
|
|
568
|
-
formats = [f.strip() for f in formats_str.split(",") if f.strip()]
|
|
569
|
-
|
|
570
|
-
caption_info.append({"language": language, "name": name, "formats": formats})
|
|
571
|
-
|
|
572
|
-
self.logger.info(f"✅ Found {len(caption_info)} caption tracks")
|
|
573
|
-
return caption_info
|
|
574
|
-
|
|
575
|
-
except subprocess.CalledProcessError as e:
|
|
576
|
-
self.logger.error(f"Failed to list captions: {e.stderr}")
|
|
577
|
-
raise RuntimeError(f"Failed to list captions: {e.stderr}")
|