spatelier 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analytics/__init__.py +1 -0
- analytics/reporter.py +497 -0
- cli/__init__.py +1 -0
- cli/app.py +147 -0
- cli/audio.py +129 -0
- cli/cli_analytics.py +320 -0
- cli/cli_utils.py +282 -0
- cli/error_handlers.py +122 -0
- cli/files.py +299 -0
- cli/update.py +325 -0
- cli/video.py +823 -0
- cli/worker.py +615 -0
- core/__init__.py +1 -0
- core/analytics_dashboard.py +368 -0
- core/base.py +303 -0
- core/base_service.py +69 -0
- core/config.py +345 -0
- core/database_service.py +116 -0
- core/decorators.py +263 -0
- core/error_handler.py +210 -0
- core/file_tracker.py +254 -0
- core/interactive_cli.py +366 -0
- core/interfaces.py +166 -0
- core/job_queue.py +437 -0
- core/logger.py +79 -0
- core/package_updater.py +469 -0
- core/progress.py +228 -0
- core/service_factory.py +295 -0
- core/streaming.py +299 -0
- core/worker.py +765 -0
- database/__init__.py +1 -0
- database/connection.py +265 -0
- database/metadata.py +516 -0
- database/models.py +288 -0
- database/repository.py +592 -0
- database/transcription_storage.py +219 -0
- modules/__init__.py +1 -0
- modules/audio/__init__.py +5 -0
- modules/audio/converter.py +197 -0
- modules/video/__init__.py +16 -0
- modules/video/converter.py +191 -0
- modules/video/fallback_extractor.py +334 -0
- modules/video/services/__init__.py +18 -0
- modules/video/services/audio_extraction_service.py +274 -0
- modules/video/services/download_service.py +852 -0
- modules/video/services/metadata_service.py +190 -0
- modules/video/services/playlist_service.py +445 -0
- modules/video/services/transcription_service.py +491 -0
- modules/video/transcription_service.py +385 -0
- modules/video/youtube_api.py +397 -0
- spatelier/__init__.py +33 -0
- spatelier-0.3.0.dist-info/METADATA +260 -0
- spatelier-0.3.0.dist-info/RECORD +59 -0
- spatelier-0.3.0.dist-info/WHEEL +5 -0
- spatelier-0.3.0.dist-info/entry_points.txt +2 -0
- spatelier-0.3.0.dist-info/licenses/LICENSE +21 -0
- spatelier-0.3.0.dist-info/top_level.txt +7 -0
- utils/__init__.py +1 -0
- utils/helpers.py +250 -0
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Video download service.
|
|
3
|
+
|
|
4
|
+
This module provides focused video downloading functionality,
|
|
5
|
+
separated from transcription and metadata concerns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import subprocess
|
|
9
|
+
import tempfile
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional, Union
|
|
13
|
+
|
|
14
|
+
from core.base import BaseDownloader, ProcessingResult
|
|
15
|
+
from core.base_service import BaseService
|
|
16
|
+
from core.config import Config
|
|
17
|
+
from database.metadata import MetadataExtractor, MetadataManager
|
|
18
|
+
from database.models import MediaType, ProcessingStatus
|
|
19
|
+
from modules.video.fallback_extractor import FallbackExtractor
|
|
20
|
+
from utils.helpers import get_file_hash, get_file_type, safe_filename
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VideoDownloadService(BaseService):
|
|
24
|
+
"""
|
|
25
|
+
Focused video download service.
|
|
26
|
+
|
|
27
|
+
Handles only video downloading, without transcription or complex metadata processing.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, config: Config, verbose: bool = False, db_service=None):
|
|
31
|
+
"""Initialize the video download service."""
|
|
32
|
+
# Initialize base service
|
|
33
|
+
super().__init__(config, verbose, db_service)
|
|
34
|
+
|
|
35
|
+
# Service-specific initialization
|
|
36
|
+
self.supported_sites = [
|
|
37
|
+
"youtube.com",
|
|
38
|
+
"youtu.be",
|
|
39
|
+
"vimeo.com",
|
|
40
|
+
"dailymotion.com",
|
|
41
|
+
"twitch.tv",
|
|
42
|
+
"twitter.com",
|
|
43
|
+
"instagram.com",
|
|
44
|
+
"tiktok.com",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# Initialize metadata management
|
|
48
|
+
self.metadata_extractor = MetadataExtractor(config, verbose=verbose)
|
|
49
|
+
self.metadata_manager = MetadataManager(config, verbose=verbose)
|
|
50
|
+
|
|
51
|
+
# Initialize fallback extractor
|
|
52
|
+
try:
|
|
53
|
+
self.fallback_extractor = FallbackExtractor(config)
|
|
54
|
+
except RuntimeError as exc:
|
|
55
|
+
self.fallback_extractor = None
|
|
56
|
+
self.logger.info(f"Fallback extractor disabled: {exc}")
|
|
57
|
+
|
|
58
|
+
def download_video(
|
|
59
|
+
self, url: str, output_path: Optional[Union[str, Path]] = None, **kwargs
|
|
60
|
+
) -> ProcessingResult:
|
|
61
|
+
"""
|
|
62
|
+
Download a single video from URL.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
url: URL to download from
|
|
66
|
+
output_path: Optional output path
|
|
67
|
+
**kwargs: Additional download options
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
ProcessingResult with download details
|
|
71
|
+
"""
|
|
72
|
+
# Track download start
|
|
73
|
+
self.repos.analytics.track_event("download_start", event_data={"url": url})
|
|
74
|
+
|
|
75
|
+
# Extract metadata before download
|
|
76
|
+
source_metadata = {}
|
|
77
|
+
if "youtube.com" in url or "youtu.be" in url:
|
|
78
|
+
source_metadata = self.metadata_extractor.extract_youtube_metadata(url)
|
|
79
|
+
self.logger.info(
|
|
80
|
+
f"Extracted YouTube metadata: {source_metadata.get('title', 'Unknown')}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
# Determine output path
|
|
85
|
+
output_file = None
|
|
86
|
+
if output_path is None:
|
|
87
|
+
from core.config import get_default_data_dir
|
|
88
|
+
|
|
89
|
+
repo_root = get_default_data_dir().parent
|
|
90
|
+
output_dir = self.config.video.output_dir or (repo_root / "downloads")
|
|
91
|
+
else:
|
|
92
|
+
output_path = Path(output_path)
|
|
93
|
+
if output_path.suffix:
|
|
94
|
+
output_file = output_path
|
|
95
|
+
output_dir = output_path.parent
|
|
96
|
+
else:
|
|
97
|
+
output_dir = output_path
|
|
98
|
+
|
|
99
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
# Create processing job
|
|
102
|
+
job = self.repos.jobs.create(
|
|
103
|
+
media_file_id=None, # Will be updated after processing
|
|
104
|
+
job_type="download_video",
|
|
105
|
+
input_path=url,
|
|
106
|
+
output_path=str(output_file or output_dir),
|
|
107
|
+
parameters=str(kwargs),
|
|
108
|
+
)
|
|
109
|
+
self.logger.info(f"Created video processing job: {job.id}")
|
|
110
|
+
|
|
111
|
+
# Check if output is on NAS and set up temp processing if needed
|
|
112
|
+
is_nas = self._is_nas_path(output_dir)
|
|
113
|
+
|
|
114
|
+
temp_dir = None
|
|
115
|
+
processing_path = output_dir
|
|
116
|
+
|
|
117
|
+
if is_nas:
|
|
118
|
+
# Create job-specific temp processing directory
|
|
119
|
+
temp_dir = self._get_temp_processing_dir(job.id)
|
|
120
|
+
processing_path = temp_dir
|
|
121
|
+
self.logger.info(f"NAS detected, using temp processing: {temp_dir}")
|
|
122
|
+
self.logger.info(f"Video will be processed in: {processing_path}")
|
|
123
|
+
|
|
124
|
+
# Mark job as processing (sets started_at for duration tracking)
|
|
125
|
+
self.repos.jobs.update_status(job.id, ProcessingStatus.PROCESSING)
|
|
126
|
+
|
|
127
|
+
# Download using yt-dlp
|
|
128
|
+
downloaded_file = self._download_with_ytdlp(url, processing_path, **kwargs)
|
|
129
|
+
|
|
130
|
+
if downloaded_file and downloaded_file.exists():
|
|
131
|
+
# Extract video metadata
|
|
132
|
+
video_id = self._extract_video_id_from_url(url)
|
|
133
|
+
|
|
134
|
+
# Create media file record
|
|
135
|
+
media_file = self.repos.media.create(
|
|
136
|
+
file_path=str(downloaded_file),
|
|
137
|
+
file_name=downloaded_file.name,
|
|
138
|
+
file_size=downloaded_file.stat().st_size,
|
|
139
|
+
file_hash=get_file_hash(downloaded_file),
|
|
140
|
+
media_type=MediaType.VIDEO,
|
|
141
|
+
mime_type=get_file_type(downloaded_file),
|
|
142
|
+
source_url=url,
|
|
143
|
+
source_platform=(
|
|
144
|
+
"youtube"
|
|
145
|
+
if "youtube.com" in url or "youtu.be" in url
|
|
146
|
+
else "unknown"
|
|
147
|
+
),
|
|
148
|
+
source_id=video_id,
|
|
149
|
+
title=source_metadata.get("title", downloaded_file.stem),
|
|
150
|
+
description=source_metadata.get("description"),
|
|
151
|
+
uploader=source_metadata.get("uploader"),
|
|
152
|
+
uploader_id=source_metadata.get("uploader_id"),
|
|
153
|
+
upload_date=source_metadata.get("upload_date"),
|
|
154
|
+
view_count=source_metadata.get("view_count"),
|
|
155
|
+
like_count=source_metadata.get("like_count"),
|
|
156
|
+
duration=source_metadata.get("duration"),
|
|
157
|
+
language=source_metadata.get("language"),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Enrich with additional metadata
|
|
161
|
+
self.metadata_manager.enrich_media_file(
|
|
162
|
+
media_file, self.repos.media, extract_source_metadata=True
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Update job with media file ID
|
|
166
|
+
self.repos.jobs.update(
|
|
167
|
+
job.id,
|
|
168
|
+
media_file_id=media_file.id,
|
|
169
|
+
output_path=str(downloaded_file),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# If we used temp processing, move file to final destination
|
|
173
|
+
if is_nas and temp_dir:
|
|
174
|
+
self.logger.info("Moving video to NAS destination...")
|
|
175
|
+
final_file_path = output_file or (output_dir / downloaded_file.name)
|
|
176
|
+
|
|
177
|
+
if self._move_file_to_nas(downloaded_file, final_file_path):
|
|
178
|
+
self.logger.info(
|
|
179
|
+
f"Successfully moved video to NAS: {final_file_path}"
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Check if a media file with this path already exists
|
|
183
|
+
existing_media = self.repos.media.get_by_file_path(
|
|
184
|
+
str(final_file_path)
|
|
185
|
+
)
|
|
186
|
+
if existing_media:
|
|
187
|
+
# Delete the old record and update the current one
|
|
188
|
+
self.logger.info(
|
|
189
|
+
f"Found existing media file {existing_media.id} with same path, updating it"
|
|
190
|
+
)
|
|
191
|
+
self.repos.media.delete(existing_media.id)
|
|
192
|
+
|
|
193
|
+
# Update media file record with final path
|
|
194
|
+
self.repos.media.update(
|
|
195
|
+
media_file.id,
|
|
196
|
+
file_path=str(final_file_path),
|
|
197
|
+
file_name=final_file_path.name,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Update job status
|
|
201
|
+
self.repos.jobs.update_status(
|
|
202
|
+
job.id, ProcessingStatus.COMPLETED
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Clean up temp directory
|
|
206
|
+
self._cleanup_temp_directory(temp_dir)
|
|
207
|
+
self.logger.info(f"Cleaned up temp directory: {temp_dir}")
|
|
208
|
+
|
|
209
|
+
return ProcessingResult(
|
|
210
|
+
success=True,
|
|
211
|
+
message="Video downloaded and moved to NAS successfully",
|
|
212
|
+
output_path=str(final_file_path),
|
|
213
|
+
metadata={
|
|
214
|
+
"media_file_id": media_file.id,
|
|
215
|
+
"job_id": job.id,
|
|
216
|
+
"nas_processing": True,
|
|
217
|
+
},
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
self.logger.error("Failed to move video to NAS")
|
|
221
|
+
self.repos.jobs.update_status(
|
|
222
|
+
job.id,
|
|
223
|
+
ProcessingStatus.FAILED,
|
|
224
|
+
error_message="Failed to move to NAS",
|
|
225
|
+
)
|
|
226
|
+
return ProcessingResult(
|
|
227
|
+
success=False,
|
|
228
|
+
message="Video downloaded but failed to move to NAS",
|
|
229
|
+
errors=["Failed to move file to final destination"],
|
|
230
|
+
)
|
|
231
|
+
else:
|
|
232
|
+
# For local downloads, update job status
|
|
233
|
+
self.repos.jobs.update_status(job.id, ProcessingStatus.COMPLETED)
|
|
234
|
+
|
|
235
|
+
final_file_path = downloaded_file
|
|
236
|
+
if output_file and final_file_path.exists():
|
|
237
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
238
|
+
if final_file_path.resolve() != output_file.resolve():
|
|
239
|
+
final_file_path.replace(output_file)
|
|
240
|
+
self.repos.media.update(
|
|
241
|
+
media_file.id,
|
|
242
|
+
file_path=str(output_file),
|
|
243
|
+
file_name=output_file.name,
|
|
244
|
+
)
|
|
245
|
+
self.repos.jobs.update(job.id, output_path=str(output_file))
|
|
246
|
+
final_file_path = output_file
|
|
247
|
+
|
|
248
|
+
return ProcessingResult(
|
|
249
|
+
success=True,
|
|
250
|
+
message="Video downloaded successfully",
|
|
251
|
+
output_path=str(final_file_path),
|
|
252
|
+
metadata={
|
|
253
|
+
"media_file_id": media_file.id,
|
|
254
|
+
"job_id": job.id,
|
|
255
|
+
"nas_processing": False,
|
|
256
|
+
},
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
self.repos.jobs.update_status(
|
|
260
|
+
job.id, ProcessingStatus.FAILED, error_message="Download failed"
|
|
261
|
+
)
|
|
262
|
+
return ProcessingResult(
|
|
263
|
+
success=False,
|
|
264
|
+
message="Video download failed",
|
|
265
|
+
errors=["No video file found after download"],
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
except Exception as e:
|
|
269
|
+
self.logger.error(f"Video download failed: {e}")
|
|
270
|
+
return ProcessingResult(
|
|
271
|
+
success=False, message=f"Video download failed: {e}", errors=[str(e)]
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
def _download_with_ytdlp(
|
|
275
|
+
self, url: str, output_path: Path, **kwargs
|
|
276
|
+
) -> Optional[Path]:
|
|
277
|
+
"""Download video using yt-dlp.
|
|
278
|
+
|
|
279
|
+
Automatically refreshes cookies and retries if download fails due to
|
|
280
|
+
authentication issues with age-restricted content.
|
|
281
|
+
"""
|
|
282
|
+
try:
|
|
283
|
+
# Build yt-dlp options
|
|
284
|
+
ydl_opts = self._build_ydl_opts(output_path, **kwargs)
|
|
285
|
+
|
|
286
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
287
|
+
|
|
288
|
+
# Execute download
|
|
289
|
+
import yt_dlp
|
|
290
|
+
|
|
291
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
292
|
+
info = ydl.extract_info(url, download=True)
|
|
293
|
+
downloaded_file = self._resolve_downloaded_path(ydl, info)
|
|
294
|
+
if (
|
|
295
|
+
downloaded_file
|
|
296
|
+
and downloaded_file.exists()
|
|
297
|
+
and downloaded_file.stat().st_size > 0
|
|
298
|
+
):
|
|
299
|
+
return downloaded_file
|
|
300
|
+
|
|
301
|
+
# Only fallback to finding latest if we can't resolve the path
|
|
302
|
+
# But validate it matches the expected video ID to avoid picking up old files
|
|
303
|
+
return self._validate_fallback_file(output_path, url)
|
|
304
|
+
|
|
305
|
+
except Exception as e:
|
|
306
|
+
error_msg = str(e)
|
|
307
|
+
# Check if this is a cookie/authentication error
|
|
308
|
+
if any(
|
|
309
|
+
keyword in error_msg.lower()
|
|
310
|
+
for keyword in ["sign in", "age", "cookies", "authentication"]
|
|
311
|
+
):
|
|
312
|
+
self.logger.warning(
|
|
313
|
+
"Download failed due to authentication - attempting to refresh cookies..."
|
|
314
|
+
)
|
|
315
|
+
# Try to refresh cookies and get cookie file
|
|
316
|
+
cookie_file = self._refresh_youtube_cookies()
|
|
317
|
+
if cookie_file:
|
|
318
|
+
self.logger.info("Retrying download with refreshed cookies...")
|
|
319
|
+
# Retry the download with cookie file
|
|
320
|
+
try:
|
|
321
|
+
ydl_opts = self._build_ydl_opts(output_path, **kwargs)
|
|
322
|
+
# Use the cookie file instead of cookies_from_browser
|
|
323
|
+
ydl_opts["cookies"] = cookie_file
|
|
324
|
+
if "cookies_from_browser" in ydl_opts:
|
|
325
|
+
del ydl_opts["cookies_from_browser"]
|
|
326
|
+
|
|
327
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
328
|
+
info = ydl.extract_info(url, download=True)
|
|
329
|
+
downloaded_file = self._resolve_downloaded_path(ydl, info)
|
|
330
|
+
if (
|
|
331
|
+
downloaded_file
|
|
332
|
+
and downloaded_file.exists()
|
|
333
|
+
and downloaded_file.stat().st_size > 0
|
|
334
|
+
):
|
|
335
|
+
return downloaded_file
|
|
336
|
+
|
|
337
|
+
# Clean up cookie file
|
|
338
|
+
import os
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
os.unlink(cookie_file)
|
|
342
|
+
except:
|
|
343
|
+
pass
|
|
344
|
+
|
|
345
|
+
# Validate fallback file matches video ID
|
|
346
|
+
return self._validate_fallback_file(output_path, url)
|
|
347
|
+
except Exception as retry_error:
|
|
348
|
+
# Clean up cookie file
|
|
349
|
+
import os
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
os.unlink(cookie_file)
|
|
353
|
+
except:
|
|
354
|
+
pass
|
|
355
|
+
self.logger.error(
|
|
356
|
+
f"Download failed after cookie refresh: {retry_error}"
|
|
357
|
+
)
|
|
358
|
+
return None
|
|
359
|
+
else:
|
|
360
|
+
self.logger.error(f"yt-dlp download failed: {e}")
|
|
361
|
+
return None
|
|
362
|
+
else:
|
|
363
|
+
self.logger.error(f"yt-dlp download failed: {e}")
|
|
364
|
+
return self._validate_fallback_file(output_path, url)
|
|
365
|
+
|
|
366
|
+
def _refresh_youtube_cookies(self) -> Optional[str]:
|
|
367
|
+
"""Refresh YouTube cookies by visiting YouTube and extracting fresh cookies.
|
|
368
|
+
|
|
369
|
+
Uses Playwright to launch Chrome with the user's profile, visit YouTube,
|
|
370
|
+
extract the cookies, and save them to a temporary file for yt-dlp to use.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
Path to cookie file if successful, None otherwise
|
|
374
|
+
"""
|
|
375
|
+
try:
|
|
376
|
+
import os
|
|
377
|
+
import platform
|
|
378
|
+
import tempfile
|
|
379
|
+
|
|
380
|
+
from playwright.sync_api import sync_playwright
|
|
381
|
+
|
|
382
|
+
system = platform.system().lower()
|
|
383
|
+
if system != "darwin":
|
|
384
|
+
# Only implemented for macOS for now
|
|
385
|
+
return None
|
|
386
|
+
|
|
387
|
+
# Get Chrome user data directory
|
|
388
|
+
chrome_user_data = os.path.expanduser(
|
|
389
|
+
"~/Library/Application Support/Google/Chrome"
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if not os.path.exists(chrome_user_data):
|
|
393
|
+
return None
|
|
394
|
+
|
|
395
|
+
self.logger.info(
|
|
396
|
+
"Refreshing YouTube cookies by visiting YouTube in Chrome..."
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
with sync_playwright() as p:
|
|
400
|
+
# Launch Chrome with user's profile
|
|
401
|
+
browser = p.chromium.launch_persistent_context(
|
|
402
|
+
user_data_dir=chrome_user_data,
|
|
403
|
+
headless=True,
|
|
404
|
+
args=["--disable-blink-features=AutomationControlled"],
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Visit YouTube to refresh session
|
|
408
|
+
page = browser.new_page()
|
|
409
|
+
page.goto(
|
|
410
|
+
"https://www.youtube.com", wait_until="networkidle", timeout=15000
|
|
411
|
+
)
|
|
412
|
+
# Wait a moment for cookies to be set
|
|
413
|
+
page.wait_for_timeout(3000)
|
|
414
|
+
|
|
415
|
+
# Extract cookies from the page
|
|
416
|
+
cookies = browser.cookies()
|
|
417
|
+
browser.close()
|
|
418
|
+
|
|
419
|
+
# Filter for YouTube cookies only
|
|
420
|
+
youtube_cookies = [
|
|
421
|
+
c
|
|
422
|
+
for c in cookies
|
|
423
|
+
if "youtube.com" in c.get("domain", "")
|
|
424
|
+
or ".youtube.com" in c.get("domain", "")
|
|
425
|
+
]
|
|
426
|
+
|
|
427
|
+
if not youtube_cookies:
|
|
428
|
+
self.logger.warning("No YouTube cookies found after refresh")
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
# Save cookies to Netscape format file for yt-dlp
|
|
432
|
+
cookie_file = tempfile.NamedTemporaryFile(
|
|
433
|
+
mode="w", suffix=".txt", delete=False
|
|
434
|
+
)
|
|
435
|
+
cookie_file.write("# Netscape HTTP Cookie File\n")
|
|
436
|
+
cookie_file.write("# This file was generated by spatelier\n\n")
|
|
437
|
+
|
|
438
|
+
for cookie in youtube_cookies:
|
|
439
|
+
domain = cookie.get("domain", "")
|
|
440
|
+
domain_flag = "TRUE" if domain.startswith(".") else "FALSE"
|
|
441
|
+
path = cookie.get("path", "/")
|
|
442
|
+
secure = "TRUE" if cookie.get("secure", False) else "FALSE"
|
|
443
|
+
expires = str(int(cookie.get("expires", 0)))
|
|
444
|
+
name = cookie.get("name", "")
|
|
445
|
+
value = cookie.get("value", "")
|
|
446
|
+
|
|
447
|
+
cookie_file.write(
|
|
448
|
+
f"{domain}\t{domain_flag}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
cookie_file.close()
|
|
452
|
+
self.logger.info(
|
|
453
|
+
f"YouTube cookies refreshed and saved to: {cookie_file.name}"
|
|
454
|
+
)
|
|
455
|
+
return cookie_file.name
|
|
456
|
+
|
|
457
|
+
except Exception as e:
|
|
458
|
+
self.logger.warning(f"Failed to refresh cookies automatically: {e}")
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
def _resolve_downloaded_path(
|
|
462
|
+
self, ydl, info: Optional[Dict[str, Any]]
|
|
463
|
+
) -> Optional[Path]:
|
|
464
|
+
"""Resolve downloaded file path from yt-dlp info."""
|
|
465
|
+
if not info:
|
|
466
|
+
return None
|
|
467
|
+
|
|
468
|
+
if isinstance(info, dict) and info.get("_type") == "playlist":
|
|
469
|
+
entries = [entry for entry in info.get("entries") or [] if entry]
|
|
470
|
+
if not entries:
|
|
471
|
+
return None
|
|
472
|
+
info = entries[0]
|
|
473
|
+
|
|
474
|
+
if not isinstance(info, dict):
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
return Path(ydl.prepare_filename(info))
|
|
478
|
+
|
|
479
|
+
def _find_latest_download(self, output_path: Path) -> Optional[Path]:
|
|
480
|
+
"""Find the most recently modified downloaded video file."""
|
|
481
|
+
candidates: List[Path] = []
|
|
482
|
+
for ext in self.config.video_extensions:
|
|
483
|
+
candidates.extend(output_path.glob(f"*{ext}"))
|
|
484
|
+
|
|
485
|
+
candidates = [path for path in candidates if path.is_file()]
|
|
486
|
+
if not candidates:
|
|
487
|
+
return None
|
|
488
|
+
|
|
489
|
+
return max(candidates, key=lambda path: path.stat().st_mtime)
|
|
490
|
+
|
|
491
|
+
def _validate_fallback_file(self, output_path: Path, url: str) -> Optional[Path]:
|
|
492
|
+
"""Find latest download and validate it matches the expected video ID."""
|
|
493
|
+
fallback_file = self._find_latest_download(output_path)
|
|
494
|
+
if not fallback_file:
|
|
495
|
+
return None
|
|
496
|
+
|
|
497
|
+
# Extract video ID from URL to validate
|
|
498
|
+
import re
|
|
499
|
+
|
|
500
|
+
# Match YouTube URLs including /shorts/, /watch?v=, /v/, /embed/, youtu.be
|
|
501
|
+
video_id_match = re.search(
|
|
502
|
+
r'(?:youtube\.com/(?:shorts/|watch\?v=|v/|embed/|[^/]+/.+/|.*[?&]v=)|youtu\.be/)([^"&?/\s]{11})',
|
|
503
|
+
url,
|
|
504
|
+
)
|
|
505
|
+
if video_id_match:
|
|
506
|
+
expected_id = video_id_match.group(1)
|
|
507
|
+
# Check if the filename contains the expected video ID
|
|
508
|
+
if expected_id in fallback_file.name:
|
|
509
|
+
return fallback_file
|
|
510
|
+
else:
|
|
511
|
+
self.logger.warning(
|
|
512
|
+
f"Found file {fallback_file.name} but it doesn't match expected video ID {expected_id}. "
|
|
513
|
+
"Download may have failed."
|
|
514
|
+
)
|
|
515
|
+
return None
|
|
516
|
+
# If we can't extract video ID, return the file anyway (for non-YouTube URLs)
|
|
517
|
+
return fallback_file
|
|
518
|
+
|
|
519
|
+
def _get_cookies_from_browser(self) -> Optional[tuple]:
|
|
520
|
+
"""Try to get cookies from common browsers automatically.
|
|
521
|
+
|
|
522
|
+
Returns a tuple of browsers to try in order. yt-dlp will try each browser
|
|
523
|
+
until one works, or continue without cookies if none are available.
|
|
524
|
+
|
|
525
|
+
Note: On macOS, Chrome is more reliable than Safari for cookie extraction.
|
|
526
|
+
"""
|
|
527
|
+
# Try browsers in order of preference
|
|
528
|
+
# On macOS, Chrome is more reliable than Safari (Safari cookies are harder to access)
|
|
529
|
+
# yt-dlp will try each browser until one works
|
|
530
|
+
import platform
|
|
531
|
+
|
|
532
|
+
system = platform.system().lower()
|
|
533
|
+
|
|
534
|
+
if system == "darwin": # macOS - prioritize Chrome over Safari
|
|
535
|
+
browsers = ("chrome", "safari", "firefox", "edge")
|
|
536
|
+
else: # Linux, Windows, etc.
|
|
537
|
+
browsers = ("chrome", "firefox", "safari", "edge")
|
|
538
|
+
|
|
539
|
+
return browsers
|
|
540
|
+
|
|
541
|
+
def _build_ydl_opts(self, output_path: Path, **kwargs) -> Dict:
|
|
542
|
+
"""Build yt-dlp options."""
|
|
543
|
+
# Output template
|
|
544
|
+
output_template = str(output_path / "%(title)s [%(id)s].%(ext)s")
|
|
545
|
+
|
|
546
|
+
ydl_opts = {
|
|
547
|
+
"outtmpl": output_template,
|
|
548
|
+
"format": self._get_format_selector(
|
|
549
|
+
kwargs.get("quality", self.config.video.quality),
|
|
550
|
+
kwargs.get("format", self.config.video.default_format),
|
|
551
|
+
),
|
|
552
|
+
"writeinfojson": False,
|
|
553
|
+
"writesubtitles": False,
|
|
554
|
+
"writeautomaticsub": False,
|
|
555
|
+
"no_warnings": not self.verbose,
|
|
556
|
+
"quiet": not self.verbose,
|
|
557
|
+
# Add fallback formats for YouTube SABR streaming issues
|
|
558
|
+
"format_sort": ["res", "ext", "codec", "br", "asr"],
|
|
559
|
+
# Try to use available formats even if preferred format fails
|
|
560
|
+
"ignoreerrors": False,
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
# Automatically try to use cookies from browser for age-restricted content
|
|
564
|
+
cookies_browser = self._get_cookies_from_browser()
|
|
565
|
+
if cookies_browser:
|
|
566
|
+
ydl_opts["cookies_from_browser"] = cookies_browser
|
|
567
|
+
if self.verbose:
|
|
568
|
+
self.logger.info(
|
|
569
|
+
f"Attempting to use cookies from browsers: {cookies_browser}"
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
if self.verbose:
|
|
573
|
+
ydl_opts["verbose"] = True
|
|
574
|
+
|
|
575
|
+
return ydl_opts
|
|
576
|
+
|
|
577
|
+
def _get_format_selector(self, quality: str, format: str) -> str:
|
|
578
|
+
"""Get format selector for yt-dlp with fallbacks for YouTube issues."""
|
|
579
|
+
if quality == "best":
|
|
580
|
+
# Add fallback chain: preferred format -> any format -> best available
|
|
581
|
+
return f"best[ext={format}]/bestvideo[ext={format}]+bestaudio/best[ext={format}]/best"
|
|
582
|
+
elif quality == "worst":
|
|
583
|
+
return f"worst[ext={format}]/worst"
|
|
584
|
+
else:
|
|
585
|
+
# Extract numeric part from quality (e.g., "1080p" -> "1080")
|
|
586
|
+
try:
|
|
587
|
+
height = quality.replace("p", "")
|
|
588
|
+
# Add fallback chain with height constraint
|
|
589
|
+
return f"best[height<={height}][ext={format}]/bestvideo[height<={height}]+bestaudio/best[height<={height}]/best"
|
|
590
|
+
except:
|
|
591
|
+
# Fallback to simpler selector if parsing fails
|
|
592
|
+
return f"best[ext={format}]/bestvideo+bestaudio/best"
|
|
593
|
+
|
|
594
|
+
def _is_nas_path(self, path: Union[str, Path]) -> bool:
|
|
595
|
+
"""Check if path is on NAS."""
|
|
596
|
+
path_str = str(path)
|
|
597
|
+
return any(
|
|
598
|
+
nas_indicator in path_str.lower()
|
|
599
|
+
for nas_indicator in [
|
|
600
|
+
"/volumes/",
|
|
601
|
+
"/mnt/",
|
|
602
|
+
"nas",
|
|
603
|
+
"network",
|
|
604
|
+
"smb://",
|
|
605
|
+
"nfs://",
|
|
606
|
+
]
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
def _get_temp_processing_dir(self, job_id: int) -> Path:
|
|
610
|
+
"""Get temporary processing directory for job."""
|
|
611
|
+
temp_dir = self.config.video.temp_dir / str(job_id)
|
|
612
|
+
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
613
|
+
return temp_dir
|
|
614
|
+
|
|
615
|
+
def _move_file_to_nas(self, source_file: Path, dest_file: Path) -> bool:
|
|
616
|
+
"""Move file to NAS destination."""
|
|
617
|
+
try:
|
|
618
|
+
import shutil
|
|
619
|
+
|
|
620
|
+
dest_file.parent.mkdir(parents=True, exist_ok=True)
|
|
621
|
+
shutil.move(str(source_file), str(dest_file))
|
|
622
|
+
return True
|
|
623
|
+
except Exception as e:
|
|
624
|
+
self.logger.error(f"Failed to move file to NAS: {e}")
|
|
625
|
+
return False
|
|
626
|
+
|
|
627
|
+
def _cleanup_temp_directory(self, temp_dir: Path):
|
|
628
|
+
"""Clean up temporary directory."""
|
|
629
|
+
try:
|
|
630
|
+
import shutil
|
|
631
|
+
|
|
632
|
+
shutil.rmtree(temp_dir)
|
|
633
|
+
except Exception as e:
|
|
634
|
+
self.logger.warning(f"Failed to clean up temp directory {temp_dir}: {e}")
|
|
635
|
+
|
|
636
|
+
def _extract_video_id_from_url(self, url: str) -> str:
|
|
637
|
+
"""Extract video ID from URL."""
|
|
638
|
+
if "youtube.com" in url or "youtu.be" in url:
|
|
639
|
+
if "v=" in url:
|
|
640
|
+
return url.split("v=")[1].split("&")[0]
|
|
641
|
+
elif "youtu.be/" in url:
|
|
642
|
+
return url.split("youtu.be/")[1].split("?")[0]
|
|
643
|
+
return "unknown"
|
|
644
|
+
|
|
645
|
+
def _get_playlist_progress(self, playlist_id: str) -> Dict[str, int]:
|
|
646
|
+
"""Get playlist download progress."""
|
|
647
|
+
try:
|
|
648
|
+
# Get playlist from database
|
|
649
|
+
playlist = self.repos.playlists.get_by_playlist_id(playlist_id)
|
|
650
|
+
if not playlist:
|
|
651
|
+
return {"total": 0, "completed": 0, "failed": 0, "remaining": 0}
|
|
652
|
+
|
|
653
|
+
# Get playlist videos
|
|
654
|
+
playlist_videos = self.repos.playlist_videos.get_by_playlist_id(playlist.id)
|
|
655
|
+
total = len(playlist_videos)
|
|
656
|
+
|
|
657
|
+
completed = 0
|
|
658
|
+
failed = 0
|
|
659
|
+
|
|
660
|
+
for pv in playlist_videos:
|
|
661
|
+
media_file = self.repos.media.get_by_id(pv.media_file_id)
|
|
662
|
+
if media_file and media_file.file_path:
|
|
663
|
+
file_path = Path(media_file.file_path)
|
|
664
|
+
if file_path.exists():
|
|
665
|
+
# Check if has transcription
|
|
666
|
+
if self._check_video_has_transcription(media_file):
|
|
667
|
+
completed += 1
|
|
668
|
+
else:
|
|
669
|
+
failed += 1
|
|
670
|
+
else:
|
|
671
|
+
failed += 1
|
|
672
|
+
else:
|
|
673
|
+
failed += 1
|
|
674
|
+
|
|
675
|
+
remaining = total - completed - failed
|
|
676
|
+
|
|
677
|
+
return {
|
|
678
|
+
"total": total,
|
|
679
|
+
"completed": completed,
|
|
680
|
+
"failed": failed,
|
|
681
|
+
"remaining": remaining,
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
except Exception as e:
|
|
685
|
+
self.logger.error(f"Failed to get playlist progress: {e}")
|
|
686
|
+
return {"total": 0, "completed": 0, "failed": 0, "remaining": 0}
|
|
687
|
+
|
|
688
|
+
def _get_failed_videos(self, playlist_id: str) -> List[Dict[str, Any]]:
|
|
689
|
+
"""Get failed videos from playlist."""
|
|
690
|
+
try:
|
|
691
|
+
# Get playlist from database
|
|
692
|
+
playlist = self.repos.playlists.get_by_playlist_id(playlist_id)
|
|
693
|
+
if not playlist:
|
|
694
|
+
return []
|
|
695
|
+
|
|
696
|
+
# Get playlist videos
|
|
697
|
+
playlist_videos = self.repos.playlist_videos.get_by_playlist_id(playlist.id)
|
|
698
|
+
failed_videos = []
|
|
699
|
+
|
|
700
|
+
for pv in playlist_videos:
|
|
701
|
+
media_file = self.repos.media.get_by_id(pv.media_file_id)
|
|
702
|
+
if media_file and media_file.file_path:
|
|
703
|
+
file_path = Path(media_file.file_path)
|
|
704
|
+
if not file_path.exists():
|
|
705
|
+
failed_videos.append(
|
|
706
|
+
{
|
|
707
|
+
"position": pv.position,
|
|
708
|
+
"video_title": pv.video_title or "Unknown",
|
|
709
|
+
"reason": "File missing",
|
|
710
|
+
}
|
|
711
|
+
)
|
|
712
|
+
elif not self._check_video_has_transcription(media_file):
|
|
713
|
+
failed_videos.append(
|
|
714
|
+
{
|
|
715
|
+
"position": pv.position,
|
|
716
|
+
"video_title": pv.video_title or "Unknown",
|
|
717
|
+
"reason": "No transcription",
|
|
718
|
+
}
|
|
719
|
+
)
|
|
720
|
+
else:
|
|
721
|
+
failed_videos.append(
|
|
722
|
+
{
|
|
723
|
+
"position": pv.position,
|
|
724
|
+
"video_title": pv.video_title or "Unknown",
|
|
725
|
+
"reason": "Media file not found",
|
|
726
|
+
}
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
return failed_videos
|
|
730
|
+
|
|
731
|
+
except Exception as e:
|
|
732
|
+
self.logger.error(f"Failed to get failed videos: {e}")
|
|
733
|
+
return []
|
|
734
|
+
|
|
735
|
+
def _check_video_has_transcription(self, media_file) -> bool:
|
|
736
|
+
"""Check if video has transcription."""
|
|
737
|
+
try:
|
|
738
|
+
if not media_file or not media_file.file_path:
|
|
739
|
+
return False
|
|
740
|
+
|
|
741
|
+
file_path = Path(media_file.file_path)
|
|
742
|
+
if not file_path.exists():
|
|
743
|
+
return False
|
|
744
|
+
|
|
745
|
+
# Check for transcription files
|
|
746
|
+
base_name = file_path.stem
|
|
747
|
+
transcription_files = [
|
|
748
|
+
file_path.parent / f"{base_name}.srt",
|
|
749
|
+
file_path.parent / f"{base_name}.vtt",
|
|
750
|
+
file_path.parent / f"{base_name}.json",
|
|
751
|
+
]
|
|
752
|
+
|
|
753
|
+
return any(f.exists() for f in transcription_files)
|
|
754
|
+
|
|
755
|
+
except Exception as e:
|
|
756
|
+
self.logger.error(f"Failed to check transcription: {e}")
|
|
757
|
+
return False
|
|
758
|
+
|
|
759
|
+
def download_playlist_with_transcription(
|
|
760
|
+
self,
|
|
761
|
+
url: str,
|
|
762
|
+
output_path: Optional[Union[str, Path]] = None,
|
|
763
|
+
continue_download: bool = True,
|
|
764
|
+
**kwargs,
|
|
765
|
+
) -> Dict[str, Any]:
|
|
766
|
+
"""Download playlist with transcription support."""
|
|
767
|
+
try:
|
|
768
|
+
# This method would integrate with PlaylistService
|
|
769
|
+
# For now, return a placeholder implementation
|
|
770
|
+
from modules.video.services.playlist_service import PlaylistService
|
|
771
|
+
|
|
772
|
+
playlist_service = PlaylistService(
|
|
773
|
+
self.config, verbose=self.verbose, db_service=self.db_factory
|
|
774
|
+
)
|
|
775
|
+
result = playlist_service.download_playlist(url, output_path, **kwargs)
|
|
776
|
+
|
|
777
|
+
# Add transcription logic here if needed
|
|
778
|
+
return result
|
|
779
|
+
|
|
780
|
+
except Exception as e:
|
|
781
|
+
self.logger.error(f"Playlist download with transcription failed: {e}")
|
|
782
|
+
return {
|
|
783
|
+
"success": False,
|
|
784
|
+
"message": f"Playlist download failed: {e}",
|
|
785
|
+
"errors": [str(e)],
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
def _check_existing_video(self, file_path: Path, url: str) -> Dict[str, Any]:
|
|
789
|
+
"""Check if video file exists and has subtitles."""
|
|
790
|
+
result = {
|
|
791
|
+
"exists": False,
|
|
792
|
+
"has_subtitles": False,
|
|
793
|
+
"should_overwrite": True,
|
|
794
|
+
"reason": "",
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
if not file_path.exists():
|
|
798
|
+
result["reason"] = f"File {file_path} does not exist"
|
|
799
|
+
return result
|
|
800
|
+
|
|
801
|
+
result["exists"] = True
|
|
802
|
+
|
|
803
|
+
# Check for subtitles
|
|
804
|
+
has_subtitles = self._has_whisper_subtitles(file_path)
|
|
805
|
+
result["has_subtitles"] = has_subtitles
|
|
806
|
+
|
|
807
|
+
if has_subtitles:
|
|
808
|
+
result["should_overwrite"] = False
|
|
809
|
+
result["reason"] = f"File {file_path} exists with WhisperAI subtitles"
|
|
810
|
+
else:
|
|
811
|
+
result["should_overwrite"] = True
|
|
812
|
+
result["reason"] = f"File {file_path} exists without subtitles"
|
|
813
|
+
|
|
814
|
+
return result
|
|
815
|
+
|
|
816
|
+
def _has_whisper_subtitles(self, file_path: Path) -> bool:
|
|
817
|
+
"""Check if video file has Whisper subtitles."""
|
|
818
|
+
try:
|
|
819
|
+
# Use ffprobe to check for subtitle tracks
|
|
820
|
+
cmd = [
|
|
821
|
+
"ffprobe",
|
|
822
|
+
"-v",
|
|
823
|
+
"quiet",
|
|
824
|
+
"-print_format",
|
|
825
|
+
"json",
|
|
826
|
+
"-show_streams",
|
|
827
|
+
"-show_format",
|
|
828
|
+
str(file_path),
|
|
829
|
+
]
|
|
830
|
+
|
|
831
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
|
832
|
+
|
|
833
|
+
if result.returncode != 0:
|
|
834
|
+
return False
|
|
835
|
+
|
|
836
|
+
import json
|
|
837
|
+
|
|
838
|
+
data = json.loads(result.stdout)
|
|
839
|
+
|
|
840
|
+
# Check for subtitle streams
|
|
841
|
+
for stream in data.get("streams", []):
|
|
842
|
+
if stream.get("codec_type") == "subtitle":
|
|
843
|
+
# Check if it's a Whisper subtitle
|
|
844
|
+
title = stream.get("tags", {}).get("title", "")
|
|
845
|
+
if "whisper" in title.lower() or "whisperai" in title.lower():
|
|
846
|
+
return True
|
|
847
|
+
|
|
848
|
+
return False
|
|
849
|
+
|
|
850
|
+
except Exception as e:
|
|
851
|
+
self.logger.warning(f"Error checking subtitles for {file_path}: {e}")
|
|
852
|
+
return False
|