spatelier 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analytics/__init__.py +1 -0
- analytics/reporter.py +497 -0
- cli/__init__.py +1 -0
- cli/app.py +147 -0
- cli/audio.py +129 -0
- cli/cli_analytics.py +320 -0
- cli/cli_utils.py +282 -0
- cli/error_handlers.py +122 -0
- cli/files.py +299 -0
- cli/update.py +325 -0
- cli/video.py +823 -0
- cli/worker.py +615 -0
- core/__init__.py +1 -0
- core/analytics_dashboard.py +368 -0
- core/base.py +303 -0
- core/base_service.py +69 -0
- core/config.py +345 -0
- core/database_service.py +116 -0
- core/decorators.py +263 -0
- core/error_handler.py +210 -0
- core/file_tracker.py +254 -0
- core/interactive_cli.py +366 -0
- core/interfaces.py +166 -0
- core/job_queue.py +437 -0
- core/logger.py +79 -0
- core/package_updater.py +469 -0
- core/progress.py +228 -0
- core/service_factory.py +295 -0
- core/streaming.py +299 -0
- core/worker.py +765 -0
- database/__init__.py +1 -0
- database/connection.py +265 -0
- database/metadata.py +516 -0
- database/models.py +288 -0
- database/repository.py +592 -0
- database/transcription_storage.py +219 -0
- modules/__init__.py +1 -0
- modules/audio/__init__.py +5 -0
- modules/audio/converter.py +197 -0
- modules/video/__init__.py +16 -0
- modules/video/converter.py +191 -0
- modules/video/fallback_extractor.py +334 -0
- modules/video/services/__init__.py +18 -0
- modules/video/services/audio_extraction_service.py +274 -0
- modules/video/services/download_service.py +852 -0
- modules/video/services/metadata_service.py +190 -0
- modules/video/services/playlist_service.py +445 -0
- modules/video/services/transcription_service.py +491 -0
- modules/video/transcription_service.py +385 -0
- modules/video/youtube_api.py +397 -0
- spatelier/__init__.py +33 -0
- spatelier-0.3.0.dist-info/METADATA +260 -0
- spatelier-0.3.0.dist-info/RECORD +59 -0
- spatelier-0.3.0.dist-info/WHEEL +5 -0
- spatelier-0.3.0.dist-info/entry_points.txt +2 -0
- spatelier-0.3.0.dist-info/licenses/LICENSE +21 -0
- spatelier-0.3.0.dist-info/top_level.txt +7 -0
- utils/__init__.py +1 -0
- utils/helpers.py +250 -0
database/metadata.py
ADDED
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata extraction and management.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for extracting and managing video metadata,
|
|
5
|
+
especially from YouTube and other platforms.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import subprocess
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional, Union
|
|
13
|
+
|
|
14
|
+
import ffmpeg
|
|
15
|
+
|
|
16
|
+
from core.config import Config
|
|
17
|
+
from core.logger import get_logger
|
|
18
|
+
from database.models import DownloadSource, MediaFile
|
|
19
|
+
from database.repository import MediaFileRepository
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MetadataExtractor:
|
|
23
|
+
"""
|
|
24
|
+
Metadata extractor for various media types and platforms.
|
|
25
|
+
|
|
26
|
+
Supports YouTube, Vimeo, and other platforms using yt-dlp for metadata extraction.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, config: Config, verbose: bool = False):
|
|
30
|
+
"""
|
|
31
|
+
Initialize metadata extractor.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
config: Configuration instance
|
|
35
|
+
verbose: Enable verbose logging
|
|
36
|
+
"""
|
|
37
|
+
self.config = config
|
|
38
|
+
self.verbose = verbose
|
|
39
|
+
self.logger = get_logger("MetadataExtractor", verbose=verbose)
|
|
40
|
+
|
|
41
|
+
def extract_youtube_metadata(self, url: str) -> Dict[str, Any]:
|
|
42
|
+
"""
|
|
43
|
+
Extract metadata from YouTube URL using yt-dlp.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
url: YouTube URL
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Dictionary with extracted metadata
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
self.logger.info(f"Extracting YouTube metadata from: {url}")
|
|
53
|
+
|
|
54
|
+
# Use yt-dlp Python package to get metadata without downloading
|
|
55
|
+
import yt_dlp
|
|
56
|
+
|
|
57
|
+
ydl_opts = {
|
|
58
|
+
"quiet": True,
|
|
59
|
+
"no_playlist": True,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# Automatically try to use cookies from browser for age-restricted content
|
|
63
|
+
# Try multiple browsers in order - yt-dlp will use the first available one
|
|
64
|
+
# On macOS, Chrome is more reliable than Safari for cookie extraction
|
|
65
|
+
import platform
|
|
66
|
+
|
|
67
|
+
system = platform.system().lower()
|
|
68
|
+
if system == "darwin": # macOS - prioritize Chrome over Safari
|
|
69
|
+
browsers = ("chrome", "safari", "firefox", "edge")
|
|
70
|
+
else:
|
|
71
|
+
browsers = ("chrome", "firefox", "safari", "edge")
|
|
72
|
+
ydl_opts["cookies_from_browser"] = browsers
|
|
73
|
+
if self.verbose:
|
|
74
|
+
self.logger.info(f"Attempting to use cookies from browsers: {browsers}")
|
|
75
|
+
|
|
76
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
77
|
+
metadata = ydl.extract_info(url, download=False)
|
|
78
|
+
return self._parse_youtube_metadata(metadata)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
error_msg = str(e)
|
|
81
|
+
# Check if this is a cookie/authentication error
|
|
82
|
+
if any(
|
|
83
|
+
keyword in error_msg.lower()
|
|
84
|
+
for keyword in ["sign in", "age", "cookies", "authentication"]
|
|
85
|
+
):
|
|
86
|
+
self.logger.warning(
|
|
87
|
+
"Metadata extraction failed due to authentication - attempting to refresh cookies..."
|
|
88
|
+
)
|
|
89
|
+
# Try to refresh cookies and get cookie file
|
|
90
|
+
cookie_file = self._refresh_youtube_cookies()
|
|
91
|
+
if cookie_file:
|
|
92
|
+
self.logger.info(
|
|
93
|
+
"Retrying metadata extraction with refreshed cookies..."
|
|
94
|
+
)
|
|
95
|
+
# Retry the extraction with cookie file
|
|
96
|
+
try:
|
|
97
|
+
# Use the cookie file instead of cookies_from_browser
|
|
98
|
+
ydl_opts["cookies"] = cookie_file
|
|
99
|
+
if "cookies_from_browser" in ydl_opts:
|
|
100
|
+
del ydl_opts["cookies_from_browser"]
|
|
101
|
+
|
|
102
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
103
|
+
metadata = ydl.extract_info(url, download=False)
|
|
104
|
+
|
|
105
|
+
# Clean up cookie file
|
|
106
|
+
import os
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
os.unlink(cookie_file)
|
|
110
|
+
except:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
return self._parse_youtube_metadata(metadata)
|
|
114
|
+
except Exception as retry_error:
|
|
115
|
+
# Clean up cookie file
|
|
116
|
+
import os
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
os.unlink(cookie_file)
|
|
120
|
+
except:
|
|
121
|
+
pass
|
|
122
|
+
self.logger.error(
|
|
123
|
+
f"Metadata extraction failed after cookie refresh: {retry_error}"
|
|
124
|
+
)
|
|
125
|
+
return {}
|
|
126
|
+
else:
|
|
127
|
+
self.logger.error(f"Metadata extraction failed: {e}")
|
|
128
|
+
return {}
|
|
129
|
+
else:
|
|
130
|
+
self.logger.error(f"Metadata extraction failed: {e}")
|
|
131
|
+
return {}
|
|
132
|
+
|
|
133
|
+
def _refresh_youtube_cookies(self) -> Optional[str]:
|
|
134
|
+
"""Refresh YouTube cookies by visiting YouTube and extracting fresh cookies.
|
|
135
|
+
|
|
136
|
+
Uses Playwright to launch Chrome with the user's profile, visit YouTube,
|
|
137
|
+
extract the cookies, and save them to a temporary file for yt-dlp to use.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Path to cookie file if successful, None otherwise
|
|
141
|
+
"""
|
|
142
|
+
try:
|
|
143
|
+
import os
|
|
144
|
+
import platform
|
|
145
|
+
import tempfile
|
|
146
|
+
|
|
147
|
+
from playwright.sync_api import sync_playwright
|
|
148
|
+
|
|
149
|
+
system = platform.system().lower()
|
|
150
|
+
if system != "darwin":
|
|
151
|
+
# Only implemented for macOS for now
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
# Get Chrome user data directory
|
|
155
|
+
chrome_user_data = os.path.expanduser(
|
|
156
|
+
"~/Library/Application Support/Google/Chrome"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if not os.path.exists(chrome_user_data):
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
self.logger.info(
|
|
163
|
+
"Refreshing YouTube cookies by visiting YouTube in Chrome..."
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
with sync_playwright() as p:
|
|
167
|
+
# Launch Chrome with user's profile
|
|
168
|
+
browser = p.chromium.launch_persistent_context(
|
|
169
|
+
user_data_dir=chrome_user_data,
|
|
170
|
+
headless=True,
|
|
171
|
+
args=["--disable-blink-features=AutomationControlled"],
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Visit YouTube to refresh session
|
|
175
|
+
page = browser.new_page()
|
|
176
|
+
page.goto(
|
|
177
|
+
"https://www.youtube.com", wait_until="networkidle", timeout=15000
|
|
178
|
+
)
|
|
179
|
+
# Wait a moment for cookies to be set
|
|
180
|
+
page.wait_for_timeout(3000)
|
|
181
|
+
|
|
182
|
+
# Extract cookies from the page
|
|
183
|
+
cookies = browser.cookies()
|
|
184
|
+
browser.close()
|
|
185
|
+
|
|
186
|
+
# Filter for YouTube cookies only
|
|
187
|
+
youtube_cookies = [
|
|
188
|
+
c
|
|
189
|
+
for c in cookies
|
|
190
|
+
if "youtube.com" in c.get("domain", "")
|
|
191
|
+
or ".youtube.com" in c.get("domain", "")
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
if not youtube_cookies:
|
|
195
|
+
self.logger.warning("No YouTube cookies found after refresh")
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
# Save cookies to Netscape format file for yt-dlp
|
|
199
|
+
cookie_file = tempfile.NamedTemporaryFile(
|
|
200
|
+
mode="w", suffix=".txt", delete=False
|
|
201
|
+
)
|
|
202
|
+
cookie_file.write("# Netscape HTTP Cookie File\n")
|
|
203
|
+
cookie_file.write("# This file was generated by spatelier\n\n")
|
|
204
|
+
|
|
205
|
+
for cookie in youtube_cookies:
|
|
206
|
+
domain = cookie.get("domain", "")
|
|
207
|
+
domain_flag = "TRUE" if domain.startswith(".") else "FALSE"
|
|
208
|
+
path = cookie.get("path", "/")
|
|
209
|
+
secure = "TRUE" if cookie.get("secure", False) else "FALSE"
|
|
210
|
+
expires = str(int(cookie.get("expires", 0)))
|
|
211
|
+
name = cookie.get("name", "")
|
|
212
|
+
value = cookie.get("value", "")
|
|
213
|
+
|
|
214
|
+
cookie_file.write(
|
|
215
|
+
f"{domain}\t{domain_flag}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
cookie_file.close()
|
|
219
|
+
self.logger.info(
|
|
220
|
+
f"YouTube cookies refreshed and saved to: {cookie_file.name}"
|
|
221
|
+
)
|
|
222
|
+
return cookie_file.name
|
|
223
|
+
|
|
224
|
+
except Exception as e:
|
|
225
|
+
self.logger.warning(f"Failed to refresh cookies automatically: {e}")
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
def extract_file_metadata(self, file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
229
|
+
"""
|
|
230
|
+
Extract metadata from local media file using ffprobe.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
file_path: Path to media file
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Dictionary with extracted metadata
|
|
237
|
+
"""
|
|
238
|
+
try:
|
|
239
|
+
file_path = Path(file_path)
|
|
240
|
+
if not file_path.exists():
|
|
241
|
+
return {}
|
|
242
|
+
|
|
243
|
+
self.logger.info(f"Extracting file metadata from: {file_path}")
|
|
244
|
+
|
|
245
|
+
# Use ffmpeg-python to get technical metadata
|
|
246
|
+
probe_data = ffmpeg.probe(str(file_path))
|
|
247
|
+
return self._parse_ffprobe_metadata(probe_data)
|
|
248
|
+
|
|
249
|
+
except ffmpeg.Error as e:
|
|
250
|
+
self.logger.error(f"ffmpeg probe failed: {e}")
|
|
251
|
+
return {}
|
|
252
|
+
except Exception as e:
|
|
253
|
+
self.logger.error(f"File metadata extraction failed: {e}")
|
|
254
|
+
return {}
|
|
255
|
+
|
|
256
|
+
def _parse_youtube_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
257
|
+
"""
|
|
258
|
+
Parse YouTube metadata from yt-dlp output.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
metadata: Raw metadata from yt-dlp
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Parsed metadata dictionary
|
|
265
|
+
"""
|
|
266
|
+
parsed = {}
|
|
267
|
+
|
|
268
|
+
# Basic information
|
|
269
|
+
parsed["title"] = metadata.get("title", "")
|
|
270
|
+
parsed["description"] = metadata.get("description", "")
|
|
271
|
+
parsed["uploader"] = metadata.get("uploader", "")
|
|
272
|
+
parsed["uploader_id"] = metadata.get("uploader_id", "")
|
|
273
|
+
parsed["source_url"] = metadata.get("webpage_url", "")
|
|
274
|
+
parsed["source_platform"] = "youtube"
|
|
275
|
+
parsed["source_id"] = metadata.get("id", "")
|
|
276
|
+
|
|
277
|
+
# Dates
|
|
278
|
+
if metadata.get("upload_date"):
|
|
279
|
+
try:
|
|
280
|
+
upload_date = datetime.strptime(metadata["upload_date"], "%Y%m%d")
|
|
281
|
+
parsed["upload_date"] = upload_date
|
|
282
|
+
except ValueError:
|
|
283
|
+
pass
|
|
284
|
+
|
|
285
|
+
# Statistics
|
|
286
|
+
parsed["view_count"] = metadata.get("view_count")
|
|
287
|
+
parsed["like_count"] = metadata.get("like_count")
|
|
288
|
+
parsed["dislike_count"] = metadata.get("dislike_count")
|
|
289
|
+
parsed["comment_count"] = metadata.get("comment_count")
|
|
290
|
+
|
|
291
|
+
# Technical information
|
|
292
|
+
parsed["duration"] = metadata.get("duration")
|
|
293
|
+
parsed["age_limit"] = metadata.get("age_limit")
|
|
294
|
+
parsed["language"] = metadata.get("language")
|
|
295
|
+
|
|
296
|
+
# Tags and categories
|
|
297
|
+
if metadata.get("tags"):
|
|
298
|
+
parsed["tags"] = json.dumps(metadata["tags"])
|
|
299
|
+
|
|
300
|
+
if metadata.get("categories"):
|
|
301
|
+
parsed["categories"] = json.dumps(metadata["categories"])
|
|
302
|
+
|
|
303
|
+
# Thumbnails
|
|
304
|
+
if metadata.get("thumbnail"):
|
|
305
|
+
parsed["thumbnail_url"] = metadata["thumbnail"]
|
|
306
|
+
|
|
307
|
+
# Video streams information
|
|
308
|
+
if metadata.get("formats"):
|
|
309
|
+
video_streams = [
|
|
310
|
+
f for f in metadata["formats"] if f.get("vcodec") != "none"
|
|
311
|
+
]
|
|
312
|
+
if video_streams:
|
|
313
|
+
# Get best quality stream info
|
|
314
|
+
best_stream = max(video_streams, key=lambda x: x.get("height", 0) or 0)
|
|
315
|
+
parsed["width"] = best_stream.get("width")
|
|
316
|
+
parsed["height"] = best_stream.get("height")
|
|
317
|
+
parsed["fps"] = best_stream.get("fps")
|
|
318
|
+
parsed["video_codec"] = best_stream.get("vcodec")
|
|
319
|
+
parsed["audio_codec"] = best_stream.get("acodec")
|
|
320
|
+
parsed["bitrate"] = best_stream.get("tbr")
|
|
321
|
+
|
|
322
|
+
return parsed
|
|
323
|
+
|
|
324
|
+
def _parse_ffprobe_metadata(self, probe_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
325
|
+
"""
|
|
326
|
+
Parse ffprobe metadata from media file.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
probe_data: Raw ffprobe output
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Parsed metadata dictionary
|
|
333
|
+
"""
|
|
334
|
+
parsed = {}
|
|
335
|
+
|
|
336
|
+
# Format information
|
|
337
|
+
format_info = probe_data.get("format", {})
|
|
338
|
+
parsed["duration"] = float(format_info.get("duration", 0))
|
|
339
|
+
parsed["bitrate"] = int(format_info.get("bit_rate", 0))
|
|
340
|
+
|
|
341
|
+
# Stream information
|
|
342
|
+
streams = probe_data.get("streams", [])
|
|
343
|
+
video_streams = [s for s in streams if s.get("codec_type") == "video"]
|
|
344
|
+
audio_streams = [s for s in streams if s.get("codec_type") == "audio"]
|
|
345
|
+
|
|
346
|
+
# Video stream info
|
|
347
|
+
if video_streams:
|
|
348
|
+
video_stream = video_streams[0] # Primary video stream
|
|
349
|
+
parsed["width"] = video_stream.get("width")
|
|
350
|
+
parsed["height"] = video_stream.get("height")
|
|
351
|
+
parsed["fps"] = self._parse_fps(video_stream.get("r_frame_rate", ""))
|
|
352
|
+
parsed["video_codec"] = video_stream.get("codec_name")
|
|
353
|
+
parsed["aspect_ratio"] = video_stream.get("display_aspect_ratio")
|
|
354
|
+
parsed["color_space"] = video_stream.get("color_space")
|
|
355
|
+
|
|
356
|
+
# Audio stream info
|
|
357
|
+
if audio_streams:
|
|
358
|
+
audio_stream = audio_streams[0] # Primary audio stream
|
|
359
|
+
parsed["audio_codec"] = audio_stream.get("codec_name")
|
|
360
|
+
parsed["sample_rate"] = audio_stream.get("sample_rate")
|
|
361
|
+
parsed["channels"] = audio_stream.get("channels")
|
|
362
|
+
|
|
363
|
+
return parsed
|
|
364
|
+
|
|
365
|
+
def _parse_fps(self, fps_string: str) -> Optional[float]:
|
|
366
|
+
"""Parse FPS from fraction string like '30/1'."""
|
|
367
|
+
try:
|
|
368
|
+
if "/" in fps_string:
|
|
369
|
+
numerator, denominator = fps_string.split("/")
|
|
370
|
+
return float(numerator) / float(denominator)
|
|
371
|
+
else:
|
|
372
|
+
return float(fps_string)
|
|
373
|
+
except (ValueError, ZeroDivisionError):
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
def update_media_file_metadata(
|
|
377
|
+
self,
|
|
378
|
+
media_file: MediaFile,
|
|
379
|
+
metadata: Dict[str, Any],
|
|
380
|
+
repository: MediaFileRepository,
|
|
381
|
+
) -> MediaFile:
|
|
382
|
+
"""
|
|
383
|
+
Update media file with extracted metadata.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
media_file: MediaFile instance to update
|
|
387
|
+
metadata: Extracted metadata dictionary
|
|
388
|
+
repository: MediaFileRepository instance
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
Updated MediaFile instance
|
|
392
|
+
"""
|
|
393
|
+
try:
|
|
394
|
+
# Update fields that exist in metadata
|
|
395
|
+
for field, value in metadata.items():
|
|
396
|
+
if hasattr(media_file, field) and value is not None:
|
|
397
|
+
setattr(media_file, field, value)
|
|
398
|
+
|
|
399
|
+
# Commit changes
|
|
400
|
+
repository.session.commit()
|
|
401
|
+
repository.session.refresh(media_file)
|
|
402
|
+
|
|
403
|
+
self.logger.info(f"Updated metadata for media file: {media_file.id}")
|
|
404
|
+
return media_file
|
|
405
|
+
|
|
406
|
+
except Exception as e:
|
|
407
|
+
self.logger.error(f"Failed to update media file metadata: {e}")
|
|
408
|
+
repository.session.rollback()
|
|
409
|
+
raise
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class MetadataManager:
|
|
413
|
+
"""
|
|
414
|
+
High-level metadata management.
|
|
415
|
+
|
|
416
|
+
Provides convenient methods for metadata extraction and storage.
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
def __init__(self, config: Config, verbose: bool = False):
|
|
420
|
+
"""
|
|
421
|
+
Initialize metadata manager.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
config: Configuration instance
|
|
425
|
+
verbose: Enable verbose logging
|
|
426
|
+
"""
|
|
427
|
+
self.config = config
|
|
428
|
+
self.verbose = verbose
|
|
429
|
+
self.logger = get_logger("MetadataManager", verbose=verbose)
|
|
430
|
+
|
|
431
|
+
self.extractor = MetadataExtractor(config, verbose=verbose)
|
|
432
|
+
|
|
433
|
+
def enrich_media_file(
|
|
434
|
+
self,
|
|
435
|
+
media_file: MediaFile,
|
|
436
|
+
repository: MediaFileRepository,
|
|
437
|
+
extract_source_metadata: bool = True,
|
|
438
|
+
) -> MediaFile:
|
|
439
|
+
"""
|
|
440
|
+
Enrich media file with metadata from various sources.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
media_file: MediaFile to enrich
|
|
444
|
+
repository: MediaFileRepository instance
|
|
445
|
+
extract_source_metadata: Whether to extract source metadata (YouTube, etc.)
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Enriched MediaFile instance
|
|
449
|
+
"""
|
|
450
|
+
try:
|
|
451
|
+
# Extract file metadata
|
|
452
|
+
file_metadata = self.extractor.extract_file_metadata(media_file.file_path)
|
|
453
|
+
self.extractor.update_media_file_metadata(
|
|
454
|
+
media_file, file_metadata, repository
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# Extract source metadata if available
|
|
458
|
+
if extract_source_metadata and media_file.source_url:
|
|
459
|
+
if (
|
|
460
|
+
"youtube.com" in media_file.source_url
|
|
461
|
+
or "youtu.be" in media_file.source_url
|
|
462
|
+
):
|
|
463
|
+
youtube_metadata = self.extractor.extract_youtube_metadata(
|
|
464
|
+
media_file.source_url
|
|
465
|
+
)
|
|
466
|
+
self.extractor.update_media_file_metadata(
|
|
467
|
+
media_file, youtube_metadata, repository
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
self.logger.info(f"Enriched media file {media_file.id} with metadata")
|
|
471
|
+
return media_file
|
|
472
|
+
|
|
473
|
+
except Exception as e:
|
|
474
|
+
self.logger.error(f"Failed to enrich media file: {e}")
|
|
475
|
+
return media_file
|
|
476
|
+
|
|
477
|
+
def batch_enrich_media_files(
|
|
478
|
+
self,
|
|
479
|
+
repository: MediaFileRepository,
|
|
480
|
+
limit: int = 100,
|
|
481
|
+
media_type: Optional[str] = None,
|
|
482
|
+
) -> List[MediaFile]:
|
|
483
|
+
"""
|
|
484
|
+
Batch enrich multiple media files with metadata.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
repository: MediaFileRepository instance
|
|
488
|
+
limit: Maximum number of files to process
|
|
489
|
+
media_type: Filter by media type
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
List of enriched MediaFile instances
|
|
493
|
+
"""
|
|
494
|
+
try:
|
|
495
|
+
# Get media files to enrich
|
|
496
|
+
query = repository.session.query(MediaFile)
|
|
497
|
+
if media_type:
|
|
498
|
+
query = query.filter(MediaFile.media_type == media_type)
|
|
499
|
+
|
|
500
|
+
media_files = query.limit(limit).all()
|
|
501
|
+
|
|
502
|
+
enriched_files = []
|
|
503
|
+
for media_file in media_files:
|
|
504
|
+
try:
|
|
505
|
+
enriched_file = self.enrich_media_file(media_file, repository)
|
|
506
|
+
enriched_files.append(enriched_file)
|
|
507
|
+
except Exception as e:
|
|
508
|
+
self.logger.error(f"Failed to enrich file {media_file.id}: {e}")
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
self.logger.info(f"Batch enriched {len(enriched_files)} media files")
|
|
512
|
+
return enriched_files
|
|
513
|
+
|
|
514
|
+
except Exception as e:
|
|
515
|
+
self.logger.error(f"Batch enrichment failed: {e}")
|
|
516
|
+
return []
|