spatelier 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. analytics/__init__.py +1 -0
  2. analytics/reporter.py +497 -0
  3. cli/__init__.py +1 -0
  4. cli/app.py +147 -0
  5. cli/audio.py +129 -0
  6. cli/cli_analytics.py +320 -0
  7. cli/cli_utils.py +282 -0
  8. cli/error_handlers.py +122 -0
  9. cli/files.py +299 -0
  10. cli/update.py +325 -0
  11. cli/video.py +823 -0
  12. cli/worker.py +615 -0
  13. core/__init__.py +1 -0
  14. core/analytics_dashboard.py +368 -0
  15. core/base.py +303 -0
  16. core/base_service.py +69 -0
  17. core/config.py +345 -0
  18. core/database_service.py +116 -0
  19. core/decorators.py +263 -0
  20. core/error_handler.py +210 -0
  21. core/file_tracker.py +254 -0
  22. core/interactive_cli.py +366 -0
  23. core/interfaces.py +166 -0
  24. core/job_queue.py +437 -0
  25. core/logger.py +79 -0
  26. core/package_updater.py +469 -0
  27. core/progress.py +228 -0
  28. core/service_factory.py +295 -0
  29. core/streaming.py +299 -0
  30. core/worker.py +765 -0
  31. database/__init__.py +1 -0
  32. database/connection.py +265 -0
  33. database/metadata.py +516 -0
  34. database/models.py +288 -0
  35. database/repository.py +592 -0
  36. database/transcription_storage.py +219 -0
  37. modules/__init__.py +1 -0
  38. modules/audio/__init__.py +5 -0
  39. modules/audio/converter.py +197 -0
  40. modules/video/__init__.py +16 -0
  41. modules/video/converter.py +191 -0
  42. modules/video/fallback_extractor.py +334 -0
  43. modules/video/services/__init__.py +18 -0
  44. modules/video/services/audio_extraction_service.py +274 -0
  45. modules/video/services/download_service.py +852 -0
  46. modules/video/services/metadata_service.py +190 -0
  47. modules/video/services/playlist_service.py +445 -0
  48. modules/video/services/transcription_service.py +491 -0
  49. modules/video/transcription_service.py +385 -0
  50. modules/video/youtube_api.py +397 -0
  51. spatelier/__init__.py +33 -0
  52. spatelier-0.3.0.dist-info/METADATA +260 -0
  53. spatelier-0.3.0.dist-info/RECORD +59 -0
  54. spatelier-0.3.0.dist-info/WHEEL +5 -0
  55. spatelier-0.3.0.dist-info/entry_points.txt +2 -0
  56. spatelier-0.3.0.dist-info/licenses/LICENSE +21 -0
  57. spatelier-0.3.0.dist-info/top_level.txt +7 -0
  58. utils/__init__.py +1 -0
  59. utils/helpers.py +250 -0
database/metadata.py ADDED
@@ -0,0 +1,516 @@
1
+ """
2
+ Metadata extraction and management.
3
+
4
+ This module provides functionality for extracting and managing video metadata,
5
+ especially from YouTube and other platforms.
6
+ """
7
+
8
+ import json
9
+ import subprocess
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional, Union
13
+
14
+ import ffmpeg
15
+
16
+ from core.config import Config
17
+ from core.logger import get_logger
18
+ from database.models import DownloadSource, MediaFile
19
+ from database.repository import MediaFileRepository
20
+
21
+
22
+ class MetadataExtractor:
23
+ """
24
+ Metadata extractor for various media types and platforms.
25
+
26
+ Supports YouTube, Vimeo, and other platforms using yt-dlp for metadata extraction.
27
+ """
28
+
29
+ def __init__(self, config: Config, verbose: bool = False):
30
+ """
31
+ Initialize metadata extractor.
32
+
33
+ Args:
34
+ config: Configuration instance
35
+ verbose: Enable verbose logging
36
+ """
37
+ self.config = config
38
+ self.verbose = verbose
39
+ self.logger = get_logger("MetadataExtractor", verbose=verbose)
40
+
41
+ def extract_youtube_metadata(self, url: str) -> Dict[str, Any]:
42
+ """
43
+ Extract metadata from YouTube URL using yt-dlp.
44
+
45
+ Args:
46
+ url: YouTube URL
47
+
48
+ Returns:
49
+ Dictionary with extracted metadata
50
+ """
51
+ try:
52
+ self.logger.info(f"Extracting YouTube metadata from: {url}")
53
+
54
+ # Use yt-dlp Python package to get metadata without downloading
55
+ import yt_dlp
56
+
57
+ ydl_opts = {
58
+ "quiet": True,
59
+ "no_playlist": True,
60
+ }
61
+
62
+ # Automatically try to use cookies from browser for age-restricted content
63
+ # Try multiple browsers in order - yt-dlp will use the first available one
64
+ # On macOS, Chrome is more reliable than Safari for cookie extraction
65
+ import platform
66
+
67
+ system = platform.system().lower()
68
+ if system == "darwin": # macOS - prioritize Chrome over Safari
69
+ browsers = ("chrome", "safari", "firefox", "edge")
70
+ else:
71
+ browsers = ("chrome", "firefox", "safari", "edge")
72
+ ydl_opts["cookies_from_browser"] = browsers
73
+ if self.verbose:
74
+ self.logger.info(f"Attempting to use cookies from browsers: {browsers}")
75
+
76
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
77
+ metadata = ydl.extract_info(url, download=False)
78
+ return self._parse_youtube_metadata(metadata)
79
+ except Exception as e:
80
+ error_msg = str(e)
81
+ # Check if this is a cookie/authentication error
82
+ if any(
83
+ keyword in error_msg.lower()
84
+ for keyword in ["sign in", "age", "cookies", "authentication"]
85
+ ):
86
+ self.logger.warning(
87
+ "Metadata extraction failed due to authentication - attempting to refresh cookies..."
88
+ )
89
+ # Try to refresh cookies and get cookie file
90
+ cookie_file = self._refresh_youtube_cookies()
91
+ if cookie_file:
92
+ self.logger.info(
93
+ "Retrying metadata extraction with refreshed cookies..."
94
+ )
95
+ # Retry the extraction with cookie file
96
+ try:
97
+ # Use the cookie file instead of cookies_from_browser
98
+ ydl_opts["cookies"] = cookie_file
99
+ if "cookies_from_browser" in ydl_opts:
100
+ del ydl_opts["cookies_from_browser"]
101
+
102
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
103
+ metadata = ydl.extract_info(url, download=False)
104
+
105
+ # Clean up cookie file
106
+ import os
107
+
108
+ try:
109
+ os.unlink(cookie_file)
110
+ except:
111
+ pass
112
+
113
+ return self._parse_youtube_metadata(metadata)
114
+ except Exception as retry_error:
115
+ # Clean up cookie file
116
+ import os
117
+
118
+ try:
119
+ os.unlink(cookie_file)
120
+ except:
121
+ pass
122
+ self.logger.error(
123
+ f"Metadata extraction failed after cookie refresh: {retry_error}"
124
+ )
125
+ return {}
126
+ else:
127
+ self.logger.error(f"Metadata extraction failed: {e}")
128
+ return {}
129
+ else:
130
+ self.logger.error(f"Metadata extraction failed: {e}")
131
+ return {}
132
+
133
+ def _refresh_youtube_cookies(self) -> Optional[str]:
134
+ """Refresh YouTube cookies by visiting YouTube and extracting fresh cookies.
135
+
136
+ Uses Playwright to launch Chrome with the user's profile, visit YouTube,
137
+ extract the cookies, and save them to a temporary file for yt-dlp to use.
138
+
139
+ Returns:
140
+ Path to cookie file if successful, None otherwise
141
+ """
142
+ try:
143
+ import os
144
+ import platform
145
+ import tempfile
146
+
147
+ from playwright.sync_api import sync_playwright
148
+
149
+ system = platform.system().lower()
150
+ if system != "darwin":
151
+ # Only implemented for macOS for now
152
+ return None
153
+
154
+ # Get Chrome user data directory
155
+ chrome_user_data = os.path.expanduser(
156
+ "~/Library/Application Support/Google/Chrome"
157
+ )
158
+
159
+ if not os.path.exists(chrome_user_data):
160
+ return None
161
+
162
+ self.logger.info(
163
+ "Refreshing YouTube cookies by visiting YouTube in Chrome..."
164
+ )
165
+
166
+ with sync_playwright() as p:
167
+ # Launch Chrome with user's profile
168
+ browser = p.chromium.launch_persistent_context(
169
+ user_data_dir=chrome_user_data,
170
+ headless=True,
171
+ args=["--disable-blink-features=AutomationControlled"],
172
+ )
173
+
174
+ # Visit YouTube to refresh session
175
+ page = browser.new_page()
176
+ page.goto(
177
+ "https://www.youtube.com", wait_until="networkidle", timeout=15000
178
+ )
179
+ # Wait a moment for cookies to be set
180
+ page.wait_for_timeout(3000)
181
+
182
+ # Extract cookies from the page
183
+ cookies = browser.cookies()
184
+ browser.close()
185
+
186
+ # Filter for YouTube cookies only
187
+ youtube_cookies = [
188
+ c
189
+ for c in cookies
190
+ if "youtube.com" in c.get("domain", "")
191
+ or ".youtube.com" in c.get("domain", "")
192
+ ]
193
+
194
+ if not youtube_cookies:
195
+ self.logger.warning("No YouTube cookies found after refresh")
196
+ return None
197
+
198
+ # Save cookies to Netscape format file for yt-dlp
199
+ cookie_file = tempfile.NamedTemporaryFile(
200
+ mode="w", suffix=".txt", delete=False
201
+ )
202
+ cookie_file.write("# Netscape HTTP Cookie File\n")
203
+ cookie_file.write("# This file was generated by spatelier\n\n")
204
+
205
+ for cookie in youtube_cookies:
206
+ domain = cookie.get("domain", "")
207
+ domain_flag = "TRUE" if domain.startswith(".") else "FALSE"
208
+ path = cookie.get("path", "/")
209
+ secure = "TRUE" if cookie.get("secure", False) else "FALSE"
210
+ expires = str(int(cookie.get("expires", 0)))
211
+ name = cookie.get("name", "")
212
+ value = cookie.get("value", "")
213
+
214
+ cookie_file.write(
215
+ f"{domain}\t{domain_flag}\t{path}\t{secure}\t{expires}\t{name}\t{value}\n"
216
+ )
217
+
218
+ cookie_file.close()
219
+ self.logger.info(
220
+ f"YouTube cookies refreshed and saved to: {cookie_file.name}"
221
+ )
222
+ return cookie_file.name
223
+
224
+ except Exception as e:
225
+ self.logger.warning(f"Failed to refresh cookies automatically: {e}")
226
+ return None
227
+
228
+ def extract_file_metadata(self, file_path: Union[str, Path]) -> Dict[str, Any]:
229
+ """
230
+ Extract metadata from local media file using ffprobe.
231
+
232
+ Args:
233
+ file_path: Path to media file
234
+
235
+ Returns:
236
+ Dictionary with extracted metadata
237
+ """
238
+ try:
239
+ file_path = Path(file_path)
240
+ if not file_path.exists():
241
+ return {}
242
+
243
+ self.logger.info(f"Extracting file metadata from: {file_path}")
244
+
245
+ # Use ffmpeg-python to get technical metadata
246
+ probe_data = ffmpeg.probe(str(file_path))
247
+ return self._parse_ffprobe_metadata(probe_data)
248
+
249
+ except ffmpeg.Error as e:
250
+ self.logger.error(f"ffmpeg probe failed: {e}")
251
+ return {}
252
+ except Exception as e:
253
+ self.logger.error(f"File metadata extraction failed: {e}")
254
+ return {}
255
+
256
+ def _parse_youtube_metadata(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
257
+ """
258
+ Parse YouTube metadata from yt-dlp output.
259
+
260
+ Args:
261
+ metadata: Raw metadata from yt-dlp
262
+
263
+ Returns:
264
+ Parsed metadata dictionary
265
+ """
266
+ parsed = {}
267
+
268
+ # Basic information
269
+ parsed["title"] = metadata.get("title", "")
270
+ parsed["description"] = metadata.get("description", "")
271
+ parsed["uploader"] = metadata.get("uploader", "")
272
+ parsed["uploader_id"] = metadata.get("uploader_id", "")
273
+ parsed["source_url"] = metadata.get("webpage_url", "")
274
+ parsed["source_platform"] = "youtube"
275
+ parsed["source_id"] = metadata.get("id", "")
276
+
277
+ # Dates
278
+ if metadata.get("upload_date"):
279
+ try:
280
+ upload_date = datetime.strptime(metadata["upload_date"], "%Y%m%d")
281
+ parsed["upload_date"] = upload_date
282
+ except ValueError:
283
+ pass
284
+
285
+ # Statistics
286
+ parsed["view_count"] = metadata.get("view_count")
287
+ parsed["like_count"] = metadata.get("like_count")
288
+ parsed["dislike_count"] = metadata.get("dislike_count")
289
+ parsed["comment_count"] = metadata.get("comment_count")
290
+
291
+ # Technical information
292
+ parsed["duration"] = metadata.get("duration")
293
+ parsed["age_limit"] = metadata.get("age_limit")
294
+ parsed["language"] = metadata.get("language")
295
+
296
+ # Tags and categories
297
+ if metadata.get("tags"):
298
+ parsed["tags"] = json.dumps(metadata["tags"])
299
+
300
+ if metadata.get("categories"):
301
+ parsed["categories"] = json.dumps(metadata["categories"])
302
+
303
+ # Thumbnails
304
+ if metadata.get("thumbnail"):
305
+ parsed["thumbnail_url"] = metadata["thumbnail"]
306
+
307
+ # Video streams information
308
+ if metadata.get("formats"):
309
+ video_streams = [
310
+ f for f in metadata["formats"] if f.get("vcodec") != "none"
311
+ ]
312
+ if video_streams:
313
+ # Get best quality stream info
314
+ best_stream = max(video_streams, key=lambda x: x.get("height", 0) or 0)
315
+ parsed["width"] = best_stream.get("width")
316
+ parsed["height"] = best_stream.get("height")
317
+ parsed["fps"] = best_stream.get("fps")
318
+ parsed["video_codec"] = best_stream.get("vcodec")
319
+ parsed["audio_codec"] = best_stream.get("acodec")
320
+ parsed["bitrate"] = best_stream.get("tbr")
321
+
322
+ return parsed
323
+
324
+ def _parse_ffprobe_metadata(self, probe_data: Dict[str, Any]) -> Dict[str, Any]:
325
+ """
326
+ Parse ffprobe metadata from media file.
327
+
328
+ Args:
329
+ probe_data: Raw ffprobe output
330
+
331
+ Returns:
332
+ Parsed metadata dictionary
333
+ """
334
+ parsed = {}
335
+
336
+ # Format information
337
+ format_info = probe_data.get("format", {})
338
+ parsed["duration"] = float(format_info.get("duration", 0))
339
+ parsed["bitrate"] = int(format_info.get("bit_rate", 0))
340
+
341
+ # Stream information
342
+ streams = probe_data.get("streams", [])
343
+ video_streams = [s for s in streams if s.get("codec_type") == "video"]
344
+ audio_streams = [s for s in streams if s.get("codec_type") == "audio"]
345
+
346
+ # Video stream info
347
+ if video_streams:
348
+ video_stream = video_streams[0] # Primary video stream
349
+ parsed["width"] = video_stream.get("width")
350
+ parsed["height"] = video_stream.get("height")
351
+ parsed["fps"] = self._parse_fps(video_stream.get("r_frame_rate", ""))
352
+ parsed["video_codec"] = video_stream.get("codec_name")
353
+ parsed["aspect_ratio"] = video_stream.get("display_aspect_ratio")
354
+ parsed["color_space"] = video_stream.get("color_space")
355
+
356
+ # Audio stream info
357
+ if audio_streams:
358
+ audio_stream = audio_streams[0] # Primary audio stream
359
+ parsed["audio_codec"] = audio_stream.get("codec_name")
360
+ parsed["sample_rate"] = audio_stream.get("sample_rate")
361
+ parsed["channels"] = audio_stream.get("channels")
362
+
363
+ return parsed
364
+
365
+ def _parse_fps(self, fps_string: str) -> Optional[float]:
366
+ """Parse FPS from fraction string like '30/1'."""
367
+ try:
368
+ if "/" in fps_string:
369
+ numerator, denominator = fps_string.split("/")
370
+ return float(numerator) / float(denominator)
371
+ else:
372
+ return float(fps_string)
373
+ except (ValueError, ZeroDivisionError):
374
+ return None
375
+
376
+ def update_media_file_metadata(
377
+ self,
378
+ media_file: MediaFile,
379
+ metadata: Dict[str, Any],
380
+ repository: MediaFileRepository,
381
+ ) -> MediaFile:
382
+ """
383
+ Update media file with extracted metadata.
384
+
385
+ Args:
386
+ media_file: MediaFile instance to update
387
+ metadata: Extracted metadata dictionary
388
+ repository: MediaFileRepository instance
389
+
390
+ Returns:
391
+ Updated MediaFile instance
392
+ """
393
+ try:
394
+ # Update fields that exist in metadata
395
+ for field, value in metadata.items():
396
+ if hasattr(media_file, field) and value is not None:
397
+ setattr(media_file, field, value)
398
+
399
+ # Commit changes
400
+ repository.session.commit()
401
+ repository.session.refresh(media_file)
402
+
403
+ self.logger.info(f"Updated metadata for media file: {media_file.id}")
404
+ return media_file
405
+
406
+ except Exception as e:
407
+ self.logger.error(f"Failed to update media file metadata: {e}")
408
+ repository.session.rollback()
409
+ raise
410
+
411
+
412
+ class MetadataManager:
413
+ """
414
+ High-level metadata management.
415
+
416
+ Provides convenient methods for metadata extraction and storage.
417
+ """
418
+
419
+ def __init__(self, config: Config, verbose: bool = False):
420
+ """
421
+ Initialize metadata manager.
422
+
423
+ Args:
424
+ config: Configuration instance
425
+ verbose: Enable verbose logging
426
+ """
427
+ self.config = config
428
+ self.verbose = verbose
429
+ self.logger = get_logger("MetadataManager", verbose=verbose)
430
+
431
+ self.extractor = MetadataExtractor(config, verbose=verbose)
432
+
433
+ def enrich_media_file(
434
+ self,
435
+ media_file: MediaFile,
436
+ repository: MediaFileRepository,
437
+ extract_source_metadata: bool = True,
438
+ ) -> MediaFile:
439
+ """
440
+ Enrich media file with metadata from various sources.
441
+
442
+ Args:
443
+ media_file: MediaFile to enrich
444
+ repository: MediaFileRepository instance
445
+ extract_source_metadata: Whether to extract source metadata (YouTube, etc.)
446
+
447
+ Returns:
448
+ Enriched MediaFile instance
449
+ """
450
+ try:
451
+ # Extract file metadata
452
+ file_metadata = self.extractor.extract_file_metadata(media_file.file_path)
453
+ self.extractor.update_media_file_metadata(
454
+ media_file, file_metadata, repository
455
+ )
456
+
457
+ # Extract source metadata if available
458
+ if extract_source_metadata and media_file.source_url:
459
+ if (
460
+ "youtube.com" in media_file.source_url
461
+ or "youtu.be" in media_file.source_url
462
+ ):
463
+ youtube_metadata = self.extractor.extract_youtube_metadata(
464
+ media_file.source_url
465
+ )
466
+ self.extractor.update_media_file_metadata(
467
+ media_file, youtube_metadata, repository
468
+ )
469
+
470
+ self.logger.info(f"Enriched media file {media_file.id} with metadata")
471
+ return media_file
472
+
473
+ except Exception as e:
474
+ self.logger.error(f"Failed to enrich media file: {e}")
475
+ return media_file
476
+
477
+ def batch_enrich_media_files(
478
+ self,
479
+ repository: MediaFileRepository,
480
+ limit: int = 100,
481
+ media_type: Optional[str] = None,
482
+ ) -> List[MediaFile]:
483
+ """
484
+ Batch enrich multiple media files with metadata.
485
+
486
+ Args:
487
+ repository: MediaFileRepository instance
488
+ limit: Maximum number of files to process
489
+ media_type: Filter by media type
490
+
491
+ Returns:
492
+ List of enriched MediaFile instances
493
+ """
494
+ try:
495
+ # Get media files to enrich
496
+ query = repository.session.query(MediaFile)
497
+ if media_type:
498
+ query = query.filter(MediaFile.media_type == media_type)
499
+
500
+ media_files = query.limit(limit).all()
501
+
502
+ enriched_files = []
503
+ for media_file in media_files:
504
+ try:
505
+ enriched_file = self.enrich_media_file(media_file, repository)
506
+ enriched_files.append(enriched_file)
507
+ except Exception as e:
508
+ self.logger.error(f"Failed to enrich file {media_file.id}: {e}")
509
+ continue
510
+
511
+ self.logger.info(f"Batch enriched {len(enriched_files)} media files")
512
+ return enriched_files
513
+
514
+ except Exception as e:
515
+ self.logger.error(f"Batch enrichment failed: {e}")
516
+ return []