@jakende/media-info-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ from media_information_download.audio import convert_to_mp3
8
+ from media_information_download.config import (
9
+ get_model_name,
10
+ get_output_dir,
11
+ get_whisper_language,
12
+ require_dependencies,
13
+ )
14
+ from media_information_download.downloaders.http import HTTPDownloader
15
+ from media_information_download.downloaders.youtube import YouTubeDownloader
16
+ from media_information_download.models import MediaItem, ProcessedMedia, ProgressCallback
17
+ from media_information_download.output import write_transcript
18
+ from media_information_download.sources.rss import RSSSource
19
+ from media_information_download.sources.youtube import YouTubeSource
20
+ from media_information_download.transcription import WhisperTranscriber
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class ProcessOptions:
25
+ source_type: str
26
+ raw_input: str
27
+ output_dir: Path | None = None
28
+ transcribe: bool = True
29
+ model_name: str | None = None
30
+ language: str | None = None
31
+ fps: int = 25
32
+ delay_seconds: float = 2.0
33
+
34
+
35
+ class MediaPipeline:
36
+ def __init__(self, progress: ProgressCallback | None = None) -> None:
37
+ self.progress = progress or (lambda message: None)
38
+
39
+ def _source(self, source_type: str):
40
+ if source_type == "youtube":
41
+ return YouTubeSource()
42
+ if source_type == "rss":
43
+ return RSSSource()
44
+ raise ValueError(f"Unsupported source type: {source_type}")
45
+
46
+ def _downloader(self, item: MediaItem):
47
+ if item.source_type == "youtube":
48
+ return YouTubeDownloader()
49
+ if item.source_type == "rss":
50
+ return HTTPDownloader()
51
+ raise ValueError(f"Unsupported source type: {item.source_type}")
52
+
53
+ def collect_items(self, options: ProcessOptions) -> list[MediaItem]:
54
+ return self._source(options.source_type).collect(options.raw_input)
55
+
56
+ def process(self, options: ProcessOptions) -> list[ProcessedMedia]:
57
+ require_dependencies(include_transcription=options.transcribe)
58
+ output_dir = (options.output_dir or get_output_dir()).resolve()
59
+ output_dir.mkdir(parents=True, exist_ok=True)
60
+
61
+ self.progress(f"Collecting media from {options.source_type} input...")
62
+ items = self.collect_items(options)
63
+ self.progress(f"Found {len(items)} media item(s).")
64
+
65
+ transcriber: WhisperTranscriber | None = None
66
+ if options.transcribe:
67
+ model_name = options.model_name or get_model_name()
68
+ transcriber = WhisperTranscriber(
69
+ model_name=model_name,
70
+ language=options.language if options.language is not None else get_whisper_language(),
71
+ )
72
+
73
+ results: list[ProcessedMedia] = []
74
+ for index, item in enumerate(items, start=1):
75
+ result = ProcessedMedia(item=item)
76
+ results.append(result)
77
+ self.progress(f"[{index}/{len(items)}] Downloading: {item.title}")
78
+ try:
79
+ downloaded_path = self._downloader(item).download(item, output_dir)
80
+ result.downloaded_path = downloaded_path
81
+ self.progress(f"Downloaded: {downloaded_path.name}")
82
+
83
+ self.progress(f"Converting to MP3: {downloaded_path.name}")
84
+ mp3_path = convert_to_mp3(downloaded_path, output_dir)
85
+ result.mp3_path = mp3_path
86
+ self.progress(f"MP3 ready: {mp3_path.name}")
87
+
88
+ if item.source_type == "rss" and downloaded_path.resolve() != mp3_path.resolve():
89
+ downloaded_path.unlink(missing_ok=True)
90
+ result.notes.append(f"Removed intermediate media file: {downloaded_path.name}")
91
+ result.downloaded_path = None
92
+
93
+ if transcriber is not None:
94
+ self.progress(f"Transcribing: {mp3_path.name}")
95
+ transcript, language, device = transcriber.transcribe(mp3_path)
96
+ result.transcript_path = write_transcript(
97
+ output_dir=output_dir,
98
+ item=item,
99
+ audio_path=mp3_path,
100
+ model_name=transcriber.model_name,
101
+ device=device,
102
+ language=language,
103
+ fps=options.fps,
104
+ segments=transcript.get("segments", []),
105
+ full_text=(transcript.get("text") or "").strip(),
106
+ )
107
+ self.progress(f"Transcript written: {result.transcript_path.name}")
108
+ except Exception as exc:
109
+ result.error = str(exc)
110
+ self.progress(f"ERROR: {exc}")
111
+ if "HTTP Error 403" in str(exc):
112
+ result.notes.append(
113
+ "YouTube rejected the media request. Try setting "
114
+ 'YTDL_COOKIES_FROM_BROWSER="safari" or "chrome".'
115
+ )
116
+
117
+ if index < len(items) and options.delay_seconds > 0:
118
+ time.sleep(max(0.0, options.delay_seconds))
119
+
120
+ return results
121
+
122
+ def transcribe_existing(
123
+ self,
124
+ audio_paths: list[Path],
125
+ output_dir: Path | None = None,
126
+ model_name: str | None = None,
127
+ language: str | None = None,
128
+ fps: int = 25,
129
+ ) -> list[ProcessedMedia]:
130
+ require_dependencies(include_transcription=True)
131
+ target_output_dir = (output_dir or get_output_dir()).resolve()
132
+ transcriber = WhisperTranscriber(
133
+ model_name=model_name or get_model_name(),
134
+ language=language if language is not None else get_whisper_language(),
135
+ )
136
+ results: list[ProcessedMedia] = []
137
+ for index, audio_path in enumerate(audio_paths, start=1):
138
+ item = MediaItem(
139
+ source_type="local",
140
+ source_url=str(audio_path),
141
+ media_url=str(audio_path),
142
+ title=audio_path.stem,
143
+ )
144
+ result = ProcessedMedia(item=item, mp3_path=audio_path)
145
+ results.append(result)
146
+ try:
147
+ self.progress(f"[{index}/{len(audio_paths)}] Transcribing: {audio_path.name}")
148
+ transcript, detected_language, device = transcriber.transcribe(audio_path)
149
+ result.transcript_path = write_transcript(
150
+ output_dir=target_output_dir,
151
+ item=item,
152
+ audio_path=audio_path,
153
+ model_name=transcriber.model_name,
154
+ device=device,
155
+ language=detected_language,
156
+ fps=fps,
157
+ segments=transcript.get("segments", []),
158
+ full_text=(transcript.get("text") or "").strip(),
159
+ )
160
+ self.progress(f"Transcript written: {result.transcript_path.name}")
161
+ except Exception as exc:
162
+ result.error = str(exc)
163
+ self.progress(f"ERROR: {exc}")
164
+ return results
File without changes
@@ -0,0 +1,132 @@
1
+ from __future__ import annotations
2
+
3
+ import mimetypes
4
+ import urllib.parse
5
+ import urllib.request
6
+ import xml.etree.ElementTree as ET
7
+
8
+ from media_information_download.config import (
9
+ SUPPORTED_MEDIA_EXTENSIONS,
10
+ SUPPORTED_MEDIA_MIME_PREFIXES,
11
+ )
12
+ from media_information_download.models import MediaItem
13
+
14
+
15
+ def _tag_name(element: ET.Element) -> str:
16
+ return element.tag.rsplit("}", maxsplit=1)[-1].lower()
17
+
18
+
19
+ def _child_text(element: ET.Element, names: set[str]) -> str:
20
+ for child in element:
21
+ if _tag_name(child) in names and child.text:
22
+ return child.text.strip()
23
+ return ""
24
+
25
+
26
+ def _is_supported_media(url: str, mime_type: str = "") -> bool:
27
+ clean_path = urllib.parse.urlparse(url).path
28
+ extension = mimetypes.guess_extension(mime_type.split(";", maxsplit=1)[0].strip())
29
+ suffix = (extension or "").lower() or clean_path.lower().rsplit(".", maxsplit=1)[-1]
30
+ if suffix and not suffix.startswith("."):
31
+ suffix = f".{suffix}"
32
+
33
+ if suffix in SUPPORTED_MEDIA_EXTENSIONS:
34
+ return True
35
+
36
+ mime = mime_type.lower().strip()
37
+ return any(mime.startswith(prefix) for prefix in SUPPORTED_MEDIA_MIME_PREFIXES)
38
+
39
+
40
+ def _first_attr(element: ET.Element, names: tuple[str, ...]) -> str:
41
+ for name in names:
42
+ value = element.attrib.get(name)
43
+ if value:
44
+ return value.strip()
45
+ return ""
46
+
47
+
48
+ def parse_feed_xml(feed_xml: bytes | str, feed_url: str) -> list[MediaItem]:
49
+ root = ET.fromstring(feed_xml)
50
+ candidates = [
51
+ element
52
+ for element in root.iter()
53
+ if _tag_name(element) in {"item", "entry"}
54
+ ]
55
+
56
+ items: list[MediaItem] = []
57
+ for entry in candidates:
58
+ title = _child_text(entry, {"title"}) or "RSS media item"
59
+ published = _child_text(entry, {"pubdate", "published", "updated"})
60
+ description = _child_text(entry, {"description", "summary", "subtitle"})
61
+
62
+ media_candidates: list[tuple[str, str]] = []
63
+ for child in entry.iter():
64
+ name = _tag_name(child)
65
+ if name == "enclosure":
66
+ media_candidates.append(
67
+ (
68
+ _first_attr(child, ("url", "href")),
69
+ _first_attr(child, ("type", "medium")),
70
+ )
71
+ )
72
+ elif name in {"content", "player"} and (
73
+ "media" in child.tag.lower() or child.attrib.get("url")
74
+ ):
75
+ media_candidates.append(
76
+ (
77
+ _first_attr(child, ("url", "href")),
78
+ _first_attr(child, ("type", "medium")),
79
+ )
80
+ )
81
+ elif name == "link":
82
+ href = _first_attr(child, ("href", "url"))
83
+ link_type = _first_attr(child, ("type",))
84
+ if href and (link_type or _is_supported_media(href)):
85
+ media_candidates.append((href, link_type))
86
+ elif child.text and _is_supported_media(child.text.strip()):
87
+ media_candidates.append((child.text.strip(), ""))
88
+
89
+ for media_url, mime_type in media_candidates:
90
+ if not media_url:
91
+ continue
92
+ media_url = urllib.parse.urljoin(feed_url, media_url)
93
+ if not _is_supported_media(media_url, mime_type):
94
+ continue
95
+ items.append(
96
+ MediaItem(
97
+ source_type="rss",
98
+ source_url=feed_url,
99
+ media_url=media_url,
100
+ title=title,
101
+ mime_type=mime_type,
102
+ published=published,
103
+ description=description,
104
+ )
105
+ )
106
+ break
107
+
108
+ return items
109
+
110
+
111
+ class RSSSource:
112
+ name = "rss"
113
+
114
+ def collect(self, feed_url: str) -> list[MediaItem]:
115
+ parsed = urllib.parse.urlparse(feed_url)
116
+ if parsed.scheme not in {"http", "https"}:
117
+ raise ValueError("RSS feed URL must start with http:// or https://")
118
+
119
+ request = urllib.request.Request(
120
+ feed_url,
121
+ headers={
122
+ "User-Agent": "media-information-download/1.0",
123
+ "Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml",
124
+ },
125
+ )
126
+ with urllib.request.urlopen(request, timeout=30) as response:
127
+ feed_xml = response.read()
128
+
129
+ items = parse_feed_xml(feed_xml, feed_url)
130
+ if not items:
131
+ raise ValueError("No supported audio or video enclosures were found in the RSS feed.")
132
+ return items
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from media_information_download.models import MediaItem
6
+
7
+
8
+ YOUTUBE_URL_RE = re.compile(
9
+ r"^(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w\-]{6,}.*$",
10
+ re.IGNORECASE,
11
+ )
12
+
13
+
14
+ def validate_url(url: str) -> bool:
15
+ return bool(YOUTUBE_URL_RE.match(url.strip()))
16
+
17
+
18
+ def parse_urls(raw_value: str) -> list[str]:
19
+ return [part.strip() for part in re.split(r"[\s,]+", raw_value) if part.strip()]
20
+
21
+
22
+ class YouTubeSource:
23
+ name = "youtube"
24
+
25
+ def collect(self, raw_value: str) -> list[MediaItem]:
26
+ urls = parse_urls(raw_value)
27
+ invalid_urls = [url for url in urls if not validate_url(url)]
28
+ if invalid_urls:
29
+ raise ValueError(
30
+ "Invalid YouTube URL(s): " + ", ".join(invalid_urls)
31
+ )
32
+
33
+ return [
34
+ MediaItem(
35
+ source_type=self.name,
36
+ source_url=url,
37
+ media_url=url,
38
+ title="YouTube media",
39
+ )
40
+ for url in urls
41
+ ]
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
9
+
10
+
11
+ def seconds_to_timecode(seconds: float, fps: int = 25) -> str:
12
+ total_frames = int(round(max(0.0, seconds) * fps))
13
+ frames_per_hour = 3600 * fps
14
+ frames_per_minute = 60 * fps
15
+
16
+ hours = total_frames // frames_per_hour
17
+ total_frames %= frames_per_hour
18
+
19
+ minutes = total_frames // frames_per_minute
20
+ total_frames %= frames_per_minute
21
+
22
+ secs = total_frames // fps
23
+ frames = total_frames % fps
24
+
25
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}"
26
+
27
+
28
+ def densify_sparse_buffers(module: Any) -> None:
29
+ import torch
30
+
31
+ for key, buf in list(module._buffers.items()):
32
+ if torch.is_tensor(buf) and getattr(buf, "is_sparse", False):
33
+ module._buffers[key] = buf.to_dense()
34
+ for child in module.children():
35
+ densify_sparse_buffers(child)
36
+
37
+
38
+ def pick_preferred_device() -> str:
39
+ import torch
40
+
41
+ if torch.backends.mps.is_available():
42
+ return "mps"
43
+ if torch.cuda.is_available():
44
+ return "cuda"
45
+ return "cpu"
46
+
47
+
48
+ def load_whisper_model_robust(model_name: str, preferred_device: str) -> tuple[Any, str]:
49
+ import whisper
50
+
51
+ model = whisper.load_model(model_name, device="cpu")
52
+ densify_sparse_buffers(model)
53
+
54
+ if preferred_device == "mps":
55
+ try:
56
+ model = model.to("mps")
57
+ return model, "mps"
58
+ except NotImplementedError:
59
+ return model, "cpu"
60
+
61
+ if preferred_device == "cuda":
62
+ try:
63
+ model = model.to("cuda")
64
+ return model, "cuda"
65
+ except Exception:
66
+ return model, "cpu"
67
+
68
+ return model, "cpu"
69
+
70
+
71
+ class WhisperTranscriber:
72
+ def __init__(self, model_name: str, language: str | None = None) -> None:
73
+ self.model_name = model_name
74
+ self.language = language
75
+ self._model: Any | None = None
76
+ self.device = "unknown"
77
+
78
+ def _load(self) -> None:
79
+ if self._model is not None:
80
+ return
81
+
82
+ import torch
83
+
84
+ preferred = pick_preferred_device()
85
+ self._model, self.device = load_whisper_model_robust(self.model_name, preferred)
86
+ if self.device == "cpu":
87
+ try:
88
+ torch.set_num_threads(max(1, os.cpu_count() or 1))
89
+ except Exception:
90
+ pass
91
+
92
+ def transcribe(self, audio_path: Path) -> tuple[dict[str, Any], str, str]:
93
+ self._load()
94
+ options: dict[str, Any] = {
95
+ "verbose": False,
96
+ "fp16": False,
97
+ "beam_size": 1,
98
+ "best_of": 1,
99
+ "temperature": 0,
100
+ "condition_on_previous_text": False,
101
+ }
102
+ if self.language:
103
+ options["language"] = self.language
104
+
105
+ with open(os.devnull, "w", encoding="utf-8") as devnull:
106
+ with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
107
+ result = self._model.transcribe(str(audio_path), **options)
108
+ detected_language = (result.get("language") or self.language or "unknown").strip()
109
+ return result, detected_language, self.device