@jakende/media-info-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +178 -0
- package/bin/media-information-download.js +102 -0
- package/media_information_download/__init__.py +0 -0
- package/media_information_download/audio.py +33 -0
- package/media_information_download/config.py +93 -0
- package/media_information_download/downloaders/__init__.py +0 -0
- package/media_information_download/downloaders/http.py +56 -0
- package/media_information_download/downloaders/youtube.py +89 -0
- package/media_information_download/models.py +29 -0
- package/media_information_download/output.py +86 -0
- package/media_information_download/pipeline.py +164 -0
- package/media_information_download/sources/__init__.py +0 -0
- package/media_information_download/sources/rss.py +132 -0
- package/media_information_download/sources/youtube.py +41 -0
- package/media_information_download/transcription.py +109 -0
- package/media_information_download/tui.py +942 -0
- package/media_tui.py +8 -0
- package/package.json +36 -0
- package/pyproject.toml +26 -0
- package/requirements-transcribe.txt +3 -0
- package/requirements.txt +1 -0
- package/youtube_download.py +63 -0
- package/youtube_download_transcribe.py +67 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from media_information_download.audio import convert_to_mp3
|
|
8
|
+
from media_information_download.config import (
|
|
9
|
+
get_model_name,
|
|
10
|
+
get_output_dir,
|
|
11
|
+
get_whisper_language,
|
|
12
|
+
require_dependencies,
|
|
13
|
+
)
|
|
14
|
+
from media_information_download.downloaders.http import HTTPDownloader
|
|
15
|
+
from media_information_download.downloaders.youtube import YouTubeDownloader
|
|
16
|
+
from media_information_download.models import MediaItem, ProcessedMedia, ProgressCallback
|
|
17
|
+
from media_information_download.output import write_transcript
|
|
18
|
+
from media_information_download.sources.rss import RSSSource
|
|
19
|
+
from media_information_download.sources.youtube import YouTubeSource
|
|
20
|
+
from media_information_download.transcription import WhisperTranscriber
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class ProcessOptions:
|
|
25
|
+
source_type: str
|
|
26
|
+
raw_input: str
|
|
27
|
+
output_dir: Path | None = None
|
|
28
|
+
transcribe: bool = True
|
|
29
|
+
model_name: str | None = None
|
|
30
|
+
language: str | None = None
|
|
31
|
+
fps: int = 25
|
|
32
|
+
delay_seconds: float = 2.0
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class MediaPipeline:
|
|
36
|
+
def __init__(self, progress: ProgressCallback | None = None) -> None:
|
|
37
|
+
self.progress = progress or (lambda message: None)
|
|
38
|
+
|
|
39
|
+
def _source(self, source_type: str):
|
|
40
|
+
if source_type == "youtube":
|
|
41
|
+
return YouTubeSource()
|
|
42
|
+
if source_type == "rss":
|
|
43
|
+
return RSSSource()
|
|
44
|
+
raise ValueError(f"Unsupported source type: {source_type}")
|
|
45
|
+
|
|
46
|
+
def _downloader(self, item: MediaItem):
|
|
47
|
+
if item.source_type == "youtube":
|
|
48
|
+
return YouTubeDownloader()
|
|
49
|
+
if item.source_type == "rss":
|
|
50
|
+
return HTTPDownloader()
|
|
51
|
+
raise ValueError(f"Unsupported source type: {item.source_type}")
|
|
52
|
+
|
|
53
|
+
def collect_items(self, options: ProcessOptions) -> list[MediaItem]:
|
|
54
|
+
return self._source(options.source_type).collect(options.raw_input)
|
|
55
|
+
|
|
56
|
+
def process(self, options: ProcessOptions) -> list[ProcessedMedia]:
|
|
57
|
+
require_dependencies(include_transcription=options.transcribe)
|
|
58
|
+
output_dir = (options.output_dir or get_output_dir()).resolve()
|
|
59
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
self.progress(f"Collecting media from {options.source_type} input...")
|
|
62
|
+
items = self.collect_items(options)
|
|
63
|
+
self.progress(f"Found {len(items)} media item(s).")
|
|
64
|
+
|
|
65
|
+
transcriber: WhisperTranscriber | None = None
|
|
66
|
+
if options.transcribe:
|
|
67
|
+
model_name = options.model_name or get_model_name()
|
|
68
|
+
transcriber = WhisperTranscriber(
|
|
69
|
+
model_name=model_name,
|
|
70
|
+
language=options.language if options.language is not None else get_whisper_language(),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
results: list[ProcessedMedia] = []
|
|
74
|
+
for index, item in enumerate(items, start=1):
|
|
75
|
+
result = ProcessedMedia(item=item)
|
|
76
|
+
results.append(result)
|
|
77
|
+
self.progress(f"[{index}/{len(items)}] Downloading: {item.title}")
|
|
78
|
+
try:
|
|
79
|
+
downloaded_path = self._downloader(item).download(item, output_dir)
|
|
80
|
+
result.downloaded_path = downloaded_path
|
|
81
|
+
self.progress(f"Downloaded: {downloaded_path.name}")
|
|
82
|
+
|
|
83
|
+
self.progress(f"Converting to MP3: {downloaded_path.name}")
|
|
84
|
+
mp3_path = convert_to_mp3(downloaded_path, output_dir)
|
|
85
|
+
result.mp3_path = mp3_path
|
|
86
|
+
self.progress(f"MP3 ready: {mp3_path.name}")
|
|
87
|
+
|
|
88
|
+
if item.source_type == "rss" and downloaded_path.resolve() != mp3_path.resolve():
|
|
89
|
+
downloaded_path.unlink(missing_ok=True)
|
|
90
|
+
result.notes.append(f"Removed intermediate media file: {downloaded_path.name}")
|
|
91
|
+
result.downloaded_path = None
|
|
92
|
+
|
|
93
|
+
if transcriber is not None:
|
|
94
|
+
self.progress(f"Transcribing: {mp3_path.name}")
|
|
95
|
+
transcript, language, device = transcriber.transcribe(mp3_path)
|
|
96
|
+
result.transcript_path = write_transcript(
|
|
97
|
+
output_dir=output_dir,
|
|
98
|
+
item=item,
|
|
99
|
+
audio_path=mp3_path,
|
|
100
|
+
model_name=transcriber.model_name,
|
|
101
|
+
device=device,
|
|
102
|
+
language=language,
|
|
103
|
+
fps=options.fps,
|
|
104
|
+
segments=transcript.get("segments", []),
|
|
105
|
+
full_text=(transcript.get("text") or "").strip(),
|
|
106
|
+
)
|
|
107
|
+
self.progress(f"Transcript written: {result.transcript_path.name}")
|
|
108
|
+
except Exception as exc:
|
|
109
|
+
result.error = str(exc)
|
|
110
|
+
self.progress(f"ERROR: {exc}")
|
|
111
|
+
if "HTTP Error 403" in str(exc):
|
|
112
|
+
result.notes.append(
|
|
113
|
+
"YouTube rejected the media request. Try setting "
|
|
114
|
+
'YTDL_COOKIES_FROM_BROWSER="safari" or "chrome".'
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if index < len(items) and options.delay_seconds > 0:
|
|
118
|
+
time.sleep(max(0.0, options.delay_seconds))
|
|
119
|
+
|
|
120
|
+
return results
|
|
121
|
+
|
|
122
|
+
def transcribe_existing(
|
|
123
|
+
self,
|
|
124
|
+
audio_paths: list[Path],
|
|
125
|
+
output_dir: Path | None = None,
|
|
126
|
+
model_name: str | None = None,
|
|
127
|
+
language: str | None = None,
|
|
128
|
+
fps: int = 25,
|
|
129
|
+
) -> list[ProcessedMedia]:
|
|
130
|
+
require_dependencies(include_transcription=True)
|
|
131
|
+
target_output_dir = (output_dir or get_output_dir()).resolve()
|
|
132
|
+
transcriber = WhisperTranscriber(
|
|
133
|
+
model_name=model_name or get_model_name(),
|
|
134
|
+
language=language if language is not None else get_whisper_language(),
|
|
135
|
+
)
|
|
136
|
+
results: list[ProcessedMedia] = []
|
|
137
|
+
for index, audio_path in enumerate(audio_paths, start=1):
|
|
138
|
+
item = MediaItem(
|
|
139
|
+
source_type="local",
|
|
140
|
+
source_url=str(audio_path),
|
|
141
|
+
media_url=str(audio_path),
|
|
142
|
+
title=audio_path.stem,
|
|
143
|
+
)
|
|
144
|
+
result = ProcessedMedia(item=item, mp3_path=audio_path)
|
|
145
|
+
results.append(result)
|
|
146
|
+
try:
|
|
147
|
+
self.progress(f"[{index}/{len(audio_paths)}] Transcribing: {audio_path.name}")
|
|
148
|
+
transcript, detected_language, device = transcriber.transcribe(audio_path)
|
|
149
|
+
result.transcript_path = write_transcript(
|
|
150
|
+
output_dir=target_output_dir,
|
|
151
|
+
item=item,
|
|
152
|
+
audio_path=audio_path,
|
|
153
|
+
model_name=transcriber.model_name,
|
|
154
|
+
device=device,
|
|
155
|
+
language=detected_language,
|
|
156
|
+
fps=fps,
|
|
157
|
+
segments=transcript.get("segments", []),
|
|
158
|
+
full_text=(transcript.get("text") or "").strip(),
|
|
159
|
+
)
|
|
160
|
+
self.progress(f"Transcript written: {result.transcript_path.name}")
|
|
161
|
+
except Exception as exc:
|
|
162
|
+
result.error = str(exc)
|
|
163
|
+
self.progress(f"ERROR: {exc}")
|
|
164
|
+
return results
|
|
File without changes
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import mimetypes
|
|
4
|
+
import urllib.parse
|
|
5
|
+
import urllib.request
|
|
6
|
+
import xml.etree.ElementTree as ET
|
|
7
|
+
|
|
8
|
+
from media_information_download.config import (
|
|
9
|
+
SUPPORTED_MEDIA_EXTENSIONS,
|
|
10
|
+
SUPPORTED_MEDIA_MIME_PREFIXES,
|
|
11
|
+
)
|
|
12
|
+
from media_information_download.models import MediaItem
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _tag_name(element: ET.Element) -> str:
|
|
16
|
+
return element.tag.rsplit("}", maxsplit=1)[-1].lower()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _child_text(element: ET.Element, names: set[str]) -> str:
|
|
20
|
+
for child in element:
|
|
21
|
+
if _tag_name(child) in names and child.text:
|
|
22
|
+
return child.text.strip()
|
|
23
|
+
return ""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _is_supported_media(url: str, mime_type: str = "") -> bool:
|
|
27
|
+
clean_path = urllib.parse.urlparse(url).path
|
|
28
|
+
extension = mimetypes.guess_extension(mime_type.split(";", maxsplit=1)[0].strip())
|
|
29
|
+
suffix = (extension or "").lower() or clean_path.lower().rsplit(".", maxsplit=1)[-1]
|
|
30
|
+
if suffix and not suffix.startswith("."):
|
|
31
|
+
suffix = f".{suffix}"
|
|
32
|
+
|
|
33
|
+
if suffix in SUPPORTED_MEDIA_EXTENSIONS:
|
|
34
|
+
return True
|
|
35
|
+
|
|
36
|
+
mime = mime_type.lower().strip()
|
|
37
|
+
return any(mime.startswith(prefix) for prefix in SUPPORTED_MEDIA_MIME_PREFIXES)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _first_attr(element: ET.Element, names: tuple[str, ...]) -> str:
|
|
41
|
+
for name in names:
|
|
42
|
+
value = element.attrib.get(name)
|
|
43
|
+
if value:
|
|
44
|
+
return value.strip()
|
|
45
|
+
return ""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_feed_xml(feed_xml: bytes | str, feed_url: str) -> list[MediaItem]:
|
|
49
|
+
root = ET.fromstring(feed_xml)
|
|
50
|
+
candidates = [
|
|
51
|
+
element
|
|
52
|
+
for element in root.iter()
|
|
53
|
+
if _tag_name(element) in {"item", "entry"}
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
items: list[MediaItem] = []
|
|
57
|
+
for entry in candidates:
|
|
58
|
+
title = _child_text(entry, {"title"}) or "RSS media item"
|
|
59
|
+
published = _child_text(entry, {"pubdate", "published", "updated"})
|
|
60
|
+
description = _child_text(entry, {"description", "summary", "subtitle"})
|
|
61
|
+
|
|
62
|
+
media_candidates: list[tuple[str, str]] = []
|
|
63
|
+
for child in entry.iter():
|
|
64
|
+
name = _tag_name(child)
|
|
65
|
+
if name == "enclosure":
|
|
66
|
+
media_candidates.append(
|
|
67
|
+
(
|
|
68
|
+
_first_attr(child, ("url", "href")),
|
|
69
|
+
_first_attr(child, ("type", "medium")),
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
elif name in {"content", "player"} and (
|
|
73
|
+
"media" in child.tag.lower() or child.attrib.get("url")
|
|
74
|
+
):
|
|
75
|
+
media_candidates.append(
|
|
76
|
+
(
|
|
77
|
+
_first_attr(child, ("url", "href")),
|
|
78
|
+
_first_attr(child, ("type", "medium")),
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
elif name == "link":
|
|
82
|
+
href = _first_attr(child, ("href", "url"))
|
|
83
|
+
link_type = _first_attr(child, ("type",))
|
|
84
|
+
if href and (link_type or _is_supported_media(href)):
|
|
85
|
+
media_candidates.append((href, link_type))
|
|
86
|
+
elif child.text and _is_supported_media(child.text.strip()):
|
|
87
|
+
media_candidates.append((child.text.strip(), ""))
|
|
88
|
+
|
|
89
|
+
for media_url, mime_type in media_candidates:
|
|
90
|
+
if not media_url:
|
|
91
|
+
continue
|
|
92
|
+
media_url = urllib.parse.urljoin(feed_url, media_url)
|
|
93
|
+
if not _is_supported_media(media_url, mime_type):
|
|
94
|
+
continue
|
|
95
|
+
items.append(
|
|
96
|
+
MediaItem(
|
|
97
|
+
source_type="rss",
|
|
98
|
+
source_url=feed_url,
|
|
99
|
+
media_url=media_url,
|
|
100
|
+
title=title,
|
|
101
|
+
mime_type=mime_type,
|
|
102
|
+
published=published,
|
|
103
|
+
description=description,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
return items
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class RSSSource:
|
|
112
|
+
name = "rss"
|
|
113
|
+
|
|
114
|
+
def collect(self, feed_url: str) -> list[MediaItem]:
|
|
115
|
+
parsed = urllib.parse.urlparse(feed_url)
|
|
116
|
+
if parsed.scheme not in {"http", "https"}:
|
|
117
|
+
raise ValueError("RSS feed URL must start with http:// or https://")
|
|
118
|
+
|
|
119
|
+
request = urllib.request.Request(
|
|
120
|
+
feed_url,
|
|
121
|
+
headers={
|
|
122
|
+
"User-Agent": "media-information-download/1.0",
|
|
123
|
+
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml",
|
|
124
|
+
},
|
|
125
|
+
)
|
|
126
|
+
with urllib.request.urlopen(request, timeout=30) as response:
|
|
127
|
+
feed_xml = response.read()
|
|
128
|
+
|
|
129
|
+
items = parse_feed_xml(feed_xml, feed_url)
|
|
130
|
+
if not items:
|
|
131
|
+
raise ValueError("No supported audio or video enclosures were found in the RSS feed.")
|
|
132
|
+
return items
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from media_information_download.models import MediaItem
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
YOUTUBE_URL_RE = re.compile(
|
|
9
|
+
r"^(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w\-]{6,}.*$",
|
|
10
|
+
re.IGNORECASE,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def validate_url(url: str) -> bool:
|
|
15
|
+
return bool(YOUTUBE_URL_RE.match(url.strip()))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def parse_urls(raw_value: str) -> list[str]:
|
|
19
|
+
return [part.strip() for part in re.split(r"[\s,]+", raw_value) if part.strip()]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class YouTubeSource:
|
|
23
|
+
name = "youtube"
|
|
24
|
+
|
|
25
|
+
def collect(self, raw_value: str) -> list[MediaItem]:
|
|
26
|
+
urls = parse_urls(raw_value)
|
|
27
|
+
invalid_urls = [url for url in urls if not validate_url(url)]
|
|
28
|
+
if invalid_urls:
|
|
29
|
+
raise ValueError(
|
|
30
|
+
"Invalid YouTube URL(s): " + ", ".join(invalid_urls)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
return [
|
|
34
|
+
MediaItem(
|
|
35
|
+
source_type=self.name,
|
|
36
|
+
source_url=url,
|
|
37
|
+
media_url=url,
|
|
38
|
+
title="YouTube media",
|
|
39
|
+
)
|
|
40
|
+
for url in urls
|
|
41
|
+
]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def seconds_to_timecode(seconds: float, fps: int = 25) -> str:
|
|
12
|
+
total_frames = int(round(max(0.0, seconds) * fps))
|
|
13
|
+
frames_per_hour = 3600 * fps
|
|
14
|
+
frames_per_minute = 60 * fps
|
|
15
|
+
|
|
16
|
+
hours = total_frames // frames_per_hour
|
|
17
|
+
total_frames %= frames_per_hour
|
|
18
|
+
|
|
19
|
+
minutes = total_frames // frames_per_minute
|
|
20
|
+
total_frames %= frames_per_minute
|
|
21
|
+
|
|
22
|
+
secs = total_frames // fps
|
|
23
|
+
frames = total_frames % fps
|
|
24
|
+
|
|
25
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def densify_sparse_buffers(module: Any) -> None:
|
|
29
|
+
import torch
|
|
30
|
+
|
|
31
|
+
for key, buf in list(module._buffers.items()):
|
|
32
|
+
if torch.is_tensor(buf) and getattr(buf, "is_sparse", False):
|
|
33
|
+
module._buffers[key] = buf.to_dense()
|
|
34
|
+
for child in module.children():
|
|
35
|
+
densify_sparse_buffers(child)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def pick_preferred_device() -> str:
|
|
39
|
+
import torch
|
|
40
|
+
|
|
41
|
+
if torch.backends.mps.is_available():
|
|
42
|
+
return "mps"
|
|
43
|
+
if torch.cuda.is_available():
|
|
44
|
+
return "cuda"
|
|
45
|
+
return "cpu"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_whisper_model_robust(model_name: str, preferred_device: str) -> tuple[Any, str]:
|
|
49
|
+
import whisper
|
|
50
|
+
|
|
51
|
+
model = whisper.load_model(model_name, device="cpu")
|
|
52
|
+
densify_sparse_buffers(model)
|
|
53
|
+
|
|
54
|
+
if preferred_device == "mps":
|
|
55
|
+
try:
|
|
56
|
+
model = model.to("mps")
|
|
57
|
+
return model, "mps"
|
|
58
|
+
except NotImplementedError:
|
|
59
|
+
return model, "cpu"
|
|
60
|
+
|
|
61
|
+
if preferred_device == "cuda":
|
|
62
|
+
try:
|
|
63
|
+
model = model.to("cuda")
|
|
64
|
+
return model, "cuda"
|
|
65
|
+
except Exception:
|
|
66
|
+
return model, "cpu"
|
|
67
|
+
|
|
68
|
+
return model, "cpu"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class WhisperTranscriber:
|
|
72
|
+
def __init__(self, model_name: str, language: str | None = None) -> None:
|
|
73
|
+
self.model_name = model_name
|
|
74
|
+
self.language = language
|
|
75
|
+
self._model: Any | None = None
|
|
76
|
+
self.device = "unknown"
|
|
77
|
+
|
|
78
|
+
def _load(self) -> None:
|
|
79
|
+
if self._model is not None:
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
import torch
|
|
83
|
+
|
|
84
|
+
preferred = pick_preferred_device()
|
|
85
|
+
self._model, self.device = load_whisper_model_robust(self.model_name, preferred)
|
|
86
|
+
if self.device == "cpu":
|
|
87
|
+
try:
|
|
88
|
+
torch.set_num_threads(max(1, os.cpu_count() or 1))
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
def transcribe(self, audio_path: Path) -> tuple[dict[str, Any], str, str]:
|
|
93
|
+
self._load()
|
|
94
|
+
options: dict[str, Any] = {
|
|
95
|
+
"verbose": False,
|
|
96
|
+
"fp16": False,
|
|
97
|
+
"beam_size": 1,
|
|
98
|
+
"best_of": 1,
|
|
99
|
+
"temperature": 0,
|
|
100
|
+
"condition_on_previous_text": False,
|
|
101
|
+
}
|
|
102
|
+
if self.language:
|
|
103
|
+
options["language"] = self.language
|
|
104
|
+
|
|
105
|
+
with open(os.devnull, "w", encoding="utf-8") as devnull:
|
|
106
|
+
with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
|
|
107
|
+
result = self._model.transcribe(str(audio_path), **options)
|
|
108
|
+
detected_language = (result.get("language") or self.language or "unknown").strip()
|
|
109
|
+
return result, detected_language, self.device
|