@jakende/media-info-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +178 -0
- package/bin/media-information-download.js +102 -0
- package/media_information_download/__init__.py +0 -0
- package/media_information_download/audio.py +33 -0
- package/media_information_download/config.py +93 -0
- package/media_information_download/downloaders/__init__.py +0 -0
- package/media_information_download/downloaders/http.py +56 -0
- package/media_information_download/downloaders/youtube.py +89 -0
- package/media_information_download/models.py +29 -0
- package/media_information_download/output.py +86 -0
- package/media_information_download/pipeline.py +164 -0
- package/media_information_download/sources/__init__.py +0 -0
- package/media_information_download/sources/rss.py +132 -0
- package/media_information_download/sources/youtube.py +41 -0
- package/media_information_download/transcription.py +109 -0
- package/media_information_download/tui.py +942 -0
- package/media_tui.py +8 -0
- package/package.json +36 -0
- package/pyproject.toml +26 -0
- package/requirements-transcribe.txt +3 -0
- package/requirements.txt +1 -0
- package/youtube_download.py +63 -0
- package/youtube_download_transcribe.py +67 -0
package/README.md
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# Media Information Download
|
|
2
|
+
|
|
3
|
+
Terminal application for downloading media from YouTube URLs or RSS feeds, converting audio to MP3, and generating Whisper Markdown transcripts.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Python 3.10+
|
|
8
|
+
- ffmpeg on PATH
|
|
9
|
+
- A terminal with ANSI escape support for the framed TUI. macOS Terminal, iTerm2, Windows Terminal, and current PowerShell terminals are supported.
|
|
10
|
+
|
|
11
|
+
Python dependencies are installed into the project-local `.venv` folder. Run commands from this folder so the local environment is used.
|
|
12
|
+
|
|
13
|
+
## macOS Setup
|
|
14
|
+
|
|
15
|
+
Option 1, installer script:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
zsh scripts/install_macos.sh
|
|
19
|
+
./run.sh
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Option 2, manual setup:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
brew install ffmpeg
|
|
26
|
+
python3 -m venv .venv
|
|
27
|
+
source .venv/bin/activate
|
|
28
|
+
pip install -r requirements-transcribe.txt
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Optional editable install:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
source .venv/bin/activate
|
|
35
|
+
pip install -e ".[transcribe]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Windows Setup
|
|
39
|
+
|
|
40
|
+
Use Windows Terminal or PowerShell. Install Python 3.10+ and ffmpeg first, then use the local project environment.
|
|
41
|
+
|
|
42
|
+
Option 1, installer script:
|
|
43
|
+
|
|
44
|
+
```powershell
|
|
45
|
+
winget install Python.Python.3.12
|
|
46
|
+
winget install Gyan.FFmpeg
|
|
47
|
+
powershell -ExecutionPolicy Bypass -File .\scripts\install_windows.ps1
|
|
48
|
+
.\run.ps1
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Option 2, manual setup:
|
|
52
|
+
|
|
53
|
+
```powershell
|
|
54
|
+
py -3 -m venv .venv
|
|
55
|
+
.\.venv\Scripts\python.exe -m pip install -r requirements-transcribe.txt
|
|
56
|
+
.\.venv\Scripts\python.exe -m pip install -e ".[transcribe]"
|
|
57
|
+
.\.venv\Scripts\python.exe media_tui.py
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
After editable install, these console commands are available inside the active environment on both platforms:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
media-information-download
|
|
64
|
+
media-info-download
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## npm Install
|
|
68
|
+
|
|
69
|
+
The package is also published as an npm CLI wrapper. It still requires Python 3.10+ and ffmpeg on `PATH`.
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
npm install -g @jakende/media-info-cli
|
|
73
|
+
media-information-download
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
On first run, the npm wrapper creates a Python virtual environment in `~/.media-information-download/venv` and installs the Python dependencies there. To use a different venv location:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
MEDIA_INFORMATION_DOWNLOAD_VENV=/path/to/venv media-information-download
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## TUI
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
./run.sh
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
or:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
source .venv/bin/activate
|
|
92
|
+
python3 media_tui.py
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
On Windows:
|
|
96
|
+
|
|
97
|
+
```powershell
|
|
98
|
+
.\run.ps1
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
If the project is installed into the active environment, you can also run:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
media-information-download
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The TUI lets you choose YouTube or RSS input, start downloads, watch progress messages, trigger transcription, and list or open generated files.
|
|
108
|
+
Long-running download, MP3 conversion, and transcription steps show a visible `WORKING` activity bar while active.
|
|
109
|
+
Interactive screens render inside a left-aligned framed viewport that resizes with the current terminal window.
|
|
110
|
+
|
|
111
|
+
Keyboard controls:
|
|
112
|
+
|
|
113
|
+
- Navigation controls are shown at the bottom of the active menu or submenu
|
|
114
|
+
- Up/Down: move through selectable menu items
|
|
115
|
+
- Enter: select
|
|
116
|
+
- Backspace: go back from a submenu or choice screen
|
|
117
|
+
- Escape: cancel/back from submenus; quit from the main menu
|
|
118
|
+
- URL entry screens show their own controls: type or paste text, Enter continues, Backspace/Escape goes back
|
|
119
|
+
- Paste is supported in URL entry screens, including terminal bracketed paste, macOS clipboard paste, and Windows clipboard paste with Ctrl+V in PowerShell/Windows Terminal
|
|
120
|
+
|
|
121
|
+
For multiple YouTube URLs, separate entries with commas, spaces, or line breaks:
|
|
122
|
+
|
|
123
|
+
```text
|
|
124
|
+
https://youtu.be/VIDEO_ONE
|
|
125
|
+
https://www.youtube.com/watch?v=VIDEO_TWO, https://youtu.be/VIDEO_THREE
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Non-Interactive Usage
|
|
129
|
+
|
|
130
|
+
Download and transcribe a YouTube URL:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
python3 media_tui.py --source youtube --url "https://www.youtube.com/watch?v=VIDEO_ID"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Download and convert a YouTube URL to MP3 without transcription:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
python3 media_tui.py --source youtube --url "https://www.youtube.com/watch?v=VIDEO_ID" --no-transcribe
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Download supported media from an RSS feed and transcribe it:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
python3 media_tui.py --source rss --url "https://example.com/feed.xml"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Compatibility commands still work:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
python3 youtube_download.py --url "https://www.youtube.com/watch?v=VIDEO_ID"
|
|
152
|
+
python3 youtube_download_transcribe.py --url "https://www.youtube.com/watch?v=VIDEO_ID"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Configuration
|
|
156
|
+
|
|
157
|
+
- `MEDIA_OUTPUT_DIR`: output folder. Defaults to `./output`
|
|
158
|
+
- `YTDL_OUTPUT_DIR`: legacy output folder fallback
|
|
159
|
+
- `WHISPER_MODEL`: Whisper model. Defaults to `large`
|
|
160
|
+
- `WHISPER_LANGUAGE`: optional language code. If unset, Whisper auto-detects language
|
|
161
|
+
- `YTDL_COOKIES_FROM_BROWSER`: optional browser cookies for YouTube, for example `safari` or `chrome`
|
|
162
|
+
|
|
163
|
+
All generated audio is saved as `.mp3`. Non-MP3 RSS downloads are converted and removed as intermediates, so new RSS audio output does not remain as `.wav`. Transcripts are saved as `.md` next to the MP3 files.
|
|
164
|
+
|
|
165
|
+
## Architecture
|
|
166
|
+
|
|
167
|
+
- `media_tui.py`: direct script entry point
|
|
168
|
+
- `youtube_download.py`: compatibility entry point for YouTube download and MP3 conversion
|
|
169
|
+
- `youtube_download_transcribe.py`: compatibility entry point for YouTube download, MP3 conversion, and transcription
|
|
170
|
+
- `media_information_download/sources/`: input source handling for YouTube and RSS
|
|
171
|
+
- `media_information_download/downloaders/`: YouTube and HTTP media downloaders
|
|
172
|
+
- `media_information_download/audio.py`: audio extraction and MP3 conversion
|
|
173
|
+
- `media_information_download/transcription.py`: Whisper model loading and transcription, adapted from the macOS transcription workflow
|
|
174
|
+
- `media_information_download/output.py`: Markdown transcript and output file handling
|
|
175
|
+
- `media_information_download/pipeline.py`: orchestration across source, download, conversion, transcription, and output
|
|
176
|
+
- `media_information_download/tui.py`: terminal UI and non-interactive CLI entry point
|
|
177
|
+
|
|
178
|
+
This structure keeps new media sources or formats isolated to source handlers, downloaders, and supported format lists.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
|
|
4
|
+
const fs = require("node:fs");
|
|
5
|
+
const os = require("node:os");
|
|
6
|
+
const path = require("node:path");
|
|
7
|
+
const { spawnSync } = require("node:child_process");
|
|
8
|
+
|
|
9
|
+
const packageRoot = path.resolve(__dirname, "..");
|
|
10
|
+
const venvRoot = process.env.MEDIA_INFORMATION_DOWNLOAD_VENV
|
|
11
|
+
? path.resolve(process.env.MEDIA_INFORMATION_DOWNLOAD_VENV)
|
|
12
|
+
: path.join(os.homedir(), ".media-information-download", "venv");
|
|
13
|
+
const isWindows = process.platform === "win32";
|
|
14
|
+
const venvPython = isWindows
|
|
15
|
+
? path.join(venvRoot, "Scripts", "python.exe")
|
|
16
|
+
: path.join(venvRoot, "bin", "python");
|
|
17
|
+
|
|
18
|
+
function run(command, args, options = {}) {
|
|
19
|
+
const result = spawnSync(command, args, {
|
|
20
|
+
cwd: packageRoot,
|
|
21
|
+
stdio: options.stdio || "inherit",
|
|
22
|
+
env: {
|
|
23
|
+
...process.env,
|
|
24
|
+
PYTHONPATH: packageRoot,
|
|
25
|
+
},
|
|
26
|
+
});
|
|
27
|
+
return result;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function commandExists(command, args = ["--version"]) {
|
|
31
|
+
const result = spawnSync(command, args, { stdio: "ignore" });
|
|
32
|
+
return result.status === 0;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function pythonCommand() {
|
|
36
|
+
if (process.env.PYTHON) {
|
|
37
|
+
return { command: process.env.PYTHON, args: [] };
|
|
38
|
+
}
|
|
39
|
+
if (isWindows && commandExists("py", ["-3", "--version"])) {
|
|
40
|
+
return { command: "py", args: ["-3"] };
|
|
41
|
+
}
|
|
42
|
+
if (commandExists("python3")) {
|
|
43
|
+
return { command: "python3", args: [] };
|
|
44
|
+
}
|
|
45
|
+
if (commandExists("python")) {
|
|
46
|
+
return { command: "python", args: [] };
|
|
47
|
+
}
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function ensureVenv() {
|
|
52
|
+
if (fs.existsSync(venvPython)) {
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
fs.mkdirSync(path.dirname(venvRoot), { recursive: true });
|
|
57
|
+
const python = pythonCommand();
|
|
58
|
+
if (!python) {
|
|
59
|
+
console.error("Python 3.10+ is required. Install Python and rerun this command.");
|
|
60
|
+
process.exit(1);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const result = run(python.command, [...python.args, "-m", "venv", venvRoot]);
|
|
64
|
+
if (result.status !== 0) {
|
|
65
|
+
process.exit(result.status || 1);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function ensureDependencies() {
|
|
70
|
+
const check = run(venvPython, ["-c", "import yt_dlp, whisper, torch"], { stdio: "ignore" });
|
|
71
|
+
if (check.status === 0) {
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const pip = run(venvPython, [
|
|
76
|
+
"-m",
|
|
77
|
+
"pip",
|
|
78
|
+
"install",
|
|
79
|
+
"-r",
|
|
80
|
+
path.join(packageRoot, "requirements-transcribe.txt"),
|
|
81
|
+
]);
|
|
82
|
+
if (pip.status !== 0) {
|
|
83
|
+
process.exit(pip.status || 1);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function warnIfFfmpegMissing() {
|
|
88
|
+
const command = isWindows ? "where" : "which";
|
|
89
|
+
if (!commandExists(command, ["ffmpeg"])) {
|
|
90
|
+
console.warn("Warning: ffmpeg is not available on PATH. Downloads may fail during MP3 conversion.");
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
ensureVenv();
|
|
95
|
+
ensureDependencies();
|
|
96
|
+
warnIfFfmpegMissing();
|
|
97
|
+
|
|
98
|
+
const child = run(venvPython, [path.join(packageRoot, "media_tui.py"), ...process.argv.slice(2)]);
|
|
99
|
+
if (child.signal) {
|
|
100
|
+
process.kill(process.pid, child.signal);
|
|
101
|
+
}
|
|
102
|
+
process.exit(child.status || 0);
|
|
File without changes
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_to_mp3(media_path: Path, output_dir: Path) -> Path:
|
|
8
|
+
mp3_path = output_dir / f"{media_path.stem}.mp3"
|
|
9
|
+
if media_path.resolve() == mp3_path.resolve() and mp3_path.exists():
|
|
10
|
+
return mp3_path
|
|
11
|
+
|
|
12
|
+
result = subprocess.run(
|
|
13
|
+
[
|
|
14
|
+
"ffmpeg",
|
|
15
|
+
"-y",
|
|
16
|
+
"-i",
|
|
17
|
+
str(media_path),
|
|
18
|
+
"-vn",
|
|
19
|
+
"-codec:a",
|
|
20
|
+
"libmp3lame",
|
|
21
|
+
"-q:a",
|
|
22
|
+
"2",
|
|
23
|
+
str(mp3_path),
|
|
24
|
+
],
|
|
25
|
+
stdout=subprocess.DEVNULL,
|
|
26
|
+
stderr=subprocess.PIPE,
|
|
27
|
+
text=True,
|
|
28
|
+
check=False,
|
|
29
|
+
)
|
|
30
|
+
if result.returncode != 0:
|
|
31
|
+
error_tail = (result.stderr or "").strip().splitlines()[-5:]
|
|
32
|
+
raise RuntimeError("ffmpeg failed to create MP3. " + " ".join(error_tail))
|
|
33
|
+
return mp3_path
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
10
|
+
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "output"
|
|
11
|
+
SUPPORTED_MEDIA_EXTENSIONS = {
|
|
12
|
+
".aac",
|
|
13
|
+
".avi",
|
|
14
|
+
".flac",
|
|
15
|
+
".m4a",
|
|
16
|
+
".m4v",
|
|
17
|
+
".mkv",
|
|
18
|
+
".mov",
|
|
19
|
+
".mp3",
|
|
20
|
+
".mp4",
|
|
21
|
+
".ogg",
|
|
22
|
+
".opus",
|
|
23
|
+
".wav",
|
|
24
|
+
".webm",
|
|
25
|
+
".wma",
|
|
26
|
+
}
|
|
27
|
+
SUPPORTED_MEDIA_MIME_PREFIXES = ("audio/", "video/")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_output_dir() -> Path:
|
|
31
|
+
output_dir = os.environ.get("MEDIA_OUTPUT_DIR") or os.environ.get("YTDL_OUTPUT_DIR")
|
|
32
|
+
path = Path(output_dir).expanduser() if output_dir else DEFAULT_OUTPUT_DIR
|
|
33
|
+
path = path.resolve()
|
|
34
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
return path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_model_name() -> str:
|
|
39
|
+
return (os.environ.get("WHISPER_MODEL") or "large").strip() or "large"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_whisper_language() -> str | None:
|
|
43
|
+
language = (os.environ.get("WHISPER_LANGUAGE") or "").strip()
|
|
44
|
+
return language or None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_cookies_from_browser() -> tuple[str, ...] | None:
|
|
48
|
+
browser = os.environ.get("YTDL_COOKIES_FROM_BROWSER", "").strip()
|
|
49
|
+
if not browser:
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
parts = [part.strip() for part in browser.split(":", maxsplit=1) if part.strip()]
|
|
53
|
+
return tuple(parts) if parts else None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def is_project_venv_active() -> bool:
|
|
57
|
+
active_prefix = Path(sys.prefix).resolve()
|
|
58
|
+
project_venv = (PROJECT_ROOT / ".venv").resolve()
|
|
59
|
+
return active_prefix == project_venv or project_venv in active_prefix.parents
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def dependency_status(include_transcription: bool = True) -> list[str]:
|
|
63
|
+
messages: list[str] = []
|
|
64
|
+
if not is_project_venv_active():
|
|
65
|
+
messages.append(
|
|
66
|
+
f"Python is not running from the project venv: expected {PROJECT_ROOT / '.venv'}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if shutil.which("ffmpeg") is None:
|
|
70
|
+
messages.append("ffmpeg is not available on PATH.")
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
import yt_dlp # noqa: F401
|
|
74
|
+
except Exception:
|
|
75
|
+
messages.append("Python package 'yt-dlp' is not installed in the active environment.")
|
|
76
|
+
|
|
77
|
+
if include_transcription:
|
|
78
|
+
try:
|
|
79
|
+
import torch # noqa: F401
|
|
80
|
+
import whisper # noqa: F401
|
|
81
|
+
except Exception:
|
|
82
|
+
messages.append(
|
|
83
|
+
"Whisper dependencies are missing. Install local deps with: "
|
|
84
|
+
". .venv/bin/activate && pip install -r requirements-transcribe.txt"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return messages
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def require_dependencies(include_transcription: bool = True) -> None:
|
|
91
|
+
messages = dependency_status(include_transcription=include_transcription)
|
|
92
|
+
if messages:
|
|
93
|
+
raise RuntimeError("\n".join(messages))
|
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import mimetypes
|
|
4
|
+
import re
|
|
5
|
+
import urllib.parse
|
|
6
|
+
import urllib.request
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from media_information_download.models import MediaItem
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
SAFE_FILENAME_RE = re.compile(r"[^A-Za-z0-9._() -]+")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def safe_filename(value: str, fallback: str = "media") -> str:
|
|
16
|
+
cleaned = SAFE_FILENAME_RE.sub("_", value).strip(" ._")
|
|
17
|
+
return cleaned or fallback
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _extension_from_item(item: MediaItem, response_url: str) -> str:
|
|
21
|
+
path_suffix = Path(urllib.parse.urlparse(response_url).path).suffix
|
|
22
|
+
if path_suffix:
|
|
23
|
+
return path_suffix
|
|
24
|
+
|
|
25
|
+
guessed = mimetypes.guess_extension(item.mime_type.split(";", maxsplit=1)[0].strip())
|
|
26
|
+
return guessed or ".media"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _dedupe_path(path: Path) -> Path:
|
|
30
|
+
if not path.exists():
|
|
31
|
+
return path
|
|
32
|
+
|
|
33
|
+
for index in range(1, 1000):
|
|
34
|
+
candidate = path.with_name(f"{path.stem}-{index}{path.suffix}")
|
|
35
|
+
if not candidate.exists():
|
|
36
|
+
return candidate
|
|
37
|
+
raise RuntimeError(f"Could not create unique output path for {path}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class HTTPDownloader:
|
|
41
|
+
def download(self, item: MediaItem, output_dir: Path) -> Path:
|
|
42
|
+
request = urllib.request.Request(
|
|
43
|
+
item.media_url,
|
|
44
|
+
headers={"User-Agent": "media-information-download/1.0"},
|
|
45
|
+
)
|
|
46
|
+
with urllib.request.urlopen(request, timeout=60) as response:
|
|
47
|
+
extension = _extension_from_item(item, response.geturl())
|
|
48
|
+
target = _dedupe_path(output_dir / f"{safe_filename(item.title)}{extension}")
|
|
49
|
+
with target.open("wb") as file:
|
|
50
|
+
while True:
|
|
51
|
+
chunk = response.read(1024 * 1024)
|
|
52
|
+
if not chunk:
|
|
53
|
+
break
|
|
54
|
+
file.write(chunk)
|
|
55
|
+
|
|
56
|
+
return target
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from media_information_download.config import get_cookies_from_browser
|
|
7
|
+
from media_information_download.models import MediaItem
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import yt_dlp # type: ignore
|
|
11
|
+
except Exception: # pragma: no cover - checked at runtime
|
|
12
|
+
yt_dlp = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _build_ydl_opts(output_dir: Path, format_selector: str) -> dict[str, Any]:
|
|
16
|
+
ydl_opts: dict[str, Any] = {
|
|
17
|
+
"format": format_selector,
|
|
18
|
+
"merge_output_format": "mp4",
|
|
19
|
+
"outtmpl": str(output_dir / "%(title)s.%(ext)s"),
|
|
20
|
+
"noplaylist": True,
|
|
21
|
+
"quiet": True,
|
|
22
|
+
"no_warnings": True,
|
|
23
|
+
"retries": 10,
|
|
24
|
+
"fragment_retries": 10,
|
|
25
|
+
"extractor_args": {
|
|
26
|
+
"youtube": {
|
|
27
|
+
"player_client": ["ios", "android", "web"],
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"http_headers": {
|
|
31
|
+
"User-Agent": (
|
|
32
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
33
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
34
|
+
"Chrome/136.0.0.0 Safari/537.36"
|
|
35
|
+
)
|
|
36
|
+
},
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
cookies_from_browser = get_cookies_from_browser()
|
|
40
|
+
if cookies_from_browser:
|
|
41
|
+
ydl_opts["cookiesfrombrowser"] = cookies_from_browser
|
|
42
|
+
|
|
43
|
+
return ydl_opts
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _resolve_downloaded_path(output_dir: Path, prepared_filename: Path) -> Path:
|
|
47
|
+
if prepared_filename.exists():
|
|
48
|
+
return prepared_filename
|
|
49
|
+
|
|
50
|
+
stem = prepared_filename.stem
|
|
51
|
+
matches = sorted(
|
|
52
|
+
path
|
|
53
|
+
for path in output_dir.glob(f"{stem}.*")
|
|
54
|
+
if path.suffix.lower() not in {".part", ".ytdl"}
|
|
55
|
+
)
|
|
56
|
+
if matches:
|
|
57
|
+
return matches[0]
|
|
58
|
+
|
|
59
|
+
return prepared_filename
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class YouTubeDownloader:
|
|
63
|
+
def download(self, item: MediaItem, output_dir: Path) -> Path:
|
|
64
|
+
if yt_dlp is None:
|
|
65
|
+
raise RuntimeError("Python package 'yt-dlp' is not installed.")
|
|
66
|
+
|
|
67
|
+
format_attempts = [
|
|
68
|
+
("bv*+ba/b", "best split video/audio streams"),
|
|
69
|
+
("best[ext=mp4]/best", "progressive MP4 fallback"),
|
|
70
|
+
]
|
|
71
|
+
last_error: Exception | None = None
|
|
72
|
+
|
|
73
|
+
for format_selector, description in format_attempts:
|
|
74
|
+
try:
|
|
75
|
+
ydl_opts = _build_ydl_opts(output_dir, format_selector)
|
|
76
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
77
|
+
info = ydl.extract_info(item.media_url, download=True)
|
|
78
|
+
filename = Path(ydl.prepare_filename(info))
|
|
79
|
+
return _resolve_downloaded_path(output_dir, filename)
|
|
80
|
+
except Exception as exc:
|
|
81
|
+
last_error = exc
|
|
82
|
+
if "HTTP Error 403" not in str(exc):
|
|
83
|
+
raise
|
|
84
|
+
if description == format_attempts[-1][1]:
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
if last_error is not None:
|
|
88
|
+
raise last_error
|
|
89
|
+
raise RuntimeError("YouTube download failed without a captured exception.")
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class MediaItem:
|
|
10
|
+
source_type: str
|
|
11
|
+
source_url: str
|
|
12
|
+
media_url: str
|
|
13
|
+
title: str
|
|
14
|
+
mime_type: str = ""
|
|
15
|
+
published: str = ""
|
|
16
|
+
description: str = ""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ProcessedMedia:
|
|
21
|
+
item: MediaItem
|
|
22
|
+
downloaded_path: Path | None = None
|
|
23
|
+
mp3_path: Path | None = None
|
|
24
|
+
transcript_path: Path | None = None
|
|
25
|
+
error: str | None = None
|
|
26
|
+
notes: list[str] = field(default_factory=list)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
ProgressCallback = Callable[[str], None]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from media_information_download.models import MediaItem
|
|
7
|
+
from media_information_download.transcription import seconds_to_timecode
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def yaml_quote(value: str) -> str:
|
|
11
|
+
return '"' + value.replace("\\", "\\\\").replace('"', '\\"') + '"'
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_markdown(
|
|
15
|
+
item: MediaItem,
|
|
16
|
+
audio_path: Path,
|
|
17
|
+
model_name: str,
|
|
18
|
+
device: str,
|
|
19
|
+
language: str,
|
|
20
|
+
fps: int,
|
|
21
|
+
segments: list[dict],
|
|
22
|
+
full_text: str,
|
|
23
|
+
include_timecodes: bool = True,
|
|
24
|
+
) -> str:
|
|
25
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
26
|
+
segment_blocks = []
|
|
27
|
+
for seg in segments:
|
|
28
|
+
start_tc = seconds_to_timecode(float(seg.get("start", 0.0)), fps=fps)
|
|
29
|
+
end_tc = seconds_to_timecode(float(seg.get("end", 0.0)), fps=fps)
|
|
30
|
+
seg_text = (seg.get("text") or "").strip()
|
|
31
|
+
if seg_text:
|
|
32
|
+
segment_blocks.append(f"{start_tc} - {end_tc}\n{seg_text}")
|
|
33
|
+
|
|
34
|
+
transcript_body = "\n\n".join(segment_blocks).strip() if include_timecodes and segment_blocks else full_text
|
|
35
|
+
return f"""\
|
|
36
|
+
---
|
|
37
|
+
created: {yaml_quote(timestamp)}
|
|
38
|
+
model: {yaml_quote(model_name)}
|
|
39
|
+
device: {yaml_quote(device)}
|
|
40
|
+
language: {yaml_quote(language)}
|
|
41
|
+
source_type: {yaml_quote(item.source_type)}
|
|
42
|
+
source_url: {yaml_quote(item.source_url)}
|
|
43
|
+
media_url: {yaml_quote(item.media_url)}
|
|
44
|
+
source_file: {yaml_quote(audio_path.name)}
|
|
45
|
+
fps_timecode: {fps}
|
|
46
|
+
timecodes: {str(include_timecodes).lower()}
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
# Transkript: {item.title}
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
{transcript_body}
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def write_transcript(
|
|
58
|
+
output_dir: Path,
|
|
59
|
+
item: MediaItem,
|
|
60
|
+
audio_path: Path,
|
|
61
|
+
model_name: str,
|
|
62
|
+
device: str,
|
|
63
|
+
language: str,
|
|
64
|
+
fps: int,
|
|
65
|
+
segments: list[dict],
|
|
66
|
+
full_text: str,
|
|
67
|
+
) -> Path:
|
|
68
|
+
md_path = output_dir / f"{audio_path.stem}.md"
|
|
69
|
+
markdown = build_markdown(
|
|
70
|
+
item=item,
|
|
71
|
+
audio_path=audio_path,
|
|
72
|
+
model_name=model_name,
|
|
73
|
+
device=device,
|
|
74
|
+
language=language,
|
|
75
|
+
fps=fps,
|
|
76
|
+
segments=segments,
|
|
77
|
+
full_text=full_text,
|
|
78
|
+
)
|
|
79
|
+
md_path.write_text(markdown, encoding="utf-8")
|
|
80
|
+
return md_path
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def list_output_files(output_dir: Path) -> list[Path]:
|
|
84
|
+
if not output_dir.exists():
|
|
85
|
+
return []
|
|
86
|
+
return sorted(path for path in output_dir.iterdir() if path.is_file())
|