getscript 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- getscript/__init__.py +6 -0
- getscript/apple.py +255 -0
- getscript/cli.py +344 -0
- getscript/completions.py +81 -0
- getscript/config.py +64 -0
- getscript/detect.py +56 -0
- getscript/output.py +97 -0
- getscript/picker.py +69 -0
- getscript/progress.py +34 -0
- getscript/search.py +83 -0
- getscript/upload.py +131 -0
- getscript/youtube.py +58 -0
- getscript-0.12.0.dist-info/METADATA +125 -0
- getscript-0.12.0.dist-info/RECORD +18 -0
- getscript-0.12.0.dist-info/WHEEL +5 -0
- getscript-0.12.0.dist-info/entry_points.txt +2 -0
- getscript-0.12.0.dist-info/licenses/LICENSE +21 -0
- getscript-0.12.0.dist-info/top_level.txt +1 -0
getscript/config.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Configuration loading with XDG base directory compliance."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_config_dir() -> str:
|
|
8
|
+
return os.path.join(
|
|
9
|
+
os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~/.config")),
|
|
10
|
+
"getscript",
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_cache_dir() -> str:
|
|
15
|
+
return os.path.join(
|
|
16
|
+
os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
|
|
17
|
+
"getscript",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_config() -> dict:
|
|
22
|
+
"""Load config from XDG config file. Returns empty dict if not found."""
|
|
23
|
+
import sys
|
|
24
|
+
|
|
25
|
+
config_path = os.path.join(get_config_dir(), "config.json")
|
|
26
|
+
if os.path.exists(config_path):
|
|
27
|
+
try:
|
|
28
|
+
with open(config_path) as f:
|
|
29
|
+
return json.load(f)
|
|
30
|
+
except json.JSONDecodeError as e:
|
|
31
|
+
print(f"Warning: invalid config at {config_path}: {e}", file=sys.stderr)
|
|
32
|
+
return {}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def merge_config(file_config: dict, cli_args: dict) -> dict:
|
|
36
|
+
"""Merge config sources: file < env vars < CLI flags.
|
|
37
|
+
|
|
38
|
+
CLI flags override env vars override file config.
|
|
39
|
+
Only non-None CLI values override.
|
|
40
|
+
"""
|
|
41
|
+
merged = dict(file_config)
|
|
42
|
+
|
|
43
|
+
# Env var overrides
|
|
44
|
+
if os.environ.get("NO_COLOR"):
|
|
45
|
+
merged["no_color"] = True
|
|
46
|
+
if os.environ.get("GETSCRIPT_YOUTUBE_API_KEY"):
|
|
47
|
+
merged["youtube_api_key"] = os.environ["GETSCRIPT_YOUTUBE_API_KEY"]
|
|
48
|
+
if os.environ.get("GETSCRIPT_PROXY"):
|
|
49
|
+
merged["proxy"] = os.environ["GETSCRIPT_PROXY"]
|
|
50
|
+
if os.environ.get("GETSCRIPT_COOKIE_FILE"):
|
|
51
|
+
merged["cookie_file"] = os.environ["GETSCRIPT_COOKIE_FILE"]
|
|
52
|
+
if os.environ.get("GETSCRIPT_UPLOAD", "").lower() in ("0", "false", "no"):
|
|
53
|
+
merged["no_upload"] = True
|
|
54
|
+
if os.environ.get("GETSCRIPT_SUPABASE_URL"):
|
|
55
|
+
merged["supabase_url"] = os.environ["GETSCRIPT_SUPABASE_URL"]
|
|
56
|
+
if os.environ.get("GETSCRIPT_SUPABASE_ANON_KEY"):
|
|
57
|
+
merged["supabase_anon_key"] = os.environ["GETSCRIPT_SUPABASE_ANON_KEY"]
|
|
58
|
+
|
|
59
|
+
# CLI flag overrides (only if explicitly set)
|
|
60
|
+
for key, value in cli_args.items():
|
|
61
|
+
if value is not None:
|
|
62
|
+
merged[key] = value
|
|
63
|
+
|
|
64
|
+
return merged
|
getscript/detect.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Auto-detect transcript source from URL or ID."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from urllib.parse import urlparse, parse_qs
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_source(input_str: str) -> tuple[str, str]:
|
|
8
|
+
"""Detect source and extract ID from a URL or bare ID.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
("youtube", video_id) or ("apple", episode_id)
|
|
12
|
+
|
|
13
|
+
Raises:
|
|
14
|
+
ValueError with a helpful message if input can't be identified.
|
|
15
|
+
"""
|
|
16
|
+
input_str = input_str.strip()
|
|
17
|
+
|
|
18
|
+
# YouTube patterns
|
|
19
|
+
yt_patterns = [
|
|
20
|
+
r"v=([A-Za-z0-9_-]{11})",
|
|
21
|
+
r"youtu\.be/([A-Za-z0-9_-]{11})",
|
|
22
|
+
r"shorts/([A-Za-z0-9_-]{11})",
|
|
23
|
+
]
|
|
24
|
+
for pattern in yt_patterns:
|
|
25
|
+
m = re.search(pattern, input_str)
|
|
26
|
+
if m:
|
|
27
|
+
return ("youtube", m.group(1))
|
|
28
|
+
|
|
29
|
+
# Bare YouTube video ID (exactly 11 chars, alphanumeric + _ -)
|
|
30
|
+
if re.fullmatch(r"[A-Za-z0-9_-]{11}", input_str):
|
|
31
|
+
return ("youtube", input_str)
|
|
32
|
+
|
|
33
|
+
# Pure numeric string → Apple Podcasts episode ID
|
|
34
|
+
if input_str.isdigit():
|
|
35
|
+
return ("apple", input_str)
|
|
36
|
+
|
|
37
|
+
# Apple Podcasts URL
|
|
38
|
+
if "podcasts.apple.com" in input_str:
|
|
39
|
+
parsed = urlparse(input_str)
|
|
40
|
+
qs = parse_qs(parsed.query)
|
|
41
|
+
if "i" in qs:
|
|
42
|
+
return ("apple", qs["i"][0])
|
|
43
|
+
# Try extracting from path: /podcast/.../id<show_id>?i=<ep_id>
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"Apple Podcasts URL missing episode ID (?i=...). "
|
|
46
|
+
f"Open the episode in Apple Podcasts and copy the share link."
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"Could not detect source from: {input_str}\n"
|
|
51
|
+
f"Supported inputs:\n"
|
|
52
|
+
f" YouTube: https://youtube.com/watch?v=VIDEO_ID\n"
|
|
53
|
+
f" YouTube: https://youtu.be/VIDEO_ID\n"
|
|
54
|
+
f" Apple: https://podcasts.apple.com/...?i=EPISODE_ID\n"
|
|
55
|
+
f" Apple: EPISODE_ID (numeric)"
|
|
56
|
+
)
|
getscript/output.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Output formatting: plain text, JSON, TTML, Markdown."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from datetime import date
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def is_tty() -> bool:
|
|
9
|
+
return sys.stdout.isatty()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def format_timestamp(seconds: float) -> str:
|
|
13
|
+
"""Format seconds as HH:MM:SS or MM:SS."""
|
|
14
|
+
h = int(seconds // 3600)
|
|
15
|
+
m = int((seconds % 3600) // 60)
|
|
16
|
+
s = int(seconds % 60)
|
|
17
|
+
if h > 0:
|
|
18
|
+
return f"{h:02d}:{m:02d}:{s:02d}"
|
|
19
|
+
return f"{m:02d}:{s:02d}"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def format_text(segments: list[dict], timestamps: bool = False) -> str:
|
|
23
|
+
"""Format segments as plain text."""
|
|
24
|
+
if timestamps:
|
|
25
|
+
lines = []
|
|
26
|
+
for seg in segments:
|
|
27
|
+
ts = format_timestamp(seg.get("start", 0))
|
|
28
|
+
lines.append(f"[{ts}] {seg['text']}")
|
|
29
|
+
return "\n".join(lines)
|
|
30
|
+
return " ".join(seg["text"] for seg in segments)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def format_json(
|
|
34
|
+
segments: list[dict],
|
|
35
|
+
source: str,
|
|
36
|
+
source_id: str,
|
|
37
|
+
timestamps: bool = False,
|
|
38
|
+
) -> str:
|
|
39
|
+
"""Format as structured JSON."""
|
|
40
|
+
output = {
|
|
41
|
+
"source": source,
|
|
42
|
+
"id": source_id,
|
|
43
|
+
"text": " ".join(seg["text"] for seg in segments),
|
|
44
|
+
}
|
|
45
|
+
if timestamps:
|
|
46
|
+
output["segments"] = segments
|
|
47
|
+
else:
|
|
48
|
+
output["segments"] = [{"text": seg["text"]} for seg in segments]
|
|
49
|
+
return json.dumps(output, indent=2, ensure_ascii=False)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def format_markdown(
|
|
53
|
+
segments: list[dict],
|
|
54
|
+
source: str,
|
|
55
|
+
source_id: str,
|
|
56
|
+
timestamps: bool = False,
|
|
57
|
+
) -> str:
|
|
58
|
+
"""Format as Markdown with YAML frontmatter."""
|
|
59
|
+
lines = [
|
|
60
|
+
"---",
|
|
61
|
+
f"source: {source}",
|
|
62
|
+
f"id: \"{source_id}\"",
|
|
63
|
+
f"date: \"{date.today().isoformat()}\"",
|
|
64
|
+
"---",
|
|
65
|
+
"",
|
|
66
|
+
"# Transcript",
|
|
67
|
+
"",
|
|
68
|
+
]
|
|
69
|
+
if timestamps:
|
|
70
|
+
for seg in segments:
|
|
71
|
+
ts = format_timestamp(seg.get("start", 0))
|
|
72
|
+
lines.append(f"**[{ts}]** {seg['text']}")
|
|
73
|
+
lines.append("")
|
|
74
|
+
else:
|
|
75
|
+
lines.append(" ".join(seg["text"] for seg in segments))
|
|
76
|
+
lines.append("")
|
|
77
|
+
return "\n".join(lines)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def format_output(
|
|
81
|
+
segments: list[dict],
|
|
82
|
+
fmt: str = "text",
|
|
83
|
+
source: str = "",
|
|
84
|
+
source_id: str = "",
|
|
85
|
+
timestamps: bool = False,
|
|
86
|
+
ttml_raw: str | None = None,
|
|
87
|
+
) -> str:
|
|
88
|
+
"""Route to the appropriate formatter."""
|
|
89
|
+
if fmt == "ttml":
|
|
90
|
+
if ttml_raw is None:
|
|
91
|
+
raise ValueError("--ttml is only supported for Apple Podcasts transcripts")
|
|
92
|
+
return ttml_raw
|
|
93
|
+
if fmt == "json":
|
|
94
|
+
return format_json(segments, source, source_id, timestamps)
|
|
95
|
+
if fmt == "markdown":
|
|
96
|
+
return format_markdown(segments, source, source_id, timestamps)
|
|
97
|
+
return format_text(segments, timestamps)
|
getscript/picker.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Interactive selection via fzf."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def pick_result(results: list[dict]) -> dict | None:
|
|
9
|
+
"""Format results and pipe to fzf for interactive selection.
|
|
10
|
+
|
|
11
|
+
Returns the selected result dict, or None if user cancelled.
|
|
12
|
+
Raises RuntimeError if fzf is not installed.
|
|
13
|
+
"""
|
|
14
|
+
if not shutil.which("fzf"):
|
|
15
|
+
raise RuntimeError(
|
|
16
|
+
"fzf required for --search. "
|
|
17
|
+
"Install: https://github.com/junegunn/fzf#installation"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Build aligned columns
|
|
21
|
+
lines = []
|
|
22
|
+
for r in results:
|
|
23
|
+
parts = [r["id"], r["title"], r["channel"]]
|
|
24
|
+
if r.get("duration"):
|
|
25
|
+
parts.append(r["duration"])
|
|
26
|
+
lines.append("\t".join(parts))
|
|
27
|
+
|
|
28
|
+
fzf_input = "\n".join(lines)
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
proc = subprocess.run(
|
|
32
|
+
["fzf", "--delimiter=\t", "--with-nth=2..", "--header=Select a result:"],
|
|
33
|
+
input=fzf_input,
|
|
34
|
+
capture_output=True,
|
|
35
|
+
text=True,
|
|
36
|
+
)
|
|
37
|
+
except KeyboardInterrupt:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
if proc.returncode == 130:
|
|
41
|
+
# User pressed Esc or Ctrl-C in fzf
|
|
42
|
+
return None
|
|
43
|
+
if proc.returncode != 0:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
selected_line = proc.stdout.strip()
|
|
47
|
+
if not selected_line:
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
# Parse the ID from the first column
|
|
51
|
+
selected_id = selected_line.split("\t")[0]
|
|
52
|
+
|
|
53
|
+
# Find the matching result
|
|
54
|
+
for r in results:
|
|
55
|
+
if r["id"] == selected_id:
|
|
56
|
+
return r
|
|
57
|
+
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def format_list(results: list[dict]) -> str:
|
|
62
|
+
"""Format results as a printable list for --list mode."""
|
|
63
|
+
lines = []
|
|
64
|
+
for i, r in enumerate(results, 1):
|
|
65
|
+
parts = [r["id"], r["title"], r["channel"]]
|
|
66
|
+
if r.get("duration"):
|
|
67
|
+
parts.append(r["duration"])
|
|
68
|
+
lines.append(f"{i:3d}. " + "\t".join(parts))
|
|
69
|
+
return "\n".join(lines)
|
getscript/progress.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Lightweight stderr progress spinner (TTY-aware)."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
SPINNER_CHARS = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Progress:
|
|
10
|
+
"""Simple stderr status indicator. Auto-disabled when stderr is not a TTY."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, quiet: bool = False):
|
|
13
|
+
self._enabled = sys.stderr.isatty() and not quiet
|
|
14
|
+
self._idx = 0
|
|
15
|
+
self._last_msg = ""
|
|
16
|
+
|
|
17
|
+
def update(self, message: str) -> None:
|
|
18
|
+
if not self._enabled:
|
|
19
|
+
return
|
|
20
|
+
char = SPINNER_CHARS[self._idx % len(SPINNER_CHARS)]
|
|
21
|
+
self._idx += 1
|
|
22
|
+
# Clear line and write status
|
|
23
|
+
sys.stderr.write(f"\r\033[K{char} {message}")
|
|
24
|
+
sys.stderr.flush()
|
|
25
|
+
self._last_msg = message
|
|
26
|
+
|
|
27
|
+
def done(self, message: str | None = None) -> None:
|
|
28
|
+
if not self._enabled:
|
|
29
|
+
return
|
|
30
|
+
if message:
|
|
31
|
+
sys.stderr.write(f"\r\033[K{message}\n")
|
|
32
|
+
else:
|
|
33
|
+
sys.stderr.write("\r\033[K")
|
|
34
|
+
sys.stderr.flush()
|
getscript/search.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Search backends for YouTube and Apple Podcasts."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import urllib.request
|
|
5
|
+
import urllib.parse
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def search_youtube(query: str, api_key: str, limit: int = 10) -> list[dict]:
|
|
9
|
+
"""Search YouTube via Data API v3.
|
|
10
|
+
|
|
11
|
+
Returns list of {"id", "title", "channel", "duration"} dicts.
|
|
12
|
+
Duration is not available from search endpoint, so set to "".
|
|
13
|
+
"""
|
|
14
|
+
params = urllib.parse.urlencode({
|
|
15
|
+
"q": query,
|
|
16
|
+
"type": "video",
|
|
17
|
+
"part": "snippet",
|
|
18
|
+
"maxResults": min(limit, 50),
|
|
19
|
+
"key": api_key,
|
|
20
|
+
})
|
|
21
|
+
url = f"https://www.googleapis.com/youtube/v3/search?{params}"
|
|
22
|
+
|
|
23
|
+
req = urllib.request.Request(url)
|
|
24
|
+
req.add_header("Accept", "application/json")
|
|
25
|
+
|
|
26
|
+
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
27
|
+
data = json.loads(resp.read())
|
|
28
|
+
|
|
29
|
+
results = []
|
|
30
|
+
for item in data.get("items", []):
|
|
31
|
+
video_id = item.get("id", {}).get("videoId")
|
|
32
|
+
if not video_id:
|
|
33
|
+
continue
|
|
34
|
+
snippet = item.get("snippet", {})
|
|
35
|
+
results.append({
|
|
36
|
+
"id": video_id,
|
|
37
|
+
"title": snippet.get("title", ""),
|
|
38
|
+
"channel": snippet.get("channelTitle", ""),
|
|
39
|
+
"duration": "",
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
return results
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def search_apple(query: str, limit: int = 10) -> list[dict]:
|
|
46
|
+
"""Search Apple Podcasts via iTunes Search API (free, no auth).
|
|
47
|
+
|
|
48
|
+
Returns list of {"id", "title", "channel", "duration"} dicts.
|
|
49
|
+
"""
|
|
50
|
+
params = urllib.parse.urlencode({
|
|
51
|
+
"term": query,
|
|
52
|
+
"media": "podcast",
|
|
53
|
+
"entity": "podcastEpisode",
|
|
54
|
+
"limit": min(limit, 200),
|
|
55
|
+
})
|
|
56
|
+
url = f"https://itunes.apple.com/search?{params}"
|
|
57
|
+
|
|
58
|
+
req = urllib.request.Request(url)
|
|
59
|
+
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
60
|
+
data = json.loads(resp.read())
|
|
61
|
+
|
|
62
|
+
results = []
|
|
63
|
+
for item in data.get("results", []):
|
|
64
|
+
track_id = item.get("trackId")
|
|
65
|
+
if not track_id:
|
|
66
|
+
continue
|
|
67
|
+
# Duration from API is in milliseconds
|
|
68
|
+
duration_ms = item.get("trackTimeMillis", 0)
|
|
69
|
+
if duration_ms:
|
|
70
|
+
total_secs = duration_ms // 1000
|
|
71
|
+
mins, secs = divmod(total_secs, 60)
|
|
72
|
+
hours, mins = divmod(mins, 60)
|
|
73
|
+
duration = f"{hours}:{mins:02d}:{secs:02d}" if hours else f"{mins}:{secs:02d}"
|
|
74
|
+
else:
|
|
75
|
+
duration = ""
|
|
76
|
+
results.append({
|
|
77
|
+
"id": str(track_id),
|
|
78
|
+
"title": item.get("trackName", ""),
|
|
79
|
+
"channel": item.get("collectionName", ""),
|
|
80
|
+
"duration": duration,
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
return results
|
getscript/upload.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Upload transcripts to the shared Voxly transcript pool."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import urllib.error
|
|
7
|
+
import urllib.request
|
|
8
|
+
import uuid
|
|
9
|
+
|
|
10
|
+
from getscript import __version__
|
|
11
|
+
from getscript.config import get_config_dir
|
|
12
|
+
|
|
13
|
+
SUPABASE_URL = "https://ohxuifdseybxckmprcry.supabase.co"
|
|
14
|
+
SUPABASE_ANON_KEY = (
|
|
15
|
+
"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9."
|
|
16
|
+
"eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6Im9oeHVpZmRzZXlieGNrbXByY3J5Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NzA2NDE5NDgsImV4cCI6MjA4NjIxNzk0OH0."
|
|
17
|
+
"_4NFs2SY98gIL6Z0tgiTxIVSX7FBJ8b_46oF7Vi7p6M"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
SOURCE_TYPE_MAP = {
|
|
21
|
+
"youtube": "youtube_transcript",
|
|
22
|
+
"apple": "podcast",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_device_id() -> str:
|
|
27
|
+
"""Get or create a persistent anonymous device ID."""
|
|
28
|
+
config_dir = get_config_dir()
|
|
29
|
+
device_path = os.path.join(config_dir, "device.json")
|
|
30
|
+
if os.path.exists(device_path):
|
|
31
|
+
try:
|
|
32
|
+
with open(device_path) as f:
|
|
33
|
+
data = json.load(f)
|
|
34
|
+
return data["device_id"]
|
|
35
|
+
except (json.JSONDecodeError, KeyError):
|
|
36
|
+
pass
|
|
37
|
+
device_id = str(uuid.uuid4())
|
|
38
|
+
os.makedirs(config_dir, exist_ok=True)
|
|
39
|
+
with open(device_path, "w") as f:
|
|
40
|
+
json.dump({"device_id": device_id}, f)
|
|
41
|
+
os.chmod(device_path, 0o600)
|
|
42
|
+
return device_id
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def fetch_title(source: str, source_id: str) -> str | None:
|
|
46
|
+
"""Fetch video/episode title via oembed. Returns None on failure."""
|
|
47
|
+
if source == "youtube":
|
|
48
|
+
url = f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={source_id}&format=json"
|
|
49
|
+
else:
|
|
50
|
+
return None
|
|
51
|
+
try:
|
|
52
|
+
req = urllib.request.Request(url)
|
|
53
|
+
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
54
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
55
|
+
return data.get("title")
|
|
56
|
+
except Exception:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _build_source_url(source: str, source_id: str) -> str:
|
|
61
|
+
if source == "youtube":
|
|
62
|
+
return f"https://www.youtube.com/watch?v={source_id}"
|
|
63
|
+
elif source == "apple":
|
|
64
|
+
return f"https://podcasts.apple.com/podcast/ep?i={source_id}"
|
|
65
|
+
return source_id
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def upload_transcript(
|
|
69
|
+
source: str,
|
|
70
|
+
source_id: str,
|
|
71
|
+
segments: list[dict],
|
|
72
|
+
title: str | None,
|
|
73
|
+
config: dict,
|
|
74
|
+
) -> dict | None:
|
|
75
|
+
"""Upload a transcript to the shared pool. Returns response dict or None on failure.
|
|
76
|
+
|
|
77
|
+
Never raises — all errors are printed to stderr.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
base_url = config.get("supabase_url", SUPABASE_URL).rstrip("/")
|
|
81
|
+
anon_key = config.get("supabase_anon_key", SUPABASE_ANON_KEY)
|
|
82
|
+
|
|
83
|
+
source_url = _build_source_url(source, source_id)
|
|
84
|
+
source_type = SOURCE_TYPE_MAP.get(source, source)
|
|
85
|
+
full_text = " ".join(seg.get("text", "") for seg in segments)
|
|
86
|
+
word_count = len(full_text.split())
|
|
87
|
+
|
|
88
|
+
device_id = get_device_id()
|
|
89
|
+
|
|
90
|
+
payload = {
|
|
91
|
+
"device_id": device_id,
|
|
92
|
+
"source_type": source_type,
|
|
93
|
+
"source_id": source_id,
|
|
94
|
+
"source_url": source_url,
|
|
95
|
+
"title": title,
|
|
96
|
+
"segments": segments,
|
|
97
|
+
"full_text": full_text,
|
|
98
|
+
"word_count": word_count,
|
|
99
|
+
"cli_version": __version__,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
data = json.dumps(payload).encode("utf-8")
|
|
103
|
+
url = f"{base_url}/functions/v1/ingest-transcript"
|
|
104
|
+
|
|
105
|
+
req = urllib.request.Request(
|
|
106
|
+
url,
|
|
107
|
+
data=data,
|
|
108
|
+
headers={
|
|
109
|
+
"Content-Type": "application/json",
|
|
110
|
+
"Authorization": f"Bearer {anon_key}",
|
|
111
|
+
},
|
|
112
|
+
method="POST",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
116
|
+
return json.loads(resp.read().decode("utf-8"))
|
|
117
|
+
|
|
118
|
+
except urllib.error.HTTPError as e:
|
|
119
|
+
body = ""
|
|
120
|
+
try:
|
|
121
|
+
body = e.read().decode("utf-8", errors="replace")
|
|
122
|
+
except Exception:
|
|
123
|
+
pass
|
|
124
|
+
print(f"Warning: upload failed (HTTP {e.code}): {body}", file=sys.stderr)
|
|
125
|
+
return None
|
|
126
|
+
except urllib.error.URLError as e:
|
|
127
|
+
print(f"Warning: upload failed (network): {e.reason}", file=sys.stderr)
|
|
128
|
+
return None
|
|
129
|
+
except Exception as e:
|
|
130
|
+
print(f"Warning: upload failed: {e}", file=sys.stderr)
|
|
131
|
+
return None
|
getscript/youtube.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""YouTube transcript fetching."""
|
|
2
|
+
|
|
3
|
+
import http.cookiejar
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from requests import Session
|
|
7
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
8
|
+
from youtube_transcript_api.proxies import GenericProxyConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _load_cookies(cookie_path: str) -> http.cookiejar.MozillaCookieJar:
|
|
12
|
+
"""Load Netscape/Mozilla format cookies from file."""
|
|
13
|
+
jar = http.cookiejar.MozillaCookieJar(cookie_path)
|
|
14
|
+
jar.load(ignore_discard=True, ignore_expires=True)
|
|
15
|
+
return jar
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _build_api(config: dict) -> YouTubeTranscriptApi:
|
|
19
|
+
"""Build YouTubeTranscriptApi with optional proxy and cookie config."""
|
|
20
|
+
proxy_config = None
|
|
21
|
+
http_client = None
|
|
22
|
+
|
|
23
|
+
proxy_url = config.get("proxy")
|
|
24
|
+
cookie_file = config.get("cookie_file")
|
|
25
|
+
|
|
26
|
+
if proxy_url:
|
|
27
|
+
proxy_config = GenericProxyConfig(https_url=proxy_url)
|
|
28
|
+
|
|
29
|
+
if cookie_file:
|
|
30
|
+
cookie_path = os.path.expanduser(cookie_file)
|
|
31
|
+
if not os.path.exists(cookie_path):
|
|
32
|
+
raise FileNotFoundError(f"Cookie file not found: {cookie_path}")
|
|
33
|
+
http_client = Session()
|
|
34
|
+
http_client.cookies = _load_cookies(cookie_path)
|
|
35
|
+
|
|
36
|
+
return YouTubeTranscriptApi(proxy_config=proxy_config, http_client=http_client)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def fetch_transcript(video_id: str, config: dict | None = None) -> list[dict]:
|
|
40
|
+
"""Fetch transcript segments for a YouTube video.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
video_id: YouTube video ID.
|
|
44
|
+
config: Optional config dict with proxy/cookie_file keys.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
List of {"text": str, "start": float, "duration": float}
|
|
48
|
+
"""
|
|
49
|
+
api = _build_api(config or {})
|
|
50
|
+
transcript = api.fetch(video_id)
|
|
51
|
+
return [
|
|
52
|
+
{
|
|
53
|
+
"text": segment.text,
|
|
54
|
+
"start": segment.start,
|
|
55
|
+
"duration": segment.duration,
|
|
56
|
+
}
|
|
57
|
+
for segment in transcript
|
|
58
|
+
]
|