@elisym/cli 0.18.1 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@elisym/cli",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.20.0",
|
|
4
4
|
"description": "CLI agent runner for elisym - provider mode, skills, crash recovery",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agents",
|
|
@@ -44,7 +44,7 @@
|
|
|
44
44
|
"prepublishOnly": "bun run build && node scripts/preflight-publish.mjs"
|
|
45
45
|
},
|
|
46
46
|
"dependencies": {
|
|
47
|
-
"@elisym/sdk": "~0.
|
|
47
|
+
"@elisym/sdk": "~0.23.0",
|
|
48
48
|
"@solana-program/memo": "~0.11.0",
|
|
49
49
|
"@solana-program/system": "~0.12.0",
|
|
50
50
|
"@solana-program/token": "~0.5.0",
|
|
@@ -12,18 +12,96 @@ Long transcripts are split into chunks for LLM processing.
|
|
|
12
12
|
|
|
13
13
|
import argparse
|
|
14
14
|
import hashlib
|
|
15
|
+
import ipaddress
|
|
15
16
|
import json
|
|
16
17
|
import os
|
|
17
18
|
import re
|
|
18
19
|
import subprocess
|
|
19
20
|
import sys
|
|
20
21
|
import tempfile
|
|
22
|
+
from urllib.parse import parse_qs, urlparse
|
|
21
23
|
|
|
22
24
|
CHUNK_SIZE = 30_000 # ~7500 tokens, safe for rate limits
|
|
23
25
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
24
26
|
CACHE_DIR = os.path.join(SCRIPT_DIR, ".cache")
|
|
25
27
|
COOKIES_FILE = os.path.join(os.path.dirname(SCRIPT_DIR), "cookies.txt")
|
|
26
28
|
|
|
29
|
+
# Strict allowlist of YouTube hosts. Anything else (or any IP/metadata host) is rejected.
|
|
30
|
+
ALLOWED_YOUTUBE_HOSTS = frozenset({
|
|
31
|
+
"www.youtube.com",
|
|
32
|
+
"youtube.com",
|
|
33
|
+
"m.youtube.com",
|
|
34
|
+
"music.youtube.com",
|
|
35
|
+
"youtu.be",
|
|
36
|
+
"youtube-nocookie.com",
|
|
37
|
+
"www.youtube-nocookie.com",
|
|
38
|
+
})
|
|
39
|
+
# YouTube video ids are exactly 11 chars from this charset.
|
|
40
|
+
VIDEO_ID_RE = re.compile(r"^[A-Za-z0-9_-]{11}$")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _is_blocked_ip_host(hostname: str) -> bool:
|
|
44
|
+
"""True if the hostname is an IP literal in a private/loopback/link-local/metadata range."""
|
|
45
|
+
try:
|
|
46
|
+
ip = ipaddress.ip_address(hostname)
|
|
47
|
+
except ValueError:
|
|
48
|
+
return False
|
|
49
|
+
return (
|
|
50
|
+
ip.is_private
|
|
51
|
+
or ip.is_loopback
|
|
52
|
+
or ip.is_link_local
|
|
53
|
+
or ip.is_reserved
|
|
54
|
+
or ip.is_unspecified
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _host_is_allowed(hostname: str | None) -> bool:
|
|
59
|
+
"""Exact-match allowlist check, plus a strict .youtube.com suffix (never a substring match)."""
|
|
60
|
+
if not hostname:
|
|
61
|
+
return False
|
|
62
|
+
host = hostname.lower()
|
|
63
|
+
if _is_blocked_ip_host(host):
|
|
64
|
+
return False
|
|
65
|
+
if host in ALLOWED_YOUTUBE_HOSTS:
|
|
66
|
+
return True
|
|
67
|
+
return host.endswith(".youtube.com")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def extract_video_id(parsed) -> str | None:
|
|
71
|
+
"""Extract the 11-char YouTube video id from a parsed URL, or None."""
|
|
72
|
+
host = (parsed.hostname or "").lower()
|
|
73
|
+
if host == "youtu.be" or host.endswith(".youtu.be"):
|
|
74
|
+
candidate = parsed.path.lstrip("/").split("/", 1)[0]
|
|
75
|
+
return candidate if VIDEO_ID_RE.match(candidate) else None
|
|
76
|
+
query_id = parse_qs(parsed.query).get("v", [None])[0]
|
|
77
|
+
if query_id and VIDEO_ID_RE.match(query_id):
|
|
78
|
+
return query_id
|
|
79
|
+
for prefix in ("/embed/", "/shorts/", "/v/"):
|
|
80
|
+
if parsed.path.startswith(prefix):
|
|
81
|
+
candidate = parsed.path[len(prefix):].split("/", 1)[0]
|
|
82
|
+
return candidate if VIDEO_ID_RE.match(candidate) else None
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def canonicalize_youtube_url(url: str) -> str | None:
|
|
87
|
+
"""Validate a YouTube URL and return a host-controlled canonical URL.
|
|
88
|
+
|
|
89
|
+
Prefers extracting the 11-char video id and rebuilding a canonical
|
|
90
|
+
https://www.youtube.com/watch?v=<id> URL so the attacker never controls
|
|
91
|
+
the host yt-dlp connects to. Falls back to the validated original URL when
|
|
92
|
+
no id can be extracted but the host passed strict validation. Returns None
|
|
93
|
+
if the URL fails scheme/host validation.
|
|
94
|
+
"""
|
|
95
|
+
parsed = urlparse(url)
|
|
96
|
+
if parsed.scheme not in ("http", "https"):
|
|
97
|
+
return None
|
|
98
|
+
if not _host_is_allowed(parsed.hostname):
|
|
99
|
+
return None
|
|
100
|
+
video_id = extract_video_id(parsed)
|
|
101
|
+
if video_id:
|
|
102
|
+
return f"https://www.youtube.com/watch?v={video_id}"
|
|
103
|
+
return url
|
|
104
|
+
|
|
27
105
|
|
|
28
106
|
def _cookies_args() -> list[str]:
|
|
29
107
|
"""Return yt-dlp --cookies args if cookies.txt exists."""
|
|
@@ -234,13 +312,14 @@ def main():
|
|
|
234
312
|
parser.add_argument("--chunk", type=int, default=None, help="Return specific chunk (0-indexed)")
|
|
235
313
|
args = parser.parse_args()
|
|
236
314
|
|
|
237
|
-
|
|
315
|
+
safe_url = canonicalize_youtube_url(args.url)
|
|
316
|
+
if safe_url is None:
|
|
238
317
|
print("Error: not a valid YouTube URL", file=sys.stderr)
|
|
239
318
|
sys.exit(1)
|
|
240
319
|
|
|
241
320
|
# If requesting a specific chunk, read from cache
|
|
242
321
|
if args.chunk is not None:
|
|
243
|
-
cached = load_cache(
|
|
322
|
+
cached = load_cache(safe_url)
|
|
244
323
|
if not cached:
|
|
245
324
|
print("Error: no cached transcript. Call without --chunk first.", file=sys.stderr)
|
|
246
325
|
sys.exit(1)
|
|
@@ -259,7 +338,7 @@ def main():
|
|
|
259
338
|
|
|
260
339
|
# Fetch transcript
|
|
261
340
|
print("Fetching video info...", file=sys.stderr)
|
|
262
|
-
video_info = get_video_info(
|
|
341
|
+
video_info = get_video_info(safe_url)
|
|
263
342
|
print(f"Video: {video_info['title']} ({video_info['duration'] // 60} min)", file=sys.stderr)
|
|
264
343
|
|
|
265
344
|
if args.lang == "auto":
|
|
@@ -269,12 +348,12 @@ def main():
|
|
|
269
348
|
lang = args.lang
|
|
270
349
|
|
|
271
350
|
print(f"Fetching subtitles (lang={lang})...", file=sys.stderr)
|
|
272
|
-
transcript = fetch_subtitles(
|
|
351
|
+
transcript = fetch_subtitles(safe_url, lang)
|
|
273
352
|
|
|
274
353
|
if not transcript:
|
|
275
354
|
print("No subtitles found, trying Whisper transcription...", file=sys.stderr)
|
|
276
355
|
try:
|
|
277
|
-
transcript = transcribe_audio(
|
|
356
|
+
transcript = transcribe_audio(safe_url)
|
|
278
357
|
except RuntimeError as e:
|
|
279
358
|
print(f"Error: {e}", file=sys.stderr)
|
|
280
359
|
sys.exit(1)
|
|
@@ -287,7 +366,7 @@ def main():
|
|
|
287
366
|
print(f"Transcript: {len(transcript)} chars, {len(chunks)} chunk(s)", file=sys.stderr)
|
|
288
367
|
|
|
289
368
|
# Cache for chunk retrieval
|
|
290
|
-
save_cache(
|
|
369
|
+
save_cache(safe_url, {
|
|
291
370
|
"title": video_info["title"],
|
|
292
371
|
"channel": video_info["channel"],
|
|
293
372
|
"duration_min": video_info["duration"] // 60,
|