@elisym/cli 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@elisym/cli",
3
- "version": "0.19.0",
3
+ "version": "0.20.0",
4
4
  "description": "CLI agent runner for elisym - provider mode, skills, crash recovery",
5
5
  "keywords": [
6
6
  "ai-agents",
@@ -44,7 +44,7 @@
44
44
  "prepublishOnly": "bun run build && node scripts/preflight-publish.mjs"
45
45
  },
46
46
  "dependencies": {
47
- "@elisym/sdk": "~0.22.0",
47
+ "@elisym/sdk": "~0.23.0",
48
48
  "@solana-program/memo": "~0.11.0",
49
49
  "@solana-program/system": "~0.12.0",
50
50
  "@solana-program/token": "~0.5.0",
@@ -12,18 +12,96 @@ Long transcripts are split into chunks for LLM processing.
12
12
 
13
13
  import argparse
14
14
  import hashlib
15
+ import ipaddress
15
16
  import json
16
17
  import os
17
18
  import re
18
19
  import subprocess
19
20
  import sys
20
21
  import tempfile
22
+ from urllib.parse import parse_qs, urlparse
21
23
 
22
24
  CHUNK_SIZE = 30_000 # ~7500 tokens, safe for rate limits
23
25
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24
26
  CACHE_DIR = os.path.join(SCRIPT_DIR, ".cache")
25
27
  COOKIES_FILE = os.path.join(os.path.dirname(SCRIPT_DIR), "cookies.txt")
26
28
 
29
+ # Strict allowlist of YouTube hosts. Anything else (or any IP/metadata host) is rejected.
30
+ ALLOWED_YOUTUBE_HOSTS = frozenset({
31
+ "www.youtube.com",
32
+ "youtube.com",
33
+ "m.youtube.com",
34
+ "music.youtube.com",
35
+ "youtu.be",
36
+ "youtube-nocookie.com",
37
+ "www.youtube-nocookie.com",
38
+ })
39
+ # YouTube video ids are exactly 11 chars from this charset.
40
+ VIDEO_ID_RE = re.compile(r"^[A-Za-z0-9_-]{11}$")
41
+
42
+
43
+ def _is_blocked_ip_host(hostname: str) -> bool:
44
+ """True if the hostname is an IP literal in a private/loopback/link-local/metadata range."""
45
+ try:
46
+ ip = ipaddress.ip_address(hostname)
47
+ except ValueError:
48
+ return False
49
+ return (
50
+ ip.is_private
51
+ or ip.is_loopback
52
+ or ip.is_link_local
53
+ or ip.is_reserved
54
+ or ip.is_unspecified
55
+ )
56
+
57
+
58
+ def _host_is_allowed(hostname: str | None) -> bool:
59
+ """Exact-match allowlist check, plus a strict .youtube.com suffix (never a substring match)."""
60
+ if not hostname:
61
+ return False
62
+ host = hostname.lower()
63
+ if _is_blocked_ip_host(host):
64
+ return False
65
+ if host in ALLOWED_YOUTUBE_HOSTS:
66
+ return True
67
+ return host.endswith(".youtube.com")
68
+
69
+
70
+ def extract_video_id(parsed) -> str | None:
71
+ """Extract the 11-char YouTube video id from a parsed URL, or None."""
72
+ host = (parsed.hostname or "").lower()
73
+ if host == "youtu.be" or host.endswith(".youtu.be"):
74
+ candidate = parsed.path.lstrip("/").split("/", 1)[0]
75
+ return candidate if VIDEO_ID_RE.match(candidate) else None
76
+ query_id = parse_qs(parsed.query).get("v", [None])[0]
77
+ if query_id and VIDEO_ID_RE.match(query_id):
78
+ return query_id
79
+ for prefix in ("/embed/", "/shorts/", "/v/"):
80
+ if parsed.path.startswith(prefix):
81
+ candidate = parsed.path[len(prefix):].split("/", 1)[0]
82
+ return candidate if VIDEO_ID_RE.match(candidate) else None
83
+ return None
84
+
85
+
86
+ def canonicalize_youtube_url(url: str) -> str | None:
87
+ """Validate a YouTube URL and return a host-controlled canonical URL.
88
+
89
+ Prefers extracting the 11-char video id and rebuilding a canonical
90
+ https://www.youtube.com/watch?v=<id> URL so the attacker never controls
91
+ the host yt-dlp connects to. Falls back to the validated original URL when
92
+ no id can be extracted but the host passed strict validation. Returns None
93
+ if the URL fails scheme/host validation.
94
+ """
95
+ parsed = urlparse(url)
96
+ if parsed.scheme not in ("http", "https"):
97
+ return None
98
+ if not _host_is_allowed(parsed.hostname):
99
+ return None
100
+ video_id = extract_video_id(parsed)
101
+ if video_id:
102
+ return f"https://www.youtube.com/watch?v={video_id}"
103
+ return url
104
+
27
105
 
28
106
  def _cookies_args() -> list[str]:
29
107
  """Return yt-dlp --cookies args if cookies.txt exists."""
@@ -234,13 +312,14 @@ def main():
234
312
  parser.add_argument("--chunk", type=int, default=None, help="Return specific chunk (0-indexed)")
235
313
  args = parser.parse_args()
236
314
 
237
- if not re.search(r"(youtube\.com|youtu\.be)", args.url):
315
+ safe_url = canonicalize_youtube_url(args.url)
316
+ if safe_url is None:
238
317
  print("Error: not a valid YouTube URL", file=sys.stderr)
239
318
  sys.exit(1)
240
319
 
241
320
  # If requesting a specific chunk, read from cache
242
321
  if args.chunk is not None:
243
- cached = load_cache(args.url)
322
+ cached = load_cache(safe_url)
244
323
  if not cached:
245
324
  print("Error: no cached transcript. Call without --chunk first.", file=sys.stderr)
246
325
  sys.exit(1)
@@ -259,7 +338,7 @@ def main():
259
338
 
260
339
  # Fetch transcript
261
340
  print("Fetching video info...", file=sys.stderr)
262
- video_info = get_video_info(args.url)
341
+ video_info = get_video_info(safe_url)
263
342
  print(f"Video: {video_info['title']} ({video_info['duration'] // 60} min)", file=sys.stderr)
264
343
 
265
344
  if args.lang == "auto":
@@ -269,12 +348,12 @@ def main():
269
348
  lang = args.lang
270
349
 
271
350
  print(f"Fetching subtitles (lang={lang})...", file=sys.stderr)
272
- transcript = fetch_subtitles(args.url, lang)
351
+ transcript = fetch_subtitles(safe_url, lang)
273
352
 
274
353
  if not transcript:
275
354
  print("No subtitles found, trying Whisper transcription...", file=sys.stderr)
276
355
  try:
277
- transcript = transcribe_audio(args.url)
356
+ transcript = transcribe_audio(safe_url)
278
357
  except RuntimeError as e:
279
358
  print(f"Error: {e}", file=sys.stderr)
280
359
  sys.exit(1)
@@ -287,7 +366,7 @@ def main():
287
366
  print(f"Transcript: {len(transcript)} chars, {len(chunks)} chunk(s)", file=sys.stderr)
288
367
 
289
368
  # Cache for chunk retrieval
290
- save_cache(args.url, {
369
+ save_cache(safe_url, {
291
370
  "title": video_info["title"],
292
371
  "channel": video_info["channel"],
293
372
  "duration_min": video_info["duration"] // 60,