open-research-protocol 0.4.8 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -33,6 +33,7 @@ verification remains independent of framing. See `modules/instruments/README.md`
33
33
  - `docs/ORP_REASONING_KERNEL_CANONICAL_CONTINUATION_PILOT.md` — harder live downstream benchmark where the agent must produce the next canonical task artifact
34
34
  - `docs/ORP_REASONING_KERNEL_EVIDENCE_MATRIX.md` — honest map of what the kernel proves, only suggests, or still leaves unproven
35
35
  - `docs/ORP_REASONING_KERNEL_EVALUATION_PLAN.md` — comparative experiment plan for upgrading kernel evidence beyond implementation validity
36
+ - `docs/ORP_YOUTUBE_INSPECT.md` — first-class YouTube metadata/transcript ingestion surface for agent-readable external source context
36
37
  - `docs/EXTERNAL_CONTRIBUTION_GOVERNANCE.md` — canonical local-first workflow for external OSS PR work
37
38
  - `docs/OSS_CONTRIBUTION_AGENT_LOOP.md` — agent operating rhythm for external contribution workflows
38
39
  - `templates/` — claim, verification, failure, and issue templates
@@ -51,6 +52,7 @@ verification remains independent of framing. See `modules/instruments/README.md`
51
52
  ORP should feel like one CLI with built-in abilities:
52
53
 
53
54
  - `workspace` for hosted auth, idea, feature, world, checkpoint, and worker operations
55
+ - `youtube` for public video metadata and transcript ingestion
54
56
  - `governance` for local-first repo initialization, branch safety, checkpoint commits, backup refs, readiness, repair, and cleanup
55
57
  - `discover` for profile-based GitHub scanning and opportunity selection
56
58
  - `collaborate` for repository collaboration setup and workflow execution
@@ -118,6 +120,8 @@ orp home --json
118
120
  orp about --json
119
121
  orp auth login
120
122
  orp whoami --json
123
+ orp youtube inspect https://www.youtube.com/watch?v=<video_id> --json
124
+ orp youtube inspect https://www.youtube.com/watch?v=<video_id> --save --json
121
125
  orp ideas list --json
122
126
  orp world bind --idea-id <idea-id> --project-root /abs/path --codex-session-id <session-id> --json
123
127
  orp checkpoint queue --idea-id <idea-id> --json
@@ -149,6 +153,7 @@ These surfaces are meant to help automated systems discover ORP quickly:
149
153
  - bare `orp` opens a home screen with repo/runtime status, available packs, and next commands
150
154
  - `orp home --json` returns the same landing context in machine-readable form
151
155
  - `orp auth ...`, `orp ideas ...`, `orp world ...`, `orp checkpoint ...`, `orp runner ...`, and `orp agent ...` expose the hosted workspace surface directly through ORP
156
+ - `orp youtube inspect ...` exposes public YouTube metadata and transcript retrieval through a stable ORP artifact shape for agent use
152
157
  - `orp init`, `orp status`, `orp branch start`, `orp checkpoint create`, `orp backup`, `orp ready`, `orp doctor`, and `orp cleanup` expose the local-first repo governance surface directly through ORP
153
158
  - `orp discover ...` exposes profile-based GitHub scanning as a built-in ORP ability
154
159
  - `orp collaborate ...` exposes built-in collaboration setup and workflow execution without asking users to think in terms of separate governance packs
@@ -212,6 +217,7 @@ Minimal CLI skeleton:
212
217
 
213
218
  ```bash
214
219
  orp auth login
220
+ orp youtube inspect https://www.youtube.com/watch?v=<video_id> --json
215
221
  orp ideas list --json
216
222
  orp world bind --idea-id <idea-id> --project-root /abs/path --codex-session-id <session-id> --json
217
223
  orp checkpoint queue --idea-id <idea-id> --json
package/cli/orp.py CHANGED
@@ -30,6 +30,7 @@ import argparse
30
30
  import datetime as dt
31
31
  import getpass
32
32
  import hashlib
33
+ import html
33
34
  import json
34
35
  import os
35
36
  import platform
@@ -45,6 +46,7 @@ import uuid
45
46
  from urllib import error as urlerror
46
47
  from urllib import parse as urlparse
47
48
  from urllib import request as urlrequest
49
+ import xml.etree.ElementTree as ET
48
50
 
49
51
  RUNNER_LEASE_STALE_SECONDS = 120
50
52
 
@@ -112,6 +114,7 @@ DEFAULT_DISCOVER_PROFILE = "orp.profile.default.json"
112
114
  DEFAULT_DISCOVER_SCAN_ROOT = "orp/discovery/github"
113
115
  DEFAULT_HOSTED_BASE_URL = "https://orp.earth"
114
116
  KERNEL_SCHEMA_VERSION = "1.0.0"
117
+ YOUTUBE_SOURCE_SCHEMA_VERSION = "1.0.0"
115
118
 
116
119
 
117
120
  class HostedApiError(RuntimeError):
@@ -336,6 +339,442 @@ def _request_hosted_sse_event(
336
339
  ) from exc
337
340
 
338
341
 
342
+ def _http_get_text(url: str, *, headers: dict[str, str] | None = None, timeout_sec: int = 20) -> str:
343
+ request = urlrequest.Request(url, headers=headers or {}, method="GET")
344
+ try:
345
+ with urlrequest.urlopen(request, timeout=timeout_sec) as response:
346
+ return response.read().decode("utf-8", errors="replace")
347
+ except urlerror.HTTPError as exc:
348
+ body = exc.read().decode("utf-8", errors="replace").strip()
349
+ raise RuntimeError(f"HTTP {exc.code} while fetching {url}: {body or exc.reason}") from exc
350
+ except urlerror.URLError as exc:
351
+ raise RuntimeError(f"Could not reach {url}: {exc.reason}") from exc
352
+
353
+
354
+ def _http_get_json(url: str, *, headers: dict[str, str] | None = None, timeout_sec: int = 20) -> dict[str, Any]:
355
+ text = _http_get_text(url, headers=headers, timeout_sec=timeout_sec)
356
+ try:
357
+ payload = json.loads(text)
358
+ except Exception as exc:
359
+ raise RuntimeError(f"Response from {url} was not valid JSON.") from exc
360
+ if isinstance(payload, dict):
361
+ return payload
362
+ raise RuntimeError(f"Response from {url} was not a JSON object.")
363
+
364
+
365
+ def _youtube_request_headers() -> dict[str, str]:
366
+ return {
367
+ "User-Agent": (
368
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
369
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36"
370
+ ),
371
+ "Accept-Language": "en-US,en;q=0.9",
372
+ }
373
+
374
+
375
+ def _youtube_source_schema_path() -> Path:
376
+ return Path(__file__).resolve().parent.parent / "spec" / "v1" / "youtube-source.schema.json"
377
+
378
+
379
+ def _youtube_video_id_from_url(raw_url: str) -> str:
380
+ text = str(raw_url or "").strip()
381
+ if not text:
382
+ raise RuntimeError("YouTube URL is required.")
383
+ if re.fullmatch(r"[\w-]{11}", text):
384
+ return text
385
+
386
+ parsed = urlparse.urlparse(text)
387
+ host = parsed.netloc.lower()
388
+ path_parts = [part for part in parsed.path.split("/") if part]
389
+ if host.endswith("youtu.be"):
390
+ if path_parts:
391
+ return path_parts[0]
392
+ if any(host.endswith(suffix) for suffix in ("youtube.com", "youtube-nocookie.com", "music.youtube.com")):
393
+ if parsed.path == "/watch":
394
+ video_id = urlparse.parse_qs(parsed.query).get("v", [""])[0].strip()
395
+ if video_id:
396
+ return video_id
397
+ if len(path_parts) >= 2 and path_parts[0] in {"embed", "shorts", "live", "v"}:
398
+ return path_parts[1]
399
+ raise RuntimeError(f"Could not extract a YouTube video id from: {text}")
400
+
401
+
402
+ def _youtube_canonical_url(video_id: str) -> str:
403
+ return f"https://www.youtube.com/watch?v={video_id}"
404
+
405
+
406
+ def _extract_json_object_after_marker(text: str, marker: str) -> dict[str, Any] | None:
407
+ index = text.find(marker)
408
+ if index < 0:
409
+ return None
410
+ start = text.find("{", index)
411
+ if start < 0:
412
+ return None
413
+ depth = 0
414
+ in_string = False
415
+ escaped = False
416
+ for pos in range(start, len(text)):
417
+ ch = text[pos]
418
+ if in_string:
419
+ if escaped:
420
+ escaped = False
421
+ elif ch == "\\":
422
+ escaped = True
423
+ elif ch == '"':
424
+ in_string = False
425
+ continue
426
+ if ch == '"':
427
+ in_string = True
428
+ continue
429
+ if ch == "{":
430
+ depth += 1
431
+ continue
432
+ if ch == "}":
433
+ depth -= 1
434
+ if depth == 0:
435
+ candidate = text[start : pos + 1]
436
+ try:
437
+ payload = json.loads(candidate)
438
+ except Exception:
439
+ return None
440
+ return payload if isinstance(payload, dict) else None
441
+ return None
442
+
443
+
444
+ def _youtube_track_label(track: dict[str, Any]) -> str:
445
+ name = track.get("name")
446
+ if isinstance(name, dict):
447
+ simple = str(name.get("simpleText", "")).strip()
448
+ if simple:
449
+ return simple
450
+ runs = name.get("runs")
451
+ if isinstance(runs, list):
452
+ pieces = [
453
+ str(row.get("text", "")).strip()
454
+ for row in runs
455
+ if isinstance(row, dict) and str(row.get("text", "")).strip()
456
+ ]
457
+ if pieces:
458
+ return "".join(pieces)
459
+ return str(track.get("languageCode", "")).strip()
460
+
461
+
462
+ def _pick_youtube_caption_track(tracks: list[dict[str, Any]], preferred_lang: str = "") -> dict[str, Any] | None:
463
+ if not tracks:
464
+ return None
465
+ preferred = str(preferred_lang or "").strip().lower()
466
+
467
+ def score(track: dict[str, Any]) -> tuple[int, int]:
468
+ code = str(track.get("languageCode", "")).strip().lower()
469
+ kind = str(track.get("kind", "")).strip().lower()
470
+ auto = 1 if kind == "asr" else 0
471
+ exact = 1 if preferred and code == preferred else 0
472
+ prefix = 1 if preferred and code.startswith(preferred + "-") else 0
473
+ english = 1 if code.startswith("en") else 0
474
+ return (exact * 100 + prefix * 80 + english * 20 - auto * 5, -auto)
475
+
476
+ ranked = sorted(tracks, key=score, reverse=True)
477
+ return ranked[0] if ranked else None
478
+
479
+
480
+ def _youtube_add_query_param(url: str, key: str, value: str) -> str:
481
+ parsed = urlparse.urlsplit(url)
482
+ query = dict(urlparse.parse_qsl(parsed.query, keep_blank_values=True))
483
+ query[key] = value
484
+ return urlparse.urlunsplit(
485
+ (
486
+ parsed.scheme,
487
+ parsed.netloc,
488
+ parsed.path,
489
+ urlparse.urlencode(query),
490
+ parsed.fragment,
491
+ )
492
+ )
493
+
494
+
495
+ def _parse_youtube_transcript_json3(payload: dict[str, Any]) -> tuple[str, list[dict[str, Any]]]:
496
+ events = payload.get("events")
497
+ if not isinstance(events, list):
498
+ return ("", [])
499
+ segments: list[dict[str, Any]] = []
500
+ for event in events:
501
+ if not isinstance(event, dict):
502
+ continue
503
+ segs = event.get("segs")
504
+ if not isinstance(segs, list):
505
+ continue
506
+ pieces: list[str] = []
507
+ for seg in segs:
508
+ if not isinstance(seg, dict):
509
+ continue
510
+ text = html.unescape(str(seg.get("utf8", "")))
511
+ if text:
512
+ pieces.append(text)
513
+ merged = re.sub(r"\s+", " ", "".join(pieces)).strip()
514
+ if not merged:
515
+ continue
516
+ segments.append(
517
+ {
518
+ "start_ms": int(event.get("tStartMs", 0) or 0),
519
+ "duration_ms": int(event.get("dDurationMs", 0) or 0),
520
+ "text": merged,
521
+ }
522
+ )
523
+ transcript_text = "\n".join(str(row["text"]) for row in segments)
524
+ return transcript_text, segments
525
+
526
+
527
+ def _parse_youtube_transcript_xml(text: str) -> tuple[str, list[dict[str, Any]]]:
528
+ try:
529
+ root = ET.fromstring(text)
530
+ except Exception:
531
+ return ("", [])
532
+ segments: list[dict[str, Any]] = []
533
+ for node in root.findall(".//text"):
534
+ body = html.unescape("".join(node.itertext() or []))
535
+ body = re.sub(r"\s+", " ", body).strip()
536
+ if not body:
537
+ continue
538
+ start = float(node.attrib.get("start", "0") or "0")
539
+ duration = float(node.attrib.get("dur", "0") or "0")
540
+ segments.append(
541
+ {
542
+ "start_ms": int(start * 1000),
543
+ "duration_ms": int(duration * 1000),
544
+ "text": body,
545
+ }
546
+ )
547
+ transcript_text = "\n".join(str(row["text"]) for row in segments)
548
+ return transcript_text, segments
549
+
550
+
551
+ def _youtube_fetch_oembed(canonical_url: str) -> dict[str, Any]:
552
+ endpoint = "https://www.youtube.com/oembed?" + urlparse.urlencode({"url": canonical_url, "format": "json"})
553
+ try:
554
+ return _http_get_json(endpoint, headers=_youtube_request_headers(), timeout_sec=20)
555
+ except Exception:
556
+ return {}
557
+
558
+
559
+ def _youtube_fetch_watch_state(video_id: str) -> dict[str, Any]:
560
+ url = _youtube_canonical_url(video_id) + "&hl=en&persist_hl=1"
561
+ html_text = _http_get_text(url, headers=_youtube_request_headers(), timeout_sec=25)
562
+ markers = [
563
+ "var ytInitialPlayerResponse = ",
564
+ "ytInitialPlayerResponse = ",
565
+ "window['ytInitialPlayerResponse'] = ",
566
+ 'window["ytInitialPlayerResponse"] = ',
567
+ ]
568
+ player_response: dict[str, Any] | None = None
569
+ for marker in markers:
570
+ player_response = _extract_json_object_after_marker(html_text, marker)
571
+ if player_response:
572
+ break
573
+ if not player_response:
574
+ raise RuntimeError("Could not parse YouTube player response from the watch page.")
575
+ captions = (
576
+ player_response.get("captions", {})
577
+ .get("playerCaptionsTracklistRenderer", {})
578
+ .get("captionTracks", [])
579
+ )
580
+ return {
581
+ "player_response": player_response,
582
+ "video_details": player_response.get("videoDetails", {}) if isinstance(player_response.get("videoDetails"), dict) else {},
583
+ "microformat": (
584
+ player_response.get("microformat", {}).get("playerMicroformatRenderer", {})
585
+ if isinstance(player_response.get("microformat"), dict)
586
+ else {}
587
+ ),
588
+ "playability_status": (
589
+ player_response.get("playabilityStatus", {})
590
+ if isinstance(player_response.get("playabilityStatus"), dict)
591
+ else {}
592
+ ),
593
+ "caption_tracks": captions if isinstance(captions, list) else [],
594
+ }
595
+
596
+
597
+ def _youtube_fetch_transcript_from_track(track: dict[str, Any]) -> tuple[str, list[dict[str, Any]], str]:
598
+ base_url = str(track.get("baseUrl", "")).strip()
599
+ if not base_url:
600
+ return ("", [], "missing_track_url")
601
+ json3_url = _youtube_add_query_param(base_url, "fmt", "json3")
602
+ try:
603
+ payload = _http_get_json(json3_url, headers=_youtube_request_headers(), timeout_sec=25)
604
+ transcript_text, segments = _parse_youtube_transcript_json3(payload)
605
+ if transcript_text:
606
+ return transcript_text, segments, "json3"
607
+ except Exception:
608
+ pass
609
+ try:
610
+ xml_text = _http_get_text(base_url, headers=_youtube_request_headers(), timeout_sec=25)
611
+ transcript_text, segments = _parse_youtube_transcript_xml(xml_text)
612
+ if transcript_text:
613
+ return transcript_text, segments, "xml"
614
+ except Exception:
615
+ pass
616
+ return ("", [], "unavailable")
617
+
618
+
619
+ def _youtube_text_bundle(payload: dict[str, Any]) -> str:
620
+ parts: list[str] = []
621
+ title = str(payload.get("title", "")).strip()
622
+ if title:
623
+ parts.append(f"Title: {title}")
624
+ author_name = str(payload.get("author_name", "")).strip()
625
+ if author_name:
626
+ parts.append(f"Author: {author_name}")
627
+ duration_seconds = payload.get("duration_seconds")
628
+ if isinstance(duration_seconds, int) and duration_seconds > 0:
629
+ parts.append(f"Duration seconds: {duration_seconds}")
630
+ description = str(payload.get("description", "")).strip()
631
+ if description:
632
+ parts.append("Description:\n" + description)
633
+ transcript_text = str(payload.get("transcript_text", "")).strip()
634
+ if transcript_text:
635
+ parts.append("Transcript:\n" + transcript_text)
636
+ return "\n\n".join(parts)
637
+
638
+
639
+ def _youtube_inspect_payload(raw_url: str, preferred_lang: str = "") -> dict[str, Any]:
640
+ video_id = _youtube_video_id_from_url(raw_url)
641
+ canonical_url = _youtube_canonical_url(video_id)
642
+ warnings: list[str] = []
643
+ oembed = _youtube_fetch_oembed(canonical_url)
644
+
645
+ watch_state: dict[str, Any] = {}
646
+ try:
647
+ watch_state = _youtube_fetch_watch_state(video_id)
648
+ except Exception as exc:
649
+ warnings.append(str(exc))
650
+
651
+ video_details = watch_state.get("video_details", {}) if isinstance(watch_state.get("video_details"), dict) else {}
652
+ microformat = watch_state.get("microformat", {}) if isinstance(watch_state.get("microformat"), dict) else {}
653
+ playability = watch_state.get("playability_status", {}) if isinstance(watch_state.get("playability_status"), dict) else {}
654
+ tracks = [row for row in watch_state.get("caption_tracks", []) if isinstance(row, dict)]
655
+ chosen_track = _pick_youtube_caption_track(tracks, preferred_lang)
656
+ transcript_text = ""
657
+ transcript_segments: list[dict[str, Any]] = []
658
+ transcript_fetch_mode = "none"
659
+ transcript_available = False
660
+ transcript_language = ""
661
+ transcript_track_name = ""
662
+ transcript_kind = "none"
663
+ if chosen_track is not None:
664
+ transcript_text, transcript_segments, transcript_fetch_mode = _youtube_fetch_transcript_from_track(chosen_track)
665
+ transcript_available = bool(transcript_text.strip())
666
+ transcript_language = str(chosen_track.get("languageCode", "")).strip()
667
+ transcript_track_name = _youtube_track_label(chosen_track)
668
+ transcript_kind = "auto" if str(chosen_track.get("kind", "")).strip().lower() == "asr" else "manual"
669
+ if not transcript_available:
670
+ warnings.append("A caption track was found, but transcript text could not be fetched.")
671
+ elif watch_state:
672
+ warnings.append("No caption tracks were available for this video.")
673
+
674
+ title = str(video_details.get("title") or oembed.get("title") or "").strip()
675
+ author_name = str(video_details.get("author") or oembed.get("author_name") or "").strip()
676
+ author_url = str(oembed.get("author_url") or "").strip()
677
+ thumbnail_url = str(oembed.get("thumbnail_url") or "").strip()
678
+ description = str(video_details.get("shortDescription") or microformat.get("description", {}).get("simpleText", "") or "").strip()
679
+ channel_id = str(video_details.get("channelId") or "").strip()
680
+ duration_seconds = 0
681
+ raw_duration = video_details.get("lengthSeconds")
682
+ if isinstance(raw_duration, str) and raw_duration.isdigit():
683
+ duration_seconds = int(raw_duration)
684
+ published_at = str(microformat.get("publishDate") or "").strip()
685
+ payload = {
686
+ "schema_version": YOUTUBE_SOURCE_SCHEMA_VERSION,
687
+ "kind": "youtube_source",
688
+ "retrieved_at_utc": _now_utc(),
689
+ "source_url": str(raw_url).strip(),
690
+ "canonical_url": canonical_url,
691
+ "video_id": video_id,
692
+ "title": title,
693
+ "author_name": author_name,
694
+ "author_url": author_url,
695
+ "thumbnail_url": thumbnail_url,
696
+ "channel_id": channel_id,
697
+ "description": description,
698
+ "duration_seconds": duration_seconds or None,
699
+ "published_at": published_at,
700
+ "playability_status": str(playability.get("status", "")).strip(),
701
+ "transcript_available": transcript_available,
702
+ "transcript_language": transcript_language,
703
+ "transcript_track_name": transcript_track_name,
704
+ "transcript_kind": transcript_kind,
705
+ "transcript_fetch_mode": transcript_fetch_mode,
706
+ "transcript_text": transcript_text,
707
+ "transcript_segments": transcript_segments,
708
+ "warnings": _unique_strings(warnings),
709
+ }
710
+ payload["text_bundle"] = _youtube_text_bundle(payload)
711
+ return payload
712
+
713
+
714
+ def _default_youtube_artifact_path(repo_root: Path, video_id: str) -> Path:
715
+ return repo_root / "orp" / "external" / "youtube" / f"{video_id}.json"
716
+
717
+
718
+ def cmd_youtube_inspect(args: argparse.Namespace) -> int:
719
+ repo_root = Path(args.repo_root).resolve()
720
+ preferred_lang = str(getattr(args, "lang", "") or "").strip()
721
+ payload = _youtube_inspect_payload(args.url, preferred_lang=preferred_lang)
722
+
723
+ out_raw = str(getattr(args, "out", "") or "").strip()
724
+ should_save = bool(getattr(args, "save", False) or out_raw)
725
+ out_path: Path | None = None
726
+ emitted_format = ""
727
+ if should_save:
728
+ if out_raw:
729
+ out_path = _resolve_cli_path(out_raw, repo_root)
730
+ else:
731
+ _ensure_dirs(repo_root)
732
+ out_path = _default_youtube_artifact_path(repo_root, str(payload.get("video_id", "")).strip())
733
+ if out_path.exists() and not bool(getattr(args, "force", False)):
734
+ raise RuntimeError(
735
+ f"output path already exists: {_path_for_state(out_path, repo_root)}. Use --force to overwrite."
736
+ )
737
+ emitted_format = _write_structured_payload(out_path, payload, format_hint=str(getattr(args, "format", "") or ""))
738
+
739
+ result = {
740
+ "ok": True,
741
+ "saved": out_path is not None,
742
+ "path": _path_for_state(out_path, repo_root) if out_path is not None else "",
743
+ "format": emitted_format,
744
+ "schema_path": "spec/v1/youtube-source.schema.json",
745
+ "source": payload,
746
+ }
747
+ if args.json_output:
748
+ _print_json(result)
749
+ else:
750
+ _print_pairs(
751
+ [
752
+ ("ok", "true"),
753
+ ("video.id", str(payload.get("video_id", "")).strip()),
754
+ ("video.title", str(payload.get("title", "")).strip()),
755
+ ("video.author", str(payload.get("author_name", "")).strip()),
756
+ ("video.duration_seconds", payload.get("duration_seconds") or ""),
757
+ ("transcript.available", str(bool(payload.get("transcript_available", False))).lower()),
758
+ ("transcript.language", str(payload.get("transcript_language", "")).strip()),
759
+ ("transcript.kind", str(payload.get("transcript_kind", "")).strip()),
760
+ ("saved", str(bool(out_path is not None)).lower()),
761
+ ("path", _path_for_state(out_path, repo_root) if out_path is not None else ""),
762
+ ]
763
+ )
764
+ bundle = str(payload.get("text_bundle", "")).strip()
765
+ warnings = payload.get("warnings", []) if isinstance(payload.get("warnings"), list) else []
766
+ if bundle:
767
+ print("")
768
+ print(bundle)
769
+ if warnings:
770
+ print("")
771
+ for warning in warnings:
772
+ text = str(warning).strip()
773
+ if text:
774
+ print(f"warning={text}")
775
+ return 0
776
+
777
+
339
778
  def _runner_transport_mode(args: argparse.Namespace) -> str:
340
779
  mode = str(getattr(args, "transport", "auto") or "auto").strip().lower()
341
780
  if mode in {"poll", "sse"}:
@@ -5146,6 +5585,7 @@ def _about_payload() -> dict[str, Any]:
5146
5585
  "kernel": "spec/v1/kernel.schema.json",
5147
5586
  "kernel_proposal": "spec/v1/kernel-proposal.schema.json",
5148
5587
  "kernel_extension": "spec/v1/kernel-extension.schema.json",
5588
+ "youtube_source": "spec/v1/youtube-source.schema.json",
5149
5589
  "profile_pack": "spec/v1/profile-pack.schema.json",
5150
5590
  "link_project": "spec/v1/link-project.schema.json",
5151
5591
  "link_session": "spec/v1/link-session.schema.json",
@@ -5164,6 +5604,13 @@ def _about_payload() -> dict[str, Any]:
5164
5604
  ["kernel", "migrate"],
5165
5605
  ],
5166
5606
  },
5607
+ {
5608
+ "id": "youtube",
5609
+ "description": "Public YouTube metadata and transcript ingestion for agent-readable external source context.",
5610
+ "entrypoints": [
5611
+ ["youtube", "inspect"],
5612
+ ],
5613
+ },
5167
5614
  {
5168
5615
  "id": "workspace",
5169
5616
  "description": "Hosted workspace auth, ideas, features, worlds, checkpoints, and worker operations.",
@@ -5257,6 +5704,7 @@ def _about_payload() -> dict[str, Any]:
5257
5704
  {"name": "kernel_stats", "path": ["kernel", "stats"], "json_output": True},
5258
5705
  {"name": "kernel_propose", "path": ["kernel", "propose"], "json_output": True},
5259
5706
  {"name": "kernel_migrate", "path": ["kernel", "migrate"], "json_output": True},
5707
+ {"name": "youtube_inspect", "path": ["youtube", "inspect"], "json_output": True},
5260
5708
  {"name": "auth_login", "path": ["auth", "login"], "json_output": True},
5261
5709
  {"name": "auth_verify", "path": ["auth", "verify"], "json_output": True},
5262
5710
  {"name": "auth_logout", "path": ["auth", "logout"], "json_output": True},
@@ -5326,6 +5774,7 @@ def _about_payload() -> dict[str, Any]:
5326
5774
  "Default CLI output is human-readable; listed commands with json_output=true also support --json.",
5327
5775
  "Reasoning-kernel artifacts shape promotable repository truth for tasks, decisions, hypotheses, experiments, checkpoints, policies, and results.",
5328
5776
  "Kernel evolution in ORP should stay explicit: observe real usage, propose changes, and migrate artifacts through versioned CLI surfaces rather than silent agent mutation.",
5777
+ "YouTube inspection is a built-in ORP ability exposed through `orp youtube inspect`, returning public metadata and caption transcript text when available.",
5329
5778
  "Discovery profiles in ORP are portable search-intent files managed directly by ORP.",
5330
5779
  "Collaboration is a built-in ORP ability exposed through `orp collaborate ...`.",
5331
5780
  "Project/session linking is a built-in ORP ability exposed through `orp link ...` and stored machine-locally under `.git/orp/link/`.",
@@ -5435,6 +5884,10 @@ def _home_payload(repo_root: Path, config_arg: str) -> dict[str, Any]:
5435
5884
  "label": "Inspect the current hosted workspace identity",
5436
5885
  "command": "orp whoami --json",
5437
5886
  },
5887
+ {
5888
+ "label": "Inspect a YouTube video and public transcript for agent context",
5889
+ "command": "orp youtube inspect https://www.youtube.com/watch?v=<video_id> --json",
5890
+ },
5438
5891
  {
5439
5892
  "label": "List hosted ideas in the current workspace",
5440
5893
  "command": "orp ideas list --json",
@@ -12257,6 +12710,43 @@ def build_parser() -> argparse.ArgumentParser:
12257
12710
  add_json_flag(s_world_bind)
12258
12711
  s_world_bind.set_defaults(func=cmd_world_bind, json_output=False)
12259
12712
 
12713
+ s_youtube = sub.add_parser("youtube", help="Public YouTube metadata and transcript inspection")
12714
+ youtube_sub = s_youtube.add_subparsers(dest="youtube_cmd", required=True)
12715
+
12716
+ s_youtube_inspect = youtube_sub.add_parser(
12717
+ "inspect",
12718
+ help="Inspect a YouTube video and fetch public metadata plus transcript text when captions are available",
12719
+ )
12720
+ s_youtube_inspect.add_argument("url", help="YouTube watch/share URL or 11-character video id")
12721
+ s_youtube_inspect.add_argument(
12722
+ "--lang",
12723
+ default="",
12724
+ help="Preferred caption language code, for example en or es",
12725
+ )
12726
+ s_youtube_inspect.add_argument(
12727
+ "--save",
12728
+ action="store_true",
12729
+ help="Save the inspected source artifact under orp/external/youtube/<video_id>.json",
12730
+ )
12731
+ s_youtube_inspect.add_argument(
12732
+ "--out",
12733
+ default="",
12734
+ help="Optional output path for the source artifact (.json, .yml, or .yaml)",
12735
+ )
12736
+ s_youtube_inspect.add_argument(
12737
+ "--format",
12738
+ default="",
12739
+ choices=["", "json", "yaml"],
12740
+ help="Optional explicit output format when saving",
12741
+ )
12742
+ s_youtube_inspect.add_argument(
12743
+ "--force",
12744
+ action="store_true",
12745
+ help="Overwrite an existing saved artifact",
12746
+ )
12747
+ add_json_flag(s_youtube_inspect)
12748
+ s_youtube_inspect.set_defaults(func=cmd_youtube_inspect, json_output=False)
12749
+
12260
12750
  s_secrets = sub.add_parser("secrets", help="Hosted secret store and project binding operations")
12261
12751
  secrets_sub = s_secrets.add_subparsers(dest="secrets_cmd", required=True)
12262
12752
 
@@ -21,6 +21,9 @@ Use this loop when an AI agent is the primary operator of an ORP-enabled repo.
21
21
  - or `orp pack fetch --source <git-url> --pack-id <pack-id> --install-target . --json`
22
22
  - If the workflow depends on public Erdos data, sync it first:
23
23
  - `orp erdos sync --problem-id <id> --out-problem-dir <dir> --json`
24
+ - If the task begins from a public YouTube link, normalize it first:
25
+ - `orp youtube inspect <youtube-url> --json`
26
+ - or `orp youtube inspect <youtube-url> --save --json` when the source artifact should stay with the repo
24
27
 
25
28
  ## 3. Run
26
29
 
@@ -0,0 +1,97 @@
1
+ # ORP YouTube Inspect
2
+
3
+ `orp youtube inspect` is ORP's first-class public-source ingestion surface for
4
+ YouTube videos.
5
+
6
+ It gives agents and users a stable way to turn a YouTube link into:
7
+
8
+ - normalized video metadata,
9
+ - public caption transcript text when available,
10
+ - segment-level timing rows,
11
+ - and one agent-friendly `text_bundle` field that can be handed directly into
12
+ summarization, extraction, comparison, or kernel-shaped artifact creation.
13
+
14
+ ## Why this exists
15
+
16
+ Agents often receive a raw YouTube URL and are asked:
17
+
18
+ - what is this video about?
19
+ - summarize it,
20
+ - extract claims,
21
+ - capture action items,
22
+ - compare it against repo work,
23
+ - or turn it into a canonical ORP artifact.
24
+
25
+ Without a built-in surface, each agent has to improvise scraping, transcript
26
+ discovery, and output shape. ORP now treats this as a real protocol ability.
27
+
28
+ ## Command
29
+
30
+ ```bash
31
+ orp youtube inspect https://www.youtube.com/watch?v=<video_id> --json
32
+ ```
33
+
34
+ Optional persistence:
35
+
36
+ ```bash
37
+ orp youtube inspect https://www.youtube.com/watch?v=<video_id> --save --json
38
+ orp youtube inspect https://www.youtube.com/watch?v=<video_id> --out analysis/source.youtube.json --json
39
+ ```
40
+
41
+ ## Output shape
42
+
43
+ The canonical artifact schema is:
44
+
45
+ - `spec/v1/youtube-source.schema.json`
46
+
47
+ The command returns:
48
+
49
+ - source identity:
50
+ - `source_url`
51
+ - `canonical_url`
52
+ - `video_id`
53
+ - metadata:
54
+ - `title`
55
+ - `author_name`
56
+ - `author_url`
57
+ - `thumbnail_url`
58
+ - `channel_id`
59
+ - `description`
60
+ - `duration_seconds`
61
+ - `published_at`
62
+ - `playability_status`
63
+ - transcript fields:
64
+ - `transcript_available`
65
+ - `transcript_language`
66
+ - `transcript_track_name`
67
+ - `transcript_kind`
68
+ - `transcript_fetch_mode`
69
+ - `transcript_text`
70
+ - `transcript_segments`
71
+ - agent-ready bundle:
72
+ - `text_bundle`
73
+ - capture notes:
74
+ - `warnings`
75
+
76
+ ## Save behavior
77
+
78
+ `--save` writes the artifact to:
79
+
80
+ ```text
81
+ orp/external/youtube/<video_id>.json
82
+ ```
83
+
84
+ This keeps YouTube ingestion consistent with ORP's larger local-first artifact
85
+ discipline while staying outside the evidence boundary by default.
86
+
87
+ ## Important boundary
88
+
89
+ `orp youtube inspect` returns public source context. It does **not** make the
90
+ result canonical evidence by itself.
91
+
92
+ If a video matters for repo truth, the agent should still:
93
+
94
+ 1. inspect the video,
95
+ 2. summarize or structure the relevant claims,
96
+ 3. promote that into a typed ORP artifact when appropriate,
97
+ 4. and cite the saved source artifact path alongside any downstream result.
package/llms.txt CHANGED
@@ -13,6 +13,7 @@ ORP (Open Research Protocol) is a docs-first, local-first, agent-friendly protoc
13
13
  ## Fast Machine Discovery
14
14
 
15
15
  - Run `orp about --json` for machine-readable tool metadata, artifact paths, schemas, supported commands, and bundled packs.
16
+ - Run `orp youtube inspect <youtube-url> --json` to normalize a public YouTube video into ORP's source artifact shape, including transcript text when public captions are fetchable.
16
17
  - Run `orp erdos sync --json` for machine-readable Erdos catalog sync results.
17
18
  - Run `orp pack list --json` for machine-readable bundled pack inventory.
18
19
  - Core runtime commands also support `--json`:
@@ -37,10 +38,12 @@ ORP (Open Research Protocol) is a docs-first, local-first, agent-friendly protoc
37
38
  - `spec/v1/orp.config.schema.json`
38
39
  - `spec/v1/packet.schema.json`
39
40
  - `spec/v1/profile-pack.schema.json`
41
+ - `spec/v1/youtube-source.schema.json`
40
42
 
41
43
  ## Key Commands
42
44
 
43
45
  - `orp init`
46
+ - `orp youtube inspect <youtube-url> --json`
44
47
  - `orp gate run --profile <profile>`
45
48
  - `orp packet emit --profile <profile>`
46
49
  - `orp report summary`
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "open-research-protocol",
3
- "version": "0.4.8",
3
+ "version": "0.4.9",
4
4
  "description": "ORP CLI (Open Research Protocol): agent-friendly research workflows, runtime, reports, and pack tooling.",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -0,0 +1,151 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://openresearchprotocol.com/spec/v1/youtube-source.schema.json",
4
+ "title": "ORP YouTube Source Artifact",
5
+ "type": "object",
6
+ "additionalProperties": false,
7
+ "required": [
8
+ "schema_version",
9
+ "kind",
10
+ "retrieved_at_utc",
11
+ "source_url",
12
+ "canonical_url",
13
+ "video_id",
14
+ "title",
15
+ "author_name",
16
+ "author_url",
17
+ "thumbnail_url",
18
+ "channel_id",
19
+ "description",
20
+ "duration_seconds",
21
+ "published_at",
22
+ "playability_status",
23
+ "transcript_available",
24
+ "transcript_language",
25
+ "transcript_track_name",
26
+ "transcript_kind",
27
+ "transcript_fetch_mode",
28
+ "transcript_text",
29
+ "transcript_segments",
30
+ "warnings",
31
+ "text_bundle"
32
+ ],
33
+ "properties": {
34
+ "schema_version": {
35
+ "type": "string",
36
+ "const": "1.0.0"
37
+ },
38
+ "kind": {
39
+ "type": "string",
40
+ "const": "youtube_source"
41
+ },
42
+ "retrieved_at_utc": {
43
+ "type": "string"
44
+ },
45
+ "source_url": {
46
+ "type": "string"
47
+ },
48
+ "canonical_url": {
49
+ "type": "string"
50
+ },
51
+ "video_id": {
52
+ "type": "string",
53
+ "pattern": "^[A-Za-z0-9_-]{11}$"
54
+ },
55
+ "title": {
56
+ "type": "string"
57
+ },
58
+ "author_name": {
59
+ "type": "string"
60
+ },
61
+ "author_url": {
62
+ "type": "string"
63
+ },
64
+ "thumbnail_url": {
65
+ "type": "string"
66
+ },
67
+ "channel_id": {
68
+ "type": "string"
69
+ },
70
+ "description": {
71
+ "type": "string"
72
+ },
73
+ "duration_seconds": {
74
+ "type": [
75
+ "integer",
76
+ "null"
77
+ ],
78
+ "minimum": 0
79
+ },
80
+ "published_at": {
81
+ "type": "string"
82
+ },
83
+ "playability_status": {
84
+ "type": "string"
85
+ },
86
+ "transcript_available": {
87
+ "type": "boolean"
88
+ },
89
+ "transcript_language": {
90
+ "type": "string"
91
+ },
92
+ "transcript_track_name": {
93
+ "type": "string"
94
+ },
95
+ "transcript_kind": {
96
+ "type": "string",
97
+ "enum": [
98
+ "manual",
99
+ "auto",
100
+ "none"
101
+ ]
102
+ },
103
+ "transcript_fetch_mode": {
104
+ "type": "string",
105
+ "enum": [
106
+ "json3",
107
+ "xml",
108
+ "unavailable",
109
+ "none",
110
+ "missing_track_url"
111
+ ]
112
+ },
113
+ "transcript_text": {
114
+ "type": "string"
115
+ },
116
+ "transcript_segments": {
117
+ "type": "array",
118
+ "items": {
119
+ "type": "object",
120
+ "additionalProperties": false,
121
+ "required": [
122
+ "start_ms",
123
+ "duration_ms",
124
+ "text"
125
+ ],
126
+ "properties": {
127
+ "start_ms": {
128
+ "type": "integer",
129
+ "minimum": 0
130
+ },
131
+ "duration_ms": {
132
+ "type": "integer",
133
+ "minimum": 0
134
+ },
135
+ "text": {
136
+ "type": "string"
137
+ }
138
+ }
139
+ }
140
+ },
141
+ "warnings": {
142
+ "type": "array",
143
+ "items": {
144
+ "type": "string"
145
+ }
146
+ },
147
+ "text_bundle": {
148
+ "type": "string"
149
+ }
150
+ }
151
+ }