ultimate-pi 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/.pi/extensions/custom-header.ts +26 -2
  2. package/.pi/extensions/lib/harness-paths.ts +55 -0
  3. package/.pi/extensions/model-router-bootstrap.ts +174 -0
  4. package/.pi/extensions/sentrux-rules-sync.ts +28 -3
  5. package/.pi/harness/browser.json +5 -0
  6. package/.pi/harness/debates/README.md +9 -0
  7. package/.pi/harness/docs/adrs/0006-sentrux-dual-layer.md +1 -1
  8. package/.pi/harness/docs/adrs/0009-sentrux-rules-lifecycle.md +2 -2
  9. package/.pi/harness/incidents/README.md +6 -0
  10. package/.pi/harness/release-readiness-report.md +128 -0
  11. package/.pi/harness/router/proposals/canary-proposal.json +96 -0
  12. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773891854/events.jsonl +2 -0
  13. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773891854/trace.json +17 -0
  14. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773912057/events.jsonl +2 -0
  15. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773912057/trace.json +17 -0
  16. package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096/events.jsonl +6 -0
  17. package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096/trace.json +42 -0
  18. package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774136101/events.jsonl +1 -0
  19. package/.pi/harness/runs/019e2758-b332-771b-ad6f-54d0d8478768-1778776600591/events.jsonl +2 -0
  20. package/.pi/harness/runs/019e2758-b332-771b-ad6f-54d0d8478768-1778776600591/trace.json +17 -0
  21. package/.pi/harness/runs/README.md +6 -0
  22. package/.pi/harness/runs/budget-events.jsonl +4 -0
  23. package/.pi/harness/runs/canary-candidate-router.json +72 -0
  24. package/.pi/harness/runs/canary-evidence.json +9 -0
  25. package/.pi/harness/runs/index.jsonl +4 -0
  26. package/.pi/harness/sentrux/architecture.manifest.json +3 -3
  27. package/.pi/model-router.example.json +27 -0
  28. package/.pi/prompts/graphify.md +4 -8
  29. package/.pi/prompts/harness-setup.md +142 -92
  30. package/.pi/prompts/release.md +225 -0
  31. package/.pi/scripts/README.md +17 -0
  32. package/.pi/scripts/harness-cli-verify.sh +294 -0
  33. package/.pi/scripts/harness-graphify-bootstrap.sh +151 -0
  34. package/{scripts → .pi/scripts}/harness-verify.mjs +3 -3
  35. package/{scripts → .pi/scripts}/sentrux-rules-sync.mjs +2 -2
  36. package/.pi/settings.json +0 -2
  37. package/.sentrux/.harness-rules-meta.json +2 -2
  38. package/.sentrux/rules.toml +3 -3
  39. package/AGENTS.md +12 -0
  40. package/CHANGELOG.md +21 -0
  41. package/README.md +39 -350
  42. package/firecrawl/.env +53 -0
  43. package/package.json +16 -4
  44. package/.ckignore +0 -41
  45. package/.env.example +0 -21
  46. package/.gitattributes +0 -1
  47. package/.github/banner-v2.png +0 -0
  48. package/.github/workflows/lint.yml +0 -33
  49. package/.github/workflows/publish-github-packages.yml +0 -35
  50. package/.github/workflows/publish-npm.yml +0 -32
  51. package/CONTRIBUTING.md +0 -166
  52. package/lefthook.yml +0 -9
  53. package/scripts/__pycache__/merge_graphify_corpora.cpython-314.pyc +0 -0
  54. package/scripts/index_youtube_urls.py +0 -376
  55. package/scripts/merge_graphify_corpora.py +0 -398
  56. package/scripts/regen_graphify_html.py +0 -46
  57. package/test/harness-verify.test.mjs +0 -33
@@ -1,376 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Index YouTube watch URLs: yt-dlp metadata + Firecrawl transcript scrape.
3
-
4
- Writes ``<data-dir>/<channel-handle>/<YYYY-MM-DD>/<video-id>_<title-slug>.txt`` and
5
- ``.meta.txt``, and merges ``_index.tsv`` per channel. No channel-specific filters.
6
- Default ``data-dir`` is ``<repo>/data/youtube-transcripts`` when this file lives in ``<repo>/scripts/``.
7
-
8
- Requirements: ``yt-dlp`` and ``firecrawl`` CLI on PATH (see ``firecrawl --status``).
9
-
10
- Examples:
11
- python3 scripts/index_youtube_urls.py 'https://www.youtube.com/watch?v=VIDEO_ID'
12
- python3 scripts/index_youtube_urls.py --urls-file urls.txt
13
- python3 scripts/index_youtube_urls.py --data-dir ./data/youtube-transcripts --firecrawl-cwd . URL
14
- """
15
-
16
- from __future__ import annotations
17
-
18
- import argparse
19
- import os
20
- import re
21
- import shutil
22
- import subprocess
23
- import tempfile
24
- import time
25
- from pathlib import Path
26
- from urllib.parse import parse_qs, urlparse
27
-
28
- SLEEP_SEC = 5.0
29
-
30
-
31
- def slug(s: str, max_len: int = 80) -> str:
32
- s = re.sub(r"[^\w\s-]", "", s, flags=re.UNICODE)
33
- s = re.sub(r"[-\s]+", "-", s).strip("-") or "untitled"
34
- return s[:max_len].rstrip("-")
35
-
36
-
37
- def ymd(upload_date: str) -> str:
38
- if len(upload_date) == 8 and upload_date.isdigit():
39
- return f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
40
- return "unknown-date"
41
-
42
-
43
- def parse_firecrawl_youtube_transcript(md: str) -> str | None:
44
- marker = "## Transcript"
45
- i = md.find(marker)
46
- if i == -1:
47
- return None
48
- rest = md[i + len(marker) :].lstrip("\n")
49
- lines_out: list[str] = []
50
- for line in rest.splitlines():
51
- if line.startswith("## ") and lines_out:
52
- break
53
- lines_out.append(line)
54
- text = "\n".join(lines_out).strip()
55
- if len(text) < 30:
56
- return None
57
- return text
58
-
59
-
60
- def _firecrawl_transcript_sane(text: str) -> bool:
61
- """Reject full-page scrapes where ## Transcript captured sidebar/recommendations."""
62
- head = text[:1200]
63
- if "NaN / NaN" in head:
64
- return False
65
- if head.count("[![]") >= 2 or head.count("hqdefault.jpg") >= 2:
66
- return False
67
- if head.count("views •") >= 2:
68
- return False
69
- return True
70
-
71
-
72
- def fetch_transcript_firecrawl(
73
- video_id: str,
74
- *,
75
- firecrawl_bin: str,
76
- firecrawl_cwd: Path,
77
- wait_ms: int = 20000,
78
- attempts: int = 3,
79
- scrape_timeout: int = 300,
80
- ) -> str | None:
81
- url = f"https://www.youtube.com/watch?v={video_id}"
82
- for attempt in range(attempts):
83
- if attempt:
84
- time.sleep(4.0)
85
- fd, out = tempfile.mkstemp(suffix=".md", prefix="ytfc-")
86
- os.close(fd)
87
- out_path = Path(out)
88
- try:
89
- cmd = [
90
- firecrawl_bin,
91
- "scrape",
92
- url,
93
- "--wait-for",
94
- str(wait_ms),
95
- "--only-main-content",
96
- "-o",
97
- str(out_path),
98
- ]
99
- r = subprocess.run(
100
- cmd,
101
- capture_output=True,
102
- text=True,
103
- timeout=scrape_timeout,
104
- cwd=str(firecrawl_cwd),
105
- )
106
- if r.returncode != 0:
107
- continue
108
- md = out_path.read_text(encoding="utf-8", errors="replace")
109
- text = parse_firecrawl_youtube_transcript(md)
110
- if text and _firecrawl_transcript_sane(text):
111
- return text
112
- except (OSError, subprocess.TimeoutExpired, ValueError):
113
- pass
114
- finally:
115
- out_path.unlink(missing_ok=True)
116
- return None
117
-
118
-
119
- def needs_transcript(path: Path) -> bool:
120
- if not path.exists():
121
- return True
122
- try:
123
- text = path.read_text(encoding="utf-8", errors="replace")
124
- except OSError:
125
- return True
126
- return text.strip().startswith("(no transcript")
127
-
128
-
129
- def channel_dir_from_handle(uploader_id: str) -> str:
130
- h = (uploader_id or "unknown-channel").strip()
131
- if h.startswith("@"):
132
- h = h[1:]
133
- return h.lower() or "unknown-channel"
134
-
135
-
136
- def video_id_from_arg(s: str) -> str:
137
- s = s.strip()
138
- if re.fullmatch(r"[0-9A-Za-z_-]{11}", s):
139
- return s
140
- u = urlparse(s)
141
- host = (u.netloc or "").lower().removeprefix("www.")
142
- if host == "youtu.be":
143
- seg = u.path.strip("/").split("/")[0]
144
- if re.fullmatch(r"[0-9A-Za-z_-]{11}", seg):
145
- return seg
146
- qs = parse_qs(u.query)
147
- if "v" in qs and qs["v"]:
148
- vid = qs["v"][0]
149
- if re.fullmatch(r"[0-9A-Za-z_-]{11}", vid):
150
- return vid
151
- raise SystemExit(f"Could not parse YouTube video id from: {s!r}")
152
-
153
-
154
- def yt_dlp_row(watch_url: str, *, yt_dlp_bin: str) -> tuple[str, str, str, str]:
155
- """Returns (video_id, upload_date, title, uploader_id)."""
156
- cmd = [
157
- yt_dlp_bin,
158
- "--no-download",
159
- "--ignore-errors",
160
- "--print",
161
- "%(id)s|%(upload_date)s|%(title)s|%(uploader_id)s",
162
- watch_url,
163
- ]
164
- r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
165
- if r.returncode != 0:
166
- raise SystemExit(f"yt-dlp failed ({r.returncode}): {watch_url}\n{r.stderr}")
167
- line = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else ""
168
- parts = line.split("|", 3)
169
- if len(parts) < 4:
170
- raise SystemExit(f"Unexpected yt-dlp output for {watch_url!r}: {line!r}")
171
- vid, udate, title, handle = parts[0], parts[1], parts[2], parts[3]
172
- if not udate.isdigit() or len(udate) != 8:
173
- raise SystemExit(f"Bad upload_date from yt-dlp: {udate!r}")
174
- return vid, udate, title, handle or "@unknown"
175
-
176
-
177
- def merge_index(idx: Path, rows: dict[str, tuple[str, str]]) -> None:
178
- if idx.exists():
179
- for i, line in enumerate(idx.read_text(encoding="utf-8").splitlines()):
180
- line = line.strip()
181
- if not line:
182
- continue
183
- if i == 0 and line.startswith("video_id"):
184
- continue
185
- parts = line.split("\t")
186
- if len(parts) >= 3:
187
- vid, ud, tit = parts[0], parts[1], parts[2]
188
- rows.setdefault(vid, (ud, tit))
189
- lines = ["video_id\tupload_date\ttitle"]
190
- for vid in sorted(rows.keys()):
191
- ud, tit = rows[vid]
192
- lines.append(f"{vid}\t{ud}\t{tit.replace(chr(9), ' ')}")
193
- idx.parent.mkdir(parents=True, exist_ok=True)
194
- idx.write_text("\n".join(lines) + "\n", encoding="utf-8")
195
-
196
-
197
- def collect_urls(args: argparse.Namespace) -> list[str]:
198
- out: list[str] = []
199
- for a in args.url:
200
- out.append(a.strip())
201
- if args.urls_file:
202
- raw = Path(args.urls_file).read_text(encoding="utf-8")
203
- for line in raw.splitlines():
204
- line = line.strip()
205
- if line and not line.startswith("#"):
206
- out.append(line)
207
- seen: set[str] = set()
208
- uniq: list[str] = []
209
- for u in out:
210
- if u not in seen:
211
- seen.add(u)
212
- uniq.append(u)
213
- return uniq
214
-
215
-
216
- def default_paths() -> tuple[Path, Path]:
217
- """(data_dir, firecrawl_cwd) when script lives in <repo>/scripts/."""
218
- here = Path(__file__).resolve()
219
- repo = here.parent.parent
220
- return repo / "data" / "youtube-transcripts", repo
221
-
222
-
223
- def main() -> int:
224
- default_data, default_fc_cwd = default_paths()
225
- ap = argparse.ArgumentParser(description=__doc__)
226
- ap.add_argument(
227
- "url",
228
- nargs="*",
229
- help="YouTube watch URLs, youtu.be links, or 11-char video ids",
230
- )
231
- ap.add_argument(
232
- "--urls-file",
233
- metavar="PATH",
234
- help="Text file with one URL or id per line (# comments allowed)",
235
- )
236
- ap.add_argument(
237
- "--data-dir",
238
- type=Path,
239
- metavar="DIR",
240
- default=default_data,
241
- help=f"Root for channel folders (default: {default_data})",
242
- )
243
- ap.add_argument(
244
- "--firecrawl-cwd",
245
- type=Path,
246
- metavar="DIR",
247
- default=default_fc_cwd,
248
- help="Working directory for firecrawl subprocess (default: repo root next to scripts/)",
249
- )
250
- ap.add_argument(
251
- "--yt-dlp",
252
- metavar="BIN",
253
- default="yt-dlp",
254
- help="yt-dlp executable name or path (default: yt-dlp)",
255
- )
256
- ap.add_argument(
257
- "--firecrawl",
258
- metavar="BIN",
259
- default="",
260
- help="firecrawl executable (default: search PATH)",
261
- )
262
- ap.add_argument(
263
- "--wait-for",
264
- type=int,
265
- default=20000,
266
- metavar="MS",
267
- help="Firecrawl scrape --wait-for milliseconds (default 20000)",
268
- )
269
- ap.add_argument(
270
- "--sleep",
271
- type=float,
272
- default=SLEEP_SEC,
273
- metavar="SEC",
274
- help=f"Seconds between Firecrawl scrapes (default {SLEEP_SEC})",
275
- )
276
- ap.add_argument(
277
- "--dry-run",
278
- action="store_true",
279
- help="Print yt-dlp metadata only; do not scrape or write files",
280
- )
281
- ap.add_argument(
282
- "--force",
283
- action="store_true",
284
- help="Re-scrape even when a non-placeholder transcript already exists",
285
- )
286
- args = ap.parse_args()
287
- urls = collect_urls(args)
288
- if not urls:
289
- ap.error("Pass at least one url, or use --urls-file")
290
-
291
- fc_bin = args.firecrawl.strip() or shutil.which("firecrawl")
292
- if not fc_bin and not args.dry_run:
293
- raise SystemExit(
294
- "firecrawl CLI not found on PATH. Install it and run `firecrawl --status`, "
295
- "or pass --firecrawl /path/to/firecrawl."
296
- )
297
-
298
- data_dir: Path = args.data_dir
299
- fc_cwd: Path = args.firecrawl_cwd
300
-
301
- index_rows: dict[str, dict[str, tuple[str, str]]] = {}
302
- first_scrape = True
303
-
304
- for raw in urls:
305
- vid_guess = video_id_from_arg(raw)
306
- watch = f"https://www.youtube.com/watch?v={vid_guess}"
307
- vid, udate, title, uploader_id = yt_dlp_row(watch, yt_dlp_bin=args.yt_dlp)
308
- ch_slug = channel_dir_from_handle(uploader_id)
309
- day = ymd(udate)
310
- out_base = data_dir / ch_slug
311
- day_dir = out_base / day
312
- base = f"{vid}_{slug(title)}"
313
- path = day_dir / f"{base}.txt"
314
- meta_path = day_dir / f"{base}.meta.txt"
315
-
316
- if args.dry_run:
317
- print(f"{ch_slug}\t{day}\t{vid}\t{udate}\t{title}", flush=True)
318
- bucket = index_rows.setdefault(ch_slug, {})
319
- bucket[vid] = (udate, title)
320
- continue
321
-
322
- day_dir.mkdir(parents=True, exist_ok=True)
323
- need = args.force or needs_transcript(path)
324
- text: str | None
325
- if need:
326
- if not first_scrape:
327
- time.sleep(max(0.0, args.sleep))
328
- first_scrape = False
329
- print(f"scrape {ch_slug} {day} {vid} …", flush=True)
330
- assert fc_bin is not None
331
- text = fetch_transcript_firecrawl(
332
- vid,
333
- firecrawl_bin=fc_bin,
334
- firecrawl_cwd=fc_cwd,
335
- wait_ms=args.wait_for,
336
- )
337
- else:
338
- print(f"skip {ch_slug} {day} {vid} (existing transcript)", flush=True)
339
- text = None
340
-
341
- ch_meta = uploader_id if uploader_id.startswith("@") else f"@{uploader_id}"
342
- meta = (
343
- f"video_id: {vid}\n"
344
- f"upload_date: {udate}\n"
345
- f"title: {title}\n"
346
- f"url: https://www.youtube.com/watch?v={vid}\n"
347
- f"transcript_source: firecrawl\n"
348
- f"channel: {ch_meta}\n"
349
- )
350
- meta_path.write_text(meta, encoding="utf-8")
351
- if need:
352
- if text is None:
353
- path.write_text(
354
- "(no transcript yet: Firecrawl scrape had no ## Transcript section or empty body. "
355
- "Retry later or open the watch URL in a browser.)\n",
356
- encoding="utf-8",
357
- )
358
- print(" -> no transcript", flush=True)
359
- else:
360
- path.write_text(text, encoding="utf-8")
361
- print(f" -> ok ({len(text)} chars)", flush=True)
362
-
363
- bucket = index_rows.setdefault(ch_slug, {})
364
- bucket[vid] = (udate, title)
365
-
366
- if not args.dry_run:
367
- for ch_slug, rows in index_rows.items():
368
- idx = data_dir / ch_slug / "_index.tsv"
369
- merge_index(idx, dict(rows))
370
- print(f"wrote {idx}", flush=True)
371
-
372
- return 0
373
-
374
-
375
- if __name__ == "__main__":
376
- raise SystemExit(main())