ultimate-pi 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.pi/extensions/lib/harness-paths.ts +8 -0
- package/.pi/extensions/sentrux-rules-sync.ts +2 -8
- package/.pi/harness/browser.json +5 -1
- package/.pi/harness/debates/README.md +9 -0
- package/.pi/harness/docs/adrs/0006-sentrux-dual-layer.md +1 -1
- package/.pi/harness/docs/adrs/0009-sentrux-rules-lifecycle.md +2 -2
- package/.pi/harness/incidents/README.md +6 -0
- package/.pi/harness/release-readiness-report.md +128 -0
- package/.pi/harness/router/proposals/canary-proposal.json +96 -0
- package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773891854/events.jsonl +2 -0
- package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773891854/trace.json +17 -0
- package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773912057/events.jsonl +2 -0
- package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773912057/trace.json +17 -0
- package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096/events.jsonl +6 -0
- package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096/trace.json +42 -0
- package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774136101/events.jsonl +1 -0
- package/.pi/harness/runs/019e2758-b332-771b-ad6f-54d0d8478768-1778776600591/events.jsonl +2 -0
- package/.pi/harness/runs/019e2758-b332-771b-ad6f-54d0d8478768-1778776600591/trace.json +17 -0
- package/.pi/harness/runs/README.md +6 -0
- package/.pi/harness/runs/budget-events.jsonl +4 -0
- package/.pi/harness/runs/canary-candidate-router.json +72 -0
- package/.pi/harness/runs/canary-evidence.json +9 -0
- package/.pi/harness/runs/index.jsonl +4 -0
- package/.pi/harness/sentrux/architecture.manifest.json +3 -3
- package/.pi/model-router.json +95 -0
- package/.pi/prompts/harness-setup.md +13 -14
- package/.pi/prompts/release.md +225 -0
- package/.pi/scripts/README.md +17 -0
- package/{scripts → .pi/scripts}/harness-verify.mjs +3 -3
- package/{scripts → .pi/scripts}/sentrux-rules-sync.mjs +2 -2
- package/.sentrux/.harness-rules-meta.json +2 -2
- package/.sentrux/rules.toml +3 -3
- package/CHANGELOG.md +8 -0
- package/firecrawl/.env +53 -0
- package/package.json +15 -5
- package/.ckignore +0 -41
- package/.codex/hooks.json +0 -15
- package/.env.example +0 -21
- package/.gitattributes +0 -1
- package/.github/banner-v2.png +0 -0
- package/.github/workflows/lint.yml +0 -33
- package/.github/workflows/publish-github-packages.yml +0 -35
- package/.github/workflows/publish-npm.yml +0 -32
- package/CONTRIBUTING.md +0 -166
- package/lefthook.yml +0 -9
- package/scripts/__pycache__/merge_graphify_corpora.cpython-314.pyc +0 -0
- package/scripts/index_youtube_urls.py +0 -376
- package/scripts/merge_graphify_corpora.py +0 -398
- package/scripts/regen_graphify_html.py +0 -46
- package/test/harness-verify.test.mjs +0 -33
- /package/{scripts → .pi/scripts}/harness-cli-verify.sh +0 -0
- /package/{scripts → .pi/scripts}/harness-graphify-bootstrap.sh +0 -0
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
name: Publish to npm
|
|
2
|
-
run-name: Publish npm from ${{ github.ref_name }}
|
|
3
|
-
|
|
4
|
-
on:
|
|
5
|
-
push:
|
|
6
|
-
tags:
|
|
7
|
-
- 'v*'
|
|
8
|
-
workflow_dispatch:
|
|
9
|
-
|
|
10
|
-
jobs:
|
|
11
|
-
publish:
|
|
12
|
-
runs-on: ubuntu-latest
|
|
13
|
-
permissions:
|
|
14
|
-
contents: read
|
|
15
|
-
id-token: write
|
|
16
|
-
steps:
|
|
17
|
-
- name: Checkout
|
|
18
|
-
uses: actions/checkout@v4
|
|
19
|
-
|
|
20
|
-
- name: Setup Node.js
|
|
21
|
-
uses: actions/setup-node@v4
|
|
22
|
-
with:
|
|
23
|
-
node-version: '22.14.0'
|
|
24
|
-
|
|
25
|
-
- name: Ensure npm trusted publishing minimum version
|
|
26
|
-
run: |
|
|
27
|
-
npm i -g npm@^11.5.1
|
|
28
|
-
node -v
|
|
29
|
-
npm -v
|
|
30
|
-
|
|
31
|
-
- name: Publish package
|
|
32
|
-
run: npm publish --provenance --access public --ignore-scripts
|
package/CONTRIBUTING.md
DELETED
|
@@ -1,166 +0,0 @@
|
|
|
1
|
-
# Contributing to ultimate-pi
|
|
2
|
-
|
|
3
|
-
## Local development setup
|
|
4
|
-
|
|
5
|
-
1. Clone and install dependencies:
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
git clone https://github.com/aryaniyaps/ultimate-pi.git
|
|
9
|
-
cd ultimate-pi
|
|
10
|
-
npm install
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
`npm install` automatically sets up pre-commit hooks via [Lefthook](https://github.com/evilmartians/lefthook).
|
|
14
|
-
|
|
15
|
-
2. Install the package locally into PI:
|
|
16
|
-
|
|
17
|
-
```bash
|
|
18
|
-
pi install . -l
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
Then restart PI or run `/reload`.
|
|
22
|
-
|
|
23
|
-
## Linting & formatting
|
|
24
|
-
|
|
25
|
-
Uses [Biome](https://biomejs.dev) for linting, formatting, and import sorting.
|
|
26
|
-
|
|
27
|
-
```bash
|
|
28
|
-
npm run lint # check lint + format errors
|
|
29
|
-
npm run lint:fix # auto-fix lint + format errors
|
|
30
|
-
npm run format # format all files
|
|
31
|
-
npm run format:check # check formatting without writing
|
|
32
|
-
npm run check:ts # typecheck extensions
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
Pre-commit hooks run `biome check` and `tsc` on staged files automatically.
|
|
36
|
-
|
|
37
|
-
## Sentrux (architectural quality gate)
|
|
38
|
-
|
|
39
|
-
[Sentrux](https://github.com/sentrux/sentrux) provides real-time structural quality metrics for AI-agent-written code. It acts as a feedback loop sensor — scanning codebase architecture, detecting degradation, and enforcing rules via MCP.
|
|
40
|
-
|
|
41
|
-
### Quick start
|
|
42
|
-
|
|
43
|
-
```bash
|
|
44
|
-
# Install (macOS / Linux / Windows)
|
|
45
|
-
curl -fsSL https://raw.githubusercontent.com/sentrux/sentrux/main/install.sh | sh
|
|
46
|
-
|
|
47
|
-
# Install all 52 language plugins
|
|
48
|
-
sentrux plugin add-standard
|
|
49
|
-
|
|
50
|
-
# Run a quality scan
|
|
51
|
-
sentrux check .
|
|
52
|
-
|
|
53
|
-
# Save baseline before agent session
|
|
54
|
-
sentrux gate --save .
|
|
55
|
-
|
|
56
|
-
# Compare after — catches degradation
|
|
57
|
-
sentrux gate .
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
### MCP Integration
|
|
61
|
-
|
|
62
|
-
The sentrux MCP server is configured in `.pi/mcp.json`. Agents can use tools like `scan`, `session_start`, `session_end`, `check_rules`, `health`, and `evolution` to monitor code quality during development.
|
|
63
|
-
|
|
64
|
-
### Rules Engine
|
|
65
|
-
|
|
66
|
-
Create `.sentrux/rules.toml` to define architectural constraints:
|
|
67
|
-
|
|
68
|
-
```toml
|
|
69
|
-
[constraints]
|
|
70
|
-
max_cycles = 0
|
|
71
|
-
max_coupling = "B"
|
|
72
|
-
max_cc = 25
|
|
73
|
-
no_god_files = true
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
## Firecrawl (self-hosted web scraping)
|
|
77
|
-
|
|
78
|
-
The Firecrawl skill depends on a Firecrawl instance. This repo includes a self-hosted setup powered by Docker.
|
|
79
|
-
|
|
80
|
-
### Quick start
|
|
81
|
-
|
|
82
|
-
```bash
|
|
83
|
-
cd firecrawl
|
|
84
|
-
cp .env.template .env # first time only — edit if needed
|
|
85
|
-
docker compose up -d # pulls pre-built GHCR images automatically
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
Firecrawl API is now at `http://localhost:3002`. Admin UI at `http://localhost:3002/admin/<BULL_AUTH_KEY>/queues`.
|
|
89
|
-
|
|
90
|
-
### Services
|
|
91
|
-
|
|
92
|
-
| Service | Image | Port |
|
|
93
|
-
|---------|-------|------|
|
|
94
|
-
| `api` | `ghcr.io/firecrawl/firecrawl` | 3002 |
|
|
95
|
-
| `playwright-service` | `ghcr.io/firecrawl/playwright-service:latest` | 3000 (internal) |
|
|
96
|
-
| `nuq-postgres` | `ghcr.io/firecrawl/nuq-postgres:latest` | 5432 (internal) |
|
|
97
|
-
| `redis` | `redis:alpine` | 6379 (internal) |
|
|
98
|
-
| `rabbitmq` | `rabbitmq:3-management` | 5672 (internal) |
|
|
99
|
-
| `searxng` | `searxng/searxng:latest` | 8080 |
|
|
100
|
-
|
|
101
|
-
### Configuration
|
|
102
|
-
|
|
103
|
-
All options live in `firecrawl/.env`. See `firecrawl/.env.template` for the full reference. Key env vars:
|
|
104
|
-
|
|
105
|
-
- `PORT` — API port (default: `3002`)
|
|
106
|
-
- `SEARXNG_ENDPOINT` — enables `/search` API (default: `http://searxng:8080`)
|
|
107
|
-
- `OPENAI_API_KEY` — enables AI features (JSON formatting, `/extract` API)
|
|
108
|
-
- `BULL_AUTH_KEY` — admin UI access key (default: `CHANGEME` — change in production)
|
|
109
|
-
|
|
110
|
-
See `firecrawl/README.md` for detailed docs and SDK usage examples.
|
|
111
|
-
|
|
112
|
-
## Extensions
|
|
113
|
-
|
|
114
|
-
### Dotenv loader
|
|
115
|
-
|
|
116
|
-
`.pi/extensions/dotenv-loader.ts` — loads `.env` files into `process.env` on session start.
|
|
117
|
-
|
|
118
|
-
Configurable via env vars (set before launching pi):
|
|
119
|
-
|
|
120
|
-
| Variable | Default | Description |
|
|
121
|
-
|---|---|---|
|
|
122
|
-
| `ENV_LOADER_FILES` | `.env` | Comma-separated list of `.env` file paths (relative to cwd). |
|
|
123
|
-
| `ENV_LOADER_OVERRIDE` | `false` | Set to `true` to overwrite existing env vars. |
|
|
124
|
-
| `ENV_LOADER_SILENT` | `false` | Set to `true` to suppress startup logs. |
|
|
125
|
-
| `ENV_LOADER_ENCODING` | `utf-8` | File encoding for `.env` files. |
|
|
126
|
-
|
|
127
|
-
- Supports variable expansion (`$VAR` and `${VAR}`).
|
|
128
|
-
- Reloads on `/reload`.
|
|
129
|
-
- Status command: `/env-loader-status`
|
|
130
|
-
|
|
131
|
-
### Harness governance extensions
|
|
132
|
-
|
|
133
|
-
These Pi extensions are loaded from `.pi/extensions/` via the root `package.json`
|
|
134
|
-
`pi.extensions` manifest (no extra registration needed):
|
|
135
|
-
|
|
136
|
-
- `.pi/extensions/policy-gate.ts` — plan-before-mutate + phase enforcement
|
|
137
|
-
- `.pi/extensions/budget-guard.ts` — budget hard-stop and `budget_exhausted` events
|
|
138
|
-
- `.pi/extensions/trace-recorder.ts` — run trace artifacts in `.pi/harness/runs/`
|
|
139
|
-
- `.pi/extensions/review-integrity.ts` — evaluator/adversary session isolation checks
|
|
140
|
-
- `.pi/extensions/test-diff-integrity.ts` — suspicious test diff detection/escalation
|
|
141
|
-
- `.pi/extensions/debate-orchestrator.ts` — headless debate bus + consensus packets
|
|
142
|
-
|
|
143
|
-
### PostHog analytics
|
|
144
|
-
|
|
145
|
-
`@posthog/pi` — wraps the upstream [posthog-pi](https://github.com/PostHog/posthog-pi) extension to capture AI generation spans, tool spans, and traces in [PostHog](https://posthog.com). Install via `pi install @posthog/pi`. See the upstream repo for configuration and env vars.
|
|
146
|
-
|
|
147
|
-
## Skill sources
|
|
148
|
-
|
|
149
|
-
| Skill | Upstream |
|
|
150
|
-
|---|---|
|
|
151
|
-
| caveman | [juliusbrussee/caveman](https://github.com/juliusbrussee/caveman) |
|
|
152
|
-
| context7-cli | [upstash/context7](https://github.com/upstash/context7) |
|
|
153
|
-
| find-skills | bundled (context7-compatible discovery) |
|
|
154
|
-
| firecrawl (13 skills) | [firecrawl](https://firecrawl.dev) |
|
|
155
|
-
| obsidian/wiki skills (11 skills) | [AgriciDaniel/claude-obsidian](https://github.com/AgriciDaniel/claude-obsidian) |
|
|
156
|
-
| posthog-analyst | bundled (PostHog MCP integration) |
|
|
157
|
-
|
|
158
|
-
### Firecrawl sub-skills
|
|
159
|
-
|
|
160
|
-
`firecrawl-search`, `firecrawl-scrape`, `firecrawl-crawl`, `firecrawl-map`, `firecrawl-download`, `firecrawl-parse`, `firecrawl-interact`, `firecrawl-agent`, `firecrawl-build-scrape`, `firecrawl-build-search`, `firecrawl-build-onboarding`, `firecrawl-build-interact`
|
|
161
|
-
|
|
162
|
-
### Wiki sub-skills
|
|
163
|
-
|
|
164
|
-
`wiki`, `wiki-save`, `wiki-query`, `wiki-ingest`, `wiki-lint`, `wiki-fold`, `autoresearch`, `canvas`, `obsidian-markdown`, `obsidian-bases`
|
|
165
|
-
|
|
166
|
-
> `context-mode` is installed as a separate pi package (`npm:context-mode`) — not bundled as a skill.
|
package/lefthook.yml
DELETED
|
Binary file
|
|
@@ -1,376 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Index YouTube watch URLs: yt-dlp metadata + Firecrawl transcript scrape.
|
|
3
|
-
|
|
4
|
-
Writes ``<data-dir>/<channel-handle>/<YYYY-MM-DD>/<video-id>_<title-slug>.txt`` and
|
|
5
|
-
``.meta.txt``, and merges ``_index.tsv`` per channel. No channel-specific filters.
|
|
6
|
-
Default ``data-dir`` is ``<repo>/data/youtube-transcripts`` when this file lives in ``<repo>/scripts/``.
|
|
7
|
-
|
|
8
|
-
Requirements: ``yt-dlp`` and ``firecrawl`` CLI on PATH (see ``firecrawl --status``).
|
|
9
|
-
|
|
10
|
-
Examples:
|
|
11
|
-
python3 scripts/index_youtube_urls.py 'https://www.youtube.com/watch?v=VIDEO_ID'
|
|
12
|
-
python3 scripts/index_youtube_urls.py --urls-file urls.txt
|
|
13
|
-
python3 scripts/index_youtube_urls.py --data-dir ./data/youtube-transcripts --firecrawl-cwd . URL
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
from __future__ import annotations
|
|
17
|
-
|
|
18
|
-
import argparse
|
|
19
|
-
import os
|
|
20
|
-
import re
|
|
21
|
-
import shutil
|
|
22
|
-
import subprocess
|
|
23
|
-
import tempfile
|
|
24
|
-
import time
|
|
25
|
-
from pathlib import Path
|
|
26
|
-
from urllib.parse import parse_qs, urlparse
|
|
27
|
-
|
|
28
|
-
SLEEP_SEC = 5.0
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def slug(s: str, max_len: int = 80) -> str:
|
|
32
|
-
s = re.sub(r"[^\w\s-]", "", s, flags=re.UNICODE)
|
|
33
|
-
s = re.sub(r"[-\s]+", "-", s).strip("-") or "untitled"
|
|
34
|
-
return s[:max_len].rstrip("-")
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def ymd(upload_date: str) -> str:
|
|
38
|
-
if len(upload_date) == 8 and upload_date.isdigit():
|
|
39
|
-
return f"{upload_date[:4]}-{upload_date[4:6]}-{upload_date[6:8]}"
|
|
40
|
-
return "unknown-date"
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def parse_firecrawl_youtube_transcript(md: str) -> str | None:
|
|
44
|
-
marker = "## Transcript"
|
|
45
|
-
i = md.find(marker)
|
|
46
|
-
if i == -1:
|
|
47
|
-
return None
|
|
48
|
-
rest = md[i + len(marker) :].lstrip("\n")
|
|
49
|
-
lines_out: list[str] = []
|
|
50
|
-
for line in rest.splitlines():
|
|
51
|
-
if line.startswith("## ") and lines_out:
|
|
52
|
-
break
|
|
53
|
-
lines_out.append(line)
|
|
54
|
-
text = "\n".join(lines_out).strip()
|
|
55
|
-
if len(text) < 30:
|
|
56
|
-
return None
|
|
57
|
-
return text
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def _firecrawl_transcript_sane(text: str) -> bool:
|
|
61
|
-
"""Reject full-page scrapes where ## Transcript captured sidebar/recommendations."""
|
|
62
|
-
head = text[:1200]
|
|
63
|
-
if "NaN / NaN" in head:
|
|
64
|
-
return False
|
|
65
|
-
if head.count("[![]") >= 2 or head.count("hqdefault.jpg") >= 2:
|
|
66
|
-
return False
|
|
67
|
-
if head.count("views •") >= 2:
|
|
68
|
-
return False
|
|
69
|
-
return True
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def fetch_transcript_firecrawl(
|
|
73
|
-
video_id: str,
|
|
74
|
-
*,
|
|
75
|
-
firecrawl_bin: str,
|
|
76
|
-
firecrawl_cwd: Path,
|
|
77
|
-
wait_ms: int = 20000,
|
|
78
|
-
attempts: int = 3,
|
|
79
|
-
scrape_timeout: int = 300,
|
|
80
|
-
) -> str | None:
|
|
81
|
-
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
82
|
-
for attempt in range(attempts):
|
|
83
|
-
if attempt:
|
|
84
|
-
time.sleep(4.0)
|
|
85
|
-
fd, out = tempfile.mkstemp(suffix=".md", prefix="ytfc-")
|
|
86
|
-
os.close(fd)
|
|
87
|
-
out_path = Path(out)
|
|
88
|
-
try:
|
|
89
|
-
cmd = [
|
|
90
|
-
firecrawl_bin,
|
|
91
|
-
"scrape",
|
|
92
|
-
url,
|
|
93
|
-
"--wait-for",
|
|
94
|
-
str(wait_ms),
|
|
95
|
-
"--only-main-content",
|
|
96
|
-
"-o",
|
|
97
|
-
str(out_path),
|
|
98
|
-
]
|
|
99
|
-
r = subprocess.run(
|
|
100
|
-
cmd,
|
|
101
|
-
capture_output=True,
|
|
102
|
-
text=True,
|
|
103
|
-
timeout=scrape_timeout,
|
|
104
|
-
cwd=str(firecrawl_cwd),
|
|
105
|
-
)
|
|
106
|
-
if r.returncode != 0:
|
|
107
|
-
continue
|
|
108
|
-
md = out_path.read_text(encoding="utf-8", errors="replace")
|
|
109
|
-
text = parse_firecrawl_youtube_transcript(md)
|
|
110
|
-
if text and _firecrawl_transcript_sane(text):
|
|
111
|
-
return text
|
|
112
|
-
except (OSError, subprocess.TimeoutExpired, ValueError):
|
|
113
|
-
pass
|
|
114
|
-
finally:
|
|
115
|
-
out_path.unlink(missing_ok=True)
|
|
116
|
-
return None
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
def needs_transcript(path: Path) -> bool:
|
|
120
|
-
if not path.exists():
|
|
121
|
-
return True
|
|
122
|
-
try:
|
|
123
|
-
text = path.read_text(encoding="utf-8", errors="replace")
|
|
124
|
-
except OSError:
|
|
125
|
-
return True
|
|
126
|
-
return text.strip().startswith("(no transcript")
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
def channel_dir_from_handle(uploader_id: str) -> str:
|
|
130
|
-
h = (uploader_id or "unknown-channel").strip()
|
|
131
|
-
if h.startswith("@"):
|
|
132
|
-
h = h[1:]
|
|
133
|
-
return h.lower() or "unknown-channel"
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def video_id_from_arg(s: str) -> str:
|
|
137
|
-
s = s.strip()
|
|
138
|
-
if re.fullmatch(r"[0-9A-Za-z_-]{11}", s):
|
|
139
|
-
return s
|
|
140
|
-
u = urlparse(s)
|
|
141
|
-
host = (u.netloc or "").lower().removeprefix("www.")
|
|
142
|
-
if host == "youtu.be":
|
|
143
|
-
seg = u.path.strip("/").split("/")[0]
|
|
144
|
-
if re.fullmatch(r"[0-9A-Za-z_-]{11}", seg):
|
|
145
|
-
return seg
|
|
146
|
-
qs = parse_qs(u.query)
|
|
147
|
-
if "v" in qs and qs["v"]:
|
|
148
|
-
vid = qs["v"][0]
|
|
149
|
-
if re.fullmatch(r"[0-9A-Za-z_-]{11}", vid):
|
|
150
|
-
return vid
|
|
151
|
-
raise SystemExit(f"Could not parse YouTube video id from: {s!r}")
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def yt_dlp_row(watch_url: str, *, yt_dlp_bin: str) -> tuple[str, str, str, str]:
|
|
155
|
-
"""Returns (video_id, upload_date, title, uploader_id)."""
|
|
156
|
-
cmd = [
|
|
157
|
-
yt_dlp_bin,
|
|
158
|
-
"--no-download",
|
|
159
|
-
"--ignore-errors",
|
|
160
|
-
"--print",
|
|
161
|
-
"%(id)s|%(upload_date)s|%(title)s|%(uploader_id)s",
|
|
162
|
-
watch_url,
|
|
163
|
-
]
|
|
164
|
-
r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
165
|
-
if r.returncode != 0:
|
|
166
|
-
raise SystemExit(f"yt-dlp failed ({r.returncode}): {watch_url}\n{r.stderr}")
|
|
167
|
-
line = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else ""
|
|
168
|
-
parts = line.split("|", 3)
|
|
169
|
-
if len(parts) < 4:
|
|
170
|
-
raise SystemExit(f"Unexpected yt-dlp output for {watch_url!r}: {line!r}")
|
|
171
|
-
vid, udate, title, handle = parts[0], parts[1], parts[2], parts[3]
|
|
172
|
-
if not udate.isdigit() or len(udate) != 8:
|
|
173
|
-
raise SystemExit(f"Bad upload_date from yt-dlp: {udate!r}")
|
|
174
|
-
return vid, udate, title, handle or "@unknown"
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
def merge_index(idx: Path, rows: dict[str, tuple[str, str]]) -> None:
|
|
178
|
-
if idx.exists():
|
|
179
|
-
for i, line in enumerate(idx.read_text(encoding="utf-8").splitlines()):
|
|
180
|
-
line = line.strip()
|
|
181
|
-
if not line:
|
|
182
|
-
continue
|
|
183
|
-
if i == 0 and line.startswith("video_id"):
|
|
184
|
-
continue
|
|
185
|
-
parts = line.split("\t")
|
|
186
|
-
if len(parts) >= 3:
|
|
187
|
-
vid, ud, tit = parts[0], parts[1], parts[2]
|
|
188
|
-
rows.setdefault(vid, (ud, tit))
|
|
189
|
-
lines = ["video_id\tupload_date\ttitle"]
|
|
190
|
-
for vid in sorted(rows.keys()):
|
|
191
|
-
ud, tit = rows[vid]
|
|
192
|
-
lines.append(f"{vid}\t{ud}\t{tit.replace(chr(9), ' ')}")
|
|
193
|
-
idx.parent.mkdir(parents=True, exist_ok=True)
|
|
194
|
-
idx.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def collect_urls(args: argparse.Namespace) -> list[str]:
|
|
198
|
-
out: list[str] = []
|
|
199
|
-
for a in args.url:
|
|
200
|
-
out.append(a.strip())
|
|
201
|
-
if args.urls_file:
|
|
202
|
-
raw = Path(args.urls_file).read_text(encoding="utf-8")
|
|
203
|
-
for line in raw.splitlines():
|
|
204
|
-
line = line.strip()
|
|
205
|
-
if line and not line.startswith("#"):
|
|
206
|
-
out.append(line)
|
|
207
|
-
seen: set[str] = set()
|
|
208
|
-
uniq: list[str] = []
|
|
209
|
-
for u in out:
|
|
210
|
-
if u not in seen:
|
|
211
|
-
seen.add(u)
|
|
212
|
-
uniq.append(u)
|
|
213
|
-
return uniq
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def default_paths() -> tuple[Path, Path]:
|
|
217
|
-
"""(data_dir, firecrawl_cwd) when script lives in <repo>/scripts/."""
|
|
218
|
-
here = Path(__file__).resolve()
|
|
219
|
-
repo = here.parent.parent
|
|
220
|
-
return repo / "data" / "youtube-transcripts", repo
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
def main() -> int:
|
|
224
|
-
default_data, default_fc_cwd = default_paths()
|
|
225
|
-
ap = argparse.ArgumentParser(description=__doc__)
|
|
226
|
-
ap.add_argument(
|
|
227
|
-
"url",
|
|
228
|
-
nargs="*",
|
|
229
|
-
help="YouTube watch URLs, youtu.be links, or 11-char video ids",
|
|
230
|
-
)
|
|
231
|
-
ap.add_argument(
|
|
232
|
-
"--urls-file",
|
|
233
|
-
metavar="PATH",
|
|
234
|
-
help="Text file with one URL or id per line (# comments allowed)",
|
|
235
|
-
)
|
|
236
|
-
ap.add_argument(
|
|
237
|
-
"--data-dir",
|
|
238
|
-
type=Path,
|
|
239
|
-
metavar="DIR",
|
|
240
|
-
default=default_data,
|
|
241
|
-
help=f"Root for channel folders (default: {default_data})",
|
|
242
|
-
)
|
|
243
|
-
ap.add_argument(
|
|
244
|
-
"--firecrawl-cwd",
|
|
245
|
-
type=Path,
|
|
246
|
-
metavar="DIR",
|
|
247
|
-
default=default_fc_cwd,
|
|
248
|
-
help="Working directory for firecrawl subprocess (default: repo root next to scripts/)",
|
|
249
|
-
)
|
|
250
|
-
ap.add_argument(
|
|
251
|
-
"--yt-dlp",
|
|
252
|
-
metavar="BIN",
|
|
253
|
-
default="yt-dlp",
|
|
254
|
-
help="yt-dlp executable name or path (default: yt-dlp)",
|
|
255
|
-
)
|
|
256
|
-
ap.add_argument(
|
|
257
|
-
"--firecrawl",
|
|
258
|
-
metavar="BIN",
|
|
259
|
-
default="",
|
|
260
|
-
help="firecrawl executable (default: search PATH)",
|
|
261
|
-
)
|
|
262
|
-
ap.add_argument(
|
|
263
|
-
"--wait-for",
|
|
264
|
-
type=int,
|
|
265
|
-
default=20000,
|
|
266
|
-
metavar="MS",
|
|
267
|
-
help="Firecrawl scrape --wait-for milliseconds (default 20000)",
|
|
268
|
-
)
|
|
269
|
-
ap.add_argument(
|
|
270
|
-
"--sleep",
|
|
271
|
-
type=float,
|
|
272
|
-
default=SLEEP_SEC,
|
|
273
|
-
metavar="SEC",
|
|
274
|
-
help=f"Seconds between Firecrawl scrapes (default {SLEEP_SEC})",
|
|
275
|
-
)
|
|
276
|
-
ap.add_argument(
|
|
277
|
-
"--dry-run",
|
|
278
|
-
action="store_true",
|
|
279
|
-
help="Print yt-dlp metadata only; do not scrape or write files",
|
|
280
|
-
)
|
|
281
|
-
ap.add_argument(
|
|
282
|
-
"--force",
|
|
283
|
-
action="store_true",
|
|
284
|
-
help="Re-scrape even when a non-placeholder transcript already exists",
|
|
285
|
-
)
|
|
286
|
-
args = ap.parse_args()
|
|
287
|
-
urls = collect_urls(args)
|
|
288
|
-
if not urls:
|
|
289
|
-
ap.error("Pass at least one url, or use --urls-file")
|
|
290
|
-
|
|
291
|
-
fc_bin = args.firecrawl.strip() or shutil.which("firecrawl")
|
|
292
|
-
if not fc_bin and not args.dry_run:
|
|
293
|
-
raise SystemExit(
|
|
294
|
-
"firecrawl CLI not found on PATH. Install it and run `firecrawl --status`, "
|
|
295
|
-
"or pass --firecrawl /path/to/firecrawl."
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
data_dir: Path = args.data_dir
|
|
299
|
-
fc_cwd: Path = args.firecrawl_cwd
|
|
300
|
-
|
|
301
|
-
index_rows: dict[str, dict[str, tuple[str, str]]] = {}
|
|
302
|
-
first_scrape = True
|
|
303
|
-
|
|
304
|
-
for raw in urls:
|
|
305
|
-
vid_guess = video_id_from_arg(raw)
|
|
306
|
-
watch = f"https://www.youtube.com/watch?v={vid_guess}"
|
|
307
|
-
vid, udate, title, uploader_id = yt_dlp_row(watch, yt_dlp_bin=args.yt_dlp)
|
|
308
|
-
ch_slug = channel_dir_from_handle(uploader_id)
|
|
309
|
-
day = ymd(udate)
|
|
310
|
-
out_base = data_dir / ch_slug
|
|
311
|
-
day_dir = out_base / day
|
|
312
|
-
base = f"{vid}_{slug(title)}"
|
|
313
|
-
path = day_dir / f"{base}.txt"
|
|
314
|
-
meta_path = day_dir / f"{base}.meta.txt"
|
|
315
|
-
|
|
316
|
-
if args.dry_run:
|
|
317
|
-
print(f"{ch_slug}\t{day}\t{vid}\t{udate}\t{title}", flush=True)
|
|
318
|
-
bucket = index_rows.setdefault(ch_slug, {})
|
|
319
|
-
bucket[vid] = (udate, title)
|
|
320
|
-
continue
|
|
321
|
-
|
|
322
|
-
day_dir.mkdir(parents=True, exist_ok=True)
|
|
323
|
-
need = args.force or needs_transcript(path)
|
|
324
|
-
text: str | None
|
|
325
|
-
if need:
|
|
326
|
-
if not first_scrape:
|
|
327
|
-
time.sleep(max(0.0, args.sleep))
|
|
328
|
-
first_scrape = False
|
|
329
|
-
print(f"scrape {ch_slug} {day} {vid} …", flush=True)
|
|
330
|
-
assert fc_bin is not None
|
|
331
|
-
text = fetch_transcript_firecrawl(
|
|
332
|
-
vid,
|
|
333
|
-
firecrawl_bin=fc_bin,
|
|
334
|
-
firecrawl_cwd=fc_cwd,
|
|
335
|
-
wait_ms=args.wait_for,
|
|
336
|
-
)
|
|
337
|
-
else:
|
|
338
|
-
print(f"skip {ch_slug} {day} {vid} (existing transcript)", flush=True)
|
|
339
|
-
text = None
|
|
340
|
-
|
|
341
|
-
ch_meta = uploader_id if uploader_id.startswith("@") else f"@{uploader_id}"
|
|
342
|
-
meta = (
|
|
343
|
-
f"video_id: {vid}\n"
|
|
344
|
-
f"upload_date: {udate}\n"
|
|
345
|
-
f"title: {title}\n"
|
|
346
|
-
f"url: https://www.youtube.com/watch?v={vid}\n"
|
|
347
|
-
f"transcript_source: firecrawl\n"
|
|
348
|
-
f"channel: {ch_meta}\n"
|
|
349
|
-
)
|
|
350
|
-
meta_path.write_text(meta, encoding="utf-8")
|
|
351
|
-
if need:
|
|
352
|
-
if text is None:
|
|
353
|
-
path.write_text(
|
|
354
|
-
"(no transcript yet: Firecrawl scrape had no ## Transcript section or empty body. "
|
|
355
|
-
"Retry later or open the watch URL in a browser.)\n",
|
|
356
|
-
encoding="utf-8",
|
|
357
|
-
)
|
|
358
|
-
print(" -> no transcript", flush=True)
|
|
359
|
-
else:
|
|
360
|
-
path.write_text(text, encoding="utf-8")
|
|
361
|
-
print(f" -> ok ({len(text)} chars)", flush=True)
|
|
362
|
-
|
|
363
|
-
bucket = index_rows.setdefault(ch_slug, {})
|
|
364
|
-
bucket[vid] = (udate, title)
|
|
365
|
-
|
|
366
|
-
if not args.dry_run:
|
|
367
|
-
for ch_slug, rows in index_rows.items():
|
|
368
|
-
idx = data_dir / ch_slug / "_index.tsv"
|
|
369
|
-
merge_index(idx, dict(rows))
|
|
370
|
-
print(f"wrote {idx}", flush=True)
|
|
371
|
-
|
|
372
|
-
return 0
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
if __name__ == "__main__":
|
|
376
|
-
raise SystemExit(main())
|