yt-briefing 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Usage: bun src/yt-transcript.ts VIDEO_ID_OR_URL [--lang pl|en|auto]
4
+ * Accepts: bare 11-char video ID, youtube.com/watch?v=..., or youtu.be/... URL.
5
+ * Output: plain text transcript on stdout
6
+ * Exit 1 if the video genuinely has no subtitles — caller should skip this video.
7
+ * Exit 2 if YouTube rate-limited / blocked the IP (caller may retry later).
8
+ * Exit 3 on a tooling/integration failure (yt-dlp missing, spawn error, fetch
9
+ * error, unavailable/private video, empty/unparseable track, bad input). This
10
+ * is NOT a missing-captions case — stderr carries the real cause, so the caller
11
+ * must surface it verbatim and never report it as "no subtitles".
12
+ * Uses yt-dlp for subtitle extraction (handles all YouTube caption formats).
13
+ *
14
+ * yt-dlp lookup (so a project-local install needs no global PATH pollution):
15
+ * 1. $YT_DLP_PATH if set
16
+ * 2. <package>/bin/yt-dlp (yt-dlp.exe on Windows) if present
17
+ * 3. `yt-dlp` on PATH (yt-dlp.exe on Windows)
18
+ * See README.md → Requirements for per-OS install methods.
19
+ *
20
+ * YT_PROXY env (optional): HTTP proxy URL — required on datacenter/VPS IPs which
21
+ * YouTube blocks. Example: YT_PROXY=http://127.0.0.1:1080 (Cloudflare WARP via
22
+ * Docker). See docs/warp-proxy.md.
23
+ */
24
+ import { spawnSync } from 'node:child_process';
25
+ import { mkdtempSync, mkdirSync, readdirSync, readFileSync, rmSync, existsSync } from 'node:fs';
26
+ import { join, dirname } from 'node:path';
27
+ import { fileURLToPath } from 'node:url';
28
+ import dotenv from 'dotenv';
29
+ import { CACHE_DIR, ENV_PATH } from "./lib/paths.js";
30
+ // Load .env so YT_PROXY is set when run standalone under Node (Bun auto-loads it; Node doesn't).
31
+ // When spawned by yt-sweep, the parent already loaded it and the child inherits the env.
32
+ dotenv.config({ path: ENV_PATH });
33
+ /** Resolve the yt-dlp binary: explicit env → project-local ./bin → PATH (Windows-aware). */
34
+ function resolveYtDlp() {
35
+ if (process.env.YT_DLP_PATH)
36
+ return process.env.YT_DLP_PATH;
37
+ const exe = process.platform === 'win32' ? 'yt-dlp.exe' : 'yt-dlp';
38
+ const local = join(dirname(fileURLToPath(import.meta.url)), '..', 'bin', exe);
39
+ return existsSync(local) ? local : exe; // bare name → looked up on PATH
40
+ }
41
+ const YT_DLP = resolveYtDlp();
42
+ const args = process.argv.slice(2);
43
+ const rawInput = args[0];
44
+ const langIdx = args.indexOf('--lang');
45
+ const preferredLang = langIdx !== -1 ? args[langIdx + 1] : 'auto';
46
+ if (!rawInput) {
47
+ console.error('Usage: yt-briefing transcribe VIDEO_ID_OR_URL [--lang pl|en|auto]');
48
+ process.exit(3);
49
+ }
50
+ function extractVideoId(input) {
51
+ const t = input.trim();
52
+ const watch = t.match(/[?&]v=([A-Za-z0-9_-]{11})/);
53
+ if (watch)
54
+ return watch[1];
55
+ const short = t.match(/youtu\.be\/([A-Za-z0-9_-]{11})/);
56
+ if (short)
57
+ return short[1];
58
+ if (/^[A-Za-z0-9_-]{11}$/.test(t))
59
+ return t;
60
+ console.error(`Invalid VIDEO_ID or YouTube URL: ${input}`);
61
+ process.exit(3);
62
+ }
63
+ const videoId = extractVideoId(rawInput);
64
+ const proxyUrl = process.env.YT_PROXY;
65
+ function vttToText(vtt) {
66
+ let prev = '';
67
+ const parts = [];
68
+ for (const line of vtt.split('\n')) {
69
+ const l = line.trim();
70
+ if (!l || l.startsWith('WEBVTT') || l.startsWith('Kind:') || l.startsWith('Language:') || l.includes('-->'))
71
+ continue;
72
+ const clean = l.replace(/<[^>]+>/g, '').trim();
73
+ if (clean && clean !== prev) {
74
+ parts.push(clean);
75
+ prev = clean;
76
+ }
77
+ }
78
+ return parts.join(' ');
79
+ }
80
+ // Keep yt-dlp's scratch subtitles inside the gitignored data cache — never OS /tmp, so all
81
+ // of the tool's transient files stay in-repo (DATA_DIR/.cache). Cleaned up below.
82
+ mkdirSync(CACHE_DIR, { recursive: true });
83
+ const tmpDir = mkdtempSync(join(CACHE_DIR, 'sub-'));
84
+ const outTemplate = join(tmpDir, 'sub');
85
+ const subLangs = preferredLang === 'auto' ? 'pl,en,en-orig,en.*' : `${preferredLang},en,pl`;
86
+ const ytdlpArgs = [
87
+ '--no-playlist',
88
+ '--skip-download',
89
+ '--write-subs',
90
+ '--write-auto-subs',
91
+ '--sub-langs', subLangs,
92
+ '--sub-format', 'vtt',
93
+ '--ignore-errors', // continue if one language track fails (e.g. 429 on pl for en-only video)
94
+ '--no-warnings',
95
+ '--quiet',
96
+ '-o', outTemplate,
97
+ ];
98
+ if (proxyUrl)
99
+ ytdlpArgs.push('--proxy', proxyUrl);
100
+ ytdlpArgs.push(`https://www.youtube.com/watch?v=${videoId}`);
101
+ const result = spawnSync(YT_DLP, ytdlpArgs, { encoding: 'utf8', timeout: 60_000 });
102
+ const cleanup = () => { try {
103
+ rmSync(tmpDir, { recursive: true });
104
+ }
105
+ catch { } };
106
+ if (result.error) {
107
+ cleanup();
108
+ const msg = result.error.code === 'ENOENT'
109
+ ? `yt-dlp not found (looked for "${YT_DLP}") — install it or set YT_DLP_PATH (see README.md → Requirements)`
110
+ : `yt-dlp spawn error: ${result.error.message}`;
111
+ console.error(msg);
112
+ process.exit(3);
113
+ }
114
+ const lower = (result.stderr ?? '').toLowerCase();
115
+ const isRateLimited = lower.includes('sign in') || lower.includes('429') || lower.includes('too many') || lower.includes('captcha');
116
+ // Check VTT files first — with --ignore-errors some languages may succeed even if others 429
117
+ let vttFiles = [];
118
+ try {
119
+ vttFiles = readdirSync(tmpDir).filter(f => f.endsWith('.vtt')).map(f => join(tmpDir, f));
120
+ }
121
+ catch { }
122
+ if (vttFiles.length === 0) {
123
+ cleanup();
124
+ if (isRateLimited) {
125
+ console.error(`rate limited by YouTube (IP blocked/captcha): ${videoId}`);
126
+ process.exit(2);
127
+ }
128
+ // Zero subtitle files is ambiguous: the video may genuinely have no captions,
129
+ // OR yt-dlp/the fetch failed (unavailable/private video, extractor bug, network).
130
+ // A clean run (exit 0, no ERROR line) means there really are no captions → exit 1.
131
+ // Anything else is a tooling/integration failure → exit 3 with the real stderr,
132
+ // so the caller never mislabels a broken fetch as "no subtitles".
133
+ const ytdlpFailed = result.status !== 0 || lower.includes('error');
134
+ if (ytdlpFailed) {
135
+ console.error(`could not fetch subtitles for ${videoId} (fetch/tooling error, NOT a missing-captions case):\n` +
136
+ `${(result.stderr ?? '').trim() || `yt-dlp exited with code ${result.status}`}`);
137
+ process.exit(3);
138
+ }
139
+ console.error(`no subtitles published for video: ${videoId}`);
140
+ process.exit(1);
141
+ }
142
+ let vttFile = vttFiles[0];
143
+ if (preferredLang !== 'auto') {
144
+ const exact = vttFiles.find(f => f.includes(`.${preferredLang}.`));
145
+ if (exact)
146
+ vttFile = exact;
147
+ }
148
+ const text = vttToText(readFileSync(vttFile, 'utf8'));
149
+ cleanup();
150
+ if (!text.trim()) {
151
+ // A subtitle file existed but parsed to nothing — a format/integration issue,
152
+ // not a genuinely captionless video. Don't report it as "no subtitles".
153
+ console.error(`subtitle track for ${videoId} parsed empty (format/integration issue)`);
154
+ process.exit(3);
155
+ }
156
+ console.log(text);
@@ -0,0 +1,127 @@
1
+ # Sync across machines (multi-device)
2
+
3
+ yt-briefing keeps **all your state as plain files** under the data dir — `channels.md`,
4
+ `state.md` (the per-video cursor), and one Markdown `channels/<slug>.md` profile per
5
+ channel (where your ratings accumulate). The engine itself **never touches git** — it just
6
+ reads and writes these files. So syncing across machines (laptop + a VPS / remote session,
7
+ say) is entirely up to you, and it's just *"version the data dir and commit after each
8
+ rating."*
9
+
10
+ This guide gives you a **sync-safe** wiring — one where a rating made on machine A can't be
11
+ silently lost when machine B pushed first.
12
+
13
+ > **Secrets never sync.** `.env` (your API keys) is git-ignored and stays per-machine. Only
14
+ > the data dir is versioned. The throwaway `.cache/` inside it is ignored too.
15
+
16
+ ---
17
+
18
+ ## 1. Version your state
19
+
20
+ Your state already lives in plain files at **`<your project>/.yt-briefing/data/`** — right
21
+ inside the project you ran `init` from. So the simplest sync is to **version that folder** in
22
+ your project's own git: push from machine A, pull on machine B. Keep `.yt-briefing/.env`
23
+ git-ignored — secrets stay per machine.
24
+
25
+ Want briefing state in **its own** repo instead (e.g. a laptop and a headless VPS that share
26
+ nothing else)? Point `YT_DATA_DIR` at a folder you control and version that:
27
+
28
+ ```bash
29
+ # .env (per machine — secrets never sync)
30
+ YT_DATA_DIR=/home/you/yt-briefing-data
31
+ ```
32
+
33
+ ```bash
34
+ cd /home/you/yt-briefing-data
35
+ git init && git remote add origin git@github.com:you/yt-briefing-data.git # a PRIVATE repo
36
+ printf '.cache/\n' > .gitignore # never commit the throwaway session cache
37
+ npx yt-briefing init # onboard into this folder (or move existing data here)
38
+ git add -A && git commit -m "initial" && git push -u origin main
39
+ ```
40
+
41
+ On the second machine: clone that repo, set the same `YT_DATA_DIR`, drop in your `.env`.
42
+
43
+ ---
44
+
45
+ ## 2. Make profiles merge instead of conflict
46
+
47
+ Ratings **append** to a profile's `## Skip titles` / `## Notes`. If two machines rate
48
+ different videos before syncing, a normal merge would conflict on the same section. A
49
+ **union merge** keeps both sides' lines instead. Drop this in your data repo as
50
+ `.gitattributes`:
51
+
52
+ ```gitattributes
53
+ # Profiles are append-only logs — keep both machines' additions on merge.
54
+ channels/*.md merge=union
55
+ ```
56
+
57
+ `state.md` is deliberately **left out** (it's a table, one row per channel — union would
58
+ duplicate/garble rows). A genuine same-channel collision should surface as a conflict, not
59
+ merge wrong (see the hook below — it aborts and tells you).
60
+
61
+ ---
62
+
63
+ ## 3. Commit + push after every rating (sync-safe)
64
+
65
+ Wire this so each rating lands on `origin`. It commits the cursor + profiles, and — if the
66
+ other machine pushed first — **rebases and retries** instead of silently dropping your work.
67
+
68
+ Save as `yt-sync.sh` (anywhere), `chmod +x`:
69
+
70
+ ```bash
71
+ #!/usr/bin/env bash
72
+ # Persist yt-briefing state to git, sync-safe across machines. Best-effort, never blocks.
73
+ set -uo pipefail
74
+ DATA="${YT_DATA_DIR:-$PWD/.yt-briefing/data}" # the folder you version (default: in-project)
75
+ cd "$DATA" || exit 0
76
+
77
+ git add channels.md state.md channels/ config.json 2>/dev/null || exit 0
78
+ git diff --cached --quiet 2>/dev/null && exit 0 # nothing changed → no commit
79
+ git commit -q -m "yt-briefing: rating $(date '+%Y-%m-%d %H:%M %Z')" 2>/dev/null || exit 0
80
+
81
+ git push -q 2>/dev/null && exit 0 # landed → done
82
+ # origin moved (the other machine pushed) → integrate + retry, never swallow:
83
+ if git pull --rebase -q 2>/dev/null; then
84
+ git push -q 2>/dev/null && exit 0
85
+ else
86
+ git rebase --abort 2>/dev/null || true # same-channel collision → leave it for you
87
+ fi
88
+ echo "yt-briefing: state NOT pushed — resolve: cd \"$DATA\" && git pull --rebase && git push" >&2
89
+ exit 0
90
+ ```
91
+
92
+ Then trigger it after each rating. Two ways, depending on how you run yt-briefing:
93
+
94
+ **A) Via a coding agent (Claude Code / Cursor)** — add a `PostToolUse` hook that fires after
95
+ the engine runs. In Claude Code's `settings.json`:
96
+
97
+ ```json
98
+ {
99
+ "hooks": {
100
+ "PostToolUse": [
101
+ { "matcher": "Bash",
102
+ "hooks": [ { "type": "command", "command": "cmd=$(jq -r '.tool_input.command // \"\"'); case \"$cmd\" in *yt-rating*|*yt-sweep*) /path/to/yt-sync.sh ;; esac" } ] }
103
+ ]
104
+ }
105
+ }
106
+ ```
107
+
108
+ **B) Via the CLI** — just call it after `rate`:
109
+
110
+ ```bash
111
+ yt-briefing rate --rating 0 && /path/to/yt-sync.sh
112
+ ```
113
+
114
+ > Optional but recommended: also `git pull --rebase` **before** the first sweep of a session
115
+ > (so a machine starts on the latest cursor), e.g. a `PreToolUse` hook matching
116
+ > `*yt-sweep*--reset*`, or just `cd "$YT_DATA_DIR" && git pull --rebase` before you start.
117
+
118
+ ---
119
+
120
+ ## Why "sync-safe" (the footgun this avoids)
121
+
122
+ The naive version — `git push || true` — **silently swallows** a rejected push when the
123
+ other machine pushed first. The rating stays local-only while you believe it's saved; later
124
+ a careless conflict resolution drops it. The script above instead: per-rating commit →
125
+ push → on rejection `pull --rebase` + retry → on a real conflict, **abort and tell you**.
126
+ With profiles union-merging, the only thing that ever needs your hand is two machines rating
127
+ the *same channel* before syncing — and even then it surfaces loudly, never vanishes.
@@ -0,0 +1,81 @@
1
+ # Making YouTube transcripts work on a VPS (the datacenter-IP block)
2
+
3
+ If you run yt-briefing on your laptop, you can skip this — residential IPs fetch
4
+ transcripts fine, no proxy needed. This is for when you move it to a server.
5
+
6
+ ## The problem
7
+
8
+ YouTube **structurally blocks datacenter IPs** — OVH, AWS, GCP, Azure, Scaleway, the lot.
9
+ From the very first request you get `429` or *"Sign in to confirm you're not a bot."* It
10
+ is not a temporary rate-limit you can wait out; whole cloud CIDR ranges are blocked. On a
11
+ VPS without a proxy, every sweep returns `rate_limited` for every video → zero transcripts.
12
+
13
+ ## The fix: Cloudflare WARP as a proxy
14
+
15
+ Route the transcript fetch through Cloudflare WARP. The egress IP becomes Cloudflare's
16
+ CDN — a global CDN, not a datacenter range — which YouTube does not block. It's free and
17
+ there are no credentials to babysit (the WARP registration lives in a Docker volume).
18
+
19
+ ### One-time setup
20
+
21
+ ```bash
22
+ docker run -d \
23
+ --name warp \
24
+ --restart unless-stopped \
25
+ --device-cgroup-rule 'c 10:200 rwm' \
26
+ --cap-add MKNOD --cap-add AUDIT_WRITE --cap-add NET_ADMIN \
27
+ --sysctl net.ipv6.conf.all.disable_ipv6=0 \
28
+ --sysctl net.ipv4.conf.all.src_valid_mark=1 \
29
+ -p 127.0.0.1:1080:1080 \
30
+ -v warp-data:/var/lib/cloudflare-warp \
31
+ caomingjun/warp
32
+ ```
33
+
34
+ The `caomingjun/warp` image exposes both HTTP and SOCKS5 on port 1080 — yt-dlp handles
35
+ either; we use HTTP.
36
+
37
+ ### Point yt-briefing at it
38
+
39
+ ```bash
40
+ # .env
41
+ YT_PROXY=http://127.0.0.1:1080
42
+ ```
43
+
44
+ yt-briefing reads `YT_PROXY` and routes all yt-dlp traffic through it. No env var → direct
45
+ fetch (the residential default).
46
+
47
+ ### Health check
48
+
49
+ ```bash
50
+ curl --proxy http://127.0.0.1:1080 https://www.cloudflare.com/cdn-cgi/trace | grep warp=
51
+ # expect: warp=on
52
+ ```
53
+
54
+ ## Recovery
55
+
56
+ If the container stops, transcripts return `rate_limited` again:
57
+
58
+ ```bash
59
+ docker start warp
60
+ ```
61
+
62
+ ### "It worked all day, then suddenly rate_limited"
63
+
64
+ YouTube blocked the specific Cloudflare egress IP WARP happened to use (the block is
65
+ per-IP, not per-account). Force a re-registration to get a fresh egress IP:
66
+
67
+ ```bash
68
+ # 1. See the current egress IP and whether YouTube blocks it:
69
+ curl -x http://127.0.0.1:1080 -s https://api.ipify.org
70
+ curl -x http://127.0.0.1:1080 -s -o /dev/null -w "%{http_code}\n" "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
71
+ # 200 = OK, 429 = this IP is blocked
72
+
73
+ # 2. Re-register → new egress IP:
74
+ docker exec warp warp-cli registration delete 2>/dev/null; true
75
+ docker exec warp warp-cli registration new
76
+ docker exec warp warp-cli connect
77
+ sleep 5
78
+ curl -x http://127.0.0.1:1080 -s https://api.ipify.org # should differ
79
+ ```
80
+
81
+ Then re-run the sweep.
package/package.json ADDED
@@ -0,0 +1,56 @@
1
+ {
2
+ "name": "yt-briefing",
3
+ "version": "0.1.0",
4
+ "description": "A self-learning YouTube briefing engine: it sweeps the channels you follow, filters noise in two stages (title, then transcript), summarizes the rest in your language, and adapts to your ratings — one video at a time.",
5
+ "type": "module",
6
+ "bin": {
7
+ "yt-briefing": "dist/cli.js"
8
+ },
9
+ "files": [
10
+ "dist",
11
+ "data.example",
12
+ ".claude/skills",
13
+ "docs",
14
+ "README.md",
15
+ "LICENSE"
16
+ ],
17
+ "scripts": {
18
+ "build": "tsc -p tsconfig.build.json",
19
+ "prepare": "tsc -p tsconfig.build.json",
20
+ "init": "bun run src/bootstrap.ts",
21
+ "install-skill": "bun run src/install-skill.ts",
22
+ "sweep": "bun run src/yt-sweep.ts",
23
+ "rate": "bun run src/yt-rating.ts",
24
+ "transcribe": "bun run src/yt-transcript.ts",
25
+ "typecheck": "tsc --noEmit"
26
+ },
27
+ "engines": {
28
+ "node": ">=18",
29
+ "bun": ">=1.0.0"
30
+ },
31
+ "keywords": [
32
+ "youtube",
33
+ "transcript",
34
+ "summary",
35
+ "briefing",
36
+ "llm",
37
+ "openai-compatible",
38
+ "agent",
39
+ "claude"
40
+ ],
41
+ "license": "MIT",
42
+ "repository": {
43
+ "type": "git",
44
+ "url": "git+https://github.com/michal90r/yt-briefing.git"
45
+ },
46
+ "homepage": "https://github.com/michal90r/yt-briefing#readme",
47
+ "bugs": "https://github.com/michal90r/yt-briefing/issues",
48
+ "dependencies": {
49
+ "dotenv": "^16.6.1"
50
+ },
51
+ "devDependencies": {
52
+ "@types/node": "^22",
53
+ "typescript": "^5.7",
54
+ "@types/bun": "latest"
55
+ }
56
+ }