npm - yt-briefing - Versions diffs - 0.1.0 - Mend

yt-briefing 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/.claude/skills/yt/SKILL.md +54 -0
package/LICENSE +21 -0
package/README.md +106 -0
package/data.example/.gitattributes +7 -0
package/data.example/README.md +19 -0
package/data.example/channels/_template.md +45 -0
package/dist/bootstrap.js +243 -0
package/dist/cli.js +29 -0
package/dist/install-skill.js +51 -0
package/dist/lib/config.js +23 -0
package/dist/lib/llm.js +57 -0
package/dist/lib/paths.js +56 -0
package/dist/lib/prompt.js +39 -0
package/dist/lib/skill-install.js +66 -0
package/dist/lib/yt-api.js +122 -0
package/dist/lib/yt-lib.js +157 -0
package/dist/yt-channel-pending.js +85 -0
package/dist/yt-channel-videos.js +43 -0
package/dist/yt-rating.js +110 -0
package/dist/yt-sweep.js +546 -0
package/dist/yt-transcript.js +156 -0
package/docs/sync-across-machines.md +127 -0
package/docs/warp-proxy.md +81 -0
package/package.json +56 -0

package/dist/yt-transcript.js ADDED Viewed

@@ -0,0 +1,156 @@
+#!/usr/bin/env node
+/**
+ * Usage: bun src/yt-transcript.ts VIDEO_ID_OR_URL [--lang pl|en|auto]
+ * Accepts: bare 11-char video ID, youtube.com/watch?v=..., or youtu.be/... URL.
+ * Output: plain text transcript on stdout
+ * Exit 1 if the video genuinely has no subtitles — caller should skip this video.
+ * Exit 2 if YouTube rate-limited / blocked the IP (caller may retry later).
+ * Exit 3 on a tooling/integration failure (yt-dlp missing, spawn error, fetch
+ *   error, unavailable/private video, empty/unparseable track, bad input). This
+ *   is NOT a missing-captions case — stderr carries the real cause, so the caller
+ *   must surface it verbatim and never report it as "no subtitles".
+ * Uses yt-dlp for subtitle extraction (handles all YouTube caption formats).
+ *
+ * yt-dlp lookup (so a project-local install needs no global PATH pollution):
+ *   1. $YT_DLP_PATH if set
+ *   2. <package>/bin/yt-dlp  (yt-dlp.exe on Windows) if present
+ *   3. `yt-dlp` on PATH       (yt-dlp.exe on Windows)
+ * See README.md → Requirements for per-OS install methods.
+ *
+ * YT_PROXY env (optional): HTTP proxy URL — required on datacenter/VPS IPs which
+ *   YouTube blocks. Example: YT_PROXY=http://127.0.0.1:1080 (Cloudflare WARP via
+ *   Docker). See docs/warp-proxy.md.
+ */
+import { spawnSync } from 'node:child_process';
+import { mkdtempSync, mkdirSync, readdirSync, readFileSync, rmSync, existsSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import dotenv from 'dotenv';
+import { CACHE_DIR, ENV_PATH } from "./lib/paths.js";
+// Load .env so YT_PROXY is set when run standalone under Node (Bun auto-loads it; Node doesn't).
+// When spawned by yt-sweep, the parent already loaded it and the child inherits the env.
+dotenv.config({ path: ENV_PATH });
+/** Resolve the yt-dlp binary: explicit env → project-local ./bin → PATH (Windows-aware). */
+function resolveYtDlp() {
+    if (process.env.YT_DLP_PATH)
+        return process.env.YT_DLP_PATH;
+    const exe = process.platform === 'win32' ? 'yt-dlp.exe' : 'yt-dlp';
+    const local = join(dirname(fileURLToPath(import.meta.url)), '..', 'bin', exe);
+    return existsSync(local) ? local : exe; // bare name → looked up on PATH
+}
+const YT_DLP = resolveYtDlp();
+const args = process.argv.slice(2);
+const rawInput = args[0];
+const langIdx = args.indexOf('--lang');
+const preferredLang = langIdx !== -1 ? args[langIdx + 1] : 'auto';
+if (!rawInput) {
+    console.error('Usage: yt-briefing transcribe VIDEO_ID_OR_URL [--lang pl|en|auto]');
+    process.exit(3);
+}
+function extractVideoId(input) {
+    const t = input.trim();
+    const watch = t.match(/[?&]v=([A-Za-z0-9_-]{11})/);
+    if (watch)
+        return watch[1];
+    const short = t.match(/youtu\.be\/([A-Za-z0-9_-]{11})/);
+    if (short)
+        return short[1];
+    if (/^[A-Za-z0-9_-]{11}$/.test(t))
+        return t;
+    console.error(`Invalid VIDEO_ID or YouTube URL: ${input}`);
+    process.exit(3);
+}
+const videoId = extractVideoId(rawInput);
+const proxyUrl = process.env.YT_PROXY;
+function vttToText(vtt) {
+    let prev = '';
+    const parts = [];
+    for (const line of vtt.split('\n')) {
+        const l = line.trim();
+        if (!l || l.startsWith('WEBVTT') || l.startsWith('Kind:') || l.startsWith('Language:') || l.includes('-->'))
+            continue;
+        const clean = l.replace(/<[^>]+>/g, '').trim();
+        if (clean && clean !== prev) {
+            parts.push(clean);
+            prev = clean;
+        }
+    }
+    return parts.join(' ');
+}
+// Keep yt-dlp's scratch subtitles inside the gitignored data cache — never OS /tmp, so all
+// of the tool's transient files stay in-repo (DATA_DIR/.cache). Cleaned up below.
+mkdirSync(CACHE_DIR, { recursive: true });
+const tmpDir = mkdtempSync(join(CACHE_DIR, 'sub-'));
+const outTemplate = join(tmpDir, 'sub');
+const subLangs = preferredLang === 'auto' ? 'pl,en,en-orig,en.*' : `${preferredLang},en,pl`;
+const ytdlpArgs = [
+    '--no-playlist',
+    '--skip-download',
+    '--write-subs',
+    '--write-auto-subs',
+    '--sub-langs', subLangs,
+    '--sub-format', 'vtt',
+    '--ignore-errors', // continue if one language track fails (e.g. 429 on pl for en-only video)
+    '--no-warnings',
+    '--quiet',
+    '-o', outTemplate,
+];
+if (proxyUrl)
+    ytdlpArgs.push('--proxy', proxyUrl);
+ytdlpArgs.push(`https://www.youtube.com/watch?v=${videoId}`);
+const result = spawnSync(YT_DLP, ytdlpArgs, { encoding: 'utf8', timeout: 60_000 });
+const cleanup = () => { try {
+    rmSync(tmpDir, { recursive: true });
+}
+catch { } };
+if (result.error) {
+    cleanup();
+    const msg = result.error.code === 'ENOENT'
+        ? `yt-dlp not found (looked for "${YT_DLP}") — install it or set YT_DLP_PATH (see README.md → Requirements)`
+        : `yt-dlp spawn error: ${result.error.message}`;
+    console.error(msg);
+    process.exit(3);
+}
+const lower = (result.stderr ?? '').toLowerCase();
+const isRateLimited = lower.includes('sign in') || lower.includes('429') || lower.includes('too many') || lower.includes('captcha');
+// Check VTT files first — with --ignore-errors some languages may succeed even if others 429
+let vttFiles = [];
+try {
+    vttFiles = readdirSync(tmpDir).filter(f => f.endsWith('.vtt')).map(f => join(tmpDir, f));
+}
+catch { }
+if (vttFiles.length === 0) {
+    cleanup();
+    if (isRateLimited) {
+        console.error(`rate limited by YouTube (IP blocked/captcha): ${videoId}`);
+        process.exit(2);
+    }
+    // Zero subtitle files is ambiguous: the video may genuinely have no captions,
+    // OR yt-dlp/the fetch failed (unavailable/private video, extractor bug, network).
+    // A clean run (exit 0, no ERROR line) means there really are no captions → exit 1.
+    // Anything else is a tooling/integration failure → exit 3 with the real stderr,
+    // so the caller never mislabels a broken fetch as "no subtitles".
+    const ytdlpFailed = result.status !== 0 || lower.includes('error');
+    if (ytdlpFailed) {
+        console.error(`could not fetch subtitles for ${videoId} (fetch/tooling error, NOT a missing-captions case):\n` +
+            `${(result.stderr ?? '').trim() || `yt-dlp exited with code ${result.status}`}`);
+        process.exit(3);
+    }
+    console.error(`no subtitles published for video: ${videoId}`);
+    process.exit(1);
+}
+let vttFile = vttFiles[0];
+if (preferredLang !== 'auto') {
+    const exact = vttFiles.find(f => f.includes(`.${preferredLang}.`));
+    if (exact)
+        vttFile = exact;
+}
+const text = vttToText(readFileSync(vttFile, 'utf8'));
+cleanup();
+if (!text.trim()) {
+    // A subtitle file existed but parsed to nothing — a format/integration issue,
+    // not a genuinely captionless video. Don't report it as "no subtitles".
+    console.error(`subtitle track for ${videoId} parsed empty (format/integration issue)`);
+    process.exit(3);
+}
+console.log(text);

package/docs/sync-across-machines.md ADDED Viewed

@@ -0,0 +1,127 @@
+# Sync across machines (multi-device)
+yt-briefing keeps **all your state as plain files** under the data dir — `channels.md`,
+`state.md` (the per-video cursor), and one Markdown `channels/<slug>.md` profile per
+channel (where your ratings accumulate). The engine itself **never touches git** — it just
+reads and writes these files. So syncing across machines (laptop + a VPS / remote session,
+say) is entirely up to you, and it's just *"version the data dir and commit after each
+rating."*
+This guide gives you a **sync-safe** wiring — one where a rating made on machine A can't be
+silently lost when machine B pushed first.
+> **Secrets never sync.** `.env` (your API keys) is git-ignored and stays per-machine. Only
+> the data dir is versioned. The throwaway `.cache/` inside it is ignored too.
+---
+## 1. Version your state
+Your state already lives in plain files at **`<your project>/.yt-briefing/data/`** — right
+inside the project you ran `init` from. So the simplest sync is to **version that folder** in
+your project's own git: push from machine A, pull on machine B. Keep `.yt-briefing/.env`
+git-ignored — secrets stay per machine.
+Want briefing state in **its own** repo instead (e.g. a laptop and a headless VPS that share
+nothing else)? Point `YT_DATA_DIR` at a folder you control and version that:
+```bash
+# .env  (per machine — secrets never sync)
+YT_DATA_DIR=/home/you/yt-briefing-data
+```
+```bash
+cd /home/you/yt-briefing-data
+git init && git remote add origin git@github.com:you/yt-briefing-data.git   # a PRIVATE repo
+printf '.cache/\n' > .gitignore          # never commit the throwaway session cache
+npx yt-briefing init                     # onboard into this folder (or move existing data here)
+git add -A && git commit -m "initial" && git push -u origin main
+```
+On the second machine: clone that repo, set the same `YT_DATA_DIR`, drop in your `.env`.
+---
+## 2. Make profiles merge instead of conflict
+Ratings **append** to a profile's `## Skip titles` / `## Notes`. If two machines rate
+different videos before syncing, a normal merge would conflict on the same section. A
+**union merge** keeps both sides' lines instead. Drop this in your data repo as
+`.gitattributes`:
+```gitattributes
+# Profiles are append-only logs — keep both machines' additions on merge.
+channels/*.md merge=union
+```
+`state.md` is deliberately **left out** (it's a table, one row per channel — union would
+duplicate/garble rows). A genuine same-channel collision should surface as a conflict, not
+merge wrong (see the hook below — it aborts and tells you).
+---
+## 3. Commit + push after every rating (sync-safe)
+Wire this so each rating lands on `origin`. It commits the cursor + profiles, and — if the
+other machine pushed first — **rebases and retries** instead of silently dropping your work.
+Save as `yt-sync.sh` (anywhere), `chmod +x`:
+```bash
+#!/usr/bin/env bash
+# Persist yt-briefing state to git, sync-safe across machines. Best-effort, never blocks.
+set -uo pipefail
+DATA="${YT_DATA_DIR:-$PWD/.yt-briefing/data}"   # the folder you version (default: in-project)
+cd "$DATA" || exit 0
+git add channels.md state.md channels/ config.json 2>/dev/null || exit 0
+git diff --cached --quiet 2>/dev/null && exit 0        # nothing changed → no commit
+git commit -q -m "yt-briefing: rating $(date '+%Y-%m-%d %H:%M %Z')" 2>/dev/null || exit 0
+git push -q 2>/dev/null && exit 0                       # landed → done
+# origin moved (the other machine pushed) → integrate + retry, never swallow:
+if git pull --rebase -q 2>/dev/null; then
+  git push -q 2>/dev/null && exit 0
+else
+  git rebase --abort 2>/dev/null || true               # same-channel collision → leave it for you
+fi
+echo "yt-briefing: state NOT pushed — resolve: cd \"$DATA\" && git pull --rebase && git push" >&2
+exit 0
+```
+Then trigger it after each rating. Two ways, depending on how you run yt-briefing:
+**A) Via a coding agent (Claude Code / Cursor)** — add a `PostToolUse` hook that fires after
+the engine runs. In Claude Code's `settings.json`:
+```json
+{
+  "hooks": {
+    "PostToolUse": [
+      { "matcher": "Bash",
+        "hooks": [ { "type": "command", "command": "cmd=$(jq -r '.tool_input.command // \"\"'); case \"$cmd\" in *yt-rating*|*yt-sweep*) /path/to/yt-sync.sh ;; esac" } ] }
+    ]
+  }
+}
+```
+**B) Via the CLI** — just call it after `rate`:
+```bash
+yt-briefing rate --rating 0 && /path/to/yt-sync.sh
+```
+> Optional but recommended: also `git pull --rebase` **before** the first sweep of a session
+> (so a machine starts on the latest cursor), e.g. a `PreToolUse` hook matching
+> `*yt-sweep*--reset*`, or just `cd "$YT_DATA_DIR" && git pull --rebase` before you start.
+---
+## Why "sync-safe" (the footgun this avoids)
+The naive version — `git push || true` — **silently swallows** a rejected push when the
+other machine pushed first. The rating stays local-only while you believe it's saved; later
+a careless conflict resolution drops it. The script above instead: per-rating commit →
+push → on rejection `pull --rebase` + retry → on a real conflict, **abort and tell you**.
+With profiles union-merging, the only thing that ever needs your hand is two machines rating
+the *same channel* before syncing — and even then it surfaces loudly, never vanishes.

package/docs/warp-proxy.md ADDED Viewed

@@ -0,0 +1,81 @@
+# Making YouTube transcripts work on a VPS (the datacenter-IP block)
+If you run yt-briefing on your laptop, you can skip this — residential IPs fetch
+transcripts fine, no proxy needed. This is for when you move it to a server.
+## The problem
+YouTube **structurally blocks datacenter IPs** — OVH, AWS, GCP, Azure, Scaleway, the lot.
+From the very first request you get `429` or *"Sign in to confirm you're not a bot."* It
+is not a temporary rate-limit you can wait out; whole cloud CIDR ranges are blocked. On a
+VPS without a proxy, every sweep returns `rate_limited` for every video → zero transcripts.
+## The fix: Cloudflare WARP as a proxy
+Route the transcript fetch through Cloudflare WARP. The egress IP becomes Cloudflare's
+CDN — a global CDN, not a datacenter range — which YouTube does not block. It's free and
+there are no credentials to babysit (the WARP registration lives in a Docker volume).
+### One-time setup
+```bash
+docker run -d \
+  --name warp \
+  --restart unless-stopped \
+  --device-cgroup-rule 'c 10:200 rwm' \
+  --cap-add MKNOD --cap-add AUDIT_WRITE --cap-add NET_ADMIN \
+  --sysctl net.ipv6.conf.all.disable_ipv6=0 \
+  --sysctl net.ipv4.conf.all.src_valid_mark=1 \
+  -p 127.0.0.1:1080:1080 \
+  -v warp-data:/var/lib/cloudflare-warp \
+  caomingjun/warp
+```
+The `caomingjun/warp` image exposes both HTTP and SOCKS5 on port 1080 — yt-dlp handles
+either; we use HTTP.
+### Point yt-briefing at it
+```bash
+# .env
+YT_PROXY=http://127.0.0.1:1080
+```
+yt-briefing reads `YT_PROXY` and routes all yt-dlp traffic through it. No env var → direct
+fetch (the residential default).
+### Health check
+```bash
+curl --proxy http://127.0.0.1:1080 https://www.cloudflare.com/cdn-cgi/trace | grep warp=
+# expect: warp=on
+```
+## Recovery
+If the container stops, transcripts return `rate_limited` again:
+```bash
+docker start warp
+```
+### "It worked all day, then suddenly rate_limited"
+YouTube blocked the specific Cloudflare egress IP WARP happened to use (the block is
+per-IP, not per-account). Force a re-registration to get a fresh egress IP:
+```bash
+# 1. See the current egress IP and whether YouTube blocks it:
+curl -x http://127.0.0.1:1080 -s https://api.ipify.org
+curl -x http://127.0.0.1:1080 -s -o /dev/null -w "%{http_code}\n" "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+# 200 = OK, 429 = this IP is blocked
+# 2. Re-register → new egress IP:
+docker exec warp warp-cli registration delete 2>/dev/null; true
+docker exec warp warp-cli registration new
+docker exec warp warp-cli connect
+sleep 5
+curl -x http://127.0.0.1:1080 -s https://api.ipify.org   # should differ
+```
+Then re-run the sweep.

package/package.json ADDED Viewed

@@ -0,0 +1,56 @@
+{
+  "name": "yt-briefing",
+  "version": "0.1.0",
+  "description": "A self-learning YouTube briefing engine: it sweeps the channels you follow, filters noise in two stages (title, then transcript), summarizes the rest in your language, and adapts to your ratings — one video at a time.",
+  "type": "module",
+  "bin": {
+    "yt-briefing": "dist/cli.js"
+  },
+  "files": [
+    "dist",
+    "data.example",
+    ".claude/skills",
+    "docs",
+    "README.md",
+    "LICENSE"
+  ],
+  "scripts": {
+    "build": "tsc -p tsconfig.build.json",
+    "prepare": "tsc -p tsconfig.build.json",
+    "init": "bun run src/bootstrap.ts",
+    "install-skill": "bun run src/install-skill.ts",
+    "sweep": "bun run src/yt-sweep.ts",
+    "rate": "bun run src/yt-rating.ts",
+    "transcribe": "bun run src/yt-transcript.ts",
+    "typecheck": "tsc --noEmit"
+  },
+  "engines": {
+    "node": ">=18",
+    "bun": ">=1.0.0"
+  },
+  "keywords": [
+    "youtube",
+    "transcript",
+    "summary",
+    "briefing",
+    "llm",
+    "openai-compatible",
+    "agent",
+    "claude"
+  ],
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/michal90r/yt-briefing.git"
+  },
+  "homepage": "https://github.com/michal90r/yt-briefing#readme",
+  "bugs": "https://github.com/michal90r/yt-briefing/issues",
+  "dependencies": {
+    "dotenv": "^16.6.1"
+  },
+  "devDependencies": {
+    "@types/node": "^22",
+    "typescript": "^5.7",
+    "@types/bun": "latest"
+  }
+}