webpeel 0.13.4 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +120 -162
  2. package/dist/cli-auth.js +7 -7
  3. package/dist/cli-auth.js.map +1 -1
  4. package/dist/cli.js +197 -26
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/auto-extract.d.ts +83 -0
  7. package/dist/core/auto-extract.d.ts.map +1 -0
  8. package/dist/core/auto-extract.js +565 -0
  9. package/dist/core/auto-extract.js.map +1 -0
  10. package/dist/core/deep-fetch.d.ts +75 -0
  11. package/dist/core/deep-fetch.d.ts.map +1 -0
  12. package/dist/core/deep-fetch.js +406 -0
  13. package/dist/core/deep-fetch.js.map +1 -0
  14. package/dist/core/domain-extractors.d.ts +34 -0
  15. package/dist/core/domain-extractors.d.ts.map +1 -0
  16. package/dist/core/domain-extractors.js +654 -0
  17. package/dist/core/domain-extractors.js.map +1 -0
  18. package/dist/core/markdown.d.ts +8 -0
  19. package/dist/core/markdown.d.ts.map +1 -1
  20. package/dist/core/markdown.js +25 -0
  21. package/dist/core/markdown.js.map +1 -1
  22. package/dist/core/quick-answer.d.ts +28 -0
  23. package/dist/core/quick-answer.d.ts.map +1 -0
  24. package/dist/core/quick-answer.js +288 -0
  25. package/dist/core/quick-answer.js.map +1 -0
  26. package/dist/core/readability.d.ts +58 -0
  27. package/dist/core/readability.d.ts.map +1 -0
  28. package/dist/core/readability.js +496 -0
  29. package/dist/core/readability.js.map +1 -0
  30. package/dist/core/search-provider.d.ts.map +1 -1
  31. package/dist/core/search-provider.js +3 -6
  32. package/dist/core/search-provider.js.map +1 -1
  33. package/dist/core/strategies.d.ts.map +1 -1
  34. package/dist/core/strategies.js +70 -5
  35. package/dist/core/strategies.js.map +1 -1
  36. package/dist/core/watch-manager.d.ts +140 -0
  37. package/dist/core/watch-manager.d.ts.map +1 -0
  38. package/dist/core/watch-manager.js +348 -0
  39. package/dist/core/watch-manager.js.map +1 -0
  40. package/dist/core/youtube.d.ts +91 -0
  41. package/dist/core/youtube.d.ts.map +1 -0
  42. package/dist/core/youtube.js +380 -0
  43. package/dist/core/youtube.js.map +1 -0
  44. package/dist/index.d.ts +4 -0
  45. package/dist/index.d.ts.map +1 -1
  46. package/dist/index.js +103 -0
  47. package/dist/index.js.map +1 -1
  48. package/dist/mcp/server.js +58 -16
  49. package/dist/mcp/server.js.map +1 -1
  50. package/dist/server/app.d.ts.map +1 -1
  51. package/dist/server/app.js +19 -1
  52. package/dist/server/app.js.map +1 -1
  53. package/dist/server/routes/deep-fetch.d.ts +9 -0
  54. package/dist/server/routes/deep-fetch.d.ts.map +1 -0
  55. package/dist/server/routes/deep-fetch.js +38 -0
  56. package/dist/server/routes/deep-fetch.js.map +1 -0
  57. package/dist/server/routes/extract.d.ts.map +1 -1
  58. package/dist/server/routes/extract.js +11 -0
  59. package/dist/server/routes/extract.js.map +1 -1
  60. package/dist/server/routes/fetch.d.ts.map +1 -1
  61. package/dist/server/routes/fetch.js +45 -19
  62. package/dist/server/routes/fetch.js.map +1 -1
  63. package/dist/server/routes/mcp.d.ts +2 -1
  64. package/dist/server/routes/mcp.d.ts.map +1 -1
  65. package/dist/server/routes/mcp.js +307 -38
  66. package/dist/server/routes/mcp.js.map +1 -1
  67. package/dist/server/routes/quick-answer.d.ts +9 -0
  68. package/dist/server/routes/quick-answer.d.ts.map +1 -0
  69. package/dist/server/routes/quick-answer.js +84 -0
  70. package/dist/server/routes/quick-answer.js.map +1 -0
  71. package/dist/server/routes/watch.d.ts +16 -0
  72. package/dist/server/routes/watch.d.ts.map +1 -0
  73. package/dist/server/routes/watch.js +219 -0
  74. package/dist/server/routes/watch.js.map +1 -0
  75. package/dist/server/routes/youtube.d.ts +7 -0
  76. package/dist/server/routes/youtube.d.ts.map +1 -0
  77. package/dist/server/routes/youtube.js +87 -0
  78. package/dist/server/routes/youtube.js.map +1 -0
  79. package/dist/types.d.ts +18 -0
  80. package/dist/types.d.ts.map +1 -1
  81. package/dist/types.js.map +1 -1
  82. package/llms.txt +14 -5
  83. package/package.json +1 -1
@@ -0,0 +1,348 @@
1
+ /**
2
+ * WebPeel WatchManager — Database-backed persistent URL monitoring
3
+ *
4
+ * Stores watch entries in PostgreSQL, periodically fetches watched URLs,
5
+ * compares content fingerprints to detect changes, and fires webhook
6
+ * notifications when a page is updated.
7
+ *
8
+ * This module is complementary to the in-process `watch.ts` poller:
9
+ * - `watch.ts` → ephemeral, CLI/in-process, no DB
10
+ * - `watch-manager.ts` → persistent, server-side, PostgreSQL-backed
11
+ */
12
+ import { createHash } from 'crypto';
13
+ import { fetch as undiciFetch } from 'undici';
14
+ // ─── Internal helpers ──────────────────────────────────────────────────────────
15
+ /**
16
+ * Compute a stable SHA-256 fingerprint of page content.
17
+ * Normalises whitespace so cosmetic-only reformatting doesn't trigger alerts.
18
+ */
19
+ export function computeFingerprint(content) {
20
+ const normalized = content.trim().replace(/\s+/g, ' ');
21
+ return createHash('sha256').update(normalized).digest('hex');
22
+ }
23
+ /**
24
+ * Paragraph-level diff — splits both versions of a page into paragraph blocks
25
+ * (separated by blank lines), then finds paragraphs that appear exclusively in
26
+ * each version. Only blocks longer than 10 characters are considered to avoid
27
+ * noise from short punctuation-only lines.
28
+ */
29
+ export function computeParagraphDiff(oldContent, newContent) {
30
+ const toSet = (text) => new Set(text
31
+ .split(/\n{2,}/)
32
+ .map(p => p.trim())
33
+ .filter(p => p.length > 10));
34
+ const oldSet = toSet(oldContent);
35
+ const newSet = toSet(newContent);
36
+ const addedText = [];
37
+ const removedText = [];
38
+ for (const p of newSet) {
39
+ if (!oldSet.has(p))
40
+ addedText.push(p.slice(0, 500));
41
+ }
42
+ for (const p of oldSet) {
43
+ if (!newSet.has(p))
44
+ removedText.push(p.slice(0, 500));
45
+ }
46
+ return { addedText, removedText };
47
+ }
48
+ /** Post a JSON payload to a webhook URL, silently swallowing delivery errors. */
49
+ async function sendWatchWebhook(webhookUrl, payload) {
50
+ try {
51
+ await undiciFetch(webhookUrl, {
52
+ method: 'POST',
53
+ headers: {
54
+ 'Content-Type': 'application/json',
55
+ 'User-Agent': 'WebPeel-Watch/1.0 (+https://webpeel.dev)',
56
+ },
57
+ body: JSON.stringify(payload),
58
+ signal: AbortSignal.timeout(10_000),
59
+ });
60
+ }
61
+ catch (err) {
62
+ process.stderr.write(`[watch-manager] Webhook delivery failed to ${webhookUrl}: ${err instanceof Error ? err.message : String(err)}\n`);
63
+ }
64
+ }
65
+ /** Map a raw database row to a typed {@link WatchEntry}. */
66
+ function rowToEntry(row) {
67
+ return {
68
+ id: row.id,
69
+ accountId: row.account_id,
70
+ url: row.url,
71
+ webhookUrl: row.webhook_url ?? undefined,
72
+ checkIntervalMinutes: row.check_interval_minutes ?? 60,
73
+ selector: row.selector ?? undefined,
74
+ lastFingerprint: row.last_fingerprint ?? undefined,
75
+ lastCheckedAt: row.last_checked_at ? new Date(row.last_checked_at) : undefined,
76
+ lastChangedAt: row.last_changed_at ? new Date(row.last_changed_at) : undefined,
77
+ changeCount: row.change_count ?? 0,
78
+ status: row.status ?? 'active',
79
+ errorMessage: row.error_message ?? undefined,
80
+ createdAt: new Date(row.created_at),
81
+ updatedAt: new Date(row.updated_at),
82
+ };
83
+ }
84
+ // ─── WatchManager ──────────────────────────────────────────────────────────────
85
+ /**
86
+ * Database-backed URL watch manager.
87
+ *
88
+ * Stores watch entries in a PostgreSQL `watches` table (see
89
+ * `migrations/007_watch.sql`) and handles periodic checks, change detection,
90
+ * and webhook delivery.
91
+ *
92
+ * @example
93
+ * ```typescript
94
+ * const manager = new WatchManager(pool);
95
+ * const watch = await manager.create('acct-uuid', 'https://example.com/pricing', {
96
+ * webhookUrl: 'https://hooks.example.com/alert',
97
+ * checkIntervalMinutes: 30,
98
+ * });
99
+ * const diff = await manager.check(watch.id);
100
+ * console.log(diff.summary);
101
+ * ```
102
+ */
103
+ export class WatchManager {
104
+ db;
105
+ constructor(db) {
106
+ this.db = db;
107
+ }
108
+ // ─── CRUD ────────────────────────────────────────────────────────────────────
109
+ /**
110
+ * Create a new watch entry for the given URL.
111
+ * The watch is immediately active; the first check will establish the baseline.
112
+ */
113
+ async create(accountId, url, options = {}) {
114
+ const { webhookUrl, checkIntervalMinutes = 60, selector } = options;
115
+ const result = await this.db.query(`INSERT INTO watches (account_id, url, webhook_url, check_interval_minutes, selector)
116
+ VALUES ($1, $2, $3, $4, $5)
117
+ RETURNING *`, [accountId, url, webhookUrl ?? null, checkIntervalMinutes, selector ?? null]);
118
+ return rowToEntry(result.rows[0]);
119
+ }
120
+ /** List all watches owned by the given account, most recent first. */
121
+ async list(accountId) {
122
+ const result = await this.db.query(`SELECT * FROM watches WHERE account_id = $1 ORDER BY created_at DESC`, [accountId]);
123
+ return result.rows.map(rowToEntry);
124
+ }
125
+ /** Get a single watch by ID, or null if not found. */
126
+ async get(watchId) {
127
+ const result = await this.db.query(`SELECT * FROM watches WHERE id = $1`, [watchId]);
128
+ if (result.rows.length === 0)
129
+ return null;
130
+ return rowToEntry(result.rows[0]);
131
+ }
132
+ /** Pause a watch — it will not be included in {@link checkDue} runs. */
133
+ async pause(watchId) {
134
+ await this.db.query(`UPDATE watches SET status = 'paused', updated_at = NOW() WHERE id = $1`, [watchId]);
135
+ }
136
+ /** Resume a previously paused (or errored) watch. */
137
+ async resume(watchId) {
138
+ await this.db.query(`UPDATE watches
139
+ SET status = 'active', error_message = NULL, updated_at = NOW()
140
+ WHERE id = $1`, [watchId]);
141
+ }
142
+ /** Permanently delete a watch. */
143
+ async delete(watchId) {
144
+ await this.db.query(`DELETE FROM watches WHERE id = $1`, [watchId]);
145
+ }
146
+ /**
147
+ * Update mutable properties of a watch.
148
+ * Only the fields present in `updates` are changed.
149
+ */
150
+ async update(watchId, updates) {
151
+ const setClauses = ['updated_at = NOW()'];
152
+ const values = [];
153
+ let idx = 1;
154
+ if ('webhookUrl' in updates) {
155
+ setClauses.push(`webhook_url = $${idx++}`);
156
+ values.push(updates.webhookUrl ?? null);
157
+ }
158
+ if ('checkIntervalMinutes' in updates) {
159
+ setClauses.push(`check_interval_minutes = $${idx++}`);
160
+ values.push(updates.checkIntervalMinutes);
161
+ }
162
+ if ('selector' in updates) {
163
+ setClauses.push(`selector = $${idx++}`);
164
+ values.push(updates.selector ?? null);
165
+ }
166
+ if ('status' in updates) {
167
+ setClauses.push(`status = $${idx++}`);
168
+ values.push(updates.status);
169
+ }
170
+ values.push(watchId);
171
+ const result = await this.db.query(`UPDATE watches SET ${setClauses.join(', ')} WHERE id = $${idx} RETURNING *`, values);
172
+ if (result.rows.length === 0)
173
+ return null;
174
+ return rowToEntry(result.rows[0]);
175
+ }
176
+ // ─── Checking ────────────────────────────────────────────────────────────────
177
+ /**
178
+ * Perform an immediate content check for the given watch ID.
179
+ *
180
+ * Steps:
181
+ * 1. Load watch entry and previous content snapshot from disk (if any).
182
+ * 2. Fetch the current page via `peel()`.
183
+ * 3. Compute a SHA-256 fingerprint of the normalised content.
184
+ * 4. If the fingerprint changed, compute a paragraph-level diff and fire the webhook.
185
+ * 5. Persist `last_fingerprint`, `last_checked_at`, and `change_count` to the DB.
186
+ * 6. Return a {@link WatchDiff} describing what changed.
187
+ */
188
+ async check(watchId) {
189
+ const watch = await this.get(watchId);
190
+ if (!watch)
191
+ throw new Error(`Watch not found: ${watchId}`);
192
+ const now = new Date();
193
+ try {
194
+ // Load previous content snapshot for text-diff computation.
195
+ const { getSnapshot } = await import('./change-tracking.js');
196
+ const prevSnapshot = await getSnapshot(watch.url);
197
+ // Fetch current content.
198
+ const { peel } = await import('../index.js');
199
+ const peelResult = await peel(watch.url, {
200
+ format: 'markdown',
201
+ selector: watch.selector,
202
+ timeout: 30_000,
203
+ // Enable change tracking so the snapshot is persisted for future diffs.
204
+ changeTracking: true,
205
+ });
206
+ const currentContent = peelResult.content;
207
+ const currentFingerprint = computeFingerprint(currentContent);
208
+ const previousFingerprint = watch.lastFingerprint ?? '';
209
+ // Determine whether content actually changed relative to our DB record.
210
+ const isFirstCheck = !previousFingerprint;
211
+ const changed = !isFirstCheck && currentFingerprint !== previousFingerprint;
212
+ // Compute text diff when changed and we have old content to compare against.
213
+ let addedText = [];
214
+ let removedText = [];
215
+ let summary;
216
+ if (isFirstCheck) {
217
+ summary = 'Baseline fingerprint established — monitoring active.';
218
+ }
219
+ else if (changed) {
220
+ const oldContent = prevSnapshot?.content ?? '';
221
+ if (oldContent) {
222
+ const diff = computeParagraphDiff(oldContent, currentContent);
223
+ addedText = diff.addedText;
224
+ removedText = diff.removedText;
225
+ summary =
226
+ addedText.length > 0 || removedText.length > 0
227
+ ? `Page updated: ${addedText.length} block${addedText.length !== 1 ? 's' : ''} added, ` +
228
+ `${removedText.length} block${removedText.length !== 1 ? 's' : ''} removed.`
229
+ : 'Page content changed (fingerprint mismatch — no paragraph-level diff available).';
230
+ }
231
+ else {
232
+ summary = 'Page content changed (no previous snapshot available for text diff).';
233
+ }
234
+ }
235
+ else {
236
+ summary = 'No changes detected.';
237
+ }
238
+ // Update DB.
239
+ if (changed) {
240
+ await this.db.query(`UPDATE watches
241
+ SET last_fingerprint = $1,
242
+ last_checked_at = $2,
243
+ last_changed_at = $2,
244
+ change_count = change_count + 1,
245
+ status = 'active',
246
+ error_message = NULL,
247
+ updated_at = $2
248
+ WHERE id = $3`, [currentFingerprint, now, watchId]);
249
+ // Reload to get the latest change_count for the webhook payload.
250
+ const updated = await this.get(watchId);
251
+ // Fire webhook.
252
+ if (watch.webhookUrl && updated) {
253
+ const payload = {
254
+ event: 'watch.changed',
255
+ watchId: watch.id,
256
+ url: watch.url,
257
+ changedAt: now.toISOString(),
258
+ changeCount: updated.changeCount,
259
+ diff: { addedText, removedText, summary },
260
+ };
261
+ await sendWatchWebhook(watch.webhookUrl, payload);
262
+ }
263
+ }
264
+ else {
265
+ await this.db.query(`UPDATE watches
266
+ SET last_fingerprint = COALESCE($1, last_fingerprint),
267
+ last_checked_at = $2,
268
+ status = 'active',
269
+ error_message = NULL,
270
+ updated_at = $2
271
+ WHERE id = $3`, [currentFingerprint || null, now, watchId]);
272
+ }
273
+ return {
274
+ changed,
275
+ previousFingerprint,
276
+ currentFingerprint,
277
+ summary,
278
+ addedText,
279
+ removedText,
280
+ };
281
+ }
282
+ catch (error) {
283
+ const errMsg = (error instanceof Error ? error.message : String(error)).slice(0, 500);
284
+ // Mark the watch as errored so operators can investigate.
285
+ await this.db.query(`UPDATE watches
286
+ SET status = 'error',
287
+ error_message = $1,
288
+ last_checked_at = $2,
289
+ updated_at = $2
290
+ WHERE id = $3`, [errMsg, now, watchId]);
291
+ throw error;
292
+ }
293
+ }
294
+ /**
295
+ * Scan the database for watches that are due for a check and run them.
296
+ *
297
+ * A watch is "due" when:
298
+ * - `status = 'active'`
299
+ * - `last_checked_at` is NULL (never checked) OR older than `check_interval_minutes`
300
+ *
301
+ * Processes up to 50 watches per invocation to avoid long-running cycles.
302
+ */
303
+ async checkDue() {
304
+ const result = await this.db.query(`SELECT * FROM watches
305
+ WHERE status = 'active'
306
+ AND (
307
+ last_checked_at IS NULL
308
+ OR last_checked_at < NOW() - (check_interval_minutes * INTERVAL '1 minute')
309
+ )
310
+ ORDER BY last_checked_at ASC NULLS FIRST
311
+ LIMIT 50`);
312
+ for (const row of result.rows) {
313
+ const watch = rowToEntry(row);
314
+ try {
315
+ await this.check(watch.id);
316
+ }
317
+ catch (err) {
318
+ process.stderr.write(`[watch-manager] Error checking watch ${watch.id} (${watch.url}): ` +
319
+ `${err instanceof Error ? err.message : String(err)}\n`);
320
+ }
321
+ }
322
+ }
323
+ }
324
+ // ─── Background checker ────────────────────────────────────────────────────────
325
+ /**
326
+ * Start a background interval that calls {@link WatchManager.checkDue} every
327
+ * minute. Wire this up in `app.ts` after the server starts.
328
+ *
329
+ * @returns The interval handle (pass to `clearInterval` for clean shutdown).
330
+ *
331
+ * @example
332
+ * ```typescript
333
+ * const handle = startWatchChecker(pool);
334
+ * process.on('SIGTERM', () => clearInterval(handle));
335
+ * ```
336
+ */
337
+ export function startWatchChecker(db) {
338
+ const manager = new WatchManager(db);
339
+ return setInterval(async () => {
340
+ try {
341
+ await manager.checkDue();
342
+ }
343
+ catch (err) {
344
+ process.stderr.write(`[watch-manager] Background checker error: ${err instanceof Error ? err.message : String(err)}\n`);
345
+ }
346
+ }, 60_000); // Every 1 minute
347
+ }
348
+ //# sourceMappingURL=watch-manager.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"watch-manager.js","sourceRoot":"","sources":["../../src/core/watch-manager.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AACpC,OAAO,EAAE,KAAK,IAAI,WAAW,EAAE,MAAM,QAAQ,CAAC;AA2C9C,kFAAkF;AAElF;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAAC,OAAe;IAChD,MAAM,UAAU,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACvD,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAC/D,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,oBAAoB,CAClC,UAAkB,EAClB,UAAkB;IAElB,MAAM,KAAK,GAAG,CAAC,IAAY,EAAe,EAAE,CAC1C,IAAI,GAAG,CACL,IAAI;SACD,KAAK,CAAC,QAAQ,CAAC;SACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,EAAE,CAAC,CAC9B,CAAC;IAEJ,MAAM,MAAM,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC;IACjC,MAAM,MAAM,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC;IAEjC,MAAM,SAAS,GAAa,EAAE,CAAC;IAC/B,MAAM,WAAW,GAAa,EAAE,CAAC;IAEjC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACvB,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IACD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACvB,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;YAAE,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;IACxD,CAAC;IAED,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC;AACpC,CAAC;AAED,iFAAiF;AACjF,KAAK,UAAU,gBAAgB,CAAC,UAAkB,EAAE,OAAgB;IAClE,IAAI,CAAC;QACH,MAAM,WAAW,CAAC,UAAU,EAAE;YAC5B,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,YAAY,EAAE,0CAA0C;aACzD;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;YAC7B,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC;SACpC,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,8CAA8C,UAAU,KAAK,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAClH,CAAC;IACJ,CAAC;AACH,CAAC;AAED,4DAA4D;AAC5D,SAAS,UAAU,CAAC,GAA4B;IAC9C,OAAO;QACL,EAAE,EAAE,GAAG,CAAC,EAAY;QACpB,SAAS,EAAE,GAAG,CAAC,UAAoB;QACnC,GAAG,EAAE,GAAG,CAAC,GAAa;QACtB,UAAU,EAAG,GAAG,CAAC,WAA6B,IAAI,SAAS;QAC3D,oBAAoB,EAAG,GAAG,CAAC,sBAAiC,IAAI,EAAE;QAClE,QAAQ,EAAG,GAAG,CAAC,QAA0B,IAAI,SAAS;QACtD,eAAe,EAAG,GAAG,CAAC,gBAAkC,IAAI,SAAS;QACrE,aAAa,EAAE,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,eAAyB,CAAC,CAAC,CAAC,CAAC,SAAS;QACxF,aAAa,EAAE,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,eAAyB,CAAC,CAAC,CAAC,CAAC,SAAS;QACxF,WAAW,EAAG,GAAG,CAAC,YAAuB,IAAI,CAAC;QAC9C,MAAM,EAAG,GAAG,CAAC,MAAwC,IAAI,QAAQ;QACjE,YAAY,EAAG,GAAG,CAAC,aAA+B,IAAI,SAAS;QAC/D,SAAS,EAAE,IAAI,IAAI,CAAC,GAAG,CAAC,UAAoB,CAAC;QAC7C,SAAS,EAAE,IAAI,IAAI,CAAC,GAAG,CAAC,UAAoB,CAAC;KAC9C,CAAC;AACJ,CAAC;AAED,kFAAkF;AAElF;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,OAAO,YAAY;IACM;IAA7B,YAA6B,EAAW;QAAX,OAAE,GAAF,EAAE,CAAS;IAAG,CAAC;IAE5C,gFAAgF;IAEhF;;;OAGG;IACH,KAAK,CAAC,MAAM,CACV,SAAiB,EACjB,GAAW,EACX,UAA8B,EAAE;QAEhC,MAAM,EAAE,UAAU,EAAE,oBAAoB,GAAG,EAAE,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;QAEpE,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAChC;;mBAEa,EACb,CAAC,SAAS,EAAE,GAAG,EAAE,UAAU,IAAI,IAAI,EAAE,oBAAoB,EAAE,QAAQ,IAAI,IAAI,CAAC,CAC7E,CAAC;QAEF,OAAO,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,sEAAsE;IACtE,KAAK,CAAC,IAAI,CAAC,SAAiB;QAC1B,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAChC,sEAAsE,EACtE,CAAC,SAAS,CAAC,CACZ,CAAC;QACF,OAAO,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IACrC,CAAC;IAED,sDAAsD;IACtD,KAAK,CAAC,GAAG,CAAC,OAAe;QACvB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAChC,qCAAqC,EACrC,CAAC,OAAO,CAAC,CACV,CAAC;QACF,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,OAAO,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,wEAAwE;IACxE,KAAK,CAAC,KAAK,CAAC,OAAe;QACzB,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CACjB,wEAAwE,EACxE,CAAC,OAAO,CAAC,CACV,CAAC;IACJ,CAAC;IAED,qDAAqD;IACrD,KAAK,CAAC,MAAM,CAAC,OAAe;QAC1B,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CACjB;;qBAEe,EACf,CAAC,OAAO,CAAC,CACV,CAAC;IACJ,CAAC;IAED,kCAAkC;IAClC,KAAK,CAAC,MAAM,CAAC,OAAe;QAC1B,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,mCAAmC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IACtE,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,MAAM,CACV,OAAe,EACf,OAAiG;QAEjG,MAAM,UAAU,GAAa,CAAC,oBAAoB,CAAC,CAAC;QACpD,MAAM,MAAM,GAAc,EAAE,CAAC;QAC7B,IAAI,GAAG,GAAG,CAAC,CAAC;QAEZ,IAAI,YAAY,IAAI,OAAO,EAAE,CAAC;YAC5B,UAAU,CAAC,IAAI,CAAC,kBAAkB,GAAG,EAAE,EAAE,CAAC,CAAC;YAC3C,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC;QAC1C,CAAC;QACD,IAAI,sBAAsB,IAAI,OAAO,EAAE,CAAC;YACtC,UAAU,CAAC,IAAI,CAAC,6BAA6B,GAAG,EAAE,EAAE,CAAC,CAAC;YACtD,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,oBAAoB,CAAC,CAAC;QAC5C,CAAC;QACD,IAAI,UAAU,IAAI,OAAO,EAAE,CAAC;YAC1B,UAAU,CAAC,IAAI,CAAC,eAAe,GAAG,EAAE,EAAE,CAAC,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,CAAC;QACxC,CAAC;QACD,IAAI,QAAQ,IAAI,OAAO,EAAE,CAAC;YACxB,UAAU,CAAC,IAAI,CAAC,aAAa,GAAG,EAAE,EAAE,CAAC,CAAC;YACtC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QAC9B,CAAC;QAED,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAErB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAChC,sBAAsB,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,GAAG,cAAc,EAC5E,MAAM,CACP,CAAC;QAEF,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,IAAI,CAAC;QAC1C,OAAO,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,gFAAgF;IAEhF;;;;;;;;;;OAUG;IACH,KAAK,CAAC,KAAK,CAAC,OAAe;QACzB,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QACtC,IAAI,CAAC,KAAK;YAAE,MAAM,IAAI,KAAK,CAAC,oBAAoB,OAAO,EAAE,CAAC,CAAC;QAE3D,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC;QAEvB,IAAI,CAAC;YACH,4DAA4D;YAC5D,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,sBAAsB,CAAC,CAAC;YAC7D,MAAM,YAAY,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAElD,yBAAyB;YACzB,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;YAC7C,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE;gBACvC,MAAM,EAAE,UAAU;gBAClB,QAAQ,EAAE,KAAK,CAAC,QAAQ;gBACxB,OAAO,EAAE,MAAM;gBACf,wEAAwE;gBACxE,cAAc,EAAE,IAAI;aACrB,CAAC,CAAC;YAEH,MAAM,cAAc,GAAG,UAAU,CAAC,OAAO,CAAC;YAC1C,MAAM,kBAAkB,GAAG,kBAAkB,CAAC,cAAc,CAAC,CAAC;YAC9D,MAAM,mBAAmB,GAAG,KAAK,CAAC,eAAe,IAAI,EAAE,CAAC;YAExD,wEAAwE;YACxE,MAAM,YAAY,GAAG,CAAC,mBAAmB,CAAC;YAC1C,MAAM,OAAO,GAAG,CAAC,YAAY,IAAI,kBAAkB,KAAK,mBAAmB,CAAC;YAE5E,6EAA6E;YAC7E,IAAI,SAAS,GAAa,EAAE,CAAC;YAC7B,IAAI,WAAW,GAAa,EAAE,CAAC;YAC/B,IAAI,OAAe,CAAC;YAEpB,IAAI,YAAY,EAAE,CAAC;gBACjB,OAAO,GAAG,uDAAuD,CAAC;YACpE,CAAC;iBAAM,IAAI,OAAO,EAAE,CAAC;gBACnB,MAAM,UAAU,GAAG,YAAY,EAAE,OAAO,IAAI,EAAE,CAAC;gBAC/C,IAAI,UAAU,EAAE,CAAC;oBACf,MAAM,IAAI,GAAG,oBAAoB,CAAC,UAAU,EAAE,cAAc,CAAC,CAAC;oBAC9D,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC;oBAC3B,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC;oBAC/B,OAAO;wBACL,SAAS,CAAC,MAAM,GAAG,CAAC,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC;4BAC5C,CAAC,CAAC,iBAAiB,SAAS,CAAC,MAAM,SAAS,SAAS,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,UAAU;gCACrF,GAAG,WAAW,CAAC,MAAM,SAAS,WAAW,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,WAAW;4BAC9E,CAAC,CAAC,kFAAkF,CAAC;gBAC3F,CAAC;qBAAM,CAAC;oBACN,OAAO,GAAG,sEAAsE,CAAC;gBACnF,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,OAAO,GAAG,sBAAsB,CAAC;YACnC,CAAC;YAED,aAAa;YACb,IAAI,OAAO,EAAE,CAAC;gBACZ,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CACjB;;;;;;;;yBAQe,EACf,CAAC,kBAAkB,EAAE,GAAG,EAAE,OAAO,CAAC,CACnC,CAAC;gBAEF,iEAAiE;gBACjE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;gBAExC,gBAAgB;gBAChB,IAAI,KAAK,CAAC,UAAU,IAAI,OAAO,EAAE,CAAC;oBAChC,MAAM,OAAO,GAAG;wBACd,KAAK,EAAE,eAAe;wBACtB,OAAO,EAAE,KAAK,CAAC,EAAE;wBACjB,GAAG,EAAE,KAAK,CAAC,GAAG;wBACd,SAAS,EAAE,GAAG,CAAC,WAAW,EAAE;wBAC5B,WAAW,EAAE,OAAO,CAAC,WAAW;wBAChC,IAAI,EAAE,EAAE,SAAS,EAAE,WAAW,EAAE,OAAO,EAAE;qBAC1C,CAAC;oBACF,MAAM,gBAAgB,CAAC,KAAK,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;gBACpD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CACjB;;;;;;yBAMe,EACf,CAAC,kBAAkB,IAAI,IAAI,EAAE,GAAG,EAAE,OAAO,CAAC,CAC3C,CAAC;YACJ,CAAC;YAED,OAAO;gBACL,OAAO;gBACP,mBAAmB;gBACnB,kBAAkB;gBAClB,OAAO;gBACP,SAAS;gBACT,WAAW;aACZ,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,MAAM,GAAG,CAAC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAEtF,0DAA0D;YAC1D,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CACjB;;;;;uBAKe,EACf,CAAC,MAAM,EAAE,GAAG,EAAE,OAAO,CAAC,CACvB,CAAC;YAEF,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,QAAQ;QACZ,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,KAAK,CAChC;;;;;;;gBAOU,CACX,CAAC;QAEF,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;YAC9B,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;YAC9B,IAAI,CAAC;gBACH,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;YAC7B,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,wCAAwC,KAAK,CAAC,EAAE,KAAK,KAAK,CAAC,GAAG,KAAK;oBACjE,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAC1D,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;CACF;AAED,kFAAkF;AAElF;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,iBAAiB,CAAC,EAAW;IAC3C,MAAM,OAAO,GAAG,IAAI,YAAY,CAAC,EAAE,CAAC,CAAC;IAErC,OAAO,WAAW,CAAC,KAAK,IAAI,EAAE;QAC5B,IAAI,CAAC;YACH,MAAM,OAAO,CAAC,QAAQ,EAAE,CAAC;QAC3B,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO,CAAC,MAAM,CAAC,KAAK,CAClB,6CAA6C,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAClG,CAAC;QACJ,CAAC;IACH,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC,iBAAiB;AAC/B,CAAC"}
@@ -0,0 +1,91 @@
1
+ /**
2
+ * YouTube transcript extraction — no API key required.
3
+ *
4
+ * YouTube embeds caption/transcript data directly in the page HTML as JSON
5
+ * (inside ytInitialPlayerResponse). We parse that JSON, extract caption
6
+ * track URLs, fetch the timedtext XML, and return structured transcript data.
7
+ */
8
+ export interface TranscriptSegment {
9
+ /** Caption text (HTML entities decoded) */
10
+ text: string;
11
+ /** Start time in seconds */
12
+ start: number;
13
+ /** Duration in seconds */
14
+ duration: number;
15
+ }
16
+ export interface YouTubeTranscript {
17
+ videoId: string;
18
+ title: string;
19
+ channel: string;
20
+ /** Duration formatted as "MM:SS" or "HH:MM:SS" */
21
+ duration: string;
22
+ /** BCP-47 language code, e.g. "en" */
23
+ language: string;
24
+ /** Timestamped caption segments */
25
+ segments: TranscriptSegment[];
26
+ /** All segments joined as plain text */
27
+ fullText: string;
28
+ /** Language codes available for this video */
29
+ availableLanguages: string[];
30
+ }
31
+ export interface YouTubeVideoInfo {
32
+ videoId: string;
33
+ title: string;
34
+ channel: string;
35
+ description: string;
36
+ /** Duration formatted as "MM:SS" or "HH:MM:SS" */
37
+ duration: string;
38
+ publishDate: string;
39
+ viewCount: string;
40
+ likeCount: string;
41
+ thumbnail: string;
42
+ }
43
+ /**
44
+ * Extract the video ID from any common YouTube URL format.
45
+ * Returns null if the URL is not a recognisable YouTube URL.
46
+ *
47
+ * Supported formats:
48
+ * https://www.youtube.com/watch?v=VIDEO_ID
49
+ * https://youtu.be/VIDEO_ID
50
+ * https://www.youtube.com/embed/VIDEO_ID
51
+ * https://m.youtube.com/watch?v=VIDEO_ID
52
+ * URLs with extra params (&t=120, &list=PLxxx, etc.)
53
+ */
54
+ export declare function parseYouTubeUrl(url: string): string | null;
55
+ /**
56
+ * Extract video metadata from YouTube page HTML.
57
+ * Parses ytInitialPlayerResponse JSON embedded in the page.
58
+ */
59
+ export declare function extractVideoInfo(html: string): YouTubeVideoInfo;
60
+ /**
61
+ * Fetch and return the transcript for a YouTube video.
62
+ *
63
+ * @param url - Any YouTube URL format
64
+ * @param options.language - Preferred language code (default: "en")
65
+ */
66
+ export declare function getYouTubeTranscript(url: string, options?: {
67
+ language?: string;
68
+ }): Promise<YouTubeTranscript>;
69
+ /**
70
+ * Extract the ytInitialPlayerResponse JSON object from page HTML.
71
+ */
72
+ export declare function extractPlayerResponse(html: string): Record<string, any> | null;
73
+ /**
74
+ * Parse YouTube caption XML into transcript segments.
75
+ *
76
+ * Format: <transcript><text start="0.5" dur="2.1">Hello &amp; world</text>...</transcript>
77
+ */
78
+ export declare function parseCaptionXml(xml: string): TranscriptSegment[];
79
+ /**
80
+ * Decode common HTML entities found in YouTube caption XML.
81
+ *
82
+ * Order of operations:
83
+ * 1. Strip real HTML tags (e.g. <font color="...">) — these appear literally in the XML
84
+ * 2. Decode all HTML entities (including &lt; → < which represents literal angle brackets)
85
+ */
86
+ export declare function decodeHtmlEntities(text: string): string;
87
+ /**
88
+ * Format seconds into MM:SS or HH:MM:SS.
89
+ */
90
+ export declare function formatDuration(seconds: number): string;
91
+ //# sourceMappingURL=youtube.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"youtube.d.ts","sourceRoot":"","sources":["../../src/core/youtube.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAQH,MAAM,WAAW,iBAAiB;IAChC,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,iBAAiB;IAChC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,kDAAkD;IAClD,QAAQ,EAAE,MAAM,CAAC;IACjB,sCAAsC;IACtC,QAAQ,EAAE,MAAM,CAAC;IACjB,mCAAmC;IACnC,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,wCAAwC;IACxC,QAAQ,EAAE,MAAM,CAAC;IACjB,8CAA8C;IAC9C,kBAAkB,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,EAAE,MAAM,CAAC;IACpB,kDAAkD;IAClD,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD;;;;;;;;;;GAUG;AACH,wBAAgB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CA6C1D;AAUD;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,gBAAgB,CAwC/D;AAMD;;;;;GAKG;AACH,wBAAsB,oBAAoB,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAO,GAClC,OAAO,CAAC,iBAAiB,CAAC,CAsD5B;AAaD;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,IAAI,CAgD9E;AAwGD;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,iBAAiB,EAAE,CAqBhE;AAWD;;;;;;GAMG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAgBvD;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAUtD"}