webpeel 0.13.4 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +120 -162
  2. package/dist/cli-auth.js +7 -7
  3. package/dist/cli-auth.js.map +1 -1
  4. package/dist/cli.js +197 -26
  5. package/dist/cli.js.map +1 -1
  6. package/dist/core/auto-extract.d.ts +83 -0
  7. package/dist/core/auto-extract.d.ts.map +1 -0
  8. package/dist/core/auto-extract.js +565 -0
  9. package/dist/core/auto-extract.js.map +1 -0
  10. package/dist/core/deep-fetch.d.ts +75 -0
  11. package/dist/core/deep-fetch.d.ts.map +1 -0
  12. package/dist/core/deep-fetch.js +406 -0
  13. package/dist/core/deep-fetch.js.map +1 -0
  14. package/dist/core/domain-extractors.d.ts +34 -0
  15. package/dist/core/domain-extractors.d.ts.map +1 -0
  16. package/dist/core/domain-extractors.js +654 -0
  17. package/dist/core/domain-extractors.js.map +1 -0
  18. package/dist/core/markdown.d.ts +8 -0
  19. package/dist/core/markdown.d.ts.map +1 -1
  20. package/dist/core/markdown.js +25 -0
  21. package/dist/core/markdown.js.map +1 -1
  22. package/dist/core/quick-answer.d.ts +28 -0
  23. package/dist/core/quick-answer.d.ts.map +1 -0
  24. package/dist/core/quick-answer.js +288 -0
  25. package/dist/core/quick-answer.js.map +1 -0
  26. package/dist/core/readability.d.ts +58 -0
  27. package/dist/core/readability.d.ts.map +1 -0
  28. package/dist/core/readability.js +496 -0
  29. package/dist/core/readability.js.map +1 -0
  30. package/dist/core/search-provider.d.ts.map +1 -1
  31. package/dist/core/search-provider.js +3 -6
  32. package/dist/core/search-provider.js.map +1 -1
  33. package/dist/core/strategies.d.ts.map +1 -1
  34. package/dist/core/strategies.js +70 -5
  35. package/dist/core/strategies.js.map +1 -1
  36. package/dist/core/watch-manager.d.ts +140 -0
  37. package/dist/core/watch-manager.d.ts.map +1 -0
  38. package/dist/core/watch-manager.js +348 -0
  39. package/dist/core/watch-manager.js.map +1 -0
  40. package/dist/core/youtube.d.ts +91 -0
  41. package/dist/core/youtube.d.ts.map +1 -0
  42. package/dist/core/youtube.js +380 -0
  43. package/dist/core/youtube.js.map +1 -0
  44. package/dist/index.d.ts +4 -0
  45. package/dist/index.d.ts.map +1 -1
  46. package/dist/index.js +103 -0
  47. package/dist/index.js.map +1 -1
  48. package/dist/mcp/server.js +58 -16
  49. package/dist/mcp/server.js.map +1 -1
  50. package/dist/server/app.d.ts.map +1 -1
  51. package/dist/server/app.js +19 -1
  52. package/dist/server/app.js.map +1 -1
  53. package/dist/server/routes/deep-fetch.d.ts +9 -0
  54. package/dist/server/routes/deep-fetch.d.ts.map +1 -0
  55. package/dist/server/routes/deep-fetch.js +38 -0
  56. package/dist/server/routes/deep-fetch.js.map +1 -0
  57. package/dist/server/routes/extract.d.ts.map +1 -1
  58. package/dist/server/routes/extract.js +11 -0
  59. package/dist/server/routes/extract.js.map +1 -1
  60. package/dist/server/routes/fetch.d.ts.map +1 -1
  61. package/dist/server/routes/fetch.js +45 -19
  62. package/dist/server/routes/fetch.js.map +1 -1
  63. package/dist/server/routes/mcp.d.ts +2 -1
  64. package/dist/server/routes/mcp.d.ts.map +1 -1
  65. package/dist/server/routes/mcp.js +307 -38
  66. package/dist/server/routes/mcp.js.map +1 -1
  67. package/dist/server/routes/quick-answer.d.ts +9 -0
  68. package/dist/server/routes/quick-answer.d.ts.map +1 -0
  69. package/dist/server/routes/quick-answer.js +84 -0
  70. package/dist/server/routes/quick-answer.js.map +1 -0
  71. package/dist/server/routes/watch.d.ts +16 -0
  72. package/dist/server/routes/watch.d.ts.map +1 -0
  73. package/dist/server/routes/watch.js +219 -0
  74. package/dist/server/routes/watch.js.map +1 -0
  75. package/dist/server/routes/youtube.d.ts +7 -0
  76. package/dist/server/routes/youtube.d.ts.map +1 -0
  77. package/dist/server/routes/youtube.js +87 -0
  78. package/dist/server/routes/youtube.js.map +1 -0
  79. package/dist/types.d.ts +18 -0
  80. package/dist/types.d.ts.map +1 -1
  81. package/dist/types.js.map +1 -1
  82. package/llms.txt +14 -5
  83. package/package.json +1 -1
@@ -0,0 +1,654 @@
1
+ /**
2
+ * Domain-aware structured extractors for WebPeel.
3
+ *
4
+ * When peel() fetches a URL that matches a known domain, the relevant
5
+ * extractor fires and returns clean structured data + a markdown summary.
6
+ *
7
+ * Supported domains:
8
+ * - twitter.com / x.com — tweets, threads, profiles
9
+ * - reddit.com — posts with comments (via JSON API)
10
+ * - github.com — repos, issues, PRs, users (via GitHub API)
11
+ * - news.ycombinator.com — stories with comments (via HN Firebase API)
12
+ */
13
+ import { simpleFetch } from './fetcher.js';
14
+ // ---------------------------------------------------------------------------
15
+ // Registry
16
+ // ---------------------------------------------------------------------------
17
+ const REGISTRY = [
18
+ { match: (h) => h === 'twitter.com' || h === 'x.com' || h === 'www.twitter.com' || h === 'www.x.com', extractor: twitterExtractor },
19
+ { match: (h) => h === 'reddit.com' || h === 'www.reddit.com' || h === 'old.reddit.com', extractor: redditExtractor },
20
+ { match: (h) => h === 'github.com' || h === 'www.github.com', extractor: githubExtractor },
21
+ { match: (h) => h === 'news.ycombinator.com', extractor: hackerNewsExtractor },
22
+ ];
23
+ /**
24
+ * Returns the domain extractor for a URL, or null if none matches.
25
+ */
26
+ export function getDomainExtractor(url) {
27
+ try {
28
+ const { hostname } = new URL(url);
29
+ const host = hostname.toLowerCase();
30
+ for (const entry of REGISTRY) {
31
+ if (entry.match(host))
32
+ return entry.extractor;
33
+ }
34
+ }
35
+ catch {
36
+ // Invalid URL — no extractor
37
+ }
38
+ return null;
39
+ }
40
+ /**
41
+ * Convenience: run the extractor for the URL (if one exists).
42
+ * Returns null when no extractor matches or extraction fails.
43
+ */
44
+ export async function extractDomainData(html, url) {
45
+ const extractor = getDomainExtractor(url);
46
+ if (!extractor)
47
+ return null;
48
+ try {
49
+ return await extractor(html, url);
50
+ }
51
+ catch {
52
+ return null;
53
+ }
54
+ }
55
+ // ---------------------------------------------------------------------------
56
+ // Helpers
57
+ // ---------------------------------------------------------------------------
58
+ /** Safe JSON parse — returns null on failure. */
59
+ function tryParseJson(text) {
60
+ try {
61
+ return JSON.parse(text);
62
+ }
63
+ catch {
64
+ return null;
65
+ }
66
+ }
67
+ /** Strip HTML tags from a string. */
68
+ function stripHtml(str) {
69
+ return str.replace(/<[^>]+>/g, '').replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&nbsp;/g, ' ').trim();
70
+ }
71
+ /** Format a Unix timestamp (seconds) as ISO 8601. */
72
+ function unixToIso(sec) {
73
+ return new Date(sec * 1000).toISOString();
74
+ }
75
+ /** Fetch JSON from a URL using simpleFetch (reuses WebPeel's HTTP stack). */
76
+ async function fetchJson(url, customHeaders) {
77
+ const result = await simpleFetch(url, undefined, 15000, {
78
+ Accept: 'application/json',
79
+ ...customHeaders,
80
+ });
81
+ return tryParseJson(result.html);
82
+ }
83
+ // ---------------------------------------------------------------------------
84
+ // 1. Twitter / X extractor
85
+ // ---------------------------------------------------------------------------
86
+ /** Recursively search an object for a value matching predicate (BFS). */
87
+ function deepFind(obj, predicate, depth = 0) {
88
+ if (depth > 12 || obj === null || typeof obj !== 'object')
89
+ return null;
90
+ if (predicate(obj))
91
+ return obj;
92
+ for (const val of Object.values(obj)) {
93
+ const found = deepFind(val, predicate, depth + 1);
94
+ if (found !== null)
95
+ return found;
96
+ }
97
+ return null;
98
+ }
99
+ /** Detect tweet type from __NEXT_DATA__ and extract all tweet data. */
100
+ function parseTweetFromNextData(nextData) {
101
+ // Walk the tree to find a tweet_results.result structure
102
+ const tweetResult = deepFind(nextData, (v) => v?.tweet_results?.result?.legacy?.full_text !== undefined);
103
+ if (!tweetResult)
104
+ return null;
105
+ const result = tweetResult.tweet_results.result;
106
+ return parseTweetResult(result);
107
+ }
108
+ function parseTweetResult(result) {
109
+ const legacy = result?.legacy;
110
+ if (!legacy)
111
+ return null;
112
+ const userLegacy = result?.core?.user_results?.result?.legacy ||
113
+ result?.user_results?.result?.legacy;
114
+ const author = {
115
+ name: userLegacy?.name || '',
116
+ handle: '@' + (userLegacy?.screen_name || ''),
117
+ verified: userLegacy?.verified || result?.core?.user_results?.result?.is_blue_verified || false,
118
+ };
119
+ const metrics = {
120
+ likes: legacy.favorite_count ?? 0,
121
+ retweets: legacy.retweet_count ?? 0,
122
+ replies: legacy.reply_count ?? 0,
123
+ views: Number(result?.views?.count ?? 0),
124
+ };
125
+ // Media
126
+ const mediaItems = [];
127
+ const mediaEntities = legacy.extended_entities?.media || legacy.entities?.media || [];
128
+ for (const m of mediaEntities) {
129
+ if (m.media_url_https)
130
+ mediaItems.push(m.media_url_https);
131
+ }
132
+ // Quoted tweet
133
+ let quotedTweet = null;
134
+ if (result.quoted_status_result) {
135
+ const qLegacy = result.quoted_status_result?.result?.legacy;
136
+ const qUserLegacy = result.quoted_status_result?.result?.core?.user_results?.result?.legacy;
137
+ if (qLegacy) {
138
+ quotedTweet = {
139
+ text: qLegacy.full_text || qLegacy.text || '',
140
+ author: {
141
+ name: qUserLegacy?.name || '',
142
+ handle: '@' + (qUserLegacy?.screen_name || ''),
143
+ },
144
+ timestamp: qLegacy.created_at ? new Date(qLegacy.created_at).toISOString() : undefined,
145
+ };
146
+ }
147
+ }
148
+ return {
149
+ author,
150
+ text: legacy.full_text || legacy.text || '',
151
+ timestamp: legacy.created_at ? new Date(legacy.created_at).toISOString() : undefined,
152
+ metrics,
153
+ media: mediaItems,
154
+ quotedTweet,
155
+ };
156
+ }
157
+ async function twitterExtractor(html, url) {
158
+ const urlObj = new URL(url);
159
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
160
+ const isTweet = pathParts.includes('status');
161
+ const type = isTweet ? 'tweet' : 'profile';
162
+ const domain = 'twitter.com';
163
+ // --- Try __NEXT_DATA__ JSON (SSR data) ---
164
+ const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
165
+ let structured = null;
166
+ if (nextDataMatch) {
167
+ const nextData = tryParseJson(nextDataMatch[1]);
168
+ if (nextData) {
169
+ if (isTweet) {
170
+ const tweetData = parseTweetFromNextData(nextData);
171
+ if (tweetData) {
172
+ structured = tweetData;
173
+ }
174
+ }
175
+ else {
176
+ // Profile page — extract user info
177
+ const userResult = deepFind(nextData, (v) => v?.user_results?.result?.legacy?.screen_name);
178
+ if (userResult) {
179
+ const uLegacy = userResult.user_results.result.legacy;
180
+ structured = {
181
+ name: uLegacy.name || '',
182
+ handle: '@' + (uLegacy.screen_name || ''),
183
+ bio: uLegacy.description || '',
184
+ followers: uLegacy.followers_count ?? 0,
185
+ following: uLegacy.friends_count ?? 0,
186
+ tweets: uLegacy.statuses_count ?? 0,
187
+ verified: userResult.user_results.result.is_blue_verified || uLegacy.verified || false,
188
+ location: uLegacy.location || '',
189
+ created: uLegacy.created_at ? new Date(uLegacy.created_at).toISOString() : undefined,
190
+ };
191
+ }
192
+ }
193
+ }
194
+ }
195
+ // --- Fallback: parse DOM for tweet text if __NEXT_DATA__ parsing failed ---
196
+ if (!structured && isTweet) {
197
+ // Try to extract from og: tags or article body
198
+ const ogDescMatch = html.match(/<meta[^>]+property="og:description"[^>]+content="([^"]+)"/i);
199
+ const ogTitleMatch = html.match(/<meta[^>]+property="og:title"[^>]+content="([^"]+)"/i);
200
+ if (ogDescMatch || ogTitleMatch) {
201
+ const authorRaw = (ogTitleMatch?.[1] || '').replace(' on X', '').replace(' on Twitter', '').trim();
202
+ const text = ogDescMatch?.[1] ? decodeURIComponent(ogDescMatch[1].replace(/&#39;/g, "'").replace(/&amp;/g, '&')) : '';
203
+ structured = {
204
+ author: { name: authorRaw, handle: '', verified: false },
205
+ text: stripHtml(text),
206
+ timestamp: undefined,
207
+ metrics: { likes: 0, retweets: 0, replies: 0, views: 0 },
208
+ media: [],
209
+ quotedTweet: null,
210
+ };
211
+ }
212
+ }
213
+ if (!structured)
214
+ return null;
215
+ // Build clean markdown
216
+ let cleanContent;
217
+ if (type === 'tweet') {
218
+ const s = structured;
219
+ const authorLine = s.author?.handle
220
+ ? `**${s.author.name}** (${s.author.handle})`
221
+ : `**${s.author?.name || 'Unknown'}**`;
222
+ const timeLine = s.timestamp ? `\n*${s.timestamp}*` : '';
223
+ const metricsLine = s.metrics
224
+ ? `\n\nšŸ’¬ ${s.metrics.replies} šŸ” ${s.metrics.retweets} ā¤ļø ${s.metrics.likes}${s.metrics.views ? ` šŸ‘ ${s.metrics.views}` : ''}`
225
+ : '';
226
+ const mediaLine = s.media?.length ? `\n\nšŸ“· Media: ${s.media.join(', ')}` : '';
227
+ const quotedLine = s.quotedTweet
228
+ ? `\n\n> **Quoted tweet by ${s.quotedTweet.author?.name || 'unknown'}:** ${s.quotedTweet.text}`
229
+ : '';
230
+ const threadLine = s.thread?.length ? '\n\n**Thread:**\n' + s.thread.map((t, i) => `${i + 2}. ${t.text}`).join('\n') : '';
231
+ cleanContent = `## 🐦 Tweet by ${authorLine}${timeLine}\n\n${s.text}${quotedLine}${threadLine}${metricsLine}${mediaLine}`;
232
+ }
233
+ else {
234
+ const s = structured;
235
+ cleanContent = `## 🐦 @${(s.handle || '').replace('@', '')} on X/Twitter\n\n**${s.name}**\n${s.bio || ''}\n\nšŸ“ ${s.location || 'N/A'} | šŸ‘„ ${s.followers?.toLocaleString() || 0} followers | Following: ${s.following?.toLocaleString() || 0} | Tweets: ${s.tweets?.toLocaleString() || 0}`;
236
+ }
237
+ return { domain, type, structured, cleanContent };
238
+ }
239
+ function parseRedditComment(data, depth) {
240
+ if (!data || data.kind === 'more')
241
+ return null;
242
+ const d = data.kind === 't1' ? data.data : data;
243
+ if (!d || !d.body)
244
+ return null;
245
+ const replies = [];
246
+ if (depth > 0 && d.replies && d.replies.data?.children) {
247
+ for (const child of d.replies.data.children) {
248
+ const c = parseRedditComment(child, depth - 1);
249
+ if (c)
250
+ replies.push(c);
251
+ }
252
+ // Sort replies by score
253
+ replies.sort((a, b) => b.score - a.score);
254
+ replies.splice(3); // max 3 replies per level
255
+ }
256
+ return {
257
+ author: `u/${d.author || '[deleted]'}`,
258
+ text: d.body || '',
259
+ score: d.score || 0,
260
+ replies,
261
+ };
262
+ }
263
+ async function redditExtractor(_html, url) {
264
+ const urlObj = new URL(url);
265
+ const path = urlObj.pathname;
266
+ const domain = 'reddit.com';
267
+ // Detect page type
268
+ const isPost = /\/r\/[^/]+\/comments\//.test(path);
269
+ const isSubreddit = /^\/r\/[^/]+\/?$/.test(path);
270
+ const isUser = /^\/(u|user)\/[^/]+/.test(path);
271
+ const type = isPost ? 'post' : isSubreddit ? 'subreddit' : isUser ? 'user' : 'listing';
272
+ if (isPost) {
273
+ // Fetch post data via Reddit JSON API
274
+ const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
275
+ const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
276
+ if (!Array.isArray(data) || data.length < 2)
277
+ return null;
278
+ const postData = data[0]?.data?.children?.[0]?.data;
279
+ if (!postData)
280
+ return null;
281
+ // Parse top comments (max 20)
282
+ const commentChildren = data[1]?.data?.children || [];
283
+ const comments = [];
284
+ for (const child of commentChildren) {
285
+ const c = parseRedditComment(child, 3);
286
+ if (c)
287
+ comments.push(c);
288
+ if (comments.length >= 20)
289
+ break;
290
+ }
291
+ comments.sort((a, b) => b.score - a.score);
292
+ const structured = {
293
+ subreddit: `r/${postData.subreddit}`,
294
+ title: postData.title || '',
295
+ author: `u/${postData.author || '[deleted]'}`,
296
+ score: postData.score ?? 0,
297
+ upvoteRatio: postData.upvote_ratio ?? 1,
298
+ url: postData.url || url,
299
+ selftext: postData.selftext || '',
300
+ commentCount: postData.num_comments ?? 0,
301
+ created: unixToIso(postData.created_utc),
302
+ flair: postData.link_flair_text || null,
303
+ comments,
304
+ };
305
+ // Build clean markdown
306
+ const commentsMd = comments.slice(0, 10).map(c => {
307
+ const repliesMd = c.replies.slice(0, 2).map(r => ` > **${r.author}** (${r.score}): ${r.text.slice(0, 200)}`).join('\n');
308
+ return `**${c.author}** (score: ${c.score})\n${c.text.slice(0, 300)}${repliesMd ? '\n' + repliesMd : ''}`;
309
+ }).join('\n\n---\n\n');
310
+ const selftextSection = structured.selftext
311
+ ? `\n\n${structured.selftext.slice(0, 1000)}`
312
+ : '';
313
+ const cleanContent = `## šŸ“‹ ${structured.subreddit}: ${structured.title}
314
+
315
+ **Posted by** ${structured.author} | Score: ${structured.score} (${Math.round(structured.upvoteRatio * 100)}% upvoted) | ${structured.commentCount} comments
316
+ ${structured.flair ? `**Flair:** ${structured.flair}` : ''}
317
+ *${structured.created}*${selftextSection}
318
+
319
+ ---
320
+
321
+ ### Top Comments
322
+
323
+ ${commentsMd || '*No comments found.*'}`;
324
+ return { domain, type, structured, cleanContent };
325
+ }
326
+ if (isSubreddit) {
327
+ // Fetch subreddit listing
328
+ const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=15';
329
+ const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
330
+ if (!data?.data?.children)
331
+ return null;
332
+ const posts = data.data.children
333
+ .filter((c) => c.kind === 't3')
334
+ .map((c) => {
335
+ const d = c.data;
336
+ return {
337
+ title: d.title || '',
338
+ author: `u/${d.author || '[deleted]'}`,
339
+ score: d.score ?? 0,
340
+ commentCount: d.num_comments ?? 0,
341
+ url: `https://reddit.com${d.permalink}`,
342
+ flair: d.link_flair_text || null,
343
+ };
344
+ });
345
+ const subredditName = posts[0]?.url?.match(/\/r\/([^/]+)\//)?.[1] || path.match(/\/r\/([^/]+)/)?.[1] || '';
346
+ const structured = { subreddit: `r/${subredditName}`, posts };
347
+ const cleanContent = `## šŸ“‹ r/${subredditName} — Hot Posts
348
+
349
+ ${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ↑ ${p.score} | šŸ’¬ ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n ${p.url}`).join('\n\n')}`;
350
+ return { domain, type, structured, cleanContent };
351
+ }
352
+ // User or other — fall back to null (let normal HTML extraction handle it)
353
+ return null;
354
+ }
355
+ // ---------------------------------------------------------------------------
356
+ // 3. GitHub extractor
357
+ // ---------------------------------------------------------------------------
358
+ async function githubExtractor(_html, url) {
359
+ const urlObj = new URL(url);
360
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
361
+ const domain = 'github.com';
362
+ if (pathParts.length === 0)
363
+ return null;
364
+ const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
365
+ // User profile: /username (single segment)
366
+ if (pathParts.length === 1) {
367
+ const username = pathParts[0];
368
+ const userData = await fetchJson(`https://api.github.com/users/${username}`, ghHeaders);
369
+ if (!userData || userData.message === 'Not Found')
370
+ return null;
371
+ const structured = {
372
+ login: userData.login,
373
+ name: userData.name || userData.login,
374
+ bio: userData.bio || '',
375
+ company: userData.company || null,
376
+ location: userData.location || null,
377
+ blog: userData.blog || null,
378
+ followers: userData.followers ?? 0,
379
+ following: userData.following ?? 0,
380
+ publicRepos: userData.public_repos ?? 0,
381
+ created: userData.created_at,
382
+ avatarUrl: userData.avatar_url,
383
+ };
384
+ const cleanContent = `## šŸ‘¤ GitHub: ${structured.name} (@${structured.login})
385
+
386
+ ${structured.bio ? structured.bio + '\n\n' : ''}šŸ“ ${structured.location || 'N/A'} | šŸ’¼ ${structured.company || 'N/A'} | 🌐 ${structured.blog || 'N/A'}
387
+ šŸ‘„ ${structured.followers} followers | Following: ${structured.following} | šŸ“¦ ${structured.publicRepos} public repos`;
388
+ return { domain, type: 'user', structured, cleanContent };
389
+ }
390
+ const owner = pathParts[0];
391
+ const repo = pathParts[1];
392
+ // Issue: /owner/repo/issues/123
393
+ if (pathParts[2] === 'issues' && pathParts[3]) {
394
+ const issueNumber = pathParts[3];
395
+ const [issueData, commentsData] = await Promise.all([
396
+ fetchJson(`https://api.github.com/repos/${owner}/${repo}/issues/${issueNumber}`, ghHeaders),
397
+ fetchJson(`https://api.github.com/repos/${owner}/${repo}/issues/${issueNumber}/comments?per_page=20`, ghHeaders),
398
+ ]);
399
+ if (!issueData || issueData.message === 'Not Found')
400
+ return null;
401
+ const comments = Array.isArray(commentsData)
402
+ ? commentsData.map((c) => ({
403
+ author: c.user?.login || 'ghost',
404
+ text: c.body || '',
405
+ created: c.created_at,
406
+ }))
407
+ : [];
408
+ const structured = {
409
+ repo: `${owner}/${repo}`,
410
+ number: issueData.number,
411
+ title: issueData.title || '',
412
+ author: issueData.user?.login || 'ghost',
413
+ state: issueData.state,
414
+ body: issueData.body || '',
415
+ labels: (issueData.labels || []).map((l) => l.name),
416
+ created: issueData.created_at,
417
+ updated: issueData.updated_at,
418
+ commentCount: issueData.comments ?? 0,
419
+ comments,
420
+ };
421
+ const labelStr = structured.labels.length ? structured.labels.join(', ') : 'none';
422
+ const commentsMd = comments.slice(0, 10).map((c) => `**@${c.author}** (${c.created}):\n${c.text.slice(0, 300)}`).join('\n\n---\n\n');
423
+ const cleanContent = `## šŸ› Issue #${structured.number}: ${structured.title}
424
+
425
+ **Repo:** ${structured.repo} | **State:** ${structured.state} | **Author:** @${structured.author}
426
+ **Labels:** ${labelStr} | **Created:** ${structured.created}
427
+
428
+ ${structured.body.slice(0, 800)}
429
+
430
+ ---
431
+
432
+ ### Comments (${structured.commentCount})
433
+
434
+ ${commentsMd || '*No comments.*'}`;
435
+ return { domain, type: 'issue', structured, cleanContent };
436
+ }
437
+ // Pull request: /owner/repo/pull/123
438
+ if (pathParts[2] === 'pull' && pathParts[3]) {
439
+ const prNumber = pathParts[3];
440
+ const [prData, commentsData] = await Promise.all([
441
+ fetchJson(`https://api.github.com/repos/${owner}/${repo}/pulls/${prNumber}`, ghHeaders),
442
+ fetchJson(`https://api.github.com/repos/${owner}/${repo}/issues/${prNumber}/comments?per_page=20`, ghHeaders),
443
+ ]);
444
+ if (!prData || prData.message === 'Not Found')
445
+ return null;
446
+ const comments = Array.isArray(commentsData)
447
+ ? commentsData.map((c) => ({
448
+ author: c.user?.login || 'ghost',
449
+ text: c.body || '',
450
+ created: c.created_at,
451
+ }))
452
+ : [];
453
+ const structured = {
454
+ repo: `${owner}/${repo}`,
455
+ number: prData.number,
456
+ title: prData.title || '',
457
+ author: prData.user?.login || 'ghost',
458
+ state: prData.state,
459
+ merged: prData.merged ?? false,
460
+ body: prData.body || '',
461
+ labels: (prData.labels || []).map((l) => l.name),
462
+ created: prData.created_at,
463
+ updated: prData.updated_at,
464
+ commentCount: prData.comments ?? 0,
465
+ additions: prData.additions ?? 0,
466
+ deletions: prData.deletions ?? 0,
467
+ changedFiles: prData.changed_files ?? 0,
468
+ headBranch: prData.head?.label || '',
469
+ baseBranch: prData.base?.label || '',
470
+ comments,
471
+ };
472
+ const labelStr = structured.labels.length ? structured.labels.join(', ') : 'none';
473
+ const commentsMd = comments.slice(0, 8).map((c) => `**@${c.author}** (${c.created}):\n${c.text.slice(0, 300)}`).join('\n\n---\n\n');
474
+ const cleanContent = `## šŸ”€ PR #${structured.number}: ${structured.title}
475
+
476
+ **Repo:** ${structured.repo} | **State:** ${structured.state}${structured.merged ? ' (merged)' : ''} | **Author:** @${structured.author}
477
+ **Labels:** ${labelStr} | **${structured.headBranch} → ${structured.baseBranch}**
478
+ **Changes:** +${structured.additions} / -${structured.deletions} across ${structured.changedFiles} files
479
+
480
+ ${structured.body.slice(0, 800)}
481
+
482
+ ---
483
+
484
+ ### Comments (${structured.commentCount})
485
+
486
+ ${commentsMd || '*No comments.*'}`;
487
+ return { domain, type: 'pull_request', structured, cleanContent };
488
+ }
489
+ // Repository page: /owner/repo (and no deeper path we handle above)
490
+ if (pathParts.length >= 2) {
491
+ const [repoData, readmeData] = await Promise.all([
492
+ fetchJson(`https://api.github.com/repos/${owner}/${repo}`, ghHeaders),
493
+ fetchJson(`https://api.github.com/repos/${owner}/${repo}/readme`, ghHeaders).catch(() => null),
494
+ ]);
495
+ if (!repoData || repoData.message === 'Not Found')
496
+ return null;
497
+ // README content is base64 encoded
498
+ let readmeText = '';
499
+ if (readmeData?.content) {
500
+ try {
501
+ readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 500);
502
+ }
503
+ catch { /* ignore */ }
504
+ }
505
+ const structured = {
506
+ name: `${owner}/${repo}`,
507
+ description: repoData.description || '',
508
+ stars: repoData.stargazers_count ?? 0,
509
+ forks: repoData.forks_count ?? 0,
510
+ language: repoData.language || null,
511
+ topics: repoData.topics || [],
512
+ license: repoData.license?.spdx_id || null,
513
+ openIssues: repoData.open_issues_count ?? 0,
514
+ lastPush: repoData.pushed_at,
515
+ createdAt: repoData.created_at,
516
+ defaultBranch: repoData.default_branch || 'main',
517
+ homepage: repoData.homepage || null,
518
+ archived: repoData.archived || false,
519
+ fork: repoData.fork || false,
520
+ readme: readmeText,
521
+ };
522
+ const topicsStr = structured.topics.length ? structured.topics.join(', ') : 'none';
523
+ const cleanContent = `## šŸ“¦ Repository: ${structured.name}
524
+
525
+ ${structured.description || '*No description.*'}
526
+
527
+ ⭐ ${structured.stars.toLocaleString()} stars | šŸ“ ${structured.forks.toLocaleString()} forks | šŸ’» ${structured.language || 'N/A'} | šŸ“œ ${structured.license || 'N/A'}
528
+ šŸ·ļø Topics: ${topicsStr}
529
+ šŸ”— ${structured.homepage || 'No homepage'} | Last push: ${structured.lastPush}${structured.archived ? '\nāš ļø **ARCHIVED**' : ''}
530
+
531
+ ${structured.readme ? `### README (excerpt)\n\n${structured.readme}` : ''}`;
532
+ return { domain, type: 'repository', structured, cleanContent };
533
+ }
534
+ return null;
535
+ }
536
+ async function fetchHNComment(id, depth) {
537
+ if (depth < 0)
538
+ return null;
539
+ try {
540
+ const data = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${id}.json`);
541
+ if (!data || data.deleted || data.dead)
542
+ return null;
543
+ const text = stripHtml(data.text || '');
544
+ if (!text)
545
+ return null;
546
+ let replies = [];
547
+ if (depth > 0 && Array.isArray(data.kids) && data.kids.length > 0) {
548
+ const replyResults = await Promise.all(data.kids.slice(0, 5).map((kid) => fetchHNComment(kid, depth - 1)));
549
+ replies = replyResults.filter(Boolean);
550
+ }
551
+ return {
552
+ author: data.by || '[deleted]',
553
+ text,
554
+ time: unixToIso(data.time),
555
+ replies,
556
+ };
557
+ }
558
+ catch {
559
+ return null;
560
+ }
561
+ }
562
+ async function hackerNewsExtractor(_html, url) {
563
+ const urlObj = new URL(url);
564
+ const path = urlObj.pathname;
565
+ const domain = 'news.ycombinator.com';
566
+ // Story: ?id=12345 or /item?id=12345
567
+ const itemId = urlObj.searchParams.get('id');
568
+ if (itemId && (path === '/' || path === '/item' || path === '')) {
569
+ const storyData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${itemId}.json`);
570
+ if (!storyData)
571
+ return null;
572
+ const type = storyData.type === 'story' ? 'story' :
573
+ storyData.type === 'ask' ? 'ask_hn' :
574
+ storyData.type === 'show' ? 'show_hn' :
575
+ storyData.type === 'job' ? 'job' : 'story';
576
+ // Fetch top 15 comments (top-level), 2 levels deep
577
+ const commentIds = Array.isArray(storyData.kids) ? storyData.kids.slice(0, 15) : [];
578
+ const commentResults = await Promise.all(commentIds.map((id) => fetchHNComment(id, 2)));
579
+ const comments = commentResults.filter(Boolean);
580
+ const structured = {
581
+ id: storyData.id,
582
+ title: storyData.title || '',
583
+ author: storyData.by || '[deleted]',
584
+ score: storyData.score ?? 0,
585
+ url: storyData.url || `https://news.ycombinator.com/item?id=${storyData.id}`,
586
+ commentCount: storyData.descendants ?? 0,
587
+ created: unixToIso(storyData.time),
588
+ text: storyData.text ? stripHtml(storyData.text) : null,
589
+ comments,
590
+ };
591
+ const commentsMd = comments.slice(0, 10).map(c => {
592
+ const repliesMd = c.replies.slice(0, 3).map(r => ` > **${r.author}**: ${r.text.slice(0, 200)}`).join('\n');
593
+ return `**${c.author}** (${c.time})\n${c.text.slice(0, 300)}${repliesMd ? '\n' + repliesMd : ''}`;
594
+ }).join('\n\n---\n\n');
595
+ const bodySection = structured.text ? `\n\n${structured.text.slice(0, 500)}` : '';
596
+ const cleanContent = `## 🟠 Hacker News: ${structured.title}
597
+
598
+ **Author:** ${structured.author} | **Score:** ${structured.score} | **Comments:** ${structured.commentCount}
599
+ **Posted:** ${structured.created}
600
+ ${structured.url !== `https://news.ycombinator.com/item?id=${structured.id}` ? `**Link:** ${structured.url}` : ''}${bodySection}
601
+
602
+ ---
603
+
604
+ ### Top Comments
605
+
606
+ ${commentsMd || '*No comments found.*'}`;
607
+ return { domain, type, structured, cleanContent };
608
+ }
609
+ // Front page / /news — fetch top stories
610
+ if (path === '/' || path === '/news' || path === '') {
611
+ const topIds = await fetchJson('https://hacker-news.firebaseio.com/v0/topstories.json');
612
+ if (!Array.isArray(topIds))
613
+ return null;
614
+ const top30 = topIds.slice(0, 30);
615
+ const storyResults = await Promise.all(top30.map((id) => fetchJson(`https://hacker-news.firebaseio.com/v0/item/${id}.json`).catch(() => null)));
616
+ const stories = storyResults
617
+ .filter((s) => s && s.title)
618
+ .map((s) => ({
619
+ id: s.id,
620
+ title: s.title,
621
+ author: s.by || '[deleted]',
622
+ score: s.score ?? 0,
623
+ commentCount: s.descendants ?? 0,
624
+ url: s.url || `https://news.ycombinator.com/item?id=${s.id}`,
625
+ hnUrl: `https://news.ycombinator.com/item?id=${s.id}`,
626
+ }));
627
+ const structured = { stories };
628
+ const cleanContent = `## 🟠 Hacker News — Front Page
629
+
630
+ ${stories.map((s, i) => `${i + 1}. **${s.title}**\n ↑ ${s.score} | šŸ’¬ ${s.commentCount} | by ${s.author}\n ${s.url}`).join('\n\n')}`;
631
+ return { domain, type: 'frontpage', structured, cleanContent };
632
+ }
633
+ // User page: ?id=username
634
+ const userId = urlObj.searchParams.get('id');
635
+ if (path === '/user' && userId) {
636
+ const userData = await fetchJson(`https://hacker-news.firebaseio.com/v0/user/${userId}.json`);
637
+ if (!userData)
638
+ return null;
639
+ const structured = {
640
+ id: userData.id,
641
+ karma: userData.karma ?? 0,
642
+ about: userData.about ? stripHtml(userData.about) : '',
643
+ created: unixToIso(userData.created),
644
+ submitted: (userData.submitted || []).length,
645
+ };
646
+ const cleanContent = `## 🟠 HN User: ${structured.id}
647
+
648
+ **Karma:** ${structured.karma} | **Member since:** ${structured.created}
649
+ ${structured.about ? '\n' + structured.about : ''}`;
650
+ return { domain, type: 'user', structured, cleanContent };
651
+ }
652
+ return null;
653
+ }
654
+ //# sourceMappingURL=domain-extractors.js.map