webpeel 0.13.4 ā 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -162
- package/dist/cli-auth.js +7 -7
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +197 -26
- package/dist/cli.js.map +1 -1
- package/dist/core/auto-extract.d.ts +83 -0
- package/dist/core/auto-extract.d.ts.map +1 -0
- package/dist/core/auto-extract.js +565 -0
- package/dist/core/auto-extract.js.map +1 -0
- package/dist/core/deep-fetch.d.ts +75 -0
- package/dist/core/deep-fetch.d.ts.map +1 -0
- package/dist/core/deep-fetch.js +406 -0
- package/dist/core/deep-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts +34 -0
- package/dist/core/domain-extractors.d.ts.map +1 -0
- package/dist/core/domain-extractors.js +654 -0
- package/dist/core/domain-extractors.js.map +1 -0
- package/dist/core/markdown.d.ts +8 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +25 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/quick-answer.d.ts +28 -0
- package/dist/core/quick-answer.d.ts.map +1 -0
- package/dist/core/quick-answer.js +288 -0
- package/dist/core/quick-answer.js.map +1 -0
- package/dist/core/readability.d.ts +58 -0
- package/dist/core/readability.d.ts.map +1 -0
- package/dist/core/readability.js +496 -0
- package/dist/core/readability.js.map +1 -0
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +3 -6
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +70 -5
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/watch-manager.d.ts +140 -0
- package/dist/core/watch-manager.d.ts.map +1 -0
- package/dist/core/watch-manager.js +348 -0
- package/dist/core/watch-manager.js.map +1 -0
- package/dist/core/youtube.d.ts +91 -0
- package/dist/core/youtube.d.ts.map +1 -0
- package/dist/core/youtube.js +380 -0
- package/dist/core/youtube.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +103 -0
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +58 -16
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +19 -1
- package/dist/server/app.js.map +1 -1
- package/dist/server/routes/deep-fetch.d.ts +9 -0
- package/dist/server/routes/deep-fetch.d.ts.map +1 -0
- package/dist/server/routes/deep-fetch.js +38 -0
- package/dist/server/routes/deep-fetch.js.map +1 -0
- package/dist/server/routes/extract.d.ts.map +1 -1
- package/dist/server/routes/extract.js +11 -0
- package/dist/server/routes/extract.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +45 -19
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts +2 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +307 -38
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/quick-answer.d.ts +9 -0
- package/dist/server/routes/quick-answer.d.ts.map +1 -0
- package/dist/server/routes/quick-answer.js +84 -0
- package/dist/server/routes/quick-answer.js.map +1 -0
- package/dist/server/routes/watch.d.ts +16 -0
- package/dist/server/routes/watch.d.ts.map +1 -0
- package/dist/server/routes/watch.js +219 -0
- package/dist/server/routes/watch.js.map +1 -0
- package/dist/server/routes/youtube.d.ts +7 -0
- package/dist/server/routes/youtube.d.ts.map +1 -0
- package/dist/server/routes/youtube.js +87 -0
- package/dist/server/routes/youtube.js.map +1 -0
- package/dist/types.d.ts +18 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +14 -5
- package/package.json +1 -1
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain-aware structured extractors for WebPeel.
|
|
3
|
+
*
|
|
4
|
+
* When peel() fetches a URL that matches a known domain, the relevant
|
|
5
|
+
* extractor fires and returns clean structured data + a markdown summary.
|
|
6
|
+
*
|
|
7
|
+
* Supported domains:
|
|
8
|
+
* - twitter.com / x.com ā tweets, threads, profiles
|
|
9
|
+
* - reddit.com ā posts with comments (via JSON API)
|
|
10
|
+
* - github.com ā repos, issues, PRs, users (via GitHub API)
|
|
11
|
+
* - news.ycombinator.com ā stories with comments (via HN Firebase API)
|
|
12
|
+
*/
|
|
13
|
+
import { simpleFetch } from './fetcher.js';
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Registry
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
const REGISTRY = [
|
|
18
|
+
{ match: (h) => h === 'twitter.com' || h === 'x.com' || h === 'www.twitter.com' || h === 'www.x.com', extractor: twitterExtractor },
|
|
19
|
+
{ match: (h) => h === 'reddit.com' || h === 'www.reddit.com' || h === 'old.reddit.com', extractor: redditExtractor },
|
|
20
|
+
{ match: (h) => h === 'github.com' || h === 'www.github.com', extractor: githubExtractor },
|
|
21
|
+
{ match: (h) => h === 'news.ycombinator.com', extractor: hackerNewsExtractor },
|
|
22
|
+
];
|
|
23
|
+
/**
|
|
24
|
+
* Returns the domain extractor for a URL, or null if none matches.
|
|
25
|
+
*/
|
|
26
|
+
export function getDomainExtractor(url) {
|
|
27
|
+
try {
|
|
28
|
+
const { hostname } = new URL(url);
|
|
29
|
+
const host = hostname.toLowerCase();
|
|
30
|
+
for (const entry of REGISTRY) {
|
|
31
|
+
if (entry.match(host))
|
|
32
|
+
return entry.extractor;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
// Invalid URL ā no extractor
|
|
37
|
+
}
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Convenience: run the extractor for the URL (if one exists).
|
|
42
|
+
* Returns null when no extractor matches or extraction fails.
|
|
43
|
+
*/
|
|
44
|
+
export async function extractDomainData(html, url) {
|
|
45
|
+
const extractor = getDomainExtractor(url);
|
|
46
|
+
if (!extractor)
|
|
47
|
+
return null;
|
|
48
|
+
try {
|
|
49
|
+
return await extractor(html, url);
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
return null;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// Helpers
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
/** Safe JSON parse ā returns null on failure. */
|
|
59
|
+
function tryParseJson(text) {
|
|
60
|
+
try {
|
|
61
|
+
return JSON.parse(text);
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
return null;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
/** Strip HTML tags from a string. */
|
|
68
|
+
function stripHtml(str) {
|
|
69
|
+
return str.replace(/<[^>]+>/g, '').replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"').replace(/'/g, "'").replace(/ /g, ' ').trim();
|
|
70
|
+
}
|
|
71
|
+
/** Format a Unix timestamp (seconds) as ISO 8601. */
|
|
72
|
+
function unixToIso(sec) {
|
|
73
|
+
return new Date(sec * 1000).toISOString();
|
|
74
|
+
}
|
|
75
|
+
/** Fetch JSON from a URL using simpleFetch (reuses WebPeel's HTTP stack). */
|
|
76
|
+
async function fetchJson(url, customHeaders) {
|
|
77
|
+
const result = await simpleFetch(url, undefined, 15000, {
|
|
78
|
+
Accept: 'application/json',
|
|
79
|
+
...customHeaders,
|
|
80
|
+
});
|
|
81
|
+
return tryParseJson(result.html);
|
|
82
|
+
}
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// 1. Twitter / X extractor
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
/** Recursively search an object for a value matching predicate (BFS). */
|
|
87
|
+
function deepFind(obj, predicate, depth = 0) {
|
|
88
|
+
if (depth > 12 || obj === null || typeof obj !== 'object')
|
|
89
|
+
return null;
|
|
90
|
+
if (predicate(obj))
|
|
91
|
+
return obj;
|
|
92
|
+
for (const val of Object.values(obj)) {
|
|
93
|
+
const found = deepFind(val, predicate, depth + 1);
|
|
94
|
+
if (found !== null)
|
|
95
|
+
return found;
|
|
96
|
+
}
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
/** Detect tweet type from __NEXT_DATA__ and extract all tweet data. */
|
|
100
|
+
function parseTweetFromNextData(nextData) {
|
|
101
|
+
// Walk the tree to find a tweet_results.result structure
|
|
102
|
+
const tweetResult = deepFind(nextData, (v) => v?.tweet_results?.result?.legacy?.full_text !== undefined);
|
|
103
|
+
if (!tweetResult)
|
|
104
|
+
return null;
|
|
105
|
+
const result = tweetResult.tweet_results.result;
|
|
106
|
+
return parseTweetResult(result);
|
|
107
|
+
}
|
|
108
|
+
function parseTweetResult(result) {
|
|
109
|
+
const legacy = result?.legacy;
|
|
110
|
+
if (!legacy)
|
|
111
|
+
return null;
|
|
112
|
+
const userLegacy = result?.core?.user_results?.result?.legacy ||
|
|
113
|
+
result?.user_results?.result?.legacy;
|
|
114
|
+
const author = {
|
|
115
|
+
name: userLegacy?.name || '',
|
|
116
|
+
handle: '@' + (userLegacy?.screen_name || ''),
|
|
117
|
+
verified: userLegacy?.verified || result?.core?.user_results?.result?.is_blue_verified || false,
|
|
118
|
+
};
|
|
119
|
+
const metrics = {
|
|
120
|
+
likes: legacy.favorite_count ?? 0,
|
|
121
|
+
retweets: legacy.retweet_count ?? 0,
|
|
122
|
+
replies: legacy.reply_count ?? 0,
|
|
123
|
+
views: Number(result?.views?.count ?? 0),
|
|
124
|
+
};
|
|
125
|
+
// Media
|
|
126
|
+
const mediaItems = [];
|
|
127
|
+
const mediaEntities = legacy.extended_entities?.media || legacy.entities?.media || [];
|
|
128
|
+
for (const m of mediaEntities) {
|
|
129
|
+
if (m.media_url_https)
|
|
130
|
+
mediaItems.push(m.media_url_https);
|
|
131
|
+
}
|
|
132
|
+
// Quoted tweet
|
|
133
|
+
let quotedTweet = null;
|
|
134
|
+
if (result.quoted_status_result) {
|
|
135
|
+
const qLegacy = result.quoted_status_result?.result?.legacy;
|
|
136
|
+
const qUserLegacy = result.quoted_status_result?.result?.core?.user_results?.result?.legacy;
|
|
137
|
+
if (qLegacy) {
|
|
138
|
+
quotedTweet = {
|
|
139
|
+
text: qLegacy.full_text || qLegacy.text || '',
|
|
140
|
+
author: {
|
|
141
|
+
name: qUserLegacy?.name || '',
|
|
142
|
+
handle: '@' + (qUserLegacy?.screen_name || ''),
|
|
143
|
+
},
|
|
144
|
+
timestamp: qLegacy.created_at ? new Date(qLegacy.created_at).toISOString() : undefined,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return {
|
|
149
|
+
author,
|
|
150
|
+
text: legacy.full_text || legacy.text || '',
|
|
151
|
+
timestamp: legacy.created_at ? new Date(legacy.created_at).toISOString() : undefined,
|
|
152
|
+
metrics,
|
|
153
|
+
media: mediaItems,
|
|
154
|
+
quotedTweet,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
async function twitterExtractor(html, url) {
|
|
158
|
+
const urlObj = new URL(url);
|
|
159
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
160
|
+
const isTweet = pathParts.includes('status');
|
|
161
|
+
const type = isTweet ? 'tweet' : 'profile';
|
|
162
|
+
const domain = 'twitter.com';
|
|
163
|
+
// --- Try __NEXT_DATA__ JSON (SSR data) ---
|
|
164
|
+
const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
|
|
165
|
+
let structured = null;
|
|
166
|
+
if (nextDataMatch) {
|
|
167
|
+
const nextData = tryParseJson(nextDataMatch[1]);
|
|
168
|
+
if (nextData) {
|
|
169
|
+
if (isTweet) {
|
|
170
|
+
const tweetData = parseTweetFromNextData(nextData);
|
|
171
|
+
if (tweetData) {
|
|
172
|
+
structured = tweetData;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
// Profile page ā extract user info
|
|
177
|
+
const userResult = deepFind(nextData, (v) => v?.user_results?.result?.legacy?.screen_name);
|
|
178
|
+
if (userResult) {
|
|
179
|
+
const uLegacy = userResult.user_results.result.legacy;
|
|
180
|
+
structured = {
|
|
181
|
+
name: uLegacy.name || '',
|
|
182
|
+
handle: '@' + (uLegacy.screen_name || ''),
|
|
183
|
+
bio: uLegacy.description || '',
|
|
184
|
+
followers: uLegacy.followers_count ?? 0,
|
|
185
|
+
following: uLegacy.friends_count ?? 0,
|
|
186
|
+
tweets: uLegacy.statuses_count ?? 0,
|
|
187
|
+
verified: userResult.user_results.result.is_blue_verified || uLegacy.verified || false,
|
|
188
|
+
location: uLegacy.location || '',
|
|
189
|
+
created: uLegacy.created_at ? new Date(uLegacy.created_at).toISOString() : undefined,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
// --- Fallback: parse DOM for tweet text if __NEXT_DATA__ parsing failed ---
|
|
196
|
+
if (!structured && isTweet) {
|
|
197
|
+
// Try to extract from og: tags or article body
|
|
198
|
+
const ogDescMatch = html.match(/<meta[^>]+property="og:description"[^>]+content="([^"]+)"/i);
|
|
199
|
+
const ogTitleMatch = html.match(/<meta[^>]+property="og:title"[^>]+content="([^"]+)"/i);
|
|
200
|
+
if (ogDescMatch || ogTitleMatch) {
|
|
201
|
+
const authorRaw = (ogTitleMatch?.[1] || '').replace(' on X', '').replace(' on Twitter', '').trim();
|
|
202
|
+
const text = ogDescMatch?.[1] ? decodeURIComponent(ogDescMatch[1].replace(/'/g, "'").replace(/&/g, '&')) : '';
|
|
203
|
+
structured = {
|
|
204
|
+
author: { name: authorRaw, handle: '', verified: false },
|
|
205
|
+
text: stripHtml(text),
|
|
206
|
+
timestamp: undefined,
|
|
207
|
+
metrics: { likes: 0, retweets: 0, replies: 0, views: 0 },
|
|
208
|
+
media: [],
|
|
209
|
+
quotedTweet: null,
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
if (!structured)
|
|
214
|
+
return null;
|
|
215
|
+
// Build clean markdown
|
|
216
|
+
let cleanContent;
|
|
217
|
+
if (type === 'tweet') {
|
|
218
|
+
const s = structured;
|
|
219
|
+
const authorLine = s.author?.handle
|
|
220
|
+
? `**${s.author.name}** (${s.author.handle})`
|
|
221
|
+
: `**${s.author?.name || 'Unknown'}**`;
|
|
222
|
+
const timeLine = s.timestamp ? `\n*${s.timestamp}*` : '';
|
|
223
|
+
const metricsLine = s.metrics
|
|
224
|
+
? `\n\nš¬ ${s.metrics.replies} š ${s.metrics.retweets} ā¤ļø ${s.metrics.likes}${s.metrics.views ? ` š ${s.metrics.views}` : ''}`
|
|
225
|
+
: '';
|
|
226
|
+
const mediaLine = s.media?.length ? `\n\nš· Media: ${s.media.join(', ')}` : '';
|
|
227
|
+
const quotedLine = s.quotedTweet
|
|
228
|
+
? `\n\n> **Quoted tweet by ${s.quotedTweet.author?.name || 'unknown'}:** ${s.quotedTweet.text}`
|
|
229
|
+
: '';
|
|
230
|
+
const threadLine = s.thread?.length ? '\n\n**Thread:**\n' + s.thread.map((t, i) => `${i + 2}. ${t.text}`).join('\n') : '';
|
|
231
|
+
cleanContent = `## š¦ Tweet by ${authorLine}${timeLine}\n\n${s.text}${quotedLine}${threadLine}${metricsLine}${mediaLine}`;
|
|
232
|
+
}
|
|
233
|
+
else {
|
|
234
|
+
const s = structured;
|
|
235
|
+
cleanContent = `## š¦ @${(s.handle || '').replace('@', '')} on X/Twitter\n\n**${s.name}**\n${s.bio || ''}\n\nš ${s.location || 'N/A'} | š„ ${s.followers?.toLocaleString() || 0} followers | Following: ${s.following?.toLocaleString() || 0} | Tweets: ${s.tweets?.toLocaleString() || 0}`;
|
|
236
|
+
}
|
|
237
|
+
return { domain, type, structured, cleanContent };
|
|
238
|
+
}
|
|
239
|
+
function parseRedditComment(data, depth) {
|
|
240
|
+
if (!data || data.kind === 'more')
|
|
241
|
+
return null;
|
|
242
|
+
const d = data.kind === 't1' ? data.data : data;
|
|
243
|
+
if (!d || !d.body)
|
|
244
|
+
return null;
|
|
245
|
+
const replies = [];
|
|
246
|
+
if (depth > 0 && d.replies && d.replies.data?.children) {
|
|
247
|
+
for (const child of d.replies.data.children) {
|
|
248
|
+
const c = parseRedditComment(child, depth - 1);
|
|
249
|
+
if (c)
|
|
250
|
+
replies.push(c);
|
|
251
|
+
}
|
|
252
|
+
// Sort replies by score
|
|
253
|
+
replies.sort((a, b) => b.score - a.score);
|
|
254
|
+
replies.splice(3); // max 3 replies per level
|
|
255
|
+
}
|
|
256
|
+
return {
|
|
257
|
+
author: `u/${d.author || '[deleted]'}`,
|
|
258
|
+
text: d.body || '',
|
|
259
|
+
score: d.score || 0,
|
|
260
|
+
replies,
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
async function redditExtractor(_html, url) {
|
|
264
|
+
const urlObj = new URL(url);
|
|
265
|
+
const path = urlObj.pathname;
|
|
266
|
+
const domain = 'reddit.com';
|
|
267
|
+
// Detect page type
|
|
268
|
+
const isPost = /\/r\/[^/]+\/comments\//.test(path);
|
|
269
|
+
const isSubreddit = /^\/r\/[^/]+\/?$/.test(path);
|
|
270
|
+
const isUser = /^\/(u|user)\/[^/]+/.test(path);
|
|
271
|
+
const type = isPost ? 'post' : isSubreddit ? 'subreddit' : isUser ? 'user' : 'listing';
|
|
272
|
+
if (isPost) {
|
|
273
|
+
// Fetch post data via Reddit JSON API
|
|
274
|
+
const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
|
|
275
|
+
const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
|
|
276
|
+
if (!Array.isArray(data) || data.length < 2)
|
|
277
|
+
return null;
|
|
278
|
+
const postData = data[0]?.data?.children?.[0]?.data;
|
|
279
|
+
if (!postData)
|
|
280
|
+
return null;
|
|
281
|
+
// Parse top comments (max 20)
|
|
282
|
+
const commentChildren = data[1]?.data?.children || [];
|
|
283
|
+
const comments = [];
|
|
284
|
+
for (const child of commentChildren) {
|
|
285
|
+
const c = parseRedditComment(child, 3);
|
|
286
|
+
if (c)
|
|
287
|
+
comments.push(c);
|
|
288
|
+
if (comments.length >= 20)
|
|
289
|
+
break;
|
|
290
|
+
}
|
|
291
|
+
comments.sort((a, b) => b.score - a.score);
|
|
292
|
+
const structured = {
|
|
293
|
+
subreddit: `r/${postData.subreddit}`,
|
|
294
|
+
title: postData.title || '',
|
|
295
|
+
author: `u/${postData.author || '[deleted]'}`,
|
|
296
|
+
score: postData.score ?? 0,
|
|
297
|
+
upvoteRatio: postData.upvote_ratio ?? 1,
|
|
298
|
+
url: postData.url || url,
|
|
299
|
+
selftext: postData.selftext || '',
|
|
300
|
+
commentCount: postData.num_comments ?? 0,
|
|
301
|
+
created: unixToIso(postData.created_utc),
|
|
302
|
+
flair: postData.link_flair_text || null,
|
|
303
|
+
comments,
|
|
304
|
+
};
|
|
305
|
+
// Build clean markdown
|
|
306
|
+
const commentsMd = comments.slice(0, 10).map(c => {
|
|
307
|
+
const repliesMd = c.replies.slice(0, 2).map(r => ` > **${r.author}** (${r.score}): ${r.text.slice(0, 200)}`).join('\n');
|
|
308
|
+
return `**${c.author}** (score: ${c.score})\n${c.text.slice(0, 300)}${repliesMd ? '\n' + repliesMd : ''}`;
|
|
309
|
+
}).join('\n\n---\n\n');
|
|
310
|
+
const selftextSection = structured.selftext
|
|
311
|
+
? `\n\n${structured.selftext.slice(0, 1000)}`
|
|
312
|
+
: '';
|
|
313
|
+
const cleanContent = `## š ${structured.subreddit}: ${structured.title}
|
|
314
|
+
|
|
315
|
+
**Posted by** ${structured.author} | Score: ${structured.score} (${Math.round(structured.upvoteRatio * 100)}% upvoted) | ${structured.commentCount} comments
|
|
316
|
+
${structured.flair ? `**Flair:** ${structured.flair}` : ''}
|
|
317
|
+
*${structured.created}*${selftextSection}
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
### Top Comments
|
|
322
|
+
|
|
323
|
+
${commentsMd || '*No comments found.*'}`;
|
|
324
|
+
return { domain, type, structured, cleanContent };
|
|
325
|
+
}
|
|
326
|
+
if (isSubreddit) {
|
|
327
|
+
// Fetch subreddit listing
|
|
328
|
+
const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=15';
|
|
329
|
+
const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
|
|
330
|
+
if (!data?.data?.children)
|
|
331
|
+
return null;
|
|
332
|
+
const posts = data.data.children
|
|
333
|
+
.filter((c) => c.kind === 't3')
|
|
334
|
+
.map((c) => {
|
|
335
|
+
const d = c.data;
|
|
336
|
+
return {
|
|
337
|
+
title: d.title || '',
|
|
338
|
+
author: `u/${d.author || '[deleted]'}`,
|
|
339
|
+
score: d.score ?? 0,
|
|
340
|
+
commentCount: d.num_comments ?? 0,
|
|
341
|
+
url: `https://reddit.com${d.permalink}`,
|
|
342
|
+
flair: d.link_flair_text || null,
|
|
343
|
+
};
|
|
344
|
+
});
|
|
345
|
+
const subredditName = posts[0]?.url?.match(/\/r\/([^/]+)\//)?.[1] || path.match(/\/r\/([^/]+)/)?.[1] || '';
|
|
346
|
+
const structured = { subreddit: `r/${subredditName}`, posts };
|
|
347
|
+
const cleanContent = `## š r/${subredditName} ā Hot Posts
|
|
348
|
+
|
|
349
|
+
${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ā ${p.score} | š¬ ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n ${p.url}`).join('\n\n')}`;
|
|
350
|
+
return { domain, type, structured, cleanContent };
|
|
351
|
+
}
|
|
352
|
+
// User or other ā fall back to null (let normal HTML extraction handle it)
|
|
353
|
+
return null;
|
|
354
|
+
}
|
|
355
|
+
// ---------------------------------------------------------------------------
|
|
356
|
+
// 3. GitHub extractor
|
|
357
|
+
// ---------------------------------------------------------------------------
|
|
358
|
+
async function githubExtractor(_html, url) {
|
|
359
|
+
const urlObj = new URL(url);
|
|
360
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
361
|
+
const domain = 'github.com';
|
|
362
|
+
if (pathParts.length === 0)
|
|
363
|
+
return null;
|
|
364
|
+
const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
|
|
365
|
+
// User profile: /username (single segment)
|
|
366
|
+
if (pathParts.length === 1) {
|
|
367
|
+
const username = pathParts[0];
|
|
368
|
+
const userData = await fetchJson(`https://api.github.com/users/${username}`, ghHeaders);
|
|
369
|
+
if (!userData || userData.message === 'Not Found')
|
|
370
|
+
return null;
|
|
371
|
+
const structured = {
|
|
372
|
+
login: userData.login,
|
|
373
|
+
name: userData.name || userData.login,
|
|
374
|
+
bio: userData.bio || '',
|
|
375
|
+
company: userData.company || null,
|
|
376
|
+
location: userData.location || null,
|
|
377
|
+
blog: userData.blog || null,
|
|
378
|
+
followers: userData.followers ?? 0,
|
|
379
|
+
following: userData.following ?? 0,
|
|
380
|
+
publicRepos: userData.public_repos ?? 0,
|
|
381
|
+
created: userData.created_at,
|
|
382
|
+
avatarUrl: userData.avatar_url,
|
|
383
|
+
};
|
|
384
|
+
const cleanContent = `## š¤ GitHub: ${structured.name} (@${structured.login})
|
|
385
|
+
|
|
386
|
+
${structured.bio ? structured.bio + '\n\n' : ''}š ${structured.location || 'N/A'} | š¼ ${structured.company || 'N/A'} | š ${structured.blog || 'N/A'}
|
|
387
|
+
š„ ${structured.followers} followers | Following: ${structured.following} | š¦ ${structured.publicRepos} public repos`;
|
|
388
|
+
return { domain, type: 'user', structured, cleanContent };
|
|
389
|
+
}
|
|
390
|
+
const owner = pathParts[0];
|
|
391
|
+
const repo = pathParts[1];
|
|
392
|
+
// Issue: /owner/repo/issues/123
|
|
393
|
+
if (pathParts[2] === 'issues' && pathParts[3]) {
|
|
394
|
+
const issueNumber = pathParts[3];
|
|
395
|
+
const [issueData, commentsData] = await Promise.all([
|
|
396
|
+
fetchJson(`https://api.github.com/repos/${owner}/${repo}/issues/${issueNumber}`, ghHeaders),
|
|
397
|
+
fetchJson(`https://api.github.com/repos/${owner}/${repo}/issues/${issueNumber}/comments?per_page=20`, ghHeaders),
|
|
398
|
+
]);
|
|
399
|
+
if (!issueData || issueData.message === 'Not Found')
|
|
400
|
+
return null;
|
|
401
|
+
const comments = Array.isArray(commentsData)
|
|
402
|
+
? commentsData.map((c) => ({
|
|
403
|
+
author: c.user?.login || 'ghost',
|
|
404
|
+
text: c.body || '',
|
|
405
|
+
created: c.created_at,
|
|
406
|
+
}))
|
|
407
|
+
: [];
|
|
408
|
+
const structured = {
|
|
409
|
+
repo: `${owner}/${repo}`,
|
|
410
|
+
number: issueData.number,
|
|
411
|
+
title: issueData.title || '',
|
|
412
|
+
author: issueData.user?.login || 'ghost',
|
|
413
|
+
state: issueData.state,
|
|
414
|
+
body: issueData.body || '',
|
|
415
|
+
labels: (issueData.labels || []).map((l) => l.name),
|
|
416
|
+
created: issueData.created_at,
|
|
417
|
+
updated: issueData.updated_at,
|
|
418
|
+
commentCount: issueData.comments ?? 0,
|
|
419
|
+
comments,
|
|
420
|
+
};
|
|
421
|
+
const labelStr = structured.labels.length ? structured.labels.join(', ') : 'none';
|
|
422
|
+
const commentsMd = comments.slice(0, 10).map((c) => `**@${c.author}** (${c.created}):\n${c.text.slice(0, 300)}`).join('\n\n---\n\n');
|
|
423
|
+
const cleanContent = `## š Issue #${structured.number}: ${structured.title}
|
|
424
|
+
|
|
425
|
+
**Repo:** ${structured.repo} | **State:** ${structured.state} | **Author:** @${structured.author}
|
|
426
|
+
**Labels:** ${labelStr} | **Created:** ${structured.created}
|
|
427
|
+
|
|
428
|
+
${structured.body.slice(0, 800)}
|
|
429
|
+
|
|
430
|
+
---
|
|
431
|
+
|
|
432
|
+
### Comments (${structured.commentCount})
|
|
433
|
+
|
|
434
|
+
${commentsMd || '*No comments.*'}`;
|
|
435
|
+
return { domain, type: 'issue', structured, cleanContent };
|
|
436
|
+
}
|
|
437
|
+
// Pull request: /owner/repo/pull/123
|
|
438
|
+
if (pathParts[2] === 'pull' && pathParts[3]) {
|
|
439
|
+
const prNumber = pathParts[3];
|
|
440
|
+
const [prData, commentsData] = await Promise.all([
|
|
441
|
+
fetchJson(`https://api.github.com/repos/${owner}/${repo}/pulls/${prNumber}`, ghHeaders),
|
|
442
|
+
fetchJson(`https://api.github.com/repos/${owner}/${repo}/issues/${prNumber}/comments?per_page=20`, ghHeaders),
|
|
443
|
+
]);
|
|
444
|
+
if (!prData || prData.message === 'Not Found')
|
|
445
|
+
return null;
|
|
446
|
+
const comments = Array.isArray(commentsData)
|
|
447
|
+
? commentsData.map((c) => ({
|
|
448
|
+
author: c.user?.login || 'ghost',
|
|
449
|
+
text: c.body || '',
|
|
450
|
+
created: c.created_at,
|
|
451
|
+
}))
|
|
452
|
+
: [];
|
|
453
|
+
const structured = {
|
|
454
|
+
repo: `${owner}/${repo}`,
|
|
455
|
+
number: prData.number,
|
|
456
|
+
title: prData.title || '',
|
|
457
|
+
author: prData.user?.login || 'ghost',
|
|
458
|
+
state: prData.state,
|
|
459
|
+
merged: prData.merged ?? false,
|
|
460
|
+
body: prData.body || '',
|
|
461
|
+
labels: (prData.labels || []).map((l) => l.name),
|
|
462
|
+
created: prData.created_at,
|
|
463
|
+
updated: prData.updated_at,
|
|
464
|
+
commentCount: prData.comments ?? 0,
|
|
465
|
+
additions: prData.additions ?? 0,
|
|
466
|
+
deletions: prData.deletions ?? 0,
|
|
467
|
+
changedFiles: prData.changed_files ?? 0,
|
|
468
|
+
headBranch: prData.head?.label || '',
|
|
469
|
+
baseBranch: prData.base?.label || '',
|
|
470
|
+
comments,
|
|
471
|
+
};
|
|
472
|
+
const labelStr = structured.labels.length ? structured.labels.join(', ') : 'none';
|
|
473
|
+
const commentsMd = comments.slice(0, 8).map((c) => `**@${c.author}** (${c.created}):\n${c.text.slice(0, 300)}`).join('\n\n---\n\n');
|
|
474
|
+
const cleanContent = `## š PR #${structured.number}: ${structured.title}
|
|
475
|
+
|
|
476
|
+
**Repo:** ${structured.repo} | **State:** ${structured.state}${structured.merged ? ' (merged)' : ''} | **Author:** @${structured.author}
|
|
477
|
+
**Labels:** ${labelStr} | **${structured.headBranch} ā ${structured.baseBranch}**
|
|
478
|
+
**Changes:** +${structured.additions} / -${structured.deletions} across ${structured.changedFiles} files
|
|
479
|
+
|
|
480
|
+
${structured.body.slice(0, 800)}
|
|
481
|
+
|
|
482
|
+
---
|
|
483
|
+
|
|
484
|
+
### Comments (${structured.commentCount})
|
|
485
|
+
|
|
486
|
+
${commentsMd || '*No comments.*'}`;
|
|
487
|
+
return { domain, type: 'pull_request', structured, cleanContent };
|
|
488
|
+
}
|
|
489
|
+
// Repository page: /owner/repo (and no deeper path we handle above)
|
|
490
|
+
if (pathParts.length >= 2) {
|
|
491
|
+
const [repoData, readmeData] = await Promise.all([
|
|
492
|
+
fetchJson(`https://api.github.com/repos/${owner}/${repo}`, ghHeaders),
|
|
493
|
+
fetchJson(`https://api.github.com/repos/${owner}/${repo}/readme`, ghHeaders).catch(() => null),
|
|
494
|
+
]);
|
|
495
|
+
if (!repoData || repoData.message === 'Not Found')
|
|
496
|
+
return null;
|
|
497
|
+
// README content is base64 encoded
|
|
498
|
+
let readmeText = '';
|
|
499
|
+
if (readmeData?.content) {
|
|
500
|
+
try {
|
|
501
|
+
readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 500);
|
|
502
|
+
}
|
|
503
|
+
catch { /* ignore */ }
|
|
504
|
+
}
|
|
505
|
+
const structured = {
|
|
506
|
+
name: `${owner}/${repo}`,
|
|
507
|
+
description: repoData.description || '',
|
|
508
|
+
stars: repoData.stargazers_count ?? 0,
|
|
509
|
+
forks: repoData.forks_count ?? 0,
|
|
510
|
+
language: repoData.language || null,
|
|
511
|
+
topics: repoData.topics || [],
|
|
512
|
+
license: repoData.license?.spdx_id || null,
|
|
513
|
+
openIssues: repoData.open_issues_count ?? 0,
|
|
514
|
+
lastPush: repoData.pushed_at,
|
|
515
|
+
createdAt: repoData.created_at,
|
|
516
|
+
defaultBranch: repoData.default_branch || 'main',
|
|
517
|
+
homepage: repoData.homepage || null,
|
|
518
|
+
archived: repoData.archived || false,
|
|
519
|
+
fork: repoData.fork || false,
|
|
520
|
+
readme: readmeText,
|
|
521
|
+
};
|
|
522
|
+
const topicsStr = structured.topics.length ? structured.topics.join(', ') : 'none';
|
|
523
|
+
const cleanContent = `## š¦ Repository: ${structured.name}
|
|
524
|
+
|
|
525
|
+
${structured.description || '*No description.*'}
|
|
526
|
+
|
|
527
|
+
ā ${structured.stars.toLocaleString()} stars | š“ ${structured.forks.toLocaleString()} forks | š» ${structured.language || 'N/A'} | š ${structured.license || 'N/A'}
|
|
528
|
+
š·ļø Topics: ${topicsStr}
|
|
529
|
+
š ${structured.homepage || 'No homepage'} | Last push: ${structured.lastPush}${structured.archived ? '\nā ļø **ARCHIVED**' : ''}
|
|
530
|
+
|
|
531
|
+
${structured.readme ? `### README (excerpt)\n\n${structured.readme}` : ''}`;
|
|
532
|
+
return { domain, type: 'repository', structured, cleanContent };
|
|
533
|
+
}
|
|
534
|
+
return null;
|
|
535
|
+
}
|
|
536
|
+
async function fetchHNComment(id, depth) {
|
|
537
|
+
if (depth < 0)
|
|
538
|
+
return null;
|
|
539
|
+
try {
|
|
540
|
+
const data = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${id}.json`);
|
|
541
|
+
if (!data || data.deleted || data.dead)
|
|
542
|
+
return null;
|
|
543
|
+
const text = stripHtml(data.text || '');
|
|
544
|
+
if (!text)
|
|
545
|
+
return null;
|
|
546
|
+
let replies = [];
|
|
547
|
+
if (depth > 0 && Array.isArray(data.kids) && data.kids.length > 0) {
|
|
548
|
+
const replyResults = await Promise.all(data.kids.slice(0, 5).map((kid) => fetchHNComment(kid, depth - 1)));
|
|
549
|
+
replies = replyResults.filter(Boolean);
|
|
550
|
+
}
|
|
551
|
+
return {
|
|
552
|
+
author: data.by || '[deleted]',
|
|
553
|
+
text,
|
|
554
|
+
time: unixToIso(data.time),
|
|
555
|
+
replies,
|
|
556
|
+
};
|
|
557
|
+
}
|
|
558
|
+
catch {
|
|
559
|
+
return null;
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
async function hackerNewsExtractor(_html, url) {
|
|
563
|
+
const urlObj = new URL(url);
|
|
564
|
+
const path = urlObj.pathname;
|
|
565
|
+
const domain = 'news.ycombinator.com';
|
|
566
|
+
// Story: ?id=12345 or /item?id=12345
|
|
567
|
+
const itemId = urlObj.searchParams.get('id');
|
|
568
|
+
if (itemId && (path === '/' || path === '/item' || path === '')) {
|
|
569
|
+
const storyData = await fetchJson(`https://hacker-news.firebaseio.com/v0/item/${itemId}.json`);
|
|
570
|
+
if (!storyData)
|
|
571
|
+
return null;
|
|
572
|
+
const type = storyData.type === 'story' ? 'story' :
|
|
573
|
+
storyData.type === 'ask' ? 'ask_hn' :
|
|
574
|
+
storyData.type === 'show' ? 'show_hn' :
|
|
575
|
+
storyData.type === 'job' ? 'job' : 'story';
|
|
576
|
+
// Fetch top 15 comments (top-level), 2 levels deep
|
|
577
|
+
const commentIds = Array.isArray(storyData.kids) ? storyData.kids.slice(0, 15) : [];
|
|
578
|
+
const commentResults = await Promise.all(commentIds.map((id) => fetchHNComment(id, 2)));
|
|
579
|
+
const comments = commentResults.filter(Boolean);
|
|
580
|
+
const structured = {
|
|
581
|
+
id: storyData.id,
|
|
582
|
+
title: storyData.title || '',
|
|
583
|
+
author: storyData.by || '[deleted]',
|
|
584
|
+
score: storyData.score ?? 0,
|
|
585
|
+
url: storyData.url || `https://news.ycombinator.com/item?id=${storyData.id}`,
|
|
586
|
+
commentCount: storyData.descendants ?? 0,
|
|
587
|
+
created: unixToIso(storyData.time),
|
|
588
|
+
text: storyData.text ? stripHtml(storyData.text) : null,
|
|
589
|
+
comments,
|
|
590
|
+
};
|
|
591
|
+
const commentsMd = comments.slice(0, 10).map(c => {
|
|
592
|
+
const repliesMd = c.replies.slice(0, 3).map(r => ` > **${r.author}**: ${r.text.slice(0, 200)}`).join('\n');
|
|
593
|
+
return `**${c.author}** (${c.time})\n${c.text.slice(0, 300)}${repliesMd ? '\n' + repliesMd : ''}`;
|
|
594
|
+
}).join('\n\n---\n\n');
|
|
595
|
+
const bodySection = structured.text ? `\n\n${structured.text.slice(0, 500)}` : '';
|
|
596
|
+
const cleanContent = `## š Hacker News: ${structured.title}
|
|
597
|
+
|
|
598
|
+
**Author:** ${structured.author} | **Score:** ${structured.score} | **Comments:** ${structured.commentCount}
|
|
599
|
+
**Posted:** ${structured.created}
|
|
600
|
+
${structured.url !== `https://news.ycombinator.com/item?id=${structured.id}` ? `**Link:** ${structured.url}` : ''}${bodySection}
|
|
601
|
+
|
|
602
|
+
---
|
|
603
|
+
|
|
604
|
+
### Top Comments
|
|
605
|
+
|
|
606
|
+
${commentsMd || '*No comments found.*'}`;
|
|
607
|
+
return { domain, type, structured, cleanContent };
|
|
608
|
+
}
|
|
609
|
+
// Front page / /news ā fetch top stories
|
|
610
|
+
if (path === '/' || path === '/news' || path === '') {
|
|
611
|
+
const topIds = await fetchJson('https://hacker-news.firebaseio.com/v0/topstories.json');
|
|
612
|
+
if (!Array.isArray(topIds))
|
|
613
|
+
return null;
|
|
614
|
+
const top30 = topIds.slice(0, 30);
|
|
615
|
+
const storyResults = await Promise.all(top30.map((id) => fetchJson(`https://hacker-news.firebaseio.com/v0/item/${id}.json`).catch(() => null)));
|
|
616
|
+
const stories = storyResults
|
|
617
|
+
.filter((s) => s && s.title)
|
|
618
|
+
.map((s) => ({
|
|
619
|
+
id: s.id,
|
|
620
|
+
title: s.title,
|
|
621
|
+
author: s.by || '[deleted]',
|
|
622
|
+
score: s.score ?? 0,
|
|
623
|
+
commentCount: s.descendants ?? 0,
|
|
624
|
+
url: s.url || `https://news.ycombinator.com/item?id=${s.id}`,
|
|
625
|
+
hnUrl: `https://news.ycombinator.com/item?id=${s.id}`,
|
|
626
|
+
}));
|
|
627
|
+
const structured = { stories };
|
|
628
|
+
const cleanContent = `## š Hacker News ā Front Page
|
|
629
|
+
|
|
630
|
+
${stories.map((s, i) => `${i + 1}. **${s.title}**\n ā ${s.score} | š¬ ${s.commentCount} | by ${s.author}\n ${s.url}`).join('\n\n')}`;
|
|
631
|
+
return { domain, type: 'frontpage', structured, cleanContent };
|
|
632
|
+
}
|
|
633
|
+
// User page: ?id=username
|
|
634
|
+
const userId = urlObj.searchParams.get('id');
|
|
635
|
+
if (path === '/user' && userId) {
|
|
636
|
+
const userData = await fetchJson(`https://hacker-news.firebaseio.com/v0/user/${userId}.json`);
|
|
637
|
+
if (!userData)
|
|
638
|
+
return null;
|
|
639
|
+
const structured = {
|
|
640
|
+
id: userData.id,
|
|
641
|
+
karma: userData.karma ?? 0,
|
|
642
|
+
about: userData.about ? stripHtml(userData.about) : '',
|
|
643
|
+
created: unixToIso(userData.created),
|
|
644
|
+
submitted: (userData.submitted || []).length,
|
|
645
|
+
};
|
|
646
|
+
const cleanContent = `## š HN User: ${structured.id}
|
|
647
|
+
|
|
648
|
+
**Karma:** ${structured.karma} | **Member since:** ${structured.created}
|
|
649
|
+
${structured.about ? '\n' + structured.about : ''}`;
|
|
650
|
+
return { domain, type: 'user', structured, cleanContent };
|
|
651
|
+
}
|
|
652
|
+
return null;
|
|
653
|
+
}
|
|
654
|
+
//# sourceMappingURL=domain-extractors.js.map
|