webpeel 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +11 -657
- package/README.md +246 -325
- package/dist/cli.js +330 -73
- package/dist/cli.js.map +1 -1
- package/dist/core/browser-fetch.d.ts +12 -0
- package/dist/core/browser-fetch.d.ts.map +1 -1
- package/dist/core/browser-fetch.js +70 -17
- package/dist/core/browser-fetch.js.map +1 -1
- package/dist/core/cf-worker-proxy.d.ts +33 -0
- package/dist/core/cf-worker-proxy.d.ts.map +1 -0
- package/dist/core/cf-worker-proxy.js +88 -0
- package/dist/core/cf-worker-proxy.js.map +1 -0
- package/dist/core/chunker.d.ts +47 -0
- package/dist/core/chunker.d.ts.map +1 -0
- package/dist/core/chunker.js +250 -0
- package/dist/core/chunker.js.map +1 -0
- package/dist/core/cloak-fetch.d.ts +43 -0
- package/dist/core/cloak-fetch.d.ts.map +1 -0
- package/dist/core/cloak-fetch.js +141 -0
- package/dist/core/cloak-fetch.js.map +1 -0
- package/dist/core/crawl-checkpoint.d.ts +55 -0
- package/dist/core/crawl-checkpoint.d.ts.map +1 -0
- package/dist/core/crawl-checkpoint.js +105 -0
- package/dist/core/crawl-checkpoint.js.map +1 -0
- package/dist/core/crawler.d.ts +5 -1
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +60 -5
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/cycle-fetch.d.ts +27 -0
- package/dist/core/cycle-fetch.d.ts.map +1 -0
- package/dist/core/cycle-fetch.js +99 -0
- package/dist/core/cycle-fetch.js.map +1 -0
- package/dist/core/domain-extractors.d.ts.map +1 -1
- package/dist/core/domain-extractors.js +754 -14
- package/dist/core/domain-extractors.js.map +1 -1
- package/dist/core/google-cache.d.ts +30 -0
- package/dist/core/google-cache.d.ts.map +1 -0
- package/dist/core/google-cache.js +181 -0
- package/dist/core/google-cache.js.map +1 -0
- package/dist/core/markdown.d.ts +11 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +43 -0
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/peel-tls.d.ts +26 -0
- package/dist/core/peel-tls.d.ts.map +1 -0
- package/dist/core/peel-tls.js +221 -0
- package/dist/core/peel-tls.js.map +1 -0
- package/dist/core/pipeline.d.ts +5 -1
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +269 -21
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/schema-postprocess.d.ts +33 -0
- package/dist/core/schema-postprocess.d.ts.map +1 -0
- package/dist/core/schema-postprocess.js +470 -0
- package/dist/core/schema-postprocess.js.map +1 -0
- package/dist/core/schema-templates.d.ts +20 -0
- package/dist/core/schema-templates.d.ts.map +1 -0
- package/dist/core/schema-templates.js +131 -0
- package/dist/core/schema-templates.js.map +1 -0
- package/dist/core/search-fallback.d.ts +28 -0
- package/dist/core/search-fallback.d.ts.map +1 -0
- package/dist/core/search-fallback.js +185 -0
- package/dist/core/search-fallback.js.map +1 -0
- package/dist/core/search-provider.d.ts +47 -4
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +278 -7
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/stealth-patches.d.ts +58 -0
- package/dist/core/stealth-patches.d.ts.map +1 -0
- package/dist/core/stealth-patches.js +340 -0
- package/dist/core/stealth-patches.js.map +1 -0
- package/dist/core/strategies.d.ts +20 -0
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +284 -48
- package/dist/core/strategies.js.map +1 -1
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/core/strategy-hooks.d.ts.map +1 -1
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -15
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +109 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +29 -0
- package/dist/server/app.js.map +1 -1
- package/dist/server/middleware/rate-limit.d.ts +2 -1
- package/dist/server/middleware/rate-limit.d.ts.map +1 -1
- package/dist/server/middleware/rate-limit.js +24 -8
- package/dist/server/middleware/rate-limit.js.map +1 -1
- package/dist/server/routes/agent.d.ts +4 -0
- package/dist/server/routes/agent.d.ts.map +1 -1
- package/dist/server/routes/agent.js +196 -9
- package/dist/server/routes/agent.js.map +1 -1
- package/dist/server/routes/batch.js +5 -5
- package/dist/server/routes/batch.js.map +1 -1
- package/dist/server/routes/compat.d.ts.map +1 -1
- package/dist/server/routes/compat.js +1 -0
- package/dist/server/routes/compat.js.map +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +60 -6
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +103 -2
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/search.js +1 -1
- package/dist/server/routes/search.js.map +1 -1
- package/dist/types.d.ts +55 -4
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +4 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +55 -125
- package/package.json +15 -1
|
@@ -12,6 +12,50 @@
|
|
|
12
12
|
*/
|
|
13
13
|
import { simpleFetch } from './fetcher.js';
|
|
14
14
|
// ---------------------------------------------------------------------------
|
|
15
|
+
// Helpers
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
/**
|
|
18
|
+
* Resolve Reddit share URLs (/s/CODE) to their actual destination.
|
|
19
|
+
* These are short redirect links that point to the real post URL.
|
|
20
|
+
*/
|
|
21
|
+
async function resolveRedditShareUrl(url) {
|
|
22
|
+
const urlObj = new URL(url);
|
|
23
|
+
// Match /r/subreddit/s/CODE or /s/CODE patterns
|
|
24
|
+
if (!urlObj.pathname.includes('/s/'))
|
|
25
|
+
return url;
|
|
26
|
+
try {
|
|
27
|
+
const { default: https } = await import('https');
|
|
28
|
+
const { default: http } = await import('http');
|
|
29
|
+
return new Promise((resolve) => {
|
|
30
|
+
const client = url.startsWith('https') ? https : http;
|
|
31
|
+
const req = client.get(url, {
|
|
32
|
+
headers: { 'User-Agent': 'WebPeel/0.17.0 (web data platform; https://webpeel.dev) Node.js' },
|
|
33
|
+
timeout: 10000,
|
|
34
|
+
}, (res) => {
|
|
35
|
+
// Follow redirect (one hop)
|
|
36
|
+
if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
|
37
|
+
const redirectUrl = res.headers.location.startsWith('http')
|
|
38
|
+
? res.headers.location
|
|
39
|
+
: new URL(res.headers.location, url).href;
|
|
40
|
+
resolve(redirectUrl);
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
resolve(url); // No redirect, return original
|
|
44
|
+
}
|
|
45
|
+
res.resume(); // Consume response
|
|
46
|
+
});
|
|
47
|
+
req.on('error', () => resolve(url));
|
|
48
|
+
req.on('timeout', () => {
|
|
49
|
+
req.destroy();
|
|
50
|
+
resolve(url);
|
|
51
|
+
});
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
catch {
|
|
55
|
+
return url; // On any error, return original URL
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
15
59
|
// Registry
|
|
16
60
|
// ---------------------------------------------------------------------------
|
|
17
61
|
const REGISTRY = [
|
|
@@ -19,6 +63,13 @@ const REGISTRY = [
|
|
|
19
63
|
{ match: (h) => h === 'reddit.com' || h === 'www.reddit.com' || h === 'old.reddit.com', extractor: redditExtractor },
|
|
20
64
|
{ match: (h) => h === 'github.com' || h === 'www.github.com', extractor: githubExtractor },
|
|
21
65
|
{ match: (h) => h === 'news.ycombinator.com', extractor: hackerNewsExtractor },
|
|
66
|
+
{ match: (h) => h === 'en.wikipedia.org' || h === 'www.wikipedia.org' || /\w+\.wikipedia\.org/.test(h), extractor: wikipediaExtractor },
|
|
67
|
+
{ match: (h) => h === 'youtube.com' || h === 'www.youtube.com' || h === 'youtu.be', extractor: youtubeExtractor },
|
|
68
|
+
{ match: (h) => h === 'arxiv.org' || h === 'export.arxiv.org', extractor: arxivExtractor },
|
|
69
|
+
{ match: (h) => h === 'stackoverflow.com' || h === 'www.stackoverflow.com', extractor: stackOverflowExtractor },
|
|
70
|
+
{ match: (h) => h === 'www.npmjs.com' || h === 'npmjs.com', extractor: npmExtractor },
|
|
71
|
+
{ match: (h) => h === 'www.bestbuy.com' || h === 'bestbuy.com', extractor: bestBuyExtractor },
|
|
72
|
+
{ match: (h) => h === 'www.walmart.com' || h === 'walmart.com', extractor: walmartExtractor },
|
|
22
73
|
];
|
|
23
74
|
/**
|
|
24
75
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
@@ -81,6 +132,23 @@ async function fetchJson(url, customHeaders) {
|
|
|
81
132
|
});
|
|
82
133
|
return tryParseJson(result.html);
|
|
83
134
|
}
|
|
135
|
+
/** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
|
|
136
|
+
async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {
|
|
137
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
138
|
+
try {
|
|
139
|
+
const result = await fetchJson(url, headers);
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
catch (e) {
|
|
143
|
+
// Retry on rate-limit or transient errors
|
|
144
|
+
if (attempt < retries && (e.message?.includes('429') || e.message?.includes('rate') || e.message?.includes('Too Many'))) {
|
|
145
|
+
await new Promise(resolve => setTimeout(resolve, baseDelayMs * Math.pow(2, attempt)));
|
|
146
|
+
continue;
|
|
147
|
+
}
|
|
148
|
+
throw e;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
84
152
|
// ---------------------------------------------------------------------------
|
|
85
153
|
// 1. Twitter / X extractor
|
|
86
154
|
// ---------------------------------------------------------------------------
|
|
@@ -161,6 +229,86 @@ async function twitterExtractor(html, url) {
|
|
|
161
229
|
const isTweet = pathParts.includes('status');
|
|
162
230
|
const type = isTweet ? 'tweet' : 'profile';
|
|
163
231
|
const domain = 'twitter.com';
|
|
232
|
+
// --- Try FxTwitter API first (works from datacenter IPs, no auth needed) ---
|
|
233
|
+
const username = pathParts[0] || '';
|
|
234
|
+
if (isTweet) {
|
|
235
|
+
const statusId = pathParts[pathParts.indexOf('status') + 1];
|
|
236
|
+
if (statusId && username) {
|
|
237
|
+
try {
|
|
238
|
+
const fxUrl = `https://api.fxtwitter.com/${username}/status/${statusId}`;
|
|
239
|
+
const fxData = await fetchJson(fxUrl);
|
|
240
|
+
if (fxData && fxData.code === 200 && fxData.tweet) {
|
|
241
|
+
const t = fxData.tweet;
|
|
242
|
+
const structured = {
|
|
243
|
+
author: {
|
|
244
|
+
name: t.author?.name || '',
|
|
245
|
+
handle: '@' + (t.author?.screen_name || ''),
|
|
246
|
+
verified: t.author?.verified || false,
|
|
247
|
+
},
|
|
248
|
+
text: t.text || '',
|
|
249
|
+
timestamp: t.created_at ? new Date(t.created_at).toISOString() : undefined,
|
|
250
|
+
metrics: {
|
|
251
|
+
likes: t.likes ?? 0,
|
|
252
|
+
retweets: t.retweets ?? 0,
|
|
253
|
+
replies: t.replies ?? 0,
|
|
254
|
+
views: t.views ?? 0,
|
|
255
|
+
},
|
|
256
|
+
media: (t.media?.all || []).map((m) => m.url).filter(Boolean),
|
|
257
|
+
quotedTweet: t.quote ? {
|
|
258
|
+
text: t.quote.text || '',
|
|
259
|
+
author: { name: t.quote.author?.name || '', handle: '@' + (t.quote.author?.screen_name || '') },
|
|
260
|
+
} : null,
|
|
261
|
+
source: 'fxtwitter',
|
|
262
|
+
};
|
|
263
|
+
const authorLine = `**${structured.author.name}** (${structured.author.handle})`;
|
|
264
|
+
const timeLine = structured.timestamp ? `\n*${structured.timestamp}*` : '';
|
|
265
|
+
const metricsLine = `\n\n💬 ${structured.metrics.replies} 🔁 ${structured.metrics.retweets} ❤️ ${structured.metrics.likes}${structured.metrics.views ? ` 👁 ${structured.metrics.views}` : ''}`;
|
|
266
|
+
const mediaLine = structured.media.length ? `\n\n📷 Media: ${structured.media.join(', ')}` : '';
|
|
267
|
+
const quotedLine = structured.quotedTweet
|
|
268
|
+
? `\n\n> **Quoted tweet by ${structured.quotedTweet.author?.name || 'unknown'}:** ${structured.quotedTweet.text}`
|
|
269
|
+
: '';
|
|
270
|
+
const cleanContent = `## 🐦 Tweet by ${authorLine}${timeLine}\n\n${structured.text}${quotedLine}${metricsLine}${mediaLine}`;
|
|
271
|
+
return { domain, type, structured, cleanContent };
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
catch (e) {
|
|
275
|
+
if (process.env.DEBUG)
|
|
276
|
+
console.debug('[webpeel]', 'FxTwitter API failed:', e instanceof Error ? e.message : e);
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
// --- Try FxTwitter for profiles ---
|
|
281
|
+
if (!isTweet && username) {
|
|
282
|
+
try {
|
|
283
|
+
const fxUrl = `https://api.fxtwitter.com/${username}`;
|
|
284
|
+
const fxData = await fetchJson(fxUrl);
|
|
285
|
+
if (fxData && fxData.code === 200 && fxData.user) {
|
|
286
|
+
const u = fxData.user;
|
|
287
|
+
const structured = {
|
|
288
|
+
name: u.name || '',
|
|
289
|
+
handle: '@' + (u.screen_name || ''),
|
|
290
|
+
bio: u.description || '',
|
|
291
|
+
followers: u.followers ?? 0,
|
|
292
|
+
following: u.following ?? 0,
|
|
293
|
+
tweets: u.tweets ?? 0,
|
|
294
|
+
likes: u.likes ?? 0,
|
|
295
|
+
verified: u.verification?.verified || false,
|
|
296
|
+
location: u.location || '',
|
|
297
|
+
created: u.joined || undefined,
|
|
298
|
+
avatarUrl: u.avatar_url || null,
|
|
299
|
+
bannerUrl: u.banner_url || null,
|
|
300
|
+
website: u.website || null,
|
|
301
|
+
source: 'fxtwitter',
|
|
302
|
+
};
|
|
303
|
+
const cleanContent = `## 🐦 @${(structured.handle || '').replace('@', '')} on X/Twitter\n\n**${structured.name}**${structured.verified ? ' ✓' : ''}\n${structured.bio || ''}\n\n📍 ${structured.location || 'N/A'} | 👥 ${structured.followers?.toLocaleString() || 0} followers | Following: ${structured.following?.toLocaleString() || 0} | Tweets: ${structured.tweets?.toLocaleString() || 0}`;
|
|
304
|
+
return { domain, type: 'profile', structured, cleanContent };
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
catch (e) {
|
|
308
|
+
if (process.env.DEBUG)
|
|
309
|
+
console.debug('[webpeel]', 'FxTwitter profile API failed:', e instanceof Error ? e.message : e);
|
|
310
|
+
}
|
|
311
|
+
}
|
|
164
312
|
// --- Try __NEXT_DATA__ JSON (SSR data) ---
|
|
165
313
|
const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
|
|
166
314
|
let structured = null;
|
|
@@ -262,23 +410,142 @@ function parseRedditComment(data, depth) {
|
|
|
262
410
|
};
|
|
263
411
|
}
|
|
264
412
|
async function redditExtractor(_html, url) {
|
|
265
|
-
|
|
413
|
+
// Resolve Reddit share URLs (/s/CODE) to actual post URLs before any processing
|
|
414
|
+
let workingUrl = url;
|
|
415
|
+
if (url.includes('/s/')) {
|
|
416
|
+
const resolved = await resolveRedditShareUrl(url);
|
|
417
|
+
if (resolved !== url) {
|
|
418
|
+
if (process.env.DEBUG)
|
|
419
|
+
console.debug('[webpeel]', `Reddit share URL resolved: ${url} → ${resolved}`);
|
|
420
|
+
workingUrl = resolved;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
const urlObj = new URL(workingUrl);
|
|
266
424
|
const path = urlObj.pathname;
|
|
267
425
|
const domain = 'reddit.com';
|
|
426
|
+
// Normalize old.reddit.com → www.reddit.com for JSON API
|
|
427
|
+
const normalizedUrl = workingUrl.replace(/old\.reddit\.com/, 'www.reddit.com');
|
|
428
|
+
const REDDIT_UA = { 'User-Agent': 'WebPeel/0.17.0 (web data platform; https://webpeel.dev) Node.js' };
|
|
268
429
|
// Detect page type
|
|
269
|
-
const isPost = /\/r\/[^/]+\/comments\//.test(path);
|
|
270
|
-
const
|
|
430
|
+
const isPost = /\/r\/[^/]+\/comments\//.test(path) || /^\/comments\//.test(path);
|
|
431
|
+
const isGallery = /\/gallery\//.test(path);
|
|
432
|
+
// Subreddit with any sort/filter: /r/sub, /r/sub/, /r/sub/hot, /r/sub/top, /r/sub/new, /r/sub/rising
|
|
433
|
+
const isSubreddit = /^\/r\/[^/]+\/?$/.test(path) || /^\/r\/[^/]+\/(hot|new|top|rising|controversial|best)\/?$/.test(path);
|
|
271
434
|
const isUser = /^\/(u|user)\/[^/]+/.test(path);
|
|
272
|
-
|
|
435
|
+
// Home/popular/all pages
|
|
436
|
+
const isHomeListing = /^\/(hot|new|top|rising|controversial|best|popular|all)\/?$/.test(path) || path === '/' || path === '';
|
|
437
|
+
const type = isPost || isGallery ? 'post' : isSubreddit ? 'subreddit' : isUser ? 'user' : isHomeListing ? 'listing' : 'listing';
|
|
438
|
+
if (isGallery) {
|
|
439
|
+
// Gallery posts: fetch the gallery JSON and extract the post data
|
|
440
|
+
const galleryJsonUrl = normalizedUrl.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
|
|
441
|
+
const requestedGallerySub = path.match(/\/r\/([^/]+)/)?.[1] || 'unknown';
|
|
442
|
+
let galleryData;
|
|
443
|
+
try {
|
|
444
|
+
galleryData = await fetchJsonWithRetry(galleryJsonUrl, REDDIT_UA);
|
|
445
|
+
}
|
|
446
|
+
catch (e) {
|
|
447
|
+
return {
|
|
448
|
+
domain,
|
|
449
|
+
type: 'post',
|
|
450
|
+
structured: { error: 'Post not found or has been deleted', subreddit: `r/${requestedGallerySub}` },
|
|
451
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
if (!Array.isArray(galleryData) || galleryData.length < 1) {
|
|
455
|
+
return {
|
|
456
|
+
domain,
|
|
457
|
+
type: 'post',
|
|
458
|
+
structured: { error: 'Post not found', subreddit: `r/${requestedGallerySub}` },
|
|
459
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
|
|
460
|
+
};
|
|
461
|
+
}
|
|
462
|
+
const postData = galleryData[0]?.data?.children?.[0]?.data;
|
|
463
|
+
if (!postData) {
|
|
464
|
+
return {
|
|
465
|
+
domain,
|
|
466
|
+
type: 'post',
|
|
467
|
+
structured: { error: 'Post not found', subreddit: `r/${requestedGallerySub}` },
|
|
468
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
// Validate subreddit matches the request
|
|
472
|
+
const actualGallerySub = postData.subreddit?.toLowerCase();
|
|
473
|
+
if (requestedGallerySub !== 'unknown' && actualGallerySub && requestedGallerySub.toLowerCase() !== actualGallerySub) {
|
|
474
|
+
return {
|
|
475
|
+
domain,
|
|
476
|
+
type: 'post',
|
|
477
|
+
structured: { error: 'Post not found in requested subreddit', requestedSubreddit: `r/${requestedGallerySub}`, actualSubreddit: `r/${actualGallerySub}` },
|
|
478
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post was not found in r/${requestedGallerySub}. It may have been deleted or moved.`,
|
|
479
|
+
};
|
|
480
|
+
}
|
|
481
|
+
const structured = {
|
|
482
|
+
subreddit: `r/${postData.subreddit || ''}`,
|
|
483
|
+
title: postData.title || '',
|
|
484
|
+
author: `u/${postData.author || '[deleted]'}`,
|
|
485
|
+
score: postData.score ?? 0,
|
|
486
|
+
upvoteRatio: postData.upvote_ratio ?? 1,
|
|
487
|
+
url: postData.url || url,
|
|
488
|
+
selftext: postData.selftext || '',
|
|
489
|
+
commentCount: postData.num_comments ?? 0,
|
|
490
|
+
created: unixToIso(postData.created_utc),
|
|
491
|
+
flair: postData.link_flair_text || null,
|
|
492
|
+
comments: [],
|
|
493
|
+
isGallery: true,
|
|
494
|
+
};
|
|
495
|
+
const cleanContent = `## 📋 ${structured.subreddit}: ${structured.title}
|
|
496
|
+
|
|
497
|
+
**Posted by** ${structured.author} | Score: ${structured.score} | ${structured.commentCount} comments
|
|
498
|
+
*${structured.created}*
|
|
499
|
+
|
|
500
|
+
*(Gallery post)*`;
|
|
501
|
+
return { domain, type: 'post', structured, cleanContent };
|
|
502
|
+
}
|
|
273
503
|
if (isPost) {
|
|
274
504
|
// Fetch post data via Reddit JSON API
|
|
275
|
-
const jsonUrl =
|
|
276
|
-
const
|
|
277
|
-
|
|
278
|
-
|
|
505
|
+
const jsonUrl = normalizedUrl.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
|
|
506
|
+
const requestedPostSub = path.match(/\/r\/([^/]+)/)?.[1] || 'unknown';
|
|
507
|
+
let data;
|
|
508
|
+
try {
|
|
509
|
+
data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
|
|
510
|
+
}
|
|
511
|
+
catch (e) {
|
|
512
|
+
// Post not found or API error — return a "not found" result
|
|
513
|
+
// instead of null (which would trigger browser fallback with wrong content)
|
|
514
|
+
return {
|
|
515
|
+
domain,
|
|
516
|
+
type: 'post',
|
|
517
|
+
structured: { error: 'Post not found or has been deleted', subreddit: `r/${requestedPostSub}` },
|
|
518
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
|
|
519
|
+
};
|
|
520
|
+
}
|
|
521
|
+
if (!Array.isArray(data) || data.length < 2) {
|
|
522
|
+
return {
|
|
523
|
+
domain,
|
|
524
|
+
type: 'post',
|
|
525
|
+
structured: { error: 'Post not found', subreddit: `r/${requestedPostSub}` },
|
|
526
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
|
|
527
|
+
};
|
|
528
|
+
}
|
|
279
529
|
const postData = data[0]?.data?.children?.[0]?.data;
|
|
280
|
-
if (!postData)
|
|
281
|
-
return
|
|
530
|
+
if (!postData) {
|
|
531
|
+
return {
|
|
532
|
+
domain,
|
|
533
|
+
type: 'post',
|
|
534
|
+
structured: { error: 'Post not found', subreddit: `r/${requestedPostSub}` },
|
|
535
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
|
|
536
|
+
};
|
|
537
|
+
}
|
|
538
|
+
// CRITICAL: Validate subreddit matches the request (prevents cross-subreddit ID reuse exploits)
|
|
539
|
+
const actualPostSub = postData.subreddit?.toLowerCase();
|
|
540
|
+
if (requestedPostSub !== 'unknown' && actualPostSub && requestedPostSub.toLowerCase() !== actualPostSub) {
|
|
541
|
+
// Reddit reused the post ID in a different subreddit — return error instead of wrong content
|
|
542
|
+
return {
|
|
543
|
+
domain,
|
|
544
|
+
type: 'post',
|
|
545
|
+
structured: { error: 'Post not found in requested subreddit', requestedSubreddit: `r/${requestedPostSub}`, actualSubreddit: `r/${actualPostSub}` },
|
|
546
|
+
cleanContent: `## ❌ Reddit Post Not Found\n\nThe post was not found in r/${requestedPostSub}. It may have been deleted or moved.`,
|
|
547
|
+
};
|
|
548
|
+
}
|
|
282
549
|
// Parse top comments (max 20)
|
|
283
550
|
const commentChildren = data[1]?.data?.children || [];
|
|
284
551
|
const comments = [];
|
|
@@ -326,8 +593,13 @@ ${commentsMd || '*No comments found.*'}`;
|
|
|
326
593
|
}
|
|
327
594
|
if (isSubreddit) {
|
|
328
595
|
// Fetch subreddit listing
|
|
329
|
-
|
|
330
|
-
const
|
|
596
|
+
// Preserve query params (especially t=day, t=week etc. for sorted views)
|
|
597
|
+
const queryString = urlObj.search || '';
|
|
598
|
+
const sortMatch = path.match(/\/r\/[^/]+\/(hot|new|top|rising|controversial|best)/);
|
|
599
|
+
const sortPath = sortMatch ? `/${sortMatch[1]}` : '';
|
|
600
|
+
const baseSubUrl = normalizedUrl.match(/\/r\/[^/]+/)?.[0] || normalizedUrl.split('?')[0];
|
|
601
|
+
const jsonUrl = `https://www.reddit.com${baseSubUrl}${sortPath}.json?limit=15${queryString ? '&' + queryString.slice(1) : ''}`;
|
|
602
|
+
const data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
|
|
331
603
|
if (!data?.data?.children)
|
|
332
604
|
return null;
|
|
333
605
|
const posts = data.data.children
|
|
@@ -350,6 +622,36 @@ ${commentsMd || '*No comments found.*'}`;
|
|
|
350
622
|
${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ↑ ${p.score} | 💬 ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n ${p.url}`).join('\n\n')}`;
|
|
351
623
|
return { domain, type, structured, cleanContent };
|
|
352
624
|
}
|
|
625
|
+
if (isHomeListing) {
|
|
626
|
+
const sortMatch = path.match(/\/(hot|new|top|rising|controversial|best|popular|all)/);
|
|
627
|
+
const sortType = sortMatch ? sortMatch[1] : 'hot';
|
|
628
|
+
const queryString = urlObj.search || '';
|
|
629
|
+
const jsonUrl = `https://www.reddit.com/${sortType}.json?limit=15${queryString ? '&' + queryString.slice(1) : ''}`;
|
|
630
|
+
const data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
|
|
631
|
+
if (!data?.data?.children)
|
|
632
|
+
return null;
|
|
633
|
+
const posts = data.data.children
|
|
634
|
+
.filter((c) => c.kind === 't3')
|
|
635
|
+
.map((c) => {
|
|
636
|
+
const d = c.data;
|
|
637
|
+
return {
|
|
638
|
+
title: d.title || '',
|
|
639
|
+
author: `u/${d.author || '[deleted]'}`,
|
|
640
|
+
score: d.score ?? 0,
|
|
641
|
+
commentCount: d.num_comments ?? 0,
|
|
642
|
+
url: `https://reddit.com${d.permalink}`,
|
|
643
|
+
subreddit: `r/${d.subreddit}`,
|
|
644
|
+
flair: d.link_flair_text || null,
|
|
645
|
+
};
|
|
646
|
+
});
|
|
647
|
+
const structured = { sortType, posts, postCount: posts.length };
|
|
648
|
+
const listMd = posts.map((p, i) => {
|
|
649
|
+
const flairTag = p.flair ? ` | ${p.flair}` : '';
|
|
650
|
+
return `${i + 1}. **${p.title}**\n ${p.author} in ${p.subreddit} | ↑ ${p.score} | 💬 ${p.commentCount}${flairTag}\n ${p.url}`;
|
|
651
|
+
}).join('\n\n');
|
|
652
|
+
const cleanContent = `## 📋 Reddit — ${sortType.charAt(0).toUpperCase() + sortType.slice(1)} Posts\n\n${listMd}`;
|
|
653
|
+
return { domain: 'reddit.com', type: 'listing', structured, cleanContent };
|
|
654
|
+
}
|
|
353
655
|
// User or other — fall back to null (let normal HTML extraction handle it)
|
|
354
656
|
return null;
|
|
355
657
|
}
|
|
@@ -499,7 +801,7 @@ ${commentsMd || '*No comments.*'}`;
|
|
|
499
801
|
let readmeText = '';
|
|
500
802
|
if (readmeData?.content) {
|
|
501
803
|
try {
|
|
502
|
-
readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0,
|
|
804
|
+
readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 5000);
|
|
503
805
|
}
|
|
504
806
|
catch { /* ignore */ }
|
|
505
807
|
}
|
|
@@ -529,7 +831,7 @@ ${structured.description || '*No description.*'}
|
|
|
529
831
|
🏷️ Topics: ${topicsStr}
|
|
530
832
|
🔗 ${structured.homepage || 'No homepage'} | Last push: ${structured.lastPush}${structured.archived ? '\n⚠️ **ARCHIVED**' : ''}
|
|
531
833
|
|
|
532
|
-
${structured.readme ? `### README
|
|
834
|
+
${structured.readme ? `### README\n\n${structured.readme}` : ''}`;
|
|
533
835
|
return { domain, type: 'repository', structured, cleanContent };
|
|
534
836
|
}
|
|
535
837
|
return null;
|
|
@@ -652,4 +954,442 @@ ${structured.about ? '\n' + structured.about : ''}`;
|
|
|
652
954
|
}
|
|
653
955
|
return null;
|
|
654
956
|
}
|
|
957
|
+
// ---------------------------------------------------------------------------
|
|
958
|
+
// 5. Wikipedia extractor
|
|
959
|
+
// ---------------------------------------------------------------------------
|
|
960
|
+
/** Remove Wikipedia-specific noise from extracted content. */
|
|
961
|
+
function cleanWikipediaContent(content) {
|
|
962
|
+
return content
|
|
963
|
+
// Remove [edit] links
|
|
964
|
+
.replace(/\[edit\]/gi, '')
|
|
965
|
+
// Remove citation brackets [1], [2], etc.
|
|
966
|
+
.replace(/\[\d+\]/g, '')
|
|
967
|
+
// Remove [citation needed], [verification], etc.
|
|
968
|
+
.replace(/\[(citation needed|verification|improve this article|adding citations[^\]]*|when\?|where\?|who\?|clarification needed|dubious[^\]]*|failed verification[^\]]*|unreliable source[^\]]*)\]/gi, '')
|
|
969
|
+
// Remove [Learn how and when to remove this message]
|
|
970
|
+
.replace(/\[Learn how and when to remove this message\]/gi, '')
|
|
971
|
+
// Clean up excess whitespace
|
|
972
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
973
|
+
.trim();
|
|
974
|
+
}
|
|
975
|
+
async function wikipediaExtractor(_html, url) {
|
|
976
|
+
const urlObj = new URL(url);
|
|
977
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
978
|
+
// Only handle article pages: /wiki/Article_Title
|
|
979
|
+
if (pathParts[0] !== 'wiki' || pathParts.length < 2)
|
|
980
|
+
return null;
|
|
981
|
+
const articleTitle = decodeURIComponent(pathParts[1]);
|
|
982
|
+
// Skip special pages (contain a colon, e.g. Special:Random, Talk:Article)
|
|
983
|
+
if (articleTitle.includes(':'))
|
|
984
|
+
return null;
|
|
985
|
+
const lang = urlObj.hostname.split('.')[0] || 'en';
|
|
986
|
+
const apiUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
|
|
987
|
+
// Wikipedia REST API requires a descriptive User-Agent (https://meta.wikimedia.org/wiki/User-Agent_policy)
|
|
988
|
+
const wikiHeaders = { 'User-Agent': 'WebPeel/0.17.0 (https://webpeel.dev; jake@jakeliu.me) Node.js', 'Api-User-Agent': 'WebPeel/0.17.0 (https://webpeel.dev; jake@jakeliu.me)' };
|
|
989
|
+
try {
|
|
990
|
+
const data = await fetchJson(apiUrl, wikiHeaders);
|
|
991
|
+
if (!data || data.type === 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found')
|
|
992
|
+
return null;
|
|
993
|
+
// For full article content, use the mobile-html endpoint (mobile-sections is deprecated)
|
|
994
|
+
let fullContent = '';
|
|
995
|
+
try {
|
|
996
|
+
const fullUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
|
|
997
|
+
const fullResult = await simpleFetch(fullUrl, undefined, 15000, {
|
|
998
|
+
...wikiHeaders,
|
|
999
|
+
'Accept': 'text/html',
|
|
1000
|
+
});
|
|
1001
|
+
if (fullResult?.html) {
|
|
1002
|
+
// Parse sections from the mobile HTML
|
|
1003
|
+
const sectionMatches = fullResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
|
|
1004
|
+
for (const section of sectionMatches) {
|
|
1005
|
+
// Extract section heading
|
|
1006
|
+
const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
|
|
1007
|
+
const heading = headingMatch ? stripHtml(headingMatch[2]).trim() : '';
|
|
1008
|
+
// Extract paragraphs
|
|
1009
|
+
const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
1010
|
+
const sectionText = paragraphs.map((p) => stripHtml(p).trim()).filter((t) => t.length > 0).join('\n\n');
|
|
1011
|
+
if (sectionText) {
|
|
1012
|
+
const prefix = heading ? `## ${heading}\n\n` : '';
|
|
1013
|
+
fullContent += `\n\n${prefix}${sectionText}`;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
catch (e) {
|
|
1019
|
+
// mobile-html failed — use summary extract as fallback
|
|
1020
|
+
if (process.env.DEBUG)
|
|
1021
|
+
console.debug('[webpeel]', 'Wikipedia mobile-html failed, using summary:', e instanceof Error ? e.message : e);
|
|
1022
|
+
}
|
|
1023
|
+
// Clean Wikipedia-specific noise
|
|
1024
|
+
fullContent = cleanWikipediaContent(fullContent);
|
|
1025
|
+
const structured = {
|
|
1026
|
+
title: data.title || articleTitle.replace(/_/g, ' '),
|
|
1027
|
+
description: data.description || '',
|
|
1028
|
+
extract: data.extract || '',
|
|
1029
|
+
thumbnail: data.thumbnail?.source || null,
|
|
1030
|
+
url: data.content_urls?.desktop?.page || url,
|
|
1031
|
+
lastModified: data.timestamp || null,
|
|
1032
|
+
};
|
|
1033
|
+
const cleanContent = `# ${structured.title}\n\n${structured.description ? `*${structured.description}*\n\n` : ''}${fullContent || structured.extract}`;
|
|
1034
|
+
return { domain: 'wikipedia.org', type: 'article', structured, cleanContent };
|
|
1035
|
+
}
|
|
1036
|
+
catch (e) {
|
|
1037
|
+
if (process.env.DEBUG)
|
|
1038
|
+
console.debug('[webpeel]', 'Wikipedia API failed:', e instanceof Error ? e.message : e);
|
|
1039
|
+
return null;
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
// ---------------------------------------------------------------------------
|
|
1043
|
+
// 6. YouTube extractor (oEmbed API-first)
|
|
1044
|
+
// ---------------------------------------------------------------------------
|
|
1045
|
+
async function youtubeExtractor(_html, url) {
|
|
1046
|
+
// Try YouTube oEmbed API first (no auth, works without browser)
|
|
1047
|
+
try {
|
|
1048
|
+
const oembedUrl = `https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`;
|
|
1049
|
+
const oembedData = await fetchJson(oembedUrl);
|
|
1050
|
+
if (oembedData && oembedData.title) {
|
|
1051
|
+
// Also try noembed for richer data
|
|
1052
|
+
let noembedData = null;
|
|
1053
|
+
try {
|
|
1054
|
+
noembedData = await fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`);
|
|
1055
|
+
}
|
|
1056
|
+
catch { /* optional */ }
|
|
1057
|
+
const structured = {
|
|
1058
|
+
title: oembedData.title,
|
|
1059
|
+
author: oembedData.author_name || '',
|
|
1060
|
+
authorUrl: oembedData.author_url || '',
|
|
1061
|
+
thumbnailUrl: oembedData.thumbnail_url || '',
|
|
1062
|
+
type: oembedData.type || 'video',
|
|
1063
|
+
source: 'oembed',
|
|
1064
|
+
};
|
|
1065
|
+
const cleanContent = `## 🎬 ${structured.title}\n\n**Channel:** [${structured.author}](${structured.authorUrl})\n\n${noembedData?.description || 'YouTube video'}`;
|
|
1066
|
+
return { domain: 'youtube.com', type: 'video', structured, cleanContent };
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
catch (e) {
|
|
1070
|
+
if (process.env.DEBUG)
|
|
1071
|
+
console.debug('[webpeel]', 'YouTube oEmbed failed:', e instanceof Error ? e.message : e);
|
|
1072
|
+
}
|
|
1073
|
+
// Fallback: return null (no HTML parsing implemented)
|
|
1074
|
+
return null;
|
|
1075
|
+
}
|
|
1076
|
+
// ---------------------------------------------------------------------------
|
|
1077
|
+
// 7. ArXiv extractor (ArXiv API)
|
|
1078
|
+
// ---------------------------------------------------------------------------
|
|
1079
|
+
async function arxivExtractor(_html, url) {
|
|
1080
|
+
const urlObj = new URL(url);
|
|
1081
|
+
const path = urlObj.pathname;
|
|
1082
|
+
// Extract paper ID from URL patterns:
|
|
1083
|
+
// /abs/2501.12948, /pdf/2501.12948, /abs/2501.12948v2
|
|
1084
|
+
const idMatch = path.match(/\/(abs|pdf|html)\/(\d{4}\.\d{4,5}(?:v\d+)?)/);
|
|
1085
|
+
if (!idMatch)
|
|
1086
|
+
return null;
|
|
1087
|
+
const paperId = idMatch[2];
|
|
1088
|
+
try {
|
|
1089
|
+
// Use ArXiv API
|
|
1090
|
+
const apiUrl = `https://export.arxiv.org/api/query?id_list=${paperId}`;
|
|
1091
|
+
const result = await simpleFetch(apiUrl, 'WebPeel/0.17.0', 15000, { Accept: 'application/xml' });
|
|
1092
|
+
if (!result?.html)
|
|
1093
|
+
return null;
|
|
1094
|
+
const xml = result.html;
|
|
1095
|
+
// Parse XML (simple regex-based for these known fields)
|
|
1096
|
+
const getTag = (tag) => {
|
|
1097
|
+
const match = xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
|
|
1098
|
+
return match ? stripHtml(match[1]).trim() : '';
|
|
1099
|
+
};
|
|
1100
|
+
const getAllTags = (tag) => {
|
|
1101
|
+
const matches = [...xml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
|
|
1102
|
+
return matches.map(m => stripHtml(m[1]).trim()).filter(Boolean);
|
|
1103
|
+
};
|
|
1104
|
+
const title = getTag('title');
|
|
1105
|
+
const summary = getTag('summary');
|
|
1106
|
+
const published = getTag('published');
|
|
1107
|
+
const updated = getTag('updated');
|
|
1108
|
+
const authors = getAllTags('name');
|
|
1109
|
+
// Extract categories
|
|
1110
|
+
const categories = [...xml.matchAll(/category[^>]*term="([^"]+)"/g)].map(m => m[1]);
|
|
1111
|
+
// Extract DOI and journal ref if available
|
|
1112
|
+
const doi = getTag('arxiv:doi');
|
|
1113
|
+
const journalRef = getTag('arxiv:journal_ref');
|
|
1114
|
+
if (!title)
|
|
1115
|
+
return null;
|
|
1116
|
+
const structured = {
|
|
1117
|
+
title,
|
|
1118
|
+
authors,
|
|
1119
|
+
abstract: summary,
|
|
1120
|
+
published: published || undefined,
|
|
1121
|
+
updated: updated || undefined,
|
|
1122
|
+
categories,
|
|
1123
|
+
doi: doi || undefined,
|
|
1124
|
+
journalRef: journalRef || undefined,
|
|
1125
|
+
paperId,
|
|
1126
|
+
pdfUrl: `https://arxiv.org/pdf/${paperId}`,
|
|
1127
|
+
absUrl: `https://arxiv.org/abs/${paperId}`,
|
|
1128
|
+
};
|
|
1129
|
+
const authorLine = authors.length <= 5
|
|
1130
|
+
? authors.join(', ')
|
|
1131
|
+
: `${authors.slice(0, 5).join(', ')} et al. (${authors.length} authors)`;
|
|
1132
|
+
const cleanContent = `# ${title}\n\n**Authors:** ${authorLine}\n**Published:** ${published?.split('T')[0] || 'N/A'}${categories.length ? `\n**Categories:** ${categories.join(', ')}` : ''}${doi ? `\n**DOI:** ${doi}` : ''}${journalRef ? `\n**Journal:** ${journalRef}` : ''}\n\n## Abstract\n\n${summary}\n\n📄 [PDF](${structured.pdfUrl}) | [Abstract](${structured.absUrl})`;
|
|
1133
|
+
return { domain: 'arxiv.org', type: 'paper', structured, cleanContent };
|
|
1134
|
+
}
|
|
1135
|
+
catch (e) {
|
|
1136
|
+
if (process.env.DEBUG)
|
|
1137
|
+
console.debug('[webpeel]', 'ArXiv API failed:', e instanceof Error ? e.message : e);
|
|
1138
|
+
return null;
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
// ---------------------------------------------------------------------------
|
|
1142
|
+
// 8. Stack Overflow extractor (StackExchange API)
|
|
1143
|
+
// ---------------------------------------------------------------------------
|
|
1144
|
+
async function stackOverflowExtractor(_html, url) {
|
|
1145
|
+
const urlObj = new URL(url);
|
|
1146
|
+
const path = urlObj.pathname;
|
|
1147
|
+
// Match /questions/12345/optional-slug
|
|
1148
|
+
const questionMatch = path.match(/\/questions\/(\d+)/);
|
|
1149
|
+
if (!questionMatch)
|
|
1150
|
+
return null;
|
|
1151
|
+
const questionId = questionMatch[1];
|
|
1152
|
+
try {
|
|
1153
|
+
const apiUrl = `https://api.stackexchange.com/2.3/questions/${questionId}?order=desc&sort=votes&site=stackoverflow&filter=withbody`;
|
|
1154
|
+
const data = await fetchJson(apiUrl);
|
|
1155
|
+
if (!data?.items?.[0])
|
|
1156
|
+
return null;
|
|
1157
|
+
const q = data.items[0];
|
|
1158
|
+
// Also fetch answers
|
|
1159
|
+
let answers = [];
|
|
1160
|
+
try {
|
|
1161
|
+
const answersUrl = `https://api.stackexchange.com/2.3/questions/${questionId}/answers?order=desc&sort=votes&site=stackoverflow&filter=withbody&pagesize=5`;
|
|
1162
|
+
const answersData = await fetchJson(answersUrl);
|
|
1163
|
+
answers = answersData?.items || [];
|
|
1164
|
+
}
|
|
1165
|
+
catch { /* answers optional */ }
|
|
1166
|
+
const structured = {
|
|
1167
|
+
title: stripHtml(q.title || ''),
|
|
1168
|
+
questionId: q.question_id,
|
|
1169
|
+
score: q.score || 0,
|
|
1170
|
+
views: q.view_count || 0,
|
|
1171
|
+
answerCount: q.answer_count || 0,
|
|
1172
|
+
isAnswered: q.is_answered || false,
|
|
1173
|
+
tags: q.tags || [],
|
|
1174
|
+
askedBy: q.owner?.display_name || 'anonymous',
|
|
1175
|
+
askedDate: q.creation_date ? new Date(q.creation_date * 1000).toISOString() : undefined,
|
|
1176
|
+
acceptedAnswerId: q.accepted_answer_id || null,
|
|
1177
|
+
answers: answers.map(a => ({
|
|
1178
|
+
id: a.answer_id,
|
|
1179
|
+
score: a.score,
|
|
1180
|
+
isAccepted: a.is_accepted || false,
|
|
1181
|
+
body: stripHtml(a.body || '').substring(0, 2000),
|
|
1182
|
+
author: a.owner?.display_name || 'anonymous',
|
|
1183
|
+
})),
|
|
1184
|
+
};
|
|
1185
|
+
const questionBody = stripHtml(q.body || '').substring(0, 3000);
|
|
1186
|
+
const tagLine = structured.tags.length ? `**Tags:** ${structured.tags.join(', ')}` : '';
|
|
1187
|
+
let answersContent = '';
|
|
1188
|
+
for (const a of structured.answers.slice(0, 3)) {
|
|
1189
|
+
const acceptedMark = a.isAccepted ? ' ✅ Accepted' : '';
|
|
1190
|
+
answersContent += `\n\n---\n\n### Answer by ${a.author} (Score: ${a.score}${acceptedMark})\n\n${a.body}`;
|
|
1191
|
+
}
|
|
1192
|
+
const cleanContent = `# ${structured.title}\n\n**Score:** ${structured.score} | **Views:** ${structured.views?.toLocaleString()} | **Answers:** ${structured.answerCount}\n${tagLine}\n**Asked by:** ${structured.askedBy}\n\n## Question\n\n${questionBody}${answersContent}`;
|
|
1193
|
+
return { domain: 'stackoverflow.com', type: 'question', structured, cleanContent };
|
|
1194
|
+
}
|
|
1195
|
+
catch (e) {
|
|
1196
|
+
if (process.env.DEBUG)
|
|
1197
|
+
console.debug('[webpeel]', 'StackOverflow API failed:', e instanceof Error ? e.message : e);
|
|
1198
|
+
return null;
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
// ---------------------------------------------------------------------------
|
|
1202
|
+
// 9. NPM extractor (npm registry API)
|
|
1203
|
+
// ---------------------------------------------------------------------------
|
|
1204
|
+
async function npmExtractor(_html, url) {
|
|
1205
|
+
const urlObj = new URL(url);
|
|
1206
|
+
const path = urlObj.pathname;
|
|
1207
|
+
// Match /package/name or /package/@scope/name
|
|
1208
|
+
const packageMatch = path.match(/\/package\/((?:@[^/]+\/)?[^/]+)/);
|
|
1209
|
+
if (!packageMatch)
|
|
1210
|
+
return null;
|
|
1211
|
+
const packageName = packageMatch[1];
|
|
1212
|
+
try {
|
|
1213
|
+
const apiUrl = `https://registry.npmjs.org/${encodeURIComponent(packageName)}`;
|
|
1214
|
+
const data = await fetchJson(apiUrl);
|
|
1215
|
+
if (!data?.name)
|
|
1216
|
+
return null;
|
|
1217
|
+
const latest = data['dist-tags']?.latest;
|
|
1218
|
+
const latestVersion = latest ? data.versions?.[latest] : null;
|
|
1219
|
+
// Get download counts
|
|
1220
|
+
let downloads = null;
|
|
1221
|
+
try {
|
|
1222
|
+
downloads = await fetchJson(`https://api.npmjs.org/downloads/point/last-week/${encodeURIComponent(packageName)}`);
|
|
1223
|
+
}
|
|
1224
|
+
catch { /* optional */ }
|
|
1225
|
+
const structured = {
|
|
1226
|
+
name: data.name,
|
|
1227
|
+
description: data.description || '',
|
|
1228
|
+
version: latest || 'unknown',
|
|
1229
|
+
license: latestVersion?.license || data.license || 'N/A',
|
|
1230
|
+
homepage: data.homepage || latestVersion?.homepage || null,
|
|
1231
|
+
repository: typeof data.repository === 'string' ? data.repository : data.repository?.url || null,
|
|
1232
|
+
author: typeof data.author === 'string' ? data.author : data.author?.name || '',
|
|
1233
|
+
keywords: data.keywords || [],
|
|
1234
|
+
weeklyDownloads: downloads?.downloads || 0,
|
|
1235
|
+
dependencies: Object.keys(latestVersion?.dependencies || {}),
|
|
1236
|
+
devDependencies: Object.keys(latestVersion?.devDependencies || {}),
|
|
1237
|
+
maintainers: (data.maintainers || []).map((m) => m.name || m).slice(0, 10),
|
|
1238
|
+
created: data.time?.created || undefined,
|
|
1239
|
+
modified: data.time?.modified || undefined,
|
|
1240
|
+
};
|
|
1241
|
+
// Include README if available (some packages have it, some don't)
|
|
1242
|
+
const readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
|
|
1243
|
+
// Add to structured data
|
|
1244
|
+
structured.readme = readmeText;
|
|
1245
|
+
const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
|
|
1246
|
+
const depsLine = structured.dependencies.length
|
|
1247
|
+
? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.slice(0, 15).join(', ')}${structured.dependencies.length > 15 ? '...' : ''}`
|
|
1248
|
+
: '';
|
|
1249
|
+
const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
|
|
1250
|
+
const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
|
|
1251
|
+
const datesLine = structured.created ? `\n**Created:** ${structured.created?.split('T')[0] || 'N/A'} | **Last modified:** ${structured.modified?.split('T')[0] || 'N/A'}` : '';
|
|
1252
|
+
const readmeSection = readmeText
|
|
1253
|
+
? `\n\n### README\n\n${readmeText}`
|
|
1254
|
+
: '';
|
|
1255
|
+
const cleanContent = `# 📦 ${structured.name}@${structured.version}
|
|
1256
|
+
|
|
1257
|
+
${structured.description}
|
|
1258
|
+
|
|
1259
|
+
**License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
|
|
1260
|
+
**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
|
|
1261
|
+
return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
|
|
1262
|
+
}
|
|
1263
|
+
catch (e) {
|
|
1264
|
+
if (process.env.DEBUG)
|
|
1265
|
+
console.debug('[webpeel]', 'NPM API failed:', e instanceof Error ? e.message : e);
|
|
1266
|
+
return null;
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
// ---------------------------------------------------------------------------
|
|
1270
|
+
// 10. Best Buy extractor (Best Buy Products API)
|
|
1271
|
+
// ---------------------------------------------------------------------------
|
|
1272
|
+
async function bestBuyExtractor(_html, url) {
|
|
1273
|
+
const apiKey = process.env.BESTBUY_API_KEY;
|
|
1274
|
+
if (!apiKey)
|
|
1275
|
+
return null; // No API key, skip
|
|
1276
|
+
// Extract SKU from URL: /site/.../6587822.p → 6587822
|
|
1277
|
+
const skuMatch = url.match(/\/(\d{7,})\.p/);
|
|
1278
|
+
if (!skuMatch)
|
|
1279
|
+
return null;
|
|
1280
|
+
const sku = skuMatch[1];
|
|
1281
|
+
const apiUrl = `https://api.bestbuy.com/v1/products/${sku}.json?apiKey=${apiKey}&show=sku,name,salePrice,regularPrice,onSale,shortDescription,longDescription,image,largeFrontImage,url,customerReviewAverage,customerReviewCount,categoryPath,manufacturer,modelNumber,upc,freeShipping,inStoreAvailability,onlineAvailability,condition,features.feature`;
|
|
1282
|
+
try {
|
|
1283
|
+
const data = await fetchJson(apiUrl);
|
|
1284
|
+
if (!data || data.error)
|
|
1285
|
+
return null;
|
|
1286
|
+
// Build clean markdown
|
|
1287
|
+
const lines = [];
|
|
1288
|
+
lines.push(`# ${data.name}`);
|
|
1289
|
+
lines.push('');
|
|
1290
|
+
if (data.onSale) {
|
|
1291
|
+
lines.push(`**Sale Price:** $${data.salePrice} (was $${data.regularPrice})`);
|
|
1292
|
+
}
|
|
1293
|
+
else {
|
|
1294
|
+
lines.push(`**Price:** $${data.regularPrice}`);
|
|
1295
|
+
}
|
|
1296
|
+
lines.push(`**SKU:** ${data.sku}`);
|
|
1297
|
+
if (data.manufacturer)
|
|
1298
|
+
lines.push(`**Brand:** ${data.manufacturer}`);
|
|
1299
|
+
if (data.modelNumber)
|
|
1300
|
+
lines.push(`**Model:** ${data.modelNumber}`);
|
|
1301
|
+
if (data.customerReviewAverage) {
|
|
1302
|
+
lines.push(`**Rating:** ${data.customerReviewAverage}/5 (${data.customerReviewCount} reviews)`);
|
|
1303
|
+
}
|
|
1304
|
+
lines.push(`**Availability:** ${data.onlineAvailability ? 'In Stock Online' : 'Out of Stock Online'} | ${data.inStoreAvailability ? 'Available In Store' : 'Not Available In Store'}`);
|
|
1305
|
+
if (data.freeShipping)
|
|
1306
|
+
lines.push('**Free Shipping:** Yes');
|
|
1307
|
+
lines.push('');
|
|
1308
|
+
if (data.shortDescription)
|
|
1309
|
+
lines.push(data.shortDescription);
|
|
1310
|
+
lines.push('');
|
|
1311
|
+
if (data.longDescription)
|
|
1312
|
+
lines.push(data.longDescription);
|
|
1313
|
+
if (data.features?.feature) {
|
|
1314
|
+
lines.push('');
|
|
1315
|
+
lines.push('## Features');
|
|
1316
|
+
for (const f of data.features.feature) {
|
|
1317
|
+
lines.push(`- ${f}`);
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
const structured = {
|
|
1321
|
+
sku: data.sku,
|
|
1322
|
+
name: data.name,
|
|
1323
|
+
price: data.salePrice || data.regularPrice,
|
|
1324
|
+
regularPrice: data.regularPrice,
|
|
1325
|
+
onSale: data.onSale,
|
|
1326
|
+
brand: data.manufacturer,
|
|
1327
|
+
model: data.modelNumber,
|
|
1328
|
+
upc: data.upc,
|
|
1329
|
+
rating: data.customerReviewAverage,
|
|
1330
|
+
reviewCount: data.customerReviewCount,
|
|
1331
|
+
image: data.largeFrontImage || data.image,
|
|
1332
|
+
url: data.url,
|
|
1333
|
+
inStock: data.onlineAvailability,
|
|
1334
|
+
freeShipping: data.freeShipping,
|
|
1335
|
+
condition: data.condition,
|
|
1336
|
+
category: data.categoryPath?.map((c) => c.name).join(' > '),
|
|
1337
|
+
};
|
|
1338
|
+
return { domain: 'bestbuy.com', type: 'product', structured, cleanContent: lines.join('\n') };
|
|
1339
|
+
}
|
|
1340
|
+
catch (e) {
|
|
1341
|
+
if (process.env.DEBUG)
|
|
1342
|
+
console.debug('[webpeel]', 'Best Buy API failed:', e instanceof Error ? e.message : e);
|
|
1343
|
+
return null;
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
// ---------------------------------------------------------------------------
|
|
1347
|
+
// 11. Walmart extractor (Walmart frontend search API)
|
|
1348
|
+
// ---------------------------------------------------------------------------
|
|
1349
|
+
async function walmartExtractor(_html, url) {
|
|
1350
|
+
// Extract item ID from URL patterns:
|
|
1351
|
+
// /ip/Product-Name/1234567 or /ip/1234567
|
|
1352
|
+
const itemMatch = url.match(/\/ip\/(?:.*\/)?(\d+)/);
|
|
1353
|
+
if (!itemMatch)
|
|
1354
|
+
return null;
|
|
1355
|
+
const itemId = itemMatch[1];
|
|
1356
|
+
// Try Walmart's BE API (used by their frontend, sometimes accessible)
|
|
1357
|
+
const apiUrl = `https://www.walmart.com/orchestra/snb/graphql/Search?query=${itemId}&page=1&affinityOverride=default&limit=1`;
|
|
1358
|
+
try {
|
|
1359
|
+
const response = await fetchJson(apiUrl, {
|
|
1360
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
|
|
1361
|
+
'Accept': 'application/json',
|
|
1362
|
+
'Referer': 'https://www.walmart.com/',
|
|
1363
|
+
});
|
|
1364
|
+
if (response?.data?.search?.searchResult?.itemStacks?.[0]?.items?.[0]) {
|
|
1365
|
+
const item = response.data.search.searchResult.itemStacks[0].items[0];
|
|
1366
|
+
const lines = [];
|
|
1367
|
+
lines.push(`# ${item.name}`);
|
|
1368
|
+
if (item.priceInfo?.currentPrice?.price) {
|
|
1369
|
+
lines.push(`**Price:** $${item.priceInfo.currentPrice.price}`);
|
|
1370
|
+
}
|
|
1371
|
+
if (item.averageRating) {
|
|
1372
|
+
lines.push(`**Rating:** ${item.averageRating}/5 (${item.numberOfReviews || 0} reviews)`);
|
|
1373
|
+
}
|
|
1374
|
+
if (item.shortDescription)
|
|
1375
|
+
lines.push(item.shortDescription);
|
|
1376
|
+
const structured = {
|
|
1377
|
+
name: item.name,
|
|
1378
|
+
price: item.priceInfo?.currentPrice?.price,
|
|
1379
|
+
rating: item.averageRating,
|
|
1380
|
+
reviewCount: item.numberOfReviews,
|
|
1381
|
+
image: item.imageInfo?.thumbnailUrl,
|
|
1382
|
+
itemId: itemId,
|
|
1383
|
+
inStock: item.availabilityStatusV2?.value === 'IN_STOCK',
|
|
1384
|
+
};
|
|
1385
|
+
return { domain: 'walmart.com', type: 'product', structured, cleanContent: lines.join('\n') };
|
|
1386
|
+
}
|
|
1387
|
+
return null;
|
|
1388
|
+
}
|
|
1389
|
+
catch (e) {
|
|
1390
|
+
if (process.env.DEBUG)
|
|
1391
|
+
console.debug('[webpeel]', 'Walmart API failed:', e instanceof Error ? e.message : e);
|
|
1392
|
+
return null; // API not accessible, fall through to other methods
|
|
1393
|
+
}
|
|
1394
|
+
}
|
|
655
1395
|
//# sourceMappingURL=domain-extractors.js.map
|