webpeel 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/LICENSE +11 -657
  2. package/README.md +246 -325
  3. package/dist/cli.js +330 -73
  4. package/dist/cli.js.map +1 -1
  5. package/dist/core/browser-fetch.d.ts +12 -0
  6. package/dist/core/browser-fetch.d.ts.map +1 -1
  7. package/dist/core/browser-fetch.js +70 -17
  8. package/dist/core/browser-fetch.js.map +1 -1
  9. package/dist/core/cf-worker-proxy.d.ts +33 -0
  10. package/dist/core/cf-worker-proxy.d.ts.map +1 -0
  11. package/dist/core/cf-worker-proxy.js +88 -0
  12. package/dist/core/cf-worker-proxy.js.map +1 -0
  13. package/dist/core/chunker.d.ts +47 -0
  14. package/dist/core/chunker.d.ts.map +1 -0
  15. package/dist/core/chunker.js +250 -0
  16. package/dist/core/chunker.js.map +1 -0
  17. package/dist/core/cloak-fetch.d.ts +43 -0
  18. package/dist/core/cloak-fetch.d.ts.map +1 -0
  19. package/dist/core/cloak-fetch.js +141 -0
  20. package/dist/core/cloak-fetch.js.map +1 -0
  21. package/dist/core/crawl-checkpoint.d.ts +55 -0
  22. package/dist/core/crawl-checkpoint.d.ts.map +1 -0
  23. package/dist/core/crawl-checkpoint.js +105 -0
  24. package/dist/core/crawl-checkpoint.js.map +1 -0
  25. package/dist/core/crawler.d.ts +5 -1
  26. package/dist/core/crawler.d.ts.map +1 -1
  27. package/dist/core/crawler.js +60 -5
  28. package/dist/core/crawler.js.map +1 -1
  29. package/dist/core/cycle-fetch.d.ts +27 -0
  30. package/dist/core/cycle-fetch.d.ts.map +1 -0
  31. package/dist/core/cycle-fetch.js +99 -0
  32. package/dist/core/cycle-fetch.js.map +1 -0
  33. package/dist/core/domain-extractors.d.ts.map +1 -1
  34. package/dist/core/domain-extractors.js +754 -14
  35. package/dist/core/domain-extractors.js.map +1 -1
  36. package/dist/core/google-cache.d.ts +30 -0
  37. package/dist/core/google-cache.d.ts.map +1 -0
  38. package/dist/core/google-cache.js +181 -0
  39. package/dist/core/google-cache.js.map +1 -0
  40. package/dist/core/markdown.d.ts +11 -0
  41. package/dist/core/markdown.d.ts.map +1 -1
  42. package/dist/core/markdown.js +43 -0
  43. package/dist/core/markdown.js.map +1 -1
  44. package/dist/core/peel-tls.d.ts +26 -0
  45. package/dist/core/peel-tls.d.ts.map +1 -0
  46. package/dist/core/peel-tls.js +221 -0
  47. package/dist/core/peel-tls.js.map +1 -0
  48. package/dist/core/pipeline.d.ts +5 -1
  49. package/dist/core/pipeline.d.ts.map +1 -1
  50. package/dist/core/pipeline.js +269 -21
  51. package/dist/core/pipeline.js.map +1 -1
  52. package/dist/core/schema-postprocess.d.ts +33 -0
  53. package/dist/core/schema-postprocess.d.ts.map +1 -0
  54. package/dist/core/schema-postprocess.js +470 -0
  55. package/dist/core/schema-postprocess.js.map +1 -0
  56. package/dist/core/schema-templates.d.ts +20 -0
  57. package/dist/core/schema-templates.d.ts.map +1 -0
  58. package/dist/core/schema-templates.js +131 -0
  59. package/dist/core/schema-templates.js.map +1 -0
  60. package/dist/core/search-fallback.d.ts +28 -0
  61. package/dist/core/search-fallback.d.ts.map +1 -0
  62. package/dist/core/search-fallback.js +185 -0
  63. package/dist/core/search-fallback.js.map +1 -0
  64. package/dist/core/search-provider.d.ts +47 -4
  65. package/dist/core/search-provider.d.ts.map +1 -1
  66. package/dist/core/search-provider.js +278 -7
  67. package/dist/core/search-provider.js.map +1 -1
  68. package/dist/core/stealth-patches.d.ts +58 -0
  69. package/dist/core/stealth-patches.d.ts.map +1 -0
  70. package/dist/core/stealth-patches.js +340 -0
  71. package/dist/core/stealth-patches.js.map +1 -0
  72. package/dist/core/strategies.d.ts +20 -0
  73. package/dist/core/strategies.d.ts.map +1 -1
  74. package/dist/core/strategies.js +284 -48
  75. package/dist/core/strategies.js.map +1 -1
  76. package/dist/core/strategy-hooks.d.ts +1 -1
  77. package/dist/core/strategy-hooks.d.ts.map +1 -1
  78. package/dist/index.d.ts +11 -0
  79. package/dist/index.d.ts.map +1 -1
  80. package/dist/index.js +37 -15
  81. package/dist/index.js.map +1 -1
  82. package/dist/mcp/server.js +109 -4
  83. package/dist/mcp/server.js.map +1 -1
  84. package/dist/server/app.d.ts.map +1 -1
  85. package/dist/server/app.js +29 -0
  86. package/dist/server/app.js.map +1 -1
  87. package/dist/server/middleware/rate-limit.d.ts +2 -1
  88. package/dist/server/middleware/rate-limit.d.ts.map +1 -1
  89. package/dist/server/middleware/rate-limit.js +24 -8
  90. package/dist/server/middleware/rate-limit.js.map +1 -1
  91. package/dist/server/routes/agent.d.ts +4 -0
  92. package/dist/server/routes/agent.d.ts.map +1 -1
  93. package/dist/server/routes/agent.js +196 -9
  94. package/dist/server/routes/agent.js.map +1 -1
  95. package/dist/server/routes/batch.js +5 -5
  96. package/dist/server/routes/batch.js.map +1 -1
  97. package/dist/server/routes/compat.d.ts.map +1 -1
  98. package/dist/server/routes/compat.js +1 -0
  99. package/dist/server/routes/compat.js.map +1 -1
  100. package/dist/server/routes/fetch.d.ts.map +1 -1
  101. package/dist/server/routes/fetch.js +60 -6
  102. package/dist/server/routes/fetch.js.map +1 -1
  103. package/dist/server/routes/mcp.d.ts.map +1 -1
  104. package/dist/server/routes/mcp.js +103 -2
  105. package/dist/server/routes/mcp.js.map +1 -1
  106. package/dist/server/routes/search.js +1 -1
  107. package/dist/server/routes/search.js.map +1 -1
  108. package/dist/types.d.ts +55 -4
  109. package/dist/types.d.ts.map +1 -1
  110. package/dist/types.js +4 -1
  111. package/dist/types.js.map +1 -1
  112. package/llms.txt +55 -125
  113. package/package.json +15 -1
@@ -12,6 +12,50 @@
12
12
  */
13
13
  import { simpleFetch } from './fetcher.js';
14
14
  // ---------------------------------------------------------------------------
15
+ // Helpers
16
+ // ---------------------------------------------------------------------------
17
+ /**
18
+ * Resolve Reddit share URLs (/s/CODE) to their actual destination.
19
+ * These are short redirect links that point to the real post URL.
20
+ */
21
+ async function resolveRedditShareUrl(url) {
22
+ const urlObj = new URL(url);
23
+ // Match /r/subreddit/s/CODE or /s/CODE patterns
24
+ if (!urlObj.pathname.includes('/s/'))
25
+ return url;
26
+ try {
27
+ const { default: https } = await import('https');
28
+ const { default: http } = await import('http');
29
+ return new Promise((resolve) => {
30
+ const client = url.startsWith('https') ? https : http;
31
+ const req = client.get(url, {
32
+ headers: { 'User-Agent': 'WebPeel/0.17.0 (web data platform; https://webpeel.dev) Node.js' },
33
+ timeout: 10000,
34
+ }, (res) => {
35
+ // Follow redirect (one hop)
36
+ if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
37
+ const redirectUrl = res.headers.location.startsWith('http')
38
+ ? res.headers.location
39
+ : new URL(res.headers.location, url).href;
40
+ resolve(redirectUrl);
41
+ }
42
+ else {
43
+ resolve(url); // No redirect, return original
44
+ }
45
+ res.resume(); // Consume response
46
+ });
47
+ req.on('error', () => resolve(url));
48
+ req.on('timeout', () => {
49
+ req.destroy();
50
+ resolve(url);
51
+ });
52
+ });
53
+ }
54
+ catch {
55
+ return url; // On any error, return original URL
56
+ }
57
+ }
58
+ // ---------------------------------------------------------------------------
15
59
  // Registry
16
60
  // ---------------------------------------------------------------------------
17
61
  const REGISTRY = [
@@ -19,6 +63,13 @@ const REGISTRY = [
19
63
  { match: (h) => h === 'reddit.com' || h === 'www.reddit.com' || h === 'old.reddit.com', extractor: redditExtractor },
20
64
  { match: (h) => h === 'github.com' || h === 'www.github.com', extractor: githubExtractor },
21
65
  { match: (h) => h === 'news.ycombinator.com', extractor: hackerNewsExtractor },
66
+ { match: (h) => h === 'en.wikipedia.org' || h === 'www.wikipedia.org' || /\w+\.wikipedia\.org/.test(h), extractor: wikipediaExtractor },
67
+ { match: (h) => h === 'youtube.com' || h === 'www.youtube.com' || h === 'youtu.be', extractor: youtubeExtractor },
68
+ { match: (h) => h === 'arxiv.org' || h === 'export.arxiv.org', extractor: arxivExtractor },
69
+ { match: (h) => h === 'stackoverflow.com' || h === 'www.stackoverflow.com', extractor: stackOverflowExtractor },
70
+ { match: (h) => h === 'www.npmjs.com' || h === 'npmjs.com', extractor: npmExtractor },
71
+ { match: (h) => h === 'www.bestbuy.com' || h === 'bestbuy.com', extractor: bestBuyExtractor },
72
+ { match: (h) => h === 'www.walmart.com' || h === 'walmart.com', extractor: walmartExtractor },
22
73
  ];
23
74
  /**
24
75
  * Returns the domain extractor for a URL, or null if none matches.
@@ -81,6 +132,23 @@ async function fetchJson(url, customHeaders) {
81
132
  });
82
133
  return tryParseJson(result.html);
83
134
  }
135
+ /** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
136
+ async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {
137
+ for (let attempt = 0; attempt <= retries; attempt++) {
138
+ try {
139
+ const result = await fetchJson(url, headers);
140
+ return result;
141
+ }
142
+ catch (e) {
143
+ // Retry on rate-limit or transient errors
144
+ if (attempt < retries && (e.message?.includes('429') || e.message?.includes('rate') || e.message?.includes('Too Many'))) {
145
+ await new Promise(resolve => setTimeout(resolve, baseDelayMs * Math.pow(2, attempt)));
146
+ continue;
147
+ }
148
+ throw e;
149
+ }
150
+ }
151
+ }
84
152
  // ---------------------------------------------------------------------------
85
153
  // 1. Twitter / X extractor
86
154
  // ---------------------------------------------------------------------------
@@ -161,6 +229,86 @@ async function twitterExtractor(html, url) {
161
229
  const isTweet = pathParts.includes('status');
162
230
  const type = isTweet ? 'tweet' : 'profile';
163
231
  const domain = 'twitter.com';
232
+ // --- Try FxTwitter API first (works from datacenter IPs, no auth needed) ---
233
+ const username = pathParts[0] || '';
234
+ if (isTweet) {
235
+ const statusId = pathParts[pathParts.indexOf('status') + 1];
236
+ if (statusId && username) {
237
+ try {
238
+ const fxUrl = `https://api.fxtwitter.com/${username}/status/${statusId}`;
239
+ const fxData = await fetchJson(fxUrl);
240
+ if (fxData && fxData.code === 200 && fxData.tweet) {
241
+ const t = fxData.tweet;
242
+ const structured = {
243
+ author: {
244
+ name: t.author?.name || '',
245
+ handle: '@' + (t.author?.screen_name || ''),
246
+ verified: t.author?.verified || false,
247
+ },
248
+ text: t.text || '',
249
+ timestamp: t.created_at ? new Date(t.created_at).toISOString() : undefined,
250
+ metrics: {
251
+ likes: t.likes ?? 0,
252
+ retweets: t.retweets ?? 0,
253
+ replies: t.replies ?? 0,
254
+ views: t.views ?? 0,
255
+ },
256
+ media: (t.media?.all || []).map((m) => m.url).filter(Boolean),
257
+ quotedTweet: t.quote ? {
258
+ text: t.quote.text || '',
259
+ author: { name: t.quote.author?.name || '', handle: '@' + (t.quote.author?.screen_name || '') },
260
+ } : null,
261
+ source: 'fxtwitter',
262
+ };
263
+ const authorLine = `**${structured.author.name}** (${structured.author.handle})`;
264
+ const timeLine = structured.timestamp ? `\n*${structured.timestamp}*` : '';
265
+ const metricsLine = `\n\n💬 ${structured.metrics.replies} 🔁 ${structured.metrics.retweets} ❤️ ${structured.metrics.likes}${structured.metrics.views ? ` 👁 ${structured.metrics.views}` : ''}`;
266
+ const mediaLine = structured.media.length ? `\n\n📷 Media: ${structured.media.join(', ')}` : '';
267
+ const quotedLine = structured.quotedTweet
268
+ ? `\n\n> **Quoted tweet by ${structured.quotedTweet.author?.name || 'unknown'}:** ${structured.quotedTweet.text}`
269
+ : '';
270
+ const cleanContent = `## 🐦 Tweet by ${authorLine}${timeLine}\n\n${structured.text}${quotedLine}${metricsLine}${mediaLine}`;
271
+ return { domain, type, structured, cleanContent };
272
+ }
273
+ }
274
+ catch (e) {
275
+ if (process.env.DEBUG)
276
+ console.debug('[webpeel]', 'FxTwitter API failed:', e instanceof Error ? e.message : e);
277
+ }
278
+ }
279
+ }
280
+ // --- Try FxTwitter for profiles ---
281
+ if (!isTweet && username) {
282
+ try {
283
+ const fxUrl = `https://api.fxtwitter.com/${username}`;
284
+ const fxData = await fetchJson(fxUrl);
285
+ if (fxData && fxData.code === 200 && fxData.user) {
286
+ const u = fxData.user;
287
+ const structured = {
288
+ name: u.name || '',
289
+ handle: '@' + (u.screen_name || ''),
290
+ bio: u.description || '',
291
+ followers: u.followers ?? 0,
292
+ following: u.following ?? 0,
293
+ tweets: u.tweets ?? 0,
294
+ likes: u.likes ?? 0,
295
+ verified: u.verification?.verified || false,
296
+ location: u.location || '',
297
+ created: u.joined || undefined,
298
+ avatarUrl: u.avatar_url || null,
299
+ bannerUrl: u.banner_url || null,
300
+ website: u.website || null,
301
+ source: 'fxtwitter',
302
+ };
303
+ const cleanContent = `## 🐦 @${(structured.handle || '').replace('@', '')} on X/Twitter\n\n**${structured.name}**${structured.verified ? ' ✓' : ''}\n${structured.bio || ''}\n\n📍 ${structured.location || 'N/A'} | 👥 ${structured.followers?.toLocaleString() || 0} followers | Following: ${structured.following?.toLocaleString() || 0} | Tweets: ${structured.tweets?.toLocaleString() || 0}`;
304
+ return { domain, type: 'profile', structured, cleanContent };
305
+ }
306
+ }
307
+ catch (e) {
308
+ if (process.env.DEBUG)
309
+ console.debug('[webpeel]', 'FxTwitter profile API failed:', e instanceof Error ? e.message : e);
310
+ }
311
+ }
164
312
  // --- Try __NEXT_DATA__ JSON (SSR data) ---
165
313
  const nextDataMatch = html.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
166
314
  let structured = null;
@@ -262,23 +410,142 @@ function parseRedditComment(data, depth) {
262
410
  };
263
411
  }
264
412
  async function redditExtractor(_html, url) {
265
- const urlObj = new URL(url);
413
+ // Resolve Reddit share URLs (/s/CODE) to actual post URLs before any processing
414
+ let workingUrl = url;
415
+ if (url.includes('/s/')) {
416
+ const resolved = await resolveRedditShareUrl(url);
417
+ if (resolved !== url) {
418
+ if (process.env.DEBUG)
419
+ console.debug('[webpeel]', `Reddit share URL resolved: ${url} → ${resolved}`);
420
+ workingUrl = resolved;
421
+ }
422
+ }
423
+ const urlObj = new URL(workingUrl);
266
424
  const path = urlObj.pathname;
267
425
  const domain = 'reddit.com';
426
+ // Normalize old.reddit.com → www.reddit.com for JSON API
427
+ const normalizedUrl = workingUrl.replace(/old\.reddit\.com/, 'www.reddit.com');
428
+ const REDDIT_UA = { 'User-Agent': 'WebPeel/0.17.0 (web data platform; https://webpeel.dev) Node.js' };
268
429
  // Detect page type
269
- const isPost = /\/r\/[^/]+\/comments\//.test(path);
270
- const isSubreddit = /^\/r\/[^/]+\/?$/.test(path);
430
+ const isPost = /\/r\/[^/]+\/comments\//.test(path) || /^\/comments\//.test(path);
431
+ const isGallery = /\/gallery\//.test(path);
432
+ // Subreddit with any sort/filter: /r/sub, /r/sub/, /r/sub/hot, /r/sub/top, /r/sub/new, /r/sub/rising
433
+ const isSubreddit = /^\/r\/[^/]+\/?$/.test(path) || /^\/r\/[^/]+\/(hot|new|top|rising|controversial|best)\/?$/.test(path);
271
434
  const isUser = /^\/(u|user)\/[^/]+/.test(path);
272
- const type = isPost ? 'post' : isSubreddit ? 'subreddit' : isUser ? 'user' : 'listing';
435
+ // Home/popular/all pages
436
+ const isHomeListing = /^\/(hot|new|top|rising|controversial|best|popular|all)\/?$/.test(path) || path === '/' || path === '';
437
+ const type = isPost || isGallery ? 'post' : isSubreddit ? 'subreddit' : isUser ? 'user' : isHomeListing ? 'listing' : 'listing';
438
+ if (isGallery) {
439
+ // Gallery posts: fetch the gallery JSON and extract the post data
440
+ const galleryJsonUrl = normalizedUrl.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
441
+ const requestedGallerySub = path.match(/\/r\/([^/]+)/)?.[1] || 'unknown';
442
+ let galleryData;
443
+ try {
444
+ galleryData = await fetchJsonWithRetry(galleryJsonUrl, REDDIT_UA);
445
+ }
446
+ catch (e) {
447
+ return {
448
+ domain,
449
+ type: 'post',
450
+ structured: { error: 'Post not found or has been deleted', subreddit: `r/${requestedGallerySub}` },
451
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
452
+ };
453
+ }
454
+ if (!Array.isArray(galleryData) || galleryData.length < 1) {
455
+ return {
456
+ domain,
457
+ type: 'post',
458
+ structured: { error: 'Post not found', subreddit: `r/${requestedGallerySub}` },
459
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
460
+ };
461
+ }
462
+ const postData = galleryData[0]?.data?.children?.[0]?.data;
463
+ if (!postData) {
464
+ return {
465
+ domain,
466
+ type: 'post',
467
+ structured: { error: 'Post not found', subreddit: `r/${requestedGallerySub}` },
468
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedGallerySub} could not be found. It may have been deleted or removed.`,
469
+ };
470
+ }
471
+ // Validate subreddit matches the request
472
+ const actualGallerySub = postData.subreddit?.toLowerCase();
473
+ if (requestedGallerySub !== 'unknown' && actualGallerySub && requestedGallerySub.toLowerCase() !== actualGallerySub) {
474
+ return {
475
+ domain,
476
+ type: 'post',
477
+ structured: { error: 'Post not found in requested subreddit', requestedSubreddit: `r/${requestedGallerySub}`, actualSubreddit: `r/${actualGallerySub}` },
478
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post was not found in r/${requestedGallerySub}. It may have been deleted or moved.`,
479
+ };
480
+ }
481
+ const structured = {
482
+ subreddit: `r/${postData.subreddit || ''}`,
483
+ title: postData.title || '',
484
+ author: `u/${postData.author || '[deleted]'}`,
485
+ score: postData.score ?? 0,
486
+ upvoteRatio: postData.upvote_ratio ?? 1,
487
+ url: postData.url || url,
488
+ selftext: postData.selftext || '',
489
+ commentCount: postData.num_comments ?? 0,
490
+ created: unixToIso(postData.created_utc),
491
+ flair: postData.link_flair_text || null,
492
+ comments: [],
493
+ isGallery: true,
494
+ };
495
+ const cleanContent = `## 📋 ${structured.subreddit}: ${structured.title}
496
+
497
+ **Posted by** ${structured.author} | Score: ${structured.score} | ${structured.commentCount} comments
498
+ *${structured.created}*
499
+
500
+ *(Gallery post)*`;
501
+ return { domain, type: 'post', structured, cleanContent };
502
+ }
273
503
  if (isPost) {
274
504
  // Fetch post data via Reddit JSON API
275
- const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
276
- const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
277
- if (!Array.isArray(data) || data.length < 2)
278
- return null;
505
+ const jsonUrl = normalizedUrl.split('?')[0].replace(/\/?$/, '') + '.json?limit=25&sort=top';
506
+ const requestedPostSub = path.match(/\/r\/([^/]+)/)?.[1] || 'unknown';
507
+ let data;
508
+ try {
509
+ data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
510
+ }
511
+ catch (e) {
512
+ // Post not found or API error — return a "not found" result
513
+ // instead of null (which would trigger browser fallback with wrong content)
514
+ return {
515
+ domain,
516
+ type: 'post',
517
+ structured: { error: 'Post not found or has been deleted', subreddit: `r/${requestedPostSub}` },
518
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
519
+ };
520
+ }
521
+ if (!Array.isArray(data) || data.length < 2) {
522
+ return {
523
+ domain,
524
+ type: 'post',
525
+ structured: { error: 'Post not found', subreddit: `r/${requestedPostSub}` },
526
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
527
+ };
528
+ }
279
529
  const postData = data[0]?.data?.children?.[0]?.data;
280
- if (!postData)
281
- return null;
530
+ if (!postData) {
531
+ return {
532
+ domain,
533
+ type: 'post',
534
+ structured: { error: 'Post not found', subreddit: `r/${requestedPostSub}` },
535
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post at r/${requestedPostSub} could not be found. It may have been deleted or removed.`,
536
+ };
537
+ }
538
+ // CRITICAL: Validate subreddit matches the request (prevents cross-subreddit ID reuse exploits)
539
+ const actualPostSub = postData.subreddit?.toLowerCase();
540
+ if (requestedPostSub !== 'unknown' && actualPostSub && requestedPostSub.toLowerCase() !== actualPostSub) {
541
+ // Reddit reused the post ID in a different subreddit — return error instead of wrong content
542
+ return {
543
+ domain,
544
+ type: 'post',
545
+ structured: { error: 'Post not found in requested subreddit', requestedSubreddit: `r/${requestedPostSub}`, actualSubreddit: `r/${actualPostSub}` },
546
+ cleanContent: `## ❌ Reddit Post Not Found\n\nThe post was not found in r/${requestedPostSub}. It may have been deleted or moved.`,
547
+ };
548
+ }
282
549
  // Parse top comments (max 20)
283
550
  const commentChildren = data[1]?.data?.children || [];
284
551
  const comments = [];
@@ -326,8 +593,13 @@ ${commentsMd || '*No comments found.*'}`;
326
593
  }
327
594
  if (isSubreddit) {
328
595
  // Fetch subreddit listing
329
- const jsonUrl = url.split('?')[0].replace(/\/?$/, '') + '.json?limit=15';
330
- const data = await fetchJson(jsonUrl, { 'User-Agent': 'WebPeel/1.0' });
596
+ // Preserve query params (especially t=day, t=week etc. for sorted views)
597
+ const queryString = urlObj.search || '';
598
+ const sortMatch = path.match(/\/r\/[^/]+\/(hot|new|top|rising|controversial|best)/);
599
+ const sortPath = sortMatch ? `/${sortMatch[1]}` : '';
600
+ const baseSubUrl = normalizedUrl.match(/\/r\/[^/]+/)?.[0] || normalizedUrl.split('?')[0];
601
+ const jsonUrl = `https://www.reddit.com${baseSubUrl}${sortPath}.json?limit=15${queryString ? '&' + queryString.slice(1) : ''}`;
602
+ const data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
331
603
  if (!data?.data?.children)
332
604
  return null;
333
605
  const posts = data.data.children
@@ -350,6 +622,36 @@ ${commentsMd || '*No comments found.*'}`;
350
622
  ${posts.map((p, i) => `${i + 1}. **${p.title}**\n ${p.author} | ↑ ${p.score} | 💬 ${p.commentCount}${p.flair ? ` | ${p.flair}` : ''}\n ${p.url}`).join('\n\n')}`;
351
623
  return { domain, type, structured, cleanContent };
352
624
  }
625
+ if (isHomeListing) {
626
+ const sortMatch = path.match(/\/(hot|new|top|rising|controversial|best|popular|all)/);
627
+ const sortType = sortMatch ? sortMatch[1] : 'hot';
628
+ const queryString = urlObj.search || '';
629
+ const jsonUrl = `https://www.reddit.com/${sortType}.json?limit=15${queryString ? '&' + queryString.slice(1) : ''}`;
630
+ const data = await fetchJsonWithRetry(jsonUrl, REDDIT_UA);
631
+ if (!data?.data?.children)
632
+ return null;
633
+ const posts = data.data.children
634
+ .filter((c) => c.kind === 't3')
635
+ .map((c) => {
636
+ const d = c.data;
637
+ return {
638
+ title: d.title || '',
639
+ author: `u/${d.author || '[deleted]'}`,
640
+ score: d.score ?? 0,
641
+ commentCount: d.num_comments ?? 0,
642
+ url: `https://reddit.com${d.permalink}`,
643
+ subreddit: `r/${d.subreddit}`,
644
+ flair: d.link_flair_text || null,
645
+ };
646
+ });
647
+ const structured = { sortType, posts, postCount: posts.length };
648
+ const listMd = posts.map((p, i) => {
649
+ const flairTag = p.flair ? ` | ${p.flair}` : '';
650
+ return `${i + 1}. **${p.title}**\n ${p.author} in ${p.subreddit} | ↑ ${p.score} | 💬 ${p.commentCount}${flairTag}\n ${p.url}`;
651
+ }).join('\n\n');
652
+ const cleanContent = `## 📋 Reddit — ${sortType.charAt(0).toUpperCase() + sortType.slice(1)} Posts\n\n${listMd}`;
653
+ return { domain: 'reddit.com', type: 'listing', structured, cleanContent };
654
+ }
353
655
  // User or other — fall back to null (let normal HTML extraction handle it)
354
656
  return null;
355
657
  }
@@ -499,7 +801,7 @@ ${commentsMd || '*No comments.*'}`;
499
801
  let readmeText = '';
500
802
  if (readmeData?.content) {
501
803
  try {
502
- readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 500);
804
+ readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 5000);
503
805
  }
504
806
  catch { /* ignore */ }
505
807
  }
@@ -529,7 +831,7 @@ ${structured.description || '*No description.*'}
529
831
  🏷️ Topics: ${topicsStr}
530
832
  🔗 ${structured.homepage || 'No homepage'} | Last push: ${structured.lastPush}${structured.archived ? '\n⚠️ **ARCHIVED**' : ''}
531
833
 
532
- ${structured.readme ? `### README (excerpt)\n\n${structured.readme}` : ''}`;
834
+ ${structured.readme ? `### README\n\n${structured.readme}` : ''}`;
533
835
  return { domain, type: 'repository', structured, cleanContent };
534
836
  }
535
837
  return null;
@@ -652,4 +954,442 @@ ${structured.about ? '\n' + structured.about : ''}`;
652
954
  }
653
955
  return null;
654
956
  }
957
+ // ---------------------------------------------------------------------------
958
+ // 5. Wikipedia extractor
959
+ // ---------------------------------------------------------------------------
960
+ /** Remove Wikipedia-specific noise from extracted content. */
961
+ function cleanWikipediaContent(content) {
962
+ return content
963
+ // Remove [edit] links
964
+ .replace(/\[edit\]/gi, '')
965
+ // Remove citation brackets [1], [2], etc.
966
+ .replace(/\[\d+\]/g, '')
967
+ // Remove [citation needed], [verification], etc.
968
+ .replace(/\[(citation needed|verification|improve this article|adding citations[^\]]*|when\?|where\?|who\?|clarification needed|dubious[^\]]*|failed verification[^\]]*|unreliable source[^\]]*)\]/gi, '')
969
+ // Remove [Learn how and when to remove this message]
970
+ .replace(/\[Learn how and when to remove this message\]/gi, '')
971
+ // Clean up excess whitespace
972
+ .replace(/\n{3,}/g, '\n\n')
973
+ .trim();
974
+ }
975
+ async function wikipediaExtractor(_html, url) {
976
+ const urlObj = new URL(url);
977
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
978
+ // Only handle article pages: /wiki/Article_Title
979
+ if (pathParts[0] !== 'wiki' || pathParts.length < 2)
980
+ return null;
981
+ const articleTitle = decodeURIComponent(pathParts[1]);
982
+ // Skip special pages (contain a colon, e.g. Special:Random, Talk:Article)
983
+ if (articleTitle.includes(':'))
984
+ return null;
985
+ const lang = urlObj.hostname.split('.')[0] || 'en';
986
+ const apiUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
987
+ // Wikipedia REST API requires a descriptive User-Agent (https://meta.wikimedia.org/wiki/User-Agent_policy)
988
+ const wikiHeaders = { 'User-Agent': 'WebPeel/0.17.0 (https://webpeel.dev; jake@jakeliu.me) Node.js', 'Api-User-Agent': 'WebPeel/0.17.0 (https://webpeel.dev; jake@jakeliu.me)' };
989
+ try {
990
+ const data = await fetchJson(apiUrl, wikiHeaders);
991
+ if (!data || data.type === 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found')
992
+ return null;
993
+ // For full article content, use the mobile-html endpoint (mobile-sections is deprecated)
994
+ let fullContent = '';
995
+ try {
996
+ const fullUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
997
+ const fullResult = await simpleFetch(fullUrl, undefined, 15000, {
998
+ ...wikiHeaders,
999
+ 'Accept': 'text/html',
1000
+ });
1001
+ if (fullResult?.html) {
1002
+ // Parse sections from the mobile HTML
1003
+ const sectionMatches = fullResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
1004
+ for (const section of sectionMatches) {
1005
+ // Extract section heading
1006
+ const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
1007
+ const heading = headingMatch ? stripHtml(headingMatch[2]).trim() : '';
1008
+ // Extract paragraphs
1009
+ const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
1010
+ const sectionText = paragraphs.map((p) => stripHtml(p).trim()).filter((t) => t.length > 0).join('\n\n');
1011
+ if (sectionText) {
1012
+ const prefix = heading ? `## ${heading}\n\n` : '';
1013
+ fullContent += `\n\n${prefix}${sectionText}`;
1014
+ }
1015
+ }
1016
+ }
1017
+ }
1018
+ catch (e) {
1019
+ // mobile-html failed — use summary extract as fallback
1020
+ if (process.env.DEBUG)
1021
+ console.debug('[webpeel]', 'Wikipedia mobile-html failed, using summary:', e instanceof Error ? e.message : e);
1022
+ }
1023
+ // Clean Wikipedia-specific noise
1024
+ fullContent = cleanWikipediaContent(fullContent);
1025
+ const structured = {
1026
+ title: data.title || articleTitle.replace(/_/g, ' '),
1027
+ description: data.description || '',
1028
+ extract: data.extract || '',
1029
+ thumbnail: data.thumbnail?.source || null,
1030
+ url: data.content_urls?.desktop?.page || url,
1031
+ lastModified: data.timestamp || null,
1032
+ };
1033
+ const cleanContent = `# ${structured.title}\n\n${structured.description ? `*${structured.description}*\n\n` : ''}${fullContent || structured.extract}`;
1034
+ return { domain: 'wikipedia.org', type: 'article', structured, cleanContent };
1035
+ }
1036
+ catch (e) {
1037
+ if (process.env.DEBUG)
1038
+ console.debug('[webpeel]', 'Wikipedia API failed:', e instanceof Error ? e.message : e);
1039
+ return null;
1040
+ }
1041
+ }
1042
+ // ---------------------------------------------------------------------------
1043
+ // 6. YouTube extractor (oEmbed API-first)
1044
+ // ---------------------------------------------------------------------------
1045
+ async function youtubeExtractor(_html, url) {
1046
+ // Try YouTube oEmbed API first (no auth, works without browser)
1047
+ try {
1048
+ const oembedUrl = `https://www.youtube.com/oembed?url=${encodeURIComponent(url)}&format=json`;
1049
+ const oembedData = await fetchJson(oembedUrl);
1050
+ if (oembedData && oembedData.title) {
1051
+ // Also try noembed for richer data
1052
+ let noembedData = null;
1053
+ try {
1054
+ noembedData = await fetchJson(`https://noembed.com/embed?url=${encodeURIComponent(url)}`);
1055
+ }
1056
+ catch { /* optional */ }
1057
+ const structured = {
1058
+ title: oembedData.title,
1059
+ author: oembedData.author_name || '',
1060
+ authorUrl: oembedData.author_url || '',
1061
+ thumbnailUrl: oembedData.thumbnail_url || '',
1062
+ type: oembedData.type || 'video',
1063
+ source: 'oembed',
1064
+ };
1065
+ const cleanContent = `## 🎬 ${structured.title}\n\n**Channel:** [${structured.author}](${structured.authorUrl})\n\n${noembedData?.description || 'YouTube video'}`;
1066
+ return { domain: 'youtube.com', type: 'video', structured, cleanContent };
1067
+ }
1068
+ }
1069
+ catch (e) {
1070
+ if (process.env.DEBUG)
1071
+ console.debug('[webpeel]', 'YouTube oEmbed failed:', e instanceof Error ? e.message : e);
1072
+ }
1073
+ // Fallback: return null (no HTML parsing implemented)
1074
+ return null;
1075
+ }
1076
+ // ---------------------------------------------------------------------------
1077
+ // 7. ArXiv extractor (ArXiv API)
1078
+ // ---------------------------------------------------------------------------
1079
+ async function arxivExtractor(_html, url) {
1080
+ const urlObj = new URL(url);
1081
+ const path = urlObj.pathname;
1082
+ // Extract paper ID from URL patterns:
1083
+ // /abs/2501.12948, /pdf/2501.12948, /abs/2501.12948v2
1084
+ const idMatch = path.match(/\/(abs|pdf|html)\/(\d{4}\.\d{4,5}(?:v\d+)?)/);
1085
+ if (!idMatch)
1086
+ return null;
1087
+ const paperId = idMatch[2];
1088
+ try {
1089
+ // Use ArXiv API
1090
+ const apiUrl = `https://export.arxiv.org/api/query?id_list=${paperId}`;
1091
+ const result = await simpleFetch(apiUrl, 'WebPeel/0.17.0', 15000, { Accept: 'application/xml' });
1092
+ if (!result?.html)
1093
+ return null;
1094
+ const xml = result.html;
1095
+ // Parse XML (simple regex-based for these known fields)
1096
+ const getTag = (tag) => {
1097
+ const match = xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
1098
+ return match ? stripHtml(match[1]).trim() : '';
1099
+ };
1100
+ const getAllTags = (tag) => {
1101
+ const matches = [...xml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
1102
+ return matches.map(m => stripHtml(m[1]).trim()).filter(Boolean);
1103
+ };
1104
+ const title = getTag('title');
1105
+ const summary = getTag('summary');
1106
+ const published = getTag('published');
1107
+ const updated = getTag('updated');
1108
+ const authors = getAllTags('name');
1109
+ // Extract categories
1110
+ const categories = [...xml.matchAll(/category[^>]*term="([^"]+)"/g)].map(m => m[1]);
1111
+ // Extract DOI and journal ref if available
1112
+ const doi = getTag('arxiv:doi');
1113
+ const journalRef = getTag('arxiv:journal_ref');
1114
+ if (!title)
1115
+ return null;
1116
+ const structured = {
1117
+ title,
1118
+ authors,
1119
+ abstract: summary,
1120
+ published: published || undefined,
1121
+ updated: updated || undefined,
1122
+ categories,
1123
+ doi: doi || undefined,
1124
+ journalRef: journalRef || undefined,
1125
+ paperId,
1126
+ pdfUrl: `https://arxiv.org/pdf/${paperId}`,
1127
+ absUrl: `https://arxiv.org/abs/${paperId}`,
1128
+ };
1129
+ const authorLine = authors.length <= 5
1130
+ ? authors.join(', ')
1131
+ : `${authors.slice(0, 5).join(', ')} et al. (${authors.length} authors)`;
1132
+ const cleanContent = `# ${title}\n\n**Authors:** ${authorLine}\n**Published:** ${published?.split('T')[0] || 'N/A'}${categories.length ? `\n**Categories:** ${categories.join(', ')}` : ''}${doi ? `\n**DOI:** ${doi}` : ''}${journalRef ? `\n**Journal:** ${journalRef}` : ''}\n\n## Abstract\n\n${summary}\n\n📄 [PDF](${structured.pdfUrl}) | [Abstract](${structured.absUrl})`;
1133
+ return { domain: 'arxiv.org', type: 'paper', structured, cleanContent };
1134
+ }
1135
+ catch (e) {
1136
+ if (process.env.DEBUG)
1137
+ console.debug('[webpeel]', 'ArXiv API failed:', e instanceof Error ? e.message : e);
1138
+ return null;
1139
+ }
1140
+ }
1141
+ // ---------------------------------------------------------------------------
1142
+ // 8. Stack Overflow extractor (StackExchange API)
1143
+ // ---------------------------------------------------------------------------
1144
+ async function stackOverflowExtractor(_html, url) {
1145
+ const urlObj = new URL(url);
1146
+ const path = urlObj.pathname;
1147
+ // Match /questions/12345/optional-slug
1148
+ const questionMatch = path.match(/\/questions\/(\d+)/);
1149
+ if (!questionMatch)
1150
+ return null;
1151
+ const questionId = questionMatch[1];
1152
+ try {
1153
+ const apiUrl = `https://api.stackexchange.com/2.3/questions/${questionId}?order=desc&sort=votes&site=stackoverflow&filter=withbody`;
1154
+ const data = await fetchJson(apiUrl);
1155
+ if (!data?.items?.[0])
1156
+ return null;
1157
+ const q = data.items[0];
1158
+ // Also fetch answers
1159
+ let answers = [];
1160
+ try {
1161
+ const answersUrl = `https://api.stackexchange.com/2.3/questions/${questionId}/answers?order=desc&sort=votes&site=stackoverflow&filter=withbody&pagesize=5`;
1162
+ const answersData = await fetchJson(answersUrl);
1163
+ answers = answersData?.items || [];
1164
+ }
1165
+ catch { /* answers optional */ }
1166
+ const structured = {
1167
+ title: stripHtml(q.title || ''),
1168
+ questionId: q.question_id,
1169
+ score: q.score || 0,
1170
+ views: q.view_count || 0,
1171
+ answerCount: q.answer_count || 0,
1172
+ isAnswered: q.is_answered || false,
1173
+ tags: q.tags || [],
1174
+ askedBy: q.owner?.display_name || 'anonymous',
1175
+ askedDate: q.creation_date ? new Date(q.creation_date * 1000).toISOString() : undefined,
1176
+ acceptedAnswerId: q.accepted_answer_id || null,
1177
+ answers: answers.map(a => ({
1178
+ id: a.answer_id,
1179
+ score: a.score,
1180
+ isAccepted: a.is_accepted || false,
1181
+ body: stripHtml(a.body || '').substring(0, 2000),
1182
+ author: a.owner?.display_name || 'anonymous',
1183
+ })),
1184
+ };
1185
+ const questionBody = stripHtml(q.body || '').substring(0, 3000);
1186
+ const tagLine = structured.tags.length ? `**Tags:** ${structured.tags.join(', ')}` : '';
1187
+ let answersContent = '';
1188
+ for (const a of structured.answers.slice(0, 3)) {
1189
+ const acceptedMark = a.isAccepted ? ' ✅ Accepted' : '';
1190
+ answersContent += `\n\n---\n\n### Answer by ${a.author} (Score: ${a.score}${acceptedMark})\n\n${a.body}`;
1191
+ }
1192
+ const cleanContent = `# ${structured.title}\n\n**Score:** ${structured.score} | **Views:** ${structured.views?.toLocaleString()} | **Answers:** ${structured.answerCount}\n${tagLine}\n**Asked by:** ${structured.askedBy}\n\n## Question\n\n${questionBody}${answersContent}`;
1193
+ return { domain: 'stackoverflow.com', type: 'question', structured, cleanContent };
1194
+ }
1195
+ catch (e) {
1196
+ if (process.env.DEBUG)
1197
+ console.debug('[webpeel]', 'StackOverflow API failed:', e instanceof Error ? e.message : e);
1198
+ return null;
1199
+ }
1200
+ }
1201
+ // ---------------------------------------------------------------------------
1202
+ // 9. NPM extractor (npm registry API)
1203
+ // ---------------------------------------------------------------------------
1204
+ async function npmExtractor(_html, url) {
1205
+ const urlObj = new URL(url);
1206
+ const path = urlObj.pathname;
1207
+ // Match /package/name or /package/@scope/name
1208
+ const packageMatch = path.match(/\/package\/((?:@[^/]+\/)?[^/]+)/);
1209
+ if (!packageMatch)
1210
+ return null;
1211
+ const packageName = packageMatch[1];
1212
+ try {
1213
+ const apiUrl = `https://registry.npmjs.org/${encodeURIComponent(packageName)}`;
1214
+ const data = await fetchJson(apiUrl);
1215
+ if (!data?.name)
1216
+ return null;
1217
+ const latest = data['dist-tags']?.latest;
1218
+ const latestVersion = latest ? data.versions?.[latest] : null;
1219
+ // Get download counts
1220
+ let downloads = null;
1221
+ try {
1222
+ downloads = await fetchJson(`https://api.npmjs.org/downloads/point/last-week/${encodeURIComponent(packageName)}`);
1223
+ }
1224
+ catch { /* optional */ }
1225
+ const structured = {
1226
+ name: data.name,
1227
+ description: data.description || '',
1228
+ version: latest || 'unknown',
1229
+ license: latestVersion?.license || data.license || 'N/A',
1230
+ homepage: data.homepage || latestVersion?.homepage || null,
1231
+ repository: typeof data.repository === 'string' ? data.repository : data.repository?.url || null,
1232
+ author: typeof data.author === 'string' ? data.author : data.author?.name || '',
1233
+ keywords: data.keywords || [],
1234
+ weeklyDownloads: downloads?.downloads || 0,
1235
+ dependencies: Object.keys(latestVersion?.dependencies || {}),
1236
+ devDependencies: Object.keys(latestVersion?.devDependencies || {}),
1237
+ maintainers: (data.maintainers || []).map((m) => m.name || m).slice(0, 10),
1238
+ created: data.time?.created || undefined,
1239
+ modified: data.time?.modified || undefined,
1240
+ };
1241
+ // Include README if available (some packages have it, some don't)
1242
+ const readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
1243
+ // Add to structured data
1244
+ structured.readme = readmeText;
1245
+ const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
1246
+ const depsLine = structured.dependencies.length
1247
+ ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.slice(0, 15).join(', ')}${structured.dependencies.length > 15 ? '...' : ''}`
1248
+ : '';
1249
+ const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
1250
+ const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
1251
+ const datesLine = structured.created ? `\n**Created:** ${structured.created?.split('T')[0] || 'N/A'} | **Last modified:** ${structured.modified?.split('T')[0] || 'N/A'}` : '';
1252
+ const readmeSection = readmeText
1253
+ ? `\n\n### README\n\n${readmeText}`
1254
+ : '';
1255
+ const cleanContent = `# 📦 ${structured.name}@${structured.version}
1256
+
1257
+ ${structured.description}
1258
+
1259
+ **License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
1260
+ **Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
1261
+ return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
1262
+ }
1263
+ catch (e) {
1264
+ if (process.env.DEBUG)
1265
+ console.debug('[webpeel]', 'NPM API failed:', e instanceof Error ? e.message : e);
1266
+ return null;
1267
+ }
1268
+ }
1269
+ // ---------------------------------------------------------------------------
1270
+ // 10. Best Buy extractor (Best Buy Products API)
1271
+ // ---------------------------------------------------------------------------
1272
+ async function bestBuyExtractor(_html, url) {
1273
+ const apiKey = process.env.BESTBUY_API_KEY;
1274
+ if (!apiKey)
1275
+ return null; // No API key, skip
1276
+ // Extract SKU from URL: /site/.../6587822.p → 6587822
1277
+ const skuMatch = url.match(/\/(\d{7,})\.p/);
1278
+ if (!skuMatch)
1279
+ return null;
1280
+ const sku = skuMatch[1];
1281
+ const apiUrl = `https://api.bestbuy.com/v1/products/${sku}.json?apiKey=${apiKey}&show=sku,name,salePrice,regularPrice,onSale,shortDescription,longDescription,image,largeFrontImage,url,customerReviewAverage,customerReviewCount,categoryPath,manufacturer,modelNumber,upc,freeShipping,inStoreAvailability,onlineAvailability,condition,features.feature`;
1282
+ try {
1283
+ const data = await fetchJson(apiUrl);
1284
+ if (!data || data.error)
1285
+ return null;
1286
+ // Build clean markdown
1287
+ const lines = [];
1288
+ lines.push(`# ${data.name}`);
1289
+ lines.push('');
1290
+ if (data.onSale) {
1291
+ lines.push(`**Sale Price:** $${data.salePrice} (was $${data.regularPrice})`);
1292
+ }
1293
+ else {
1294
+ lines.push(`**Price:** $${data.regularPrice}`);
1295
+ }
1296
+ lines.push(`**SKU:** ${data.sku}`);
1297
+ if (data.manufacturer)
1298
+ lines.push(`**Brand:** ${data.manufacturer}`);
1299
+ if (data.modelNumber)
1300
+ lines.push(`**Model:** ${data.modelNumber}`);
1301
+ if (data.customerReviewAverage) {
1302
+ lines.push(`**Rating:** ${data.customerReviewAverage}/5 (${data.customerReviewCount} reviews)`);
1303
+ }
1304
+ lines.push(`**Availability:** ${data.onlineAvailability ? 'In Stock Online' : 'Out of Stock Online'} | ${data.inStoreAvailability ? 'Available In Store' : 'Not Available In Store'}`);
1305
+ if (data.freeShipping)
1306
+ lines.push('**Free Shipping:** Yes');
1307
+ lines.push('');
1308
+ if (data.shortDescription)
1309
+ lines.push(data.shortDescription);
1310
+ lines.push('');
1311
+ if (data.longDescription)
1312
+ lines.push(data.longDescription);
1313
+ if (data.features?.feature) {
1314
+ lines.push('');
1315
+ lines.push('## Features');
1316
+ for (const f of data.features.feature) {
1317
+ lines.push(`- ${f}`);
1318
+ }
1319
+ }
1320
+ const structured = {
1321
+ sku: data.sku,
1322
+ name: data.name,
1323
+ price: data.salePrice || data.regularPrice,
1324
+ regularPrice: data.regularPrice,
1325
+ onSale: data.onSale,
1326
+ brand: data.manufacturer,
1327
+ model: data.modelNumber,
1328
+ upc: data.upc,
1329
+ rating: data.customerReviewAverage,
1330
+ reviewCount: data.customerReviewCount,
1331
+ image: data.largeFrontImage || data.image,
1332
+ url: data.url,
1333
+ inStock: data.onlineAvailability,
1334
+ freeShipping: data.freeShipping,
1335
+ condition: data.condition,
1336
+ category: data.categoryPath?.map((c) => c.name).join(' > '),
1337
+ };
1338
+ return { domain: 'bestbuy.com', type: 'product', structured, cleanContent: lines.join('\n') };
1339
+ }
1340
+ catch (e) {
1341
+ if (process.env.DEBUG)
1342
+ console.debug('[webpeel]', 'Best Buy API failed:', e instanceof Error ? e.message : e);
1343
+ return null;
1344
+ }
1345
+ }
1346
+ // ---------------------------------------------------------------------------
1347
+ // 11. Walmart extractor (Walmart frontend search API)
1348
+ // ---------------------------------------------------------------------------
1349
+ async function walmartExtractor(_html, url) {
1350
+ // Extract item ID from URL patterns:
1351
+ // /ip/Product-Name/1234567 or /ip/1234567
1352
+ const itemMatch = url.match(/\/ip\/(?:.*\/)?(\d+)/);
1353
+ if (!itemMatch)
1354
+ return null;
1355
+ const itemId = itemMatch[1];
1356
+ // Try Walmart's BE API (used by their frontend, sometimes accessible)
1357
+ const apiUrl = `https://www.walmart.com/orchestra/snb/graphql/Search?query=${itemId}&page=1&affinityOverride=default&limit=1`;
1358
+ try {
1359
+ const response = await fetchJson(apiUrl, {
1360
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
1361
+ 'Accept': 'application/json',
1362
+ 'Referer': 'https://www.walmart.com/',
1363
+ });
1364
+ if (response?.data?.search?.searchResult?.itemStacks?.[0]?.items?.[0]) {
1365
+ const item = response.data.search.searchResult.itemStacks[0].items[0];
1366
+ const lines = [];
1367
+ lines.push(`# ${item.name}`);
1368
+ if (item.priceInfo?.currentPrice?.price) {
1369
+ lines.push(`**Price:** $${item.priceInfo.currentPrice.price}`);
1370
+ }
1371
+ if (item.averageRating) {
1372
+ lines.push(`**Rating:** ${item.averageRating}/5 (${item.numberOfReviews || 0} reviews)`);
1373
+ }
1374
+ if (item.shortDescription)
1375
+ lines.push(item.shortDescription);
1376
+ const structured = {
1377
+ name: item.name,
1378
+ price: item.priceInfo?.currentPrice?.price,
1379
+ rating: item.averageRating,
1380
+ reviewCount: item.numberOfReviews,
1381
+ image: item.imageInfo?.thumbnailUrl,
1382
+ itemId: itemId,
1383
+ inStock: item.availabilityStatusV2?.value === 'IN_STOCK',
1384
+ };
1385
+ return { domain: 'walmart.com', type: 'product', structured, cleanContent: lines.join('\n') };
1386
+ }
1387
+ return null;
1388
+ }
1389
+ catch (e) {
1390
+ if (process.env.DEBUG)
1391
+ console.debug('[webpeel]', 'Walmart API failed:', e instanceof Error ? e.message : e);
1392
+ return null; // API not accessible, fall through to other methods
1393
+ }
1394
+ }
655
1395
  //# sourceMappingURL=domain-extractors.js.map