webpeel 0.20.6 → 0.20.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -324,19 +324,87 @@ async function twitterExtractor(html, url) {
324
324
  source: 'fxtwitter',
325
325
  };
326
326
  // Try to fetch recent tweets from Twitter's public syndication endpoint
327
+ // NOTE: simpleFetch sends too many Sec-* headers that trigger 429. Use https directly.
327
328
  let recentTweets = '';
328
329
  try {
329
- const syndicationUrl = `https://syndication.twitter.com/srv/timeline-profile/screen-name/${u.screen_name}`;
330
- const syndicationResult = await simpleFetch(syndicationUrl, 'Mozilla/5.0 (compatible; WebPeel/1.0)', 8000);
331
- if (syndicationResult?.html) {
332
- // Extract tweet texts from the syndication HTML
333
- const tweetMatches = [...syndicationResult.html.matchAll(/"full_text":"((?:[^"\\]|\\.)*)"/g)];
334
- const tweets = tweetMatches
335
- .slice(0, 5)
336
- .map(m => m[1].replace(/\\n/g, ' ').replace(/\\"/g, '"').trim())
337
- .filter(t => t.length > 10 && !t.startsWith('RT @'));
338
- if (tweets.length > 0) {
339
- recentTweets = '\n\n### Recent Tweets\n\n' + tweets.map(t => `> ${t}`).join('\n\n');
330
+ const { default: httpsModule } = await import('https');
331
+ const syndicationHtml = await new Promise((resolve, reject) => {
332
+ const req = httpsModule.request({
333
+ hostname: 'syndication.twitter.com',
334
+ path: `/srv/timeline-profile/screen-name/${u.screen_name}`,
335
+ method: 'GET',
336
+ headers: {
337
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
338
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
339
+ 'Accept-Language': 'en-US,en;q=0.9',
340
+ },
341
+ }, (res) => {
342
+ if (res.statusCode !== 200) {
343
+ reject(new Error(`HTTP ${res.statusCode}`));
344
+ res.resume();
345
+ return;
346
+ }
347
+ let body = '';
348
+ res.on('data', (chunk) => body += chunk.toString());
349
+ res.on('end', () => resolve(body));
350
+ });
351
+ req.on('error', reject);
352
+ setTimeout(() => req.destroy(new Error('timeout')), 12000);
353
+ req.end();
354
+ });
355
+ if (syndicationHtml) {
356
+ // Parse __NEXT_DATA__ JSON from the syndication page for rich tweet data
357
+ const nextDataMatch = syndicationHtml.match(/<script id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
358
+ if (nextDataMatch) {
359
+ const nextData = tryParseJson(nextDataMatch[1]);
360
+ const entries = nextData?.props?.pageProps?.timeline?.entries || [];
361
+ const tweetSections = [];
362
+ for (const entry of entries) {
363
+ if (tweetSections.length >= 8)
364
+ break;
365
+ const tweet = entry?.content?.tweet;
366
+ if (!tweet?.full_text)
367
+ continue;
368
+ const text = tweet.full_text.replace(/\\n/g, '\n').replace(/\\"/g, '"').trim();
369
+ // Skip retweets and pure-URL-only tweets without media
370
+ if (text.startsWith('RT @'))
371
+ continue;
372
+ const media = tweet.extended_entities?.media || tweet.entities?.media || [];
373
+ const isUrlOnly = /^https?:\/\/t\.co\/\S+$/.test(text.trim()) || /^https?:\/\/t\.co\/\S+\s*$/.test(text.trim());
374
+ if (isUrlOnly && media.length === 0)
375
+ continue;
376
+ // Format date
377
+ const dateStr = tweet.created_at ? (() => {
378
+ try {
379
+ return new Date(tweet.created_at).toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' });
380
+ }
381
+ catch {
382
+ return tweet.created_at;
383
+ }
384
+ })() : '';
385
+ const likes = tweet.favorite_count ?? 0;
386
+ const retweets = tweet.retweet_count ?? 0;
387
+ const replies = tweet.reply_count ?? 0;
388
+ const fmtNum = (n) => n >= 1000000 ? (n / 1000000).toFixed(1) + 'M' : n >= 1000 ? (n / 1000).toFixed(1) + 'K' : String(n);
389
+ const mediaLine = media.length > 0 ? `\nšŸ“· ${media.map((m) => m.media_url_https || m.media_url).filter(Boolean).join(', ')}` : '';
390
+ // Clean t.co URLs from text when they have real media
391
+ const cleanText = media.length > 0 ? text.replace(/https?:\/\/t\.co\/\S+/g, '').trim() : text;
392
+ tweetSections.push(`### ${dateStr}\n${cleanText}${mediaLine}\nā™»ļø ${fmtNum(retweets)} | ā¤ļø ${fmtNum(likes)} | šŸ’¬ ${fmtNum(replies)}`);
393
+ }
394
+ if (tweetSections.length > 0) {
395
+ recentTweets = '\n\n## Recent Tweets\n\n' + tweetSections.join('\n\n---\n\n');
396
+ }
397
+ }
398
+ else {
399
+ // Fallback: simple regex extraction without metrics
400
+ const tweetMatches = [...syndicationHtml.matchAll(/"full_text":"((?:[^"\\]|\\.)*)"/g)];
401
+ const tweets = tweetMatches
402
+ .slice(0, 5)
403
+ .map(m => m[1].replace(/\\n/g, ' ').replace(/\\"/g, '"').trim())
404
+ .filter(t => t.length > 10 && !t.startsWith('RT @'));
405
+ if (tweets.length > 0) {
406
+ recentTweets = '\n\n## Recent Tweets\n\n' + tweets.map(t => `> ${t}`).join('\n\n');
407
+ }
340
408
  }
341
409
  }
342
410
  }
@@ -344,7 +412,7 @@ async function twitterExtractor(html, url) {
344
412
  const websiteLine = structured.website ? `\n🌐 ${structured.website}` : '';
345
413
  const joinedLine = structured.created ? `\nšŸ“… Joined: ${structured.created}` : '';
346
414
  const likesLine = structured.likes ? ` | ā¤ļø Likes: ${structured.likes?.toLocaleString() || 0}` : '';
347
- const cleanContent = `## 🐦 @${(structured.handle || '').replace('@', '')} on X/Twitter\n\n**${structured.name}**${structured.verified ? ' āœ“' : ''}\n\n${structured.bio || ''}\n\nšŸ“ ${structured.location || 'N/A'}${websiteLine}${joinedLine}\n\nšŸ‘„ **Followers:** ${structured.followers?.toLocaleString() || 0} | **Following:** ${structured.following?.toLocaleString() || 0} | **Tweets:** ${structured.tweets?.toLocaleString() || 0}${likesLine}${recentTweets}`;
415
+ const cleanContent = `# @${(structured.handle || '').replace('@', '')} on X/Twitter\n\n**${structured.name}**${structured.verified ? ' āœ“' : ''}\n\n${structured.bio || ''}\n\nšŸ“ ${structured.location || 'N/A'}${websiteLine}${joinedLine}\nšŸ‘„ Followers: ${structured.followers?.toLocaleString() || 0} | Following: ${structured.following?.toLocaleString() || 0} | Tweets: ${structured.tweets?.toLocaleString() || 0}${likesLine}${recentTweets}`;
348
416
  return { domain, type: 'profile', structured, cleanContent };
349
417
  }
350
418
  }
@@ -1961,7 +2029,16 @@ async function linkedinExtractor(html, url) {
1961
2029
  try {
1962
2030
  const { load } = await import('cheerio');
1963
2031
  const $ = load(html);
1964
- // LinkedIn SSR exposes some data in meta tags and JSON-LD
2032
+ // Detect page type from URL first
2033
+ const urlObj = new URL(url);
2034
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
2035
+ const pageType = pathParts[0] === 'company' ? 'company'
2036
+ : pathParts[0] === 'in' ? 'profile'
2037
+ : pathParts[0] === 'jobs' ? 'job'
2038
+ : 'page';
2039
+ // Detect if we're on the authwall (LinkedIn redirects unauthenticated requests)
2040
+ const isAuthwall = html.includes('authwall') || html.includes('Join LinkedIn') || html.includes('Sign in') && !html.includes('linkedin.com/in/');
2041
+ // --- Try parsing meta tags / JSON-LD from the HTML ---
1965
2042
  let jsonLd = null;
1966
2043
  $('script[type="application/ld+json"]').each((_, el) => {
1967
2044
  if (jsonLd)
@@ -1974,30 +2051,83 @@ async function linkedinExtractor(html, url) {
1974
2051
  const ogTitle = $('meta[property="og:title"]').attr('content') || '';
1975
2052
  const ogDescription = $('meta[property="og:description"]').attr('content') || '';
1976
2053
  const ogImage = $('meta[property="og:image"]').attr('content') || '';
1977
- const name = jsonLd?.name || ogTitle.replace(/ \| LinkedIn$/, '').trim() || '';
2054
+ const metaDescription = $('meta[name="description"]').attr('content') || '';
2055
+ let name = jsonLd?.name || ogTitle.replace(/ \| LinkedIn$/, '').replace(/Sign Up \| LinkedIn$/, '').trim() || '';
2056
+ // When on authwall, discard authwall-specific meta data
2057
+ let headline = isAuthwall ? (jsonLd?.jobTitle || '') : (jsonLd?.jobTitle || metaDescription?.split('|')?.[0]?.trim() || ogDescription || '');
2058
+ let description = isAuthwall ? (jsonLd?.description || '') : (jsonLd?.description || ogDescription || '');
2059
+ let location = $('[class*="location"]').first().text().trim() || jsonLd?.address?.addressLocality || '';
2060
+ // --- If authwall or no useful data, try direct HTTPS fetch with minimal headers ---
2061
+ // LinkedIn returns rich og: meta tags when fetched with a plain browser UA (no Sec-Fetch-* noise)
2062
+ if (!name || isAuthwall || name.toLowerCase().includes('sign up') || name.toLowerCase().includes('linkedin')) {
2063
+ try {
2064
+ const { default: httpsLI } = await import('https');
2065
+ const { gunzip } = await import('zlib');
2066
+ const linkedInHtml = await new Promise((resolve, reject) => {
2067
+ const req = httpsLI.request({
2068
+ hostname: 'www.linkedin.com',
2069
+ path: urlObj.pathname,
2070
+ method: 'GET',
2071
+ headers: {
2072
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
2073
+ 'Accept': 'text/html,application/xhtml+xml',
2074
+ 'Accept-Language': 'en-US,en;q=0.9',
2075
+ 'Accept-Encoding': 'gzip, deflate',
2076
+ },
2077
+ }, (res) => {
2078
+ if (res.statusCode && res.statusCode >= 400) {
2079
+ reject(new Error(`HTTP ${res.statusCode}`));
2080
+ res.resume();
2081
+ return;
2082
+ }
2083
+ const chunks = [];
2084
+ res.on('data', (chunk) => chunks.push(chunk));
2085
+ res.on('end', () => {
2086
+ const buf = Buffer.concat(chunks);
2087
+ const enc = res.headers['content-encoding'] || '';
2088
+ if (enc === 'gzip') {
2089
+ gunzip(buf, (err, decoded) => err ? reject(err) : resolve(decoded.toString('utf8')));
2090
+ }
2091
+ else {
2092
+ resolve(buf.toString('utf8'));
2093
+ }
2094
+ });
2095
+ });
2096
+ req.on('error', reject);
2097
+ setTimeout(() => req.destroy(new Error('timeout')), 10000);
2098
+ req.end();
2099
+ });
2100
+ if (linkedInHtml) {
2101
+ const $li = load(linkedInHtml);
2102
+ const liOgTitle = $li('meta[property="og:title"]').attr('content') || '';
2103
+ const liOgDesc = $li('meta[property="og:description"]').attr('content') || '';
2104
+ // Only use if it has real profile data (not authwall)
2105
+ if (liOgTitle && !liOgTitle.toLowerCase().includes('sign up') && !liOgTitle.toLowerCase().includes('join linkedin')) {
2106
+ // "Name - Headline | LinkedIn" or "Name | LinkedIn"
2107
+ const titleParts = liOgTitle.replace(/ \| LinkedIn$/, '').split(/\s*[-–]\s*/);
2108
+ if (titleParts[0])
2109
+ name = titleParts[0].trim();
2110
+ if (titleParts[1])
2111
+ headline = titleParts[1].trim();
2112
+ if (liOgDesc)
2113
+ description = liOgDesc;
2114
+ }
2115
+ }
2116
+ }
2117
+ catch { /* direct fetch optional */ }
2118
+ }
1978
2119
  if (!name)
1979
2120
  return null;
1980
- const headline = jsonLd?.jobTitle ||
1981
- $('meta[name="description"]').attr('content')?.split('|')?.[0]?.trim() ||
1982
- ogDescription || '';
1983
- const description = jsonLd?.description || ogDescription || '';
1984
- // Try to detect page type from URL
1985
- const pathParts = new URL(url).pathname.split('/').filter(Boolean);
1986
- const pageType = pathParts[0] === 'company' ? 'company'
1987
- : pathParts[0] === 'in' ? 'profile'
1988
- : pathParts[0] === 'jobs' ? 'job'
1989
- : 'page';
1990
- // Extract any visible structured info from the HTML
1991
- const location = $('[class*="location"]').first().text().trim() ||
1992
- jsonLd?.address?.addressLocality || '';
1993
2121
  const structured = {
1994
2122
  name, headline, description, location, pageType,
1995
2123
  image: ogImage, url,
1996
2124
  };
1997
2125
  const typeLine = pageType === 'company' ? 'šŸ¢' : pageType === 'profile' ? 'šŸ‘¤' : 'šŸ”—';
1998
2126
  const locationLine = location ? `\nšŸ“ ${location}` : '';
1999
- const headlineLine = headline ? `\n*${headline}*` : '';
2000
- const cleanContent = `# ${typeLine} ${name}${headlineLine}${locationLine}\n\n${description}`;
2127
+ const headlineLine = headline && headline !== name ? `\n*${headline}*` : '';
2128
+ const descriptionLine = description ? `\n\n${description}` : '';
2129
+ const authNote = '\n\nāš ļø Full LinkedIn profiles require authentication. Use /v1/session to log in first.';
2130
+ const cleanContent = `# ${typeLine} ${name} — LinkedIn${headlineLine}${locationLine}${descriptionLine}${authNote}`;
2001
2131
  return { domain: 'linkedin.com', type: pageType, structured, cleanContent };
2002
2132
  }
2003
2133
  catch {
@@ -2541,8 +2671,73 @@ async function soundcloudExtractor(_html, url) {
2541
2671
  // 29. Instagram extractor (oEmbed)
2542
2672
  // ---------------------------------------------------------------------------
2543
2673
  async function instagramExtractor(_html, url) {
2674
+ const pathParts = new URL(url).pathname.split('/').filter(Boolean);
2675
+ const contentType = pathParts[0] === 'p' ? 'post' : pathParts[0] === 'reel' ? 'reel' : pathParts[0] === 'tv' ? 'igtv' : pathParts.length === 1 ? 'profile' : 'post';
2676
+ // --- Profile extraction via Instagram internal API (no auth needed) ---
2677
+ if (contentType === 'profile' && pathParts.length === 1) {
2678
+ const username = pathParts[0];
2679
+ try {
2680
+ const apiUrl = `https://www.instagram.com/api/v1/users/web_profile_info/?username=${encodeURIComponent(username)}`;
2681
+ const igHeaders = {
2682
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
2683
+ 'X-IG-App-ID': '936619743392459',
2684
+ 'Accept': '*/*',
2685
+ 'Referer': 'https://www.instagram.com/',
2686
+ };
2687
+ const apiResult = await simpleFetch(apiUrl, igHeaders['User-Agent'], 12000, igHeaders);
2688
+ const data = tryParseJson(apiResult?.html || '');
2689
+ const user = data?.data?.user;
2690
+ if (user && user.username) {
2691
+ const followers = user.edge_followed_by?.count ?? 0;
2692
+ const following = user.edge_follow?.count ?? 0;
2693
+ const postCount = user.edge_owner_to_timeline_media?.count ?? 0;
2694
+ const fmtNum = (n) => n >= 1000000 ? (n / 1000000).toFixed(1) + 'M' : n >= 1000 ? (n / 1000).toFixed(1) + 'K' : String(n);
2695
+ const structured = {
2696
+ username: user.username,
2697
+ fullName: user.full_name || '',
2698
+ bio: user.biography || '',
2699
+ followers,
2700
+ following,
2701
+ posts: postCount,
2702
+ verified: user.is_verified || false,
2703
+ isPrivate: user.is_private || false,
2704
+ profilePic: user.profile_pic_url_hd || user.profile_pic_url || '',
2705
+ externalUrl: user.external_url || (user.bio_links?.[0]?.url) || '',
2706
+ contentType: 'profile',
2707
+ };
2708
+ // Recent posts
2709
+ const edges = user.edge_owner_to_timeline_media?.edges || [];
2710
+ const postSections = [];
2711
+ for (const edge of edges.slice(0, 6)) {
2712
+ const node = edge?.node;
2713
+ if (!node)
2714
+ continue;
2715
+ const caption = node.edge_media_to_caption?.edges?.[0]?.node?.text || '';
2716
+ const likes = node.edge_liked_by?.count ?? node.edge_media_preview_like?.count ?? 0;
2717
+ const comments = node.edge_media_to_comment?.count ?? 0;
2718
+ const isVideo = node.is_video;
2719
+ const mediaType = isVideo ? 'šŸŽ¬' : 'šŸ“ø';
2720
+ const timestamp = node.taken_at_timestamp ? new Date(node.taken_at_timestamp * 1000).toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }) : '';
2721
+ const imgUrl = node.thumbnail_src || node.display_url || '';
2722
+ const captionSnippet = caption ? caption.slice(0, 150) + (caption.length > 150 ? '…' : '') : '';
2723
+ postSections.push(`### ${mediaType} ${timestamp}\n${captionSnippet}\nā¤ļø ${fmtNum(likes)} | šŸ’¬ ${fmtNum(comments)}${imgUrl ? `\nšŸ–¼ ${imgUrl}` : ''}`);
2724
+ }
2725
+ const verifiedBadge = structured.verified ? ' āœ“' : '';
2726
+ const privateBadge = structured.isPrivate ? ' šŸ”’' : '';
2727
+ const bioLine = structured.bio ? `\n\n${structured.bio}` : '';
2728
+ const externalLine = structured.externalUrl ? `\n🌐 ${structured.externalUrl}` : '';
2729
+ const postsSection = postSections.length > 0 ? '\n\n## Recent Posts\n\n' + postSections.join('\n\n---\n\n') : '';
2730
+ const cleanContent = `# @${structured.username} on Instagram${verifiedBadge}${privateBadge}\n\n**${structured.fullName || structured.username}**${bioLine}${externalLine}\n\nšŸ‘„ ${fmtNum(followers)} Followers | ${fmtNum(following)} Following | ${fmtNum(postCount)} Posts${postsSection}`;
2731
+ return { domain: 'instagram.com', type: 'profile', structured, cleanContent };
2732
+ }
2733
+ }
2734
+ catch (e) {
2735
+ if (process.env.DEBUG)
2736
+ console.debug('[webpeel]', 'Instagram profile API failed:', e instanceof Error ? e.message : e);
2737
+ }
2738
+ }
2739
+ // --- Post/Reel/IGTV: Try oEmbed API ---
2544
2740
  try {
2545
- // Instagram official oEmbed (no access token needed for basic data)
2546
2741
  const oembedUrl = `https://graph.facebook.com/v22.0/instagram_oembed?url=${encodeURIComponent(url)}&fields=title,author_name,provider_name,thumbnail_url`;
2547
2742
  const data = await fetchJson(oembedUrl);
2548
2743
  // Also try noembed.com as fallback
@@ -2553,8 +2748,6 @@ async function instagramExtractor(_html, url) {
2553
2748
  }
2554
2749
  if (!resolvedData || resolvedData.error)
2555
2750
  return null;
2556
- const pathParts = new URL(url).pathname.split('/').filter(Boolean);
2557
- const contentType = pathParts[0] === 'p' ? 'post' : pathParts[0] === 'reel' ? 'reel' : pathParts[0] === 'tv' ? 'igtv' : pathParts.length === 1 ? 'profile' : 'post';
2558
2751
  const structured = {
2559
2752
  title: resolvedData.title || '',
2560
2753
  author: resolvedData.author_name || '',
@@ -2563,7 +2756,7 @@ async function instagramExtractor(_html, url) {
2563
2756
  contentType,
2564
2757
  provider: 'Instagram',
2565
2758
  };
2566
- const typeEmoji = contentType === 'reel' ? 'šŸŽ¬' : contentType === 'post' ? 'šŸ“ø' : contentType === 'profile' ? 'šŸ‘¤' : 'šŸ“±';
2759
+ const typeEmoji = contentType === 'reel' ? 'šŸŽ¬' : contentType === 'post' ? 'šŸ“ø' : 'šŸ“±';
2567
2760
  const titleText = structured.title || `Instagram ${contentType} by ${structured.author}`;
2568
2761
  const cleanContent = `## ${typeEmoji} Instagram ${contentType}: ${titleText}\n\n**Creator:** @${structured.author.replace('@', '')}\n**URL:** ${url}`;
2569
2762
  return { domain: 'instagram.com', type: contentType, structured, cleanContent };
@@ -2575,59 +2768,106 @@ async function instagramExtractor(_html, url) {
2575
2768
  }
2576
2769
  }
2577
2770
  // ---------------------------------------------------------------------------
2578
- // 30. PDF extractor (URL-based detection)
2771
+ // 30. PDF extractor (URL-based detection) — downloads and extracts real text
2579
2772
  // ---------------------------------------------------------------------------
2773
+ const PDF_MAX_BYTES = 50 * 1024 * 1024; // 50 MB
2774
+ const PDF_TRUNCATE_CHARS = 100_000;
2580
2775
  async function pdfExtractor(_html, url) {
2581
2776
  try {
2582
2777
  const urlObj = new URL(url);
2583
2778
  const filename = urlObj.pathname.split('/').pop() || 'document.pdf';
2584
2779
  const hostname = urlObj.hostname;
2585
- // Try to get HEAD request metadata
2586
- let contentType = 'application/pdf';
2587
- let contentLength = '';
2780
+ // Download the PDF
2781
+ let buffer;
2782
+ let finalContentType = 'application/pdf';
2588
2783
  try {
2589
- const { default: https } = await import('https');
2590
- const { default: http } = await import('http');
2591
- const client = url.startsWith('https') ? https : http;
2592
- await new Promise((resolve) => {
2593
- const req = client.request(url, { method: 'HEAD', timeout: 5000 }, (res) => {
2594
- contentType = res.headers['content-type'] || 'application/pdf';
2595
- contentLength = res.headers['content-length'] || '';
2596
- resolve();
2597
- res.resume();
2598
- });
2599
- req.on('error', () => resolve());
2600
- req.on('timeout', () => { req.destroy(); resolve(); });
2601
- req.end();
2784
+ const response = await fetch(url, {
2785
+ headers: { 'User-Agent': 'Mozilla/5.0 (compatible; WebPeel/1.0)' },
2786
+ signal: AbortSignal.timeout(30000),
2602
2787
  });
2788
+ if (!response.ok) {
2789
+ if (process.env.DEBUG)
2790
+ console.debug('[webpeel]', `PDF download failed: HTTP ${response.status}`);
2791
+ return null; // Let the normal pipeline handle it
2792
+ }
2793
+ finalContentType = response.headers.get('content-type') || 'application/pdf';
2794
+ // Verify it's actually a PDF (content-type or URL)
2795
+ const isPdf = finalContentType.toLowerCase().includes('pdf') || /\.pdf(\?|$|#)/i.test(url);
2796
+ if (!isPdf)
2797
+ return null;
2798
+ const arrayBuffer = await response.arrayBuffer();
2799
+ buffer = Buffer.from(arrayBuffer);
2800
+ }
2801
+ catch (downloadErr) {
2802
+ if (process.env.DEBUG)
2803
+ console.debug('[webpeel]', 'PDF download error:', downloadErr instanceof Error ? downloadErr.message : downloadErr);
2804
+ return null; // Let the normal pipeline handle it
2603
2805
  }
2604
- catch { /* best-effort */ }
2605
- const isPdf = contentType.toLowerCase().includes('pdf') || /\.pdf(\?|$|#)/i.test(url);
2606
- if (!isPdf)
2806
+ // Size guard
2807
+ if (buffer.length > PDF_MAX_BYTES) {
2808
+ if (process.env.DEBUG)
2809
+ console.debug('[webpeel]', `PDF too large (${buffer.length} bytes), falling back to stub`);
2607
2810
  return null;
2608
- const fileSizeKb = contentLength ? Math.round(parseInt(contentLength) / 1024) : null;
2609
- const structured = {
2610
- filename,
2611
- url,
2612
- contentType,
2613
- fileSizeKb,
2614
- hostname,
2615
- note: 'PDF binary content — text extraction requires a PDF parser. Use a dedicated PDF extraction service for full text content.',
2811
+ }
2812
+ // Extract text via pdf-parse
2813
+ const { extractPdf } = await import('./pdf.js');
2814
+ let pdf;
2815
+ try {
2816
+ pdf = await extractPdf(buffer);
2817
+ }
2818
+ catch (parseErr) {
2819
+ if (process.env.DEBUG)
2820
+ console.debug('[webpeel]', 'PDF parse failed:', parseErr instanceof Error ? parseErr.message : parseErr);
2821
+ return null; // Let the normal pipeline handle it
2822
+ }
2823
+ // Normalize whitespace (pdf-parse emits lots of blank lines)
2824
+ let text = (pdf.text || '')
2825
+ .replace(/\r\n/g, '\n')
2826
+ .replace(/\n{3,}/g, '\n\n')
2827
+ .replace(/[ \t]+/g, ' ')
2828
+ .trim();
2829
+ // Truncate very large documents
2830
+ let truncated = false;
2831
+ if (text.length > PDF_TRUNCATE_CHARS) {
2832
+ text = text.slice(0, PDF_TRUNCATE_CHARS);
2833
+ truncated = true;
2834
+ }
2835
+ if (!text) {
2836
+ // Scanned/image-only PDF — return a clear message rather than empty content
2837
+ const emptyNote = `## šŸ“„ ${filename}\n\n*This PDF appears to be a scanned document (image-only). No extractable text was found.*\n\n**Source:** ${url}`;
2838
+ return {
2839
+ domain: hostname,
2840
+ type: 'pdf',
2841
+ structured: { title: filename, url, pages: pdf.pages, contentType: finalContentType },
2842
+ cleanContent: emptyNote,
2843
+ };
2844
+ }
2845
+ // Build markdown output
2846
+ const titleRaw = pdf.metadata?.title || '';
2847
+ const title = titleRaw || filename.replace(/\.pdf$/i, '') || 'PDF Document';
2848
+ const metaParts = [];
2849
+ if (pdf.metadata?.author)
2850
+ metaParts.push(`**Author:** ${pdf.metadata.author}`);
2851
+ if (pdf.pages)
2852
+ metaParts.push(`**Pages:** ${pdf.pages}`);
2853
+ metaParts.push(`**Source:** ${url}`);
2854
+ const header = titleRaw ? `# ${titleRaw}\n\n` : '';
2855
+ const metaBlock = metaParts.join(' | ') + '\n\n';
2856
+ const truncNote = truncated ? '\n\n*[Content truncated — document exceeds 100,000 characters]*' : '';
2857
+ const cleanContent = header + metaBlock + text + truncNote;
2858
+ return {
2859
+ domain: hostname,
2860
+ type: 'pdf',
2861
+ structured: {
2862
+ title,
2863
+ filename,
2864
+ url,
2865
+ pages: pdf.pages,
2866
+ contentType: finalContentType,
2867
+ ...pdf.metadata,
2868
+ },
2869
+ cleanContent,
2616
2870
  };
2617
- const sizeStr = fileSizeKb ? ` (${fileSizeKb > 1024 ? (fileSizeKb / 1024).toFixed(1) + ' MB' : fileSizeKb + ' KB'})` : '';
2618
- const cleanContent = `## šŸ“„ PDF Document: ${filename}
2619
-
2620
- **URL:** ${url}
2621
- **Host:** ${hostname}${sizeStr ? `\n**Size:** ${sizeStr}` : ''}
2622
-
2623
- > **Note:** This is a PDF document. Binary PDF content cannot be directly extracted as text through standard web fetching. To extract the full text, consider:
2624
- >
2625
- > 1. Use a dedicated PDF extraction service (e.g., Adobe PDF Extract API, pdfminer, PyMuPDF)
2626
- > 2. Download the file and process locally with \`pdf-parse\` (Node.js) or \`pdfplumber\` (Python)
2627
- > 3. For academic PDFs, check if an HTML version is available at the same URL without \`.pdf\`
2628
-
2629
- **Direct download URL:** ${url}`;
2630
- return { domain: hostname, type: 'pdf', structured, cleanContent };
2631
2871
  }
2632
2872
  catch (e) {
2633
2873
  if (process.env.DEBUG)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.20.6",
3
+ "version": "0.20.7",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",