webpeel 0.18.2 → 0.18.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"domain-extractors.d.ts","sourceRoot":"","sources":["../../src/core/domain-extractors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AA0DH,MAAM,WAAW,mBAAmB;IAClC,iDAAiD;IACjD,MAAM,EAAE,MAAM,CAAC;IACf,4EAA4E;IAC5E,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAChC,mDAAmD;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,uFAAuF;IACvF,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,+EAA+E;AAC/E,MAAM,MAAM,eAAe,GAAG,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,KACR,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAAC;AA2CzC;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI,CAWtE;AAED;;;GAGG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAQrC"}
1
+ {"version":3,"file":"domain-extractors.d.ts","sourceRoot":"","sources":["../../src/core/domain-extractors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AA0DH,MAAM,WAAW,mBAAmB;IAClC,iDAAiD;IACjD,MAAM,EAAE,MAAM,CAAC;IACf,4EAA4E;IAC5E,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAChC,mDAAmD;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,uFAAuF;IACvF,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,+EAA+E;AAC/E,MAAM,MAAM,eAAe,GAAG,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,KACR,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAAC;AA6CzC;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI,CAWtE;AAED;;;GAGG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAQrC"}
@@ -90,6 +90,8 @@ const REGISTRY = [
90
90
  { match: (h) => h === 'twitch.tv' || h === 'www.twitch.tv' || h === 'clips.twitch.tv', extractor: twitchExtractor },
91
91
  { match: (h) => h === 'soundcloud.com' || h === 'www.soundcloud.com', extractor: soundcloudExtractor },
92
92
  { match: (h) => h === 'instagram.com' || h === 'www.instagram.com', extractor: instagramExtractor },
93
+ { match: (h) => h === 'www.producthunt.com' || h === 'producthunt.com', extractor: productHuntExtractor },
94
+ { match: (h) => h === 'substack.com' || h === 'www.substack.com', extractor: substackRootExtractor },
93
95
  { match: (_h, url = '') => /\.pdf(\?|$|#)/i.test(url) || /\/pdf\//i.test(url), extractor: pdfExtractor },
94
96
  ];
95
97
  /**
@@ -2633,4 +2635,136 @@ async function pdfExtractor(_html, url) {
2633
2635
  return null;
2634
2636
  }
2635
2637
  }
2638
+ // ---------------------------------------------------------------------------
2639
+ // 31. Product Hunt extractor (RSS/Atom feed)
2640
+ // ---------------------------------------------------------------------------
2641
+ async function productHuntExtractor(_html, _url) {
2642
+ try {
2643
+ // Fetch the public Atom feed — no auth required
2644
+ const feedResult = await simpleFetch('https://www.producthunt.com/feed', 'WebPeel/0.17.1 (web data platform; https://webpeel.dev) Node.js', 15000, { Accept: 'application/xml, text/xml, */*' });
2645
+ if (!feedResult?.html)
2646
+ return null;
2647
+ const xml = feedResult.html;
2648
+ // Parse Atom entries (Product Hunt uses Atom, not RSS)
2649
+ const entryMatches = [...xml.matchAll(/<entry>([\s\S]*?)<\/entry>/g)];
2650
+ if (!entryMatches.length)
2651
+ return null;
2652
+ const products = [];
2653
+ for (const match of entryMatches) {
2654
+ const entry = match[1];
2655
+ const titleMatch = entry.match(/<title>([\s\S]*?)<\/title>/);
2656
+ const linkMatch = entry.match(/<link[^>]+href="([^"]+)"/);
2657
+ const publishedMatch = entry.match(/<published>([\s\S]*?)<\/published>/);
2658
+ const authorMatch = entry.match(/<name>([\s\S]*?)<\/name>/);
2659
+ const contentMatch = entry.match(/<content[^>]*>([\s\S]*?)<\/content>/);
2660
+ if (!titleMatch)
2661
+ continue;
2662
+ const title = stripHtml(titleMatch[1]).trim();
2663
+ const link = linkMatch?.[1] || '';
2664
+ const published = publishedMatch?.[1]?.trim() || '';
2665
+ const author = authorMatch ? stripHtml(authorMatch[1]).trim() : '';
2666
+ // Extract tagline from encoded HTML in <content>
2667
+ // Content is HTML-encoded: &lt;p&gt;tagline&lt;/p&gt;...
2668
+ let tagline = '';
2669
+ let directLink = '';
2670
+ if (contentMatch) {
2671
+ const decoded = contentMatch[1]
2672
+ .replace(/&lt;/g, '<')
2673
+ .replace(/&gt;/g, '>')
2674
+ .replace(/&amp;/g, '&')
2675
+ .replace(/&quot;/g, '"')
2676
+ .replace(/&#39;/g, "'");
2677
+ // First <p> is the tagline
2678
+ const taglineMatch = decoded.match(/<p[^>]*>\s*([\s\S]*?)\s*<\/p>/);
2679
+ if (taglineMatch) {
2680
+ tagline = stripHtml(taglineMatch[1]).trim();
2681
+ }
2682
+ // Extract direct product link (the "Link" href, not the discussion link)
2683
+ const linkHrefMatch = decoded.match(/href="(https:\/\/www\.producthunt\.com\/r\/p\/[^"]+)"/);
2684
+ directLink = linkHrefMatch?.[1] || link;
2685
+ }
2686
+ // Format published date nicely
2687
+ let dateStr = '';
2688
+ if (published) {
2689
+ try {
2690
+ const d = new Date(published);
2691
+ dateStr = d.toLocaleDateString('en-US', { month: 'short', day: 'numeric', year: 'numeric' });
2692
+ }
2693
+ catch {
2694
+ dateStr = published.split('T')[0];
2695
+ }
2696
+ }
2697
+ products.push({ title, link, published: dateStr, tagline, author, directLink });
2698
+ }
2699
+ if (!products.length)
2700
+ return null;
2701
+ // Build clean markdown output
2702
+ const today = new Date().toLocaleDateString('en-US', { month: 'long', day: 'numeric', year: 'numeric' });
2703
+ const productList = products.map((p, i) => {
2704
+ const taglinePart = p.tagline ? ` — ${p.tagline}` : '';
2705
+ const datePart = p.published ? `\n 📅 ${p.published}` : '';
2706
+ const authorPart = p.author ? ` by ${p.author}` : '';
2707
+ return `${i + 1}. **[${p.title}](${p.link})**${taglinePart}${datePart}${authorPart}`;
2708
+ }).join('\n\n');
2709
+ const structured = {
2710
+ products,
2711
+ total: products.length,
2712
+ fetchedAt: new Date().toISOString(),
2713
+ feedUrl: 'https://www.producthunt.com/feed',
2714
+ };
2715
+ const cleanContent = `# 🚀 Product Hunt — Featured Products\n\n*Fetched ${today} · ${products.length} products*\n\n${productList}\n\n---\n*Source: [Product Hunt Feed](https://www.producthunt.com/feed)*`;
2716
+ return { domain: 'producthunt.com', type: 'feed', structured, cleanContent };
2717
+ }
2718
+ catch (e) {
2719
+ if (process.env.DEBUG)
2720
+ console.debug('[webpeel]', 'Product Hunt extractor failed:', e instanceof Error ? e.message : e);
2721
+ return null;
2722
+ }
2723
+ }
2724
+ // ---------------------------------------------------------------------------
2725
+ // 32. Substack root extractor (substack.com homepage)
2726
+ // ---------------------------------------------------------------------------
2727
+ async function substackRootExtractor(_html, _url) {
2728
+ // The substack.com homepage is a marketing page — not useful to extract.
2729
+ // Instead, guide users to fetch individual newsletter posts.
2730
+ // Try fetching their public sitemap to surface some featured newsletters.
2731
+ // Note: Substack's homepage is JS-rendered; no useful API endpoints are publicly accessible.
2732
+ // We return a helpful guide instead of trying to scrape the homepage.
2733
+ const structured = {
2734
+ note: 'Substack root homepage is a JS-rendered marketing page with limited extractable content.',
2735
+ tip: 'Fetch individual Substack posts directly for full article content.',
2736
+ examples: [
2737
+ 'https://username.substack.com/p/article-slug',
2738
+ 'https://stratechery.com/2024/...',
2739
+ ],
2740
+ };
2741
+ const cleanContent = `# 📰 Substack
2742
+
2743
+ Substack's homepage is a JS-rendered marketing page — there's not much useful content to extract here.
2744
+
2745
+ ## ✅ What Works
2746
+
2747
+ Individual Substack posts are **fully server-rendered** and extract cleanly. Try:
2748
+
2749
+ - \`https://username.substack.com/p/article-title\`
2750
+ - Any specific newsletter post URL
2751
+
2752
+ ## 💡 Examples
2753
+
2754
+ \`\`\`
2755
+ https://lethain.substack.com/p/the-art-of-staffing-eng
2756
+ https://paulgraham.com/articles.html
2757
+ \`\`\`
2758
+
2759
+ ## 📋 Finding Newsletters
2760
+
2761
+ Browse newsletters at:
2762
+ - [substack.com/explore](https://substack.com/explore) — discover publications
2763
+ - [substack.com/leaderboard](https://substack.com/leaderboard) — top newsletters by category
2764
+
2765
+ ---
2766
+
2767
+ *WebPeel works best with individual Substack post URLs, not the root homepage.*`;
2768
+ return { domain: 'substack.com', type: 'homepage', structured, cleanContent };
2769
+ }
2636
2770
  //# sourceMappingURL=domain-extractors.js.map