npm - webpeel - Versions diffs - 0.18.2 → 0.18.4 - Mend

webpeel 0.18.2 → 0.18.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/cli.js +163 -0
package/dist/cli.js.map +1 -1
package/dist/core/domain-extractors.d.ts.map +1 -1
package/dist/core/domain-extractors.js +134 -0
package/dist/core/domain-extractors.js.map +1 -1
package/dist/server/app.d.ts.map +1 -1
package/dist/server/app.js +2 -1
package/dist/server/app.js.map +1 -1
package/dist/server/email-service.d.ts +10 -7
package/dist/server/email-service.d.ts.map +1 -1
package/dist/server/email-service.js +58 -122
package/dist/server/email-service.js.map +1 -1
package/dist/server/routes/oauth.d.ts.map +1 -1
package/dist/server/routes/oauth.js +38 -0
package/dist/server/routes/oauth.js.map +1 -1
package/dist/server/routes/stripe.d.ts +7 -0
package/dist/server/routes/stripe.d.ts.map +1 -1
package/dist/server/routes/stripe.js +48 -0
package/dist/server/routes/stripe.js.map +1 -1
package/package.json +3 -1

package/dist/core/domain-extractors.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"domain-extractors.d.ts","sourceRoot":"","sources":["../../src/core/domain-extractors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AA0DH,MAAM,WAAW,mBAAmB;IAClC,iDAAiD;IACjD,MAAM,EAAE,MAAM,CAAC;IACf,4EAA4E;IAC5E,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAChC,mDAAmD;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,uFAAuF;IACvF,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,+EAA+E;AAC/E,MAAM,MAAM,eAAe,GAAG,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,KACR,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAAC;~~AA2CzC~~;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI,CAWtE;AAED;;;GAGG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAQrC"}
1	+ {"version":3,"file":"domain-extractors.d.ts","sourceRoot":"","sources":["../../src/core/domain-extractors.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AA0DH,MAAM,WAAW,mBAAmB;IAClC,iDAAiD;IACjD,MAAM,EAAE,MAAM,CAAC;IACf,4EAA4E;IAC5E,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAChC,mDAAmD;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,uFAAuF;IACvF,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,+EAA+E;AAC/E,MAAM,MAAM,eAAe,GAAG,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,KACR,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAAC;AA6CzC;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI,CAWtE;AAED;;;GAGG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAQrC"}

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -90,6 +90,8 @@ const REGISTRY = [
     { match: (h) => h === 'twitch.tv' || h === 'www.twitch.tv' || h === 'clips.twitch.tv', extractor: twitchExtractor },
     { match: (h) => h === 'soundcloud.com' || h === 'www.soundcloud.com', extractor: soundcloudExtractor },
     { match: (h) => h === 'instagram.com' || h === 'www.instagram.com', extractor: instagramExtractor },
+    { match: (h) => h === 'www.producthunt.com' || h === 'producthunt.com', extractor: productHuntExtractor },
+    { match: (h) => h === 'substack.com' || h === 'www.substack.com', extractor: substackRootExtractor },
     { match: (_h, url = '') => /\.pdf(\?|$|#)/i.test(url) || /\/pdf\//i.test(url), extractor: pdfExtractor },
 ];
 /**
@@ -2633,4 +2635,136 @@ async function pdfExtractor(_html, url) {
         return null;
     }
 }
+// ---------------------------------------------------------------------------
+// 31. Product Hunt extractor (RSS/Atom feed)
+// ---------------------------------------------------------------------------
+async function productHuntExtractor(_html, _url) {
+    try {
+        // Fetch the public Atom feed — no auth required
+        const feedResult = await simpleFetch('https://www.producthunt.com/feed', 'WebPeel/0.17.1 (web data platform; https://webpeel.dev) Node.js', 15000, { Accept: 'application/xml, text/xml, */*' });
+        if (!feedResult?.html)
+            return null;
+        const xml = feedResult.html;
+        // Parse Atom entries (Product Hunt uses Atom, not RSS)
+        const entryMatches = [...xml.matchAll(/<entry>([\s\S]*?)<\/entry>/g)];
+        if (!entryMatches.length)
+            return null;
+        const products = [];
+        for (const match of entryMatches) {
+            const entry = match[1];
+            const titleMatch = entry.match(/<title>([\s\S]*?)<\/title>/);
+            const linkMatch = entry.match(/<link[^>]+href="([^"]+)"/);
+            const publishedMatch = entry.match(/<published>([\s\S]*?)<\/published>/);
+            const authorMatch = entry.match(/<name>([\s\S]*?)<\/name>/);
+            const contentMatch = entry.match(/<content[^>]*>([\s\S]*?)<\/content>/);
+            if (!titleMatch)
+                continue;
+            const title = stripHtml(titleMatch[1]).trim();
+            const link = linkMatch?.[1] || '';
+            const published = publishedMatch?.[1]?.trim() || '';
+            const author = authorMatch ? stripHtml(authorMatch[1]).trim() : '';
+            // Extract tagline from encoded HTML in <content>
+            // Content is HTML-encoded: &lt;p&gt;tagline&lt;/p&gt;...
+            let tagline = '';
+            let directLink = '';
+            if (contentMatch) {
+                const decoded = contentMatch[1]
+                    .replace(/&lt;/g, '<')
+                    .replace(/&gt;/g, '>')
+                    .replace(/&amp;/g, '&')
+                    .replace(/&quot;/g, '"')
+                    .replace(/&#39;/g, "'");
+                // First <p> is the tagline
+                const taglineMatch = decoded.match(/<p[^>]*>\s*([\s\S]*?)\s*<\/p>/);
+                if (taglineMatch) {
+                    tagline = stripHtml(taglineMatch[1]).trim();
+                }
+                // Extract direct product link (the "Link" href, not the discussion link)
+                const linkHrefMatch = decoded.match(/href="(https:\/\/www\.producthunt\.com\/r\/p\/[^"]+)"/);
+                directLink = linkHrefMatch?.[1] || link;
+            }
+            // Format published date nicely
+            let dateStr = '';
+            if (published) {
+                try {
+                    const d = new Date(published);
+                    dateStr = d.toLocaleDateString('en-US', { month: 'short', day: 'numeric', year: 'numeric' });
+                }
+                catch {
+                    dateStr = published.split('T')[0];
+                }
+            }
+            products.push({ title, link, published: dateStr, tagline, author, directLink });
+        }
+        if (!products.length)
+            return null;
+        // Build clean markdown output
+        const today = new Date().toLocaleDateString('en-US', { month: 'long', day: 'numeric', year: 'numeric' });
+        const productList = products.map((p, i) => {
+            const taglinePart = p.tagline ? ` — ${p.tagline}` : '';
+            const datePart = p.published ? `\n   📅 ${p.published}` : '';
+            const authorPart = p.author ? ` by ${p.author}` : '';
+            return `${i + 1}. **[${p.title}](${p.link})**${taglinePart}${datePart}${authorPart}`;
+        }).join('\n\n');
+        const structured = {
+            products,
+            total: products.length,
+            fetchedAt: new Date().toISOString(),
+            feedUrl: 'https://www.producthunt.com/feed',
+        };
+        const cleanContent = `# 🚀 Product Hunt — Featured Products\n\n*Fetched ${today} · ${products.length} products*\n\n${productList}\n\n---\n*Source: [Product Hunt Feed](https://www.producthunt.com/feed)*`;
+        return { domain: 'producthunt.com', type: 'feed', structured, cleanContent };
+    }
+    catch (e) {
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'Product Hunt extractor failed:', e instanceof Error ? e.message : e);
+        return null;
+    }
+}
+// ---------------------------------------------------------------------------
+// 32. Substack root extractor (substack.com homepage)
+// ---------------------------------------------------------------------------
+async function substackRootExtractor(_html, _url) {
+    // The substack.com homepage is a marketing page — not useful to extract.
+    // Instead, guide users to fetch individual newsletter posts.
+    // Try fetching their public sitemap to surface some featured newsletters.
+    // Note: Substack's homepage is JS-rendered; no useful API endpoints are publicly accessible.
+    // We return a helpful guide instead of trying to scrape the homepage.
+    const structured = {
+        note: 'Substack root homepage is a JS-rendered marketing page with limited extractable content.',
+        tip: 'Fetch individual Substack posts directly for full article content.',
+        examples: [
+            'https://username.substack.com/p/article-slug',
+            'https://stratechery.com/2024/...',
+        ],
+    };
+    const cleanContent = `# 📰 Substack
+Substack's homepage is a JS-rendered marketing page — there's not much useful content to extract here.
+## ✅ What Works
+Individual Substack posts are **fully server-rendered** and extract cleanly. Try:
+- \`https://username.substack.com/p/article-title\`
+- Any specific newsletter post URL
+## 💡 Examples
+\`\`\`
+https://lethain.substack.com/p/the-art-of-staffing-eng
+https://paulgraham.com/articles.html
+\`\`\`
+## 📋 Finding Newsletters
+Browse newsletters at:
+- [substack.com/explore](https://substack.com/explore) — discover publications
+- [substack.com/leaderboard](https://substack.com/leaderboard) — top newsletters by category
+---
+*WebPeel works best with individual Substack post URLs, not the root homepage.*`;
+    return { domain: 'substack.com', type: 'homepage', structured, cleanContent };
+}
 //# sourceMappingURL=domain-extractors.js.map