npm - webpeel - Versions diffs - 0.21.65 → 0.21.67 - Mend

webpeel 0.21.65 → 0.21.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/core/domain-extractors.d.ts +5 -1
package/dist/core/domain-extractors.js +68 -23
package/package.json +1 -1

package/dist/core/domain-extractors.d.ts CHANGED Viewed

@@ -28,8 +28,12 @@ export type DomainExtractor = (html: string, url: string) => Promise<DomainExtra
  * Returns the domain extractor for a URL, or null if none matches.
  */
 export declare function getDomainExtractor(url: string): DomainExtractor | null;
+/** Clear the extractor response cache (used in tests). */
+export declare function clearExtractorCache(): void;
 /**
  * Convenience: run the extractor for the URL (if one exists).
- * Returns null when no extractor matches or extraction fails.
+ * Wraps _extractDomainDataImpl with a 5-minute LRU cache so that
+ * rate-limited API responses fall back to cached results instead of
+ * garbage browser rendering.
  */
 export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -131,11 +131,40 @@ export function getDomainExtractor(url) {
     }
     return null;
 }
+// ── Extractor Response Cache ──────────────────────────────────────────────
+// Caches successful API responses for 5 minutes to survive rate limits.
+// If the API rate-limits on the next request, we serve from cache instead
+// of falling back to garbage browser rendering (cookie walls, "Loading…").
+// Key: normalized URL (no query/hash), Value: { result, timestamp }
+const EXTRACTOR_CACHE = new Map();
+/** Clear the extractor response cache (used in tests). */
+export function clearExtractorCache() { EXTRACTOR_CACHE.clear(); }
+const CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
+function getCachedExtractorResult(url) {
+    const key = url.replace(/[?#].*$/, '').toLowerCase(); // strip query+hash
+    const entry = EXTRACTOR_CACHE.get(key);
+    if (entry && Date.now() - entry.ts < CACHE_TTL_MS) {
+        return entry.result;
+    }
+    EXTRACTOR_CACHE.delete(key); // expired — evict
+    return null;
+}
+function setCachedExtractorResult(url, result) {
+    const key = url.replace(/[?#].*$/, '').toLowerCase();
+    EXTRACTOR_CACHE.set(key, { result, ts: Date.now() });
+    // Keep cache size bounded at 500 entries (evict oldest)
+    if (EXTRACTOR_CACHE.size > 500) {
+        const oldest = EXTRACTOR_CACHE.keys().next().value;
+        if (oldest)
+            EXTRACTOR_CACHE.delete(oldest);
+    }
+}
+// ─────────────────────────────────────────────────────────────────────────────
 /**
- * Convenience: run the extractor for the URL (if one exists).
+ * Internal implementation: run the extractor for the URL (if one exists).
  * Returns null when no extractor matches or extraction fails.
  */
-export async function extractDomainData(html, url) {
+async function _extractDomainDataImpl(html, url) {
     const extractor = getDomainExtractor(url);
     if (!extractor)
         return null;
@@ -146,6 +175,32 @@ export async function extractDomainData(html, url) {
         return null;
     }
 }
+/**
+ * Convenience: run the extractor for the URL (if one exists).
+ * Wraps _extractDomainDataImpl with a 5-minute LRU cache so that
+ * rate-limited API responses fall back to cached results instead of
+ * garbage browser rendering.
+ */
+export async function extractDomainData(html, url) {
+    // 1. Check fresh cache first
+    const cached = getCachedExtractorResult(url);
+    if (cached)
+        return cached;
+    // 2. Try the real extractor
+    const result = await _extractDomainDataImpl(html, url);
+    if (result && result.cleanContent.length > 20) {
+        // 3. Cache the successful result
+        setCachedExtractorResult(url, result);
+        return result;
+    }
+    // 4. Extractor failed/returned garbage — check for any stale cache entry
+    //    (stale structured data beats a browser "Loading…" page)
+    const stale = getCachedExtractorResult(url);
+    if (stale)
+        return stale;
+    // 5. Genuinely nothing — return null so the pipeline falls back to fetch
+    return result;
+}
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
@@ -184,6 +239,13 @@ async function fetchJson(url, customHeaders) {
             redirect: 'follow',
         });
         clearTimeout(timer);
+        // Surface 429 as a thrown error so callers can detect rate-limiting
+        // and the cache wrapper can serve stale results instead of garbage.
+        if (resp.status === 429) {
+            const err = new Error(`429 Too Many Requests: ${url}`);
+            err.statusCode = 429;
+            throw err;
+        }
         const text = await resp.text();
         const parsed = tryParseJson(text);
         if (parsed === null && text.length > 0) {
@@ -4246,14 +4308,9 @@ async function semanticScholarExtractor(_html, url) {
             const data = await fetchJson(apiUrl);
             if (!data)
                 return null;
-            // Handle rate limiting — return helpful message instead of null
+            // Rate limited — return null so pipeline falls back to browser rendering
             if (data.code === '429' || (data.message && String(data.message).includes('Too Many Requests'))) {
-                return {
-                    domain,
-                    type: 'paper',
-                    structured: { paperId, rateLimited: true },
-                    cleanContent: `# Semantic Scholar — Rate Limited\n\n⚠️ API rate limit reached. View paper directly: https://www.semanticscholar.org/paper/${paperId}`,
-                };
+                return null;
             }
             if (!data.title)
                 return null;
@@ -4321,23 +4378,11 @@ async function semanticScholarExtractor(_html, url) {
             const fields = 'title,authors,year,citationCount,url,openAccessPdf';
             const apiUrl = `https://api.semanticscholar.org/graph/v1/paper/search?query=${encodeURIComponent(query)}&limit=10&fields=${fields}`;
             const data = await fetchJson(apiUrl);
-            // Handle rate limiting gracefully — return a helpful message instead of null
+            // Rate limited or no data — return null so pipeline falls back to browser rendering
             if (!data)
                 return null;
             if (data.code === '429' || (data.message && String(data.message).includes('Too Many Requests'))) {
-                const cleanContent = [
-                    `# 🔍 Semantic Scholar — "${query}"`,
-                    '',
-                    '⚠️ **Rate limited by Semantic Scholar API.** The free tier has strict limits.',
-                    '',
-                    `Try again in a few seconds, or search directly: https://www.semanticscholar.org/search?q=${encodeURIComponent(query)}`,
-                ].join('\n');
-                return {
-                    domain,
-                    type: 'search',
-                    structured: { query, total: 0, papers: [], rateLimited: true },
-                    cleanContent,
-                };
+                return null;
             }
             if (!Array.isArray(data.data))
                 return null;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.65",
+  "version": "0.21.67",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",