npm - webpeel - Versions diffs - 0.21.20 → 0.21.22 - Mend

webpeel 0.21.20 → 0.21.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/core/domain-extractors.js +2 -12
package/dist/server/routes/search.js +38 -3
package/package.json +3 -2

package/dist/core/domain-extractors.js CHANGED Viewed

@@ -153,12 +153,7 @@ async function fetchJson(url, customHeaders) {
         Accept: 'application/json',
         ...customHeaders,
     });
-    const parsed = tryParseJson(result.html);
-    // Debug: log GitHub API failures to help diagnose rate limiting issues
-    if (!parsed && url.includes('api.github.com')) {
-        console.error(`[github-debug] fetchJson failed for ${url} — raw response (first 200): ${result.html?.substring(0, 200)}`);
-    }
-    return parsed;
+    return tryParseJson(result.html);
 }
 /** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
 async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {
@@ -785,13 +780,8 @@ async function githubExtractor(_html, url) {
     const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
     // Use GITHUB_TOKEN if available for higher rate limits (5000/hr vs 60/hr)
     const ghToken = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
-    if (ghToken) {
+    if (ghToken)
         ghHeaders.Authorization = `token ${ghToken}`;
-        console.log(`[github-debug] Using token (prefix: ${ghToken.substring(0, 8)}..., len: ${ghToken.length})`);
-    }
-    else {
-        console.warn('[github-debug] No GITHUB_TOKEN found — using anonymous (60/hr limit)');
-    }
     // User profile: /username (single segment)
     if (pathParts.length === 1) {
         const username = pathParts[0];

package/dist/server/routes/search.js CHANGED Viewed

@@ -29,7 +29,7 @@ export function createSearchRouter(authStore) {
             // scrapeResults=true: fetches full page content for each result (like Firecrawl's scrape_options).
             // Adds `content` field to each result. Significantly increases response time and credits used.
             // Documented in OpenAPI spec under /v1/search parameters.
-            const { q, count, scrapeResults, sources, categories, tbs, country, location } = req.query;
+            const { q, count, scrapeResults, enrich, sources, categories, tbs, country, location } = req.query;
             // --- Search provider (new: BYOK Brave support) ---
             const providerParam = (req.query.provider || '').toLowerCase() || 'auto';
             const validProviders = ['duckduckgo', 'brave', 'stealth', 'google'];
@@ -61,7 +61,8 @@ export function createSearchRouter(authStore) {
             const countryStr = country || '';
             const locationStr = location || '';
             // Build cache key (include all parameters)
-            const cacheKey = `search:${providerId}:${q}:${resultCount}:${sourcesStr}:${shouldScrape}:${categoriesStr}:${tbsStr}:${countryStr}:${locationStr}`;
+            const enrichCount = enrich ? Math.min(Math.max(parseInt(enrich, 10) || 0, 0), 5) : 0;
+            const cacheKey = `search:${providerId}:${q}:${resultCount}:${sourcesStr}:${shouldScrape}:${enrichCount}:${categoriesStr}:${tbsStr}:${countryStr}:${locationStr}`;
             // Check cache
             const cached = cache.get(cacheKey);
             if (cached) {
@@ -133,7 +134,7 @@ export function createSearchRouter(authStore) {
                         });
                     });
                 }
-                // Scrape each result URL if requested
+                // Scrape each result URL if requested (sequential — legacy)
                 if (shouldScrape) {
                     for (const result of results) {
                         try {
@@ -148,6 +149,40 @@ export function createSearchRouter(authStore) {
                         }
                     }
                 }
+                // Enrich top N results in parallel with timeout (fast alternative to scrapeResults)
+                // IMPORTANT: forceBrowser=false, stealth=false to prevent OOM on 512MB containers
+                if (enrichCount > 0 && !shouldScrape) {
+                    const ENRICH_TIMEOUT = 4000; // 4s hard timeout per URL
+                    const toEnrich = results.slice(0, enrichCount);
+                    const enrichResults = await Promise.allSettled(toEnrich.map(async (result) => {
+                        const fetchPromise = peel(result.url, {
+                            format: 'markdown',
+                            maxTokens: 1500,
+                            render: false,
+                            stealth: false,
+                        }).then(peelResult => ({
+                            url: result.url,
+                            content: peelResult.content?.substring(0, 1500) || null,
+                            wordCount: peelResult.content?.trim().split(/\s+/).length || 0,
+                            method: peelResult.method || 'unknown',
+                            fetchTimeMs: peelResult.elapsed || 0,
+                        }));
+                        const timeoutPromise = new Promise(resolve => setTimeout(() => resolve({ url: result.url, content: null, wordCount: 0, method: 'timeout', fetchTimeMs: 0 }), ENRICH_TIMEOUT));
+                        return Promise.race([fetchPromise, timeoutPromise]);
+                    }));
+                    // Merge enrichment data back into results
+                    for (const settled of enrichResults) {
+                        if (settled.status === 'fulfilled' && settled.value.content) {
+                            const match = results.find(r => r.url === settled.value.url);
+                            if (match) {
+                                match.content = settled.value.content;
+                                match.wordCount = settled.value.wordCount;
+                                match.method = settled.value.method;
+                                match.fetchTimeMs = settled.value.fetchTimeMs;
+                            }
+                        }
+                    }
+                }
                 data.web = results;
             }
             // Fetch news results (DDG only — Brave news is not supported via HTML scraping)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.20",
+  "version": "0.21.22",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",
@@ -58,7 +58,8 @@
     "lint": "tsc --noEmit",
     "prepublishOnly": "bash scripts/pre-publish.sh",
     "serve": "node dist/server/app.js",
-    "mcp": "node dist/mcp/server.js"
+    "mcp": "node dist/mcp/server.js",
+    "version": "bash scripts/postversion.sh"
   },
   "repository": {
     "type": "git",