npm - webpeel - Versions diffs - 0.21.62 → 0.21.64 - Mend

webpeel 0.21.62 → 0.21.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +6 -7
package/dist/cli/utils.js +1 -1
package/dist/core/http-fetch.js +6 -2
package/dist/server/routes/fetch-queue.js +88 -0
package/package.json +2 -1

package/README.md CHANGED Viewed

@@ -8,7 +8,6 @@
   <a href="https://github.com/webpeel/webpeel/actions/workflows/ci.yml"><img src="https://github.com/webpeel/webpeel/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
   <a href="https://www.npmjs.com/package/webpeel"><img src="https://img.shields.io/npm/v/webpeel.svg?style=flat-square" alt="npm version"></a>
   <a href="https://pypi.org/project/webpeel/"><img src="https://img.shields.io/pypi/v/webpeel.svg?style=flat-square" alt="PyPI version"></a>
-  <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square" alt="License: MIT"></a>
   <a href="LICENSE"><img src="https://img.shields.io/badge/license-WebPeel%20SDK-blue.svg?style=flat-square" alt="License"></a>
   <a href="https://webpeel.dev/status"><img src="https://img.shields.io/badge/status-operational-brightgreen.svg?style=flat-square" alt="Status"></a>
 </p>
@@ -305,20 +304,20 @@ webpeel "https://news.ycombinator.com"
 # Search the web
 webpeel search "typescript orm comparison 2025"
-# Extract structured data
-webpeel extract "https://stripe.com/pricing" --schema pricing-schema.json
+# Extract structured data with a JSON schema
+webpeel "https://stripe.com/pricing" --extract-schema pricing-schema.json
-# Crawl a site, save to folder
-webpeel crawl "https://docs.example.com" --output ./docs-dump --max-pages 100
+# Crawl a site
+webpeel crawl "https://docs.example.com" --max-pages 100
 # Screenshot
 webpeel screenshot "https://webpeel.dev" --full-page --output screenshot.png
 # YouTube transcript
-webpeel youtube "https://youtube.com/watch?v=dQw4w9WgXcQ"
+webpeel "https://youtube.com/watch?v=dQw4w9WgXcQ" --json
 # Ask a question about a page
-webpeel qa "https://openai.com/pricing" --question "How much does GPT-4o cost per million tokens?"
+webpeel ask "https://openai.com/pricing" "How much does GPT-4o cost per million tokens?"
 # Output as JSON
 webpeel "https://example.com" --json

package/dist/cli/utils.js CHANGED Viewed

@@ -398,7 +398,7 @@ export function buildCondensedHelp() {
         `    --raw                 Full page (disable auto reader mode)`,
         `    --full                Full page, no budget limit`,
         `    --json                JSON output with metadata`,
-        `    --budget: 4000)`,
+        `    --budget <n>          Token budget (default: 4000 in pipe mode)`,
         `    -q, --question <q>    Ask about the content`,
         `    -s, --silent          No spinner output`,
         '',

package/dist/core/http-fetch.js CHANGED Viewed

@@ -559,10 +559,14 @@ export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeade
         try {
             const requestHeaders = { ...mergedHeaders };
             const validators = getConditionalValidators(currentUrl);
-            if (validators?.etag && !hasHeader(requestHeaders, 'if-none-match')) {
+            // Only send conditional headers if we actually have the cached body
+            // In server/worker mode, the in-memory cache may have been cleared (pod restart)
+            // and sending If-None-Match without a cached body would cause a 304 crash
+            const cachedBody = getCachedResultFor304(currentUrl, url);
+            if (validators?.etag && cachedBody && !hasHeader(requestHeaders, 'if-none-match')) {
                 requestHeaders['If-None-Match'] = validators.etag;
             }
-            if (validators?.lastModified && !hasHeader(requestHeaders, 'if-modified-since')) {
+            if (validators?.lastModified && cachedBody && !hasHeader(requestHeaders, 'if-modified-since')) {
                 requestHeaders['If-Modified-Since'] = validators.lastModified;
             }
             // Use proxy if provided or auto-selected, otherwise use shared connection pool

package/dist/server/routes/fetch-queue.js CHANGED Viewed

@@ -176,6 +176,81 @@ export function createQueueFetchRouter() {
             pollUrl: `/v1/jobs/${jobId}`,
         });
     }
+    /**
+     * GET/POST /v1/fetch/sync — Synchronous fetch, no queue
+     * Returns content inline (no jobId/polling). Much faster for simple pages.
+     * Timeout: 25s max. No fallback to queue — fails fast if timeout exceeded.
+     */
+    async function handleSyncFetch(req, res) {
+        const requestId = req.requestId || randomUUID();
+        const url = validateUrl(req.body?.url || req.query?.url, res, requestId);
+        if (!url)
+            return;
+        const userId = req.auth?.keyInfo?.accountId || req.user?.userId;
+        if (!userId) {
+            res.status(401).json({
+                success: false,
+                error: { type: 'unauthorized', message: 'API key required.' },
+                requestId,
+            });
+            return;
+        }
+        try {
+            // Import peel dynamically to avoid circular deps
+            const { peel } = await import('../../index.js');
+            const options = {
+                format: req.body?.format || req.query?.format || 'markdown',
+                render: req.body?.render === true || req.query?.render === 'true',
+                stealth: req.body?.stealth === true || req.query?.stealth === 'true',
+                budget: req.body?.budget ? Number(req.body.budget) : (req.query?.budget ? Number(req.query.budget) : undefined),
+                selector: req.body?.selector || req.query?.selector,
+                readable: req.body?.readable === true || req.query?.readable === 'true',
+                wait: req.body?.wait ? Number(req.body.wait) : (req.query?.wait ? Number(req.query.wait) : undefined),
+                question: req.body?.question || req.query?.question,
+                timeout: 25000, // 25s max (leave 5s buffer for response)
+            };
+            const result = await peel(url, options);
+            res.json({
+                success: true,
+                ...result,
+                requestId,
+                mode: 'sync',
+            });
+        }
+        catch (err) {
+            const statusCode = err.statusCode || 500;
+            res.status(statusCode >= 400 && statusCode < 600 ? statusCode : 500).json({
+                success: false,
+                error: {
+                    type: err.errorType || 'fetch_error',
+                    message: err.message || 'Fetch failed',
+                },
+                requestId,
+            });
+        }
+    }
+    router.get('/v1/fetch/sync', (req, res) => {
+        // Map query params to body
+        req.body = req.body || {};
+        if (req.query.url)
+            req.body.url = req.query.url;
+        if (req.query.format)
+            req.body.format = req.query.format;
+        if (req.query.render)
+            req.body.render = req.query.render === 'true';
+        if (req.query.stealth)
+            req.body.stealth = req.query.stealth === 'true';
+        if (req.query.budget)
+            req.body.budget = Number(req.query.budget);
+        if (req.query.selector)
+            req.body.selector = req.query.selector;
+        if (req.query.readable)
+            req.body.readable = req.query.readable === 'true';
+        if (req.query.question)
+            req.body.question = req.query.question;
+        void handleSyncFetch(req, res);
+    });
+    router.post('/v1/fetch/sync', (req, res) => void handleSyncFetch(req, res));
     // GET /v1/fetch?url=...  — CLI and backward-compatible GET requests
     // Maps query params into req.body so handleEnqueue works uniformly
     router.get('/v1/fetch', (req, res) => {
@@ -213,6 +288,19 @@ export function createQueueFetchRouter() {
     router.get('/v1/jobs/:id', async (req, res) => {
         const { id } = req.params;
         const requestId = req.requestId || randomUUID();
+        // Auth required — prevent IDOR (unauthenticated access to job results)
+        if (!req.auth?.keyInfo) {
+            res.status(401).json({
+                success: false,
+                error: {
+                    type: 'unauthorized',
+                    message: 'API key required to poll job results.',
+                    docs: 'https://webpeel.dev/docs/errors#unauthorized',
+                },
+                requestId,
+            });
+            return;
+        }
         if (!id || typeof id !== 'string') {
             res.status(400).json({
                 success: false,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.62",
+  "version": "0.21.64",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",
@@ -59,6 +59,7 @@
     "prepublishOnly": "bash scripts/pre-publish.sh",
     "serve": "node dist/server/app.js",
     "mcp": "node dist/mcp/server.js",
+    "preversion": "npm run build && npm test && bash scripts/pre-publish-gate.sh",
     "version": "bash scripts/postversion.sh"
   },
   "repository": {