webpeel 0.21.16 → 0.21.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/fetch.js +11 -0
- package/dist/core/structured-extract.js +19 -1
- package/dist/server/app.js +5 -2
- package/dist/server/auth-store.d.ts +1 -1
- package/dist/server/middleware/auth.js +2 -2
- package/dist/server/middleware/url-validator.js +15 -0
- package/dist/server/routes/stripe.js +2 -0
- package/llms.txt +1 -1
- package/package.json +1 -1
|
@@ -148,6 +148,17 @@ export async function runFetch(url, options) {
|
|
|
148
148
|
}
|
|
149
149
|
process.exit(0);
|
|
150
150
|
}
|
|
151
|
+
// --- #4b: Read URL from stdin (pipe mode) if no URL argument provided ---
|
|
152
|
+
if ((!url || url.trim() === '') && !process.stdin.isTTY) {
|
|
153
|
+
try {
|
|
154
|
+
const stdinData = await readStdin();
|
|
155
|
+
const stdinUrl = stdinData.trim().split('\n')[0].trim();
|
|
156
|
+
if (stdinUrl && (stdinUrl.startsWith('http://') || stdinUrl.startsWith('https://'))) {
|
|
157
|
+
url = stdinUrl;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
catch { /* ignore stdin read errors */ }
|
|
161
|
+
}
|
|
151
162
|
// --- #5: Concise error for missing URL (no help dump) ---
|
|
152
163
|
if (!url || url.trim() === '') {
|
|
153
164
|
if (isJson) {
|
|
@@ -158,6 +158,13 @@ function heuristicExtractString(fieldName, content, pageUrl) {
|
|
|
158
158
|
if (pageUrl)
|
|
159
159
|
return pageUrl;
|
|
160
160
|
}
|
|
161
|
+
// Creator / designer / founder / inventor
|
|
162
|
+
if (/creator|designer|founder|inventor|invented_by|created_by/.test(lf)) {
|
|
163
|
+
const m = content.match(/(?:created?|designed?|founded?|invented?)\s+by\s+([A-Z][^\n,·|–—]+?)(?:\s*[,·|–—]|\s+in\s+\d{4}|\.)/i)
|
|
164
|
+
?? content.match(/(?:creator|designer|founder|inventor)[:\s]+([A-Z][^\n,·|]+?)(?:\s*[,·|–—]|\.)/i);
|
|
165
|
+
if (m?.[1])
|
|
166
|
+
return m[1].replace(/[*_`[\]]/g, '').trim().slice(0, 80);
|
|
167
|
+
}
|
|
161
168
|
// Director (for movies/films)
|
|
162
169
|
if (/director/.test(lf)) {
|
|
163
170
|
const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
|
|
@@ -312,12 +319,23 @@ function heuristicExtractNumber(fieldName, content) {
|
|
|
312
319
|
}
|
|
313
320
|
// Year
|
|
314
321
|
if (/year/.test(lf)) {
|
|
315
|
-
//
|
|
322
|
+
// Explicit "Year: YYYY" label first
|
|
316
323
|
const explicit = content.match(/\bYear[:\s]+(\d{4})\b/i);
|
|
317
324
|
if (explicit?.[1]) {
|
|
318
325
|
const n = parseInt(explicit[1]);
|
|
319
326
|
return isNaN(n) ? null : n;
|
|
320
327
|
}
|
|
328
|
+
// For "created_year" / "founded_year" / "released_year" — look for context
|
|
329
|
+
if (/creat|found|release|launch|start|born|inception/.test(lf)) {
|
|
330
|
+
const ctxMatch = content.match(/(?:created?|founded?|released?|launched?|started?|born|inception)[^\d]*(\b(?:19|20)\d{2}\b)/i)
|
|
331
|
+
?? content.match(/\b(?:in|year)\s+(\b(?:19|20)\d{2}\b)/i)
|
|
332
|
+
?? content.match(/(\b(?:19|20)\d{2}\b)/);
|
|
333
|
+
if (ctxMatch?.[1]) {
|
|
334
|
+
const n = parseInt(ctxMatch[1]);
|
|
335
|
+
return isNaN(n) ? null : n;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
// Fallback: first year found
|
|
321
339
|
const m = content.match(/\b((?:19|20)\d{2})\b/);
|
|
322
340
|
if (m?.[1]) {
|
|
323
341
|
const n = parseInt(m[1]);
|
package/dist/server/app.js
CHANGED
|
@@ -155,8 +155,11 @@ export function createApp(config = {}) {
|
|
|
155
155
|
return callback(null, true);
|
|
156
156
|
if (corsOrigins.includes(origin))
|
|
157
157
|
return callback(null, origin);
|
|
158
|
-
// Unknown origins: allow (API key clients need cross-origin access) but no credentials
|
|
159
|
-
|
|
158
|
+
// Unknown origins: allow (API key clients need cross-origin access) but no credentials.
|
|
159
|
+
// SECURITY: Return '*' instead of reflecting the origin — wildcard is incompatible with
|
|
160
|
+
// credentials (browsers reject Allow-Credentials + *), prevents origin-specific CORS caching,
|
|
161
|
+
// and avoids security-scanner false positives from reflected origins.
|
|
162
|
+
return callback(null, '*');
|
|
160
163
|
},
|
|
161
164
|
// credentials: set conditionally via post-cors middleware below
|
|
162
165
|
credentials: false,
|
|
@@ -127,8 +127,8 @@ export function createAuthMiddleware(authStore) {
|
|
|
127
127
|
}
|
|
128
128
|
return;
|
|
129
129
|
}
|
|
130
|
-
// Check limits (only for PostgresAuthStore)
|
|
131
|
-
if (authStore instanceof PostgresAuthStore) {
|
|
130
|
+
// Check limits (only for PostgresAuthStore, skip for admin tier)
|
|
131
|
+
if (authStore instanceof PostgresAuthStore && keyInfo?.tier !== 'admin') {
|
|
132
132
|
// HARD LIMIT: Check burst limit first (per-hour cap)
|
|
133
133
|
const { allowed: burstAllowed, burst } = await authStore.checkBurstLimit(apiKey);
|
|
134
134
|
if (!burstAllowed) {
|
|
@@ -25,6 +25,21 @@ export function validateUrlForSSRF(urlString) {
|
|
|
25
25
|
if (localhostPatterns.some(pattern => hostname === pattern || hostname.endsWith('.' + pattern))) {
|
|
26
26
|
throw new SSRFError('Cannot fetch localhost, private networks, or non-HTTP URLs');
|
|
27
27
|
}
|
|
28
|
+
// SECURITY: Block well-known cloud metadata service hostnames.
|
|
29
|
+
// These hostnames resolve to link-local IPs (169.254.x.x) which are blocked
|
|
30
|
+
// by IP, but hostname-level blocking provides defense-in-depth against DNS
|
|
31
|
+
// rebinding attacks where a domain transiently resolves to a valid IP during
|
|
32
|
+
// validation, then resolves to a private IP for the actual fetch.
|
|
33
|
+
const metadataHostnames = [
|
|
34
|
+
'metadata.google.internal', // GCP: resolves to 169.254.169.254
|
|
35
|
+
'metadata.goog', // GCP alternate
|
|
36
|
+
'metadata.internal', // Generic internal
|
|
37
|
+
'instance-data.ec2.internal', // AWS alternate
|
|
38
|
+
'computeMetadata', // Partial GCP hostname
|
|
39
|
+
];
|
|
40
|
+
if (metadataHostnames.some(m => hostname === m || hostname.endsWith('.' + m))) {
|
|
41
|
+
throw new SSRFError('Cannot fetch localhost, private networks, or non-HTTP URLs');
|
|
42
|
+
}
|
|
28
43
|
// Parse and validate IP addresses
|
|
29
44
|
const ipv4Info = parseIPv4(hostname);
|
|
30
45
|
if (ipv4Info) {
|
|
@@ -14,6 +14,8 @@ const TIER_LIMITS = {
|
|
|
14
14
|
free: { weekly_limit: 500, burst_limit: 50, rate_limit: 10 },
|
|
15
15
|
pro: { weekly_limit: 1250, burst_limit: 100, rate_limit: 60 },
|
|
16
16
|
max: { weekly_limit: 6250, burst_limit: 500, rate_limit: 200 },
|
|
17
|
+
admin: { weekly_limit: 100000, burst_limit: 10000, rate_limit: 1000 },
|
|
18
|
+
enterprise: { weekly_limit: 50000, burst_limit: 2000, rate_limit: 500 },
|
|
17
19
|
};
|
|
18
20
|
/**
|
|
19
21
|
* Create Stripe Billing Portal router
|
package/llms.txt
CHANGED
|
@@ -39,7 +39,7 @@ webpeel mcp Start MCP server
|
|
|
39
39
|
- Quick answers: Ask questions about any page (no LLM needed)
|
|
40
40
|
- Anti-bot handling: Stealth mode, proxy rotation, graceful degradation
|
|
41
41
|
- Format options: markdown, text, html, clean (AI-optimized)
|
|
42
|
-
- MCP server:
|
|
42
|
+
- MCP server: 7 tools for AI agent integration
|
|
43
43
|
- Site search: Search eBay, Amazon, GitHub, and 20+ sites with structured output
|
|
44
44
|
|
|
45
45
|
## Formats
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.18",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|