webpeel 0.20.2 → 0.20.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +384 -0
- package/dist/server/auth-store.d.ts +27 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/email-service.d.ts +21 -0
- package/dist/server/email-service.js +79 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/auth.d.ts +28 -0
- package/dist/server/middleware/auth.js +221 -0
- package/dist/server/middleware/rate-limit.d.ts +24 -0
- package/dist/server/middleware/rate-limit.js +167 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +186 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +132 -0
- package/dist/server/pg-auth-store.js +472 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/premium/domain-intel.d.ts +16 -0
- package/dist/server/premium/domain-intel.js +133 -0
- package/dist/server/premium/index.d.ts +17 -0
- package/dist/server/premium/index.js +35 -0
- package/dist/server/premium/swr-cache.d.ts +14 -0
- package/dist/server/premium/swr-cache.js +34 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +74 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +229 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +8 -0
- package/dist/server/routes/extract.js +235 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +999 -0
- package/dist/server/routes/health.d.ts +7 -0
- package/dist/server/routes/health.js +19 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +573 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +141 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +816 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +303 -0
- package/dist/server/routes/session.d.ts +15 -0
- package/dist/server/routes/session.js +397 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +294 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1671 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +13 -0
- package/dist/server/sentry.js +38 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/package.json +2 -1
|
@@ -0,0 +1,999 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fetch endpoint with caching
|
|
3
|
+
*/
|
|
4
|
+
import { Router } from 'express';
|
|
5
|
+
import '../types.js'; // Augments Express.Request with requestId
|
|
6
|
+
import { peel } from '../../index.js';
|
|
7
|
+
import { normalizeActions } from '../../core/actions.js';
|
|
8
|
+
import { extractInlineJson } from '../../core/extract-inline.js';
|
|
9
|
+
import { LRUCache } from 'lru-cache';
|
|
10
|
+
import { validateUrlForSSRF, SSRFError } from '../middleware/url-validator.js';
|
|
11
|
+
import { wantsEnvelope, successResponse } from '../utils/response.js';
|
|
12
|
+
import { getSchemaTemplate } from '../../core/schema-templates.js';
|
|
13
|
+
import { quickAnswer } from '../../core/quick-answer.js';
|
|
14
|
+
import { sendUsageAlertEmail } from '../email-service.js';
|
|
15
|
+
// ── Helper: extractive summarizer (TF-IDF-like sentence scoring) ─────────────
|
|
16
|
+
function extractSummary(content, maxWords = 150) {
|
|
17
|
+
if (!content)
|
|
18
|
+
return '';
|
|
19
|
+
const sentences = content
|
|
20
|
+
.split(/(?<=[.!?])\s+/)
|
|
21
|
+
.map(s => s.trim())
|
|
22
|
+
.filter(s => s.length > 40 && s.length < 600);
|
|
23
|
+
if (sentences.length === 0) {
|
|
24
|
+
const words = content.split(/\s+/);
|
|
25
|
+
return words.slice(0, maxWords).join(' ') + (words.length > maxWords ? '\u2026' : '');
|
|
26
|
+
}
|
|
27
|
+
if (sentences.length <= 3)
|
|
28
|
+
return sentences.join(' ');
|
|
29
|
+
const allWords = content.toLowerCase().split(/\W+/).filter(w => w.length > 3);
|
|
30
|
+
const wordFreq = {};
|
|
31
|
+
for (const w of allWords)
|
|
32
|
+
wordFreq[w] = (wordFreq[w] || 0) + 1;
|
|
33
|
+
const maxFreq = Math.max(1, ...Object.values(wordFreq));
|
|
34
|
+
const scored = sentences.map((sentence, idx) => {
|
|
35
|
+
const words = sentence.toLowerCase().split(/\W+/).filter(w => w.length > 3);
|
|
36
|
+
const score = words.reduce((sum, w) => sum + (wordFreq[w] || 0) / maxFreq, 0) / Math.max(1, words.length);
|
|
37
|
+
const posBonus = idx === 0 ? 0.3 : idx === sentences.length - 1 ? 0.1 : 0;
|
|
38
|
+
return { sentence, score: score + posBonus, idx };
|
|
39
|
+
});
|
|
40
|
+
scored.sort((a, b) => b.score - a.score);
|
|
41
|
+
const selected = [];
|
|
42
|
+
let wc = 0;
|
|
43
|
+
for (const item of scored) {
|
|
44
|
+
const itemWc = item.sentence.split(/\s+/).length;
|
|
45
|
+
if (wc + itemWc > maxWords * 1.3)
|
|
46
|
+
break;
|
|
47
|
+
selected.push(item);
|
|
48
|
+
wc += itemWc;
|
|
49
|
+
if (selected.length >= 5)
|
|
50
|
+
break;
|
|
51
|
+
}
|
|
52
|
+
selected.sort((a, b) => a.idx - b.idx);
|
|
53
|
+
return selected.map(s => s.sentence).join(' ');
|
|
54
|
+
}
|
|
55
|
+
// ── Helper: check usage and determine if alert email should be sent ───────────
|
|
56
|
+
async function checkAndTriggerAlert(pgStore, userId) {
|
|
57
|
+
const getCurrentWeek = () => {
|
|
58
|
+
const now = new Date();
|
|
59
|
+
const year = now.getUTCFullYear();
|
|
60
|
+
const jan4 = new Date(Date.UTC(year, 0, 4));
|
|
61
|
+
const weekNum = Math.ceil(((now.getTime() - jan4.getTime()) / 86400000 + jan4.getUTCDay() + 1) / 7);
|
|
62
|
+
return `${year}-W${String(weekNum).padStart(2, '0')}`;
|
|
63
|
+
};
|
|
64
|
+
const currentWeek = getCurrentWeek();
|
|
65
|
+
const result = await pgStore.pool.query(`SELECT u.email, u.name, u.tier, u.alert_threshold, u.alert_email, u.alert_sent_at,
|
|
66
|
+
u.weekly_limit,
|
|
67
|
+
COALESCE(SUM(wu.total_count), 0) AS total_used,
|
|
68
|
+
u.weekly_limit + COALESCE(MAX(wu.rollover_credits), 0) AS total_available
|
|
69
|
+
FROM users u
|
|
70
|
+
LEFT JOIN api_keys ak ON ak.user_id = u.id
|
|
71
|
+
LEFT JOIN weekly_usage wu ON wu.api_key_id = ak.id AND wu.week = $2
|
|
72
|
+
WHERE u.id = $1
|
|
73
|
+
GROUP BY u.id, u.email, u.name, u.tier, u.alert_threshold, u.alert_email, u.alert_sent_at, u.weekly_limit`, [userId, currentWeek]);
|
|
74
|
+
const row = result.rows[0];
|
|
75
|
+
if (!row || !row.alert_threshold)
|
|
76
|
+
return { shouldSendAlert: false };
|
|
77
|
+
const used = parseInt(row.total_used, 10) || 0;
|
|
78
|
+
const total = parseInt(row.total_available, 10) || row.weekly_limit || 999;
|
|
79
|
+
const usagePercent = total > 0 ? Math.round((used / total) * 100) : 0;
|
|
80
|
+
// Only alert if: crosses threshold AND haven't sent alert this week
|
|
81
|
+
const lastAlert = row.alert_sent_at ? new Date(row.alert_sent_at) : null;
|
|
82
|
+
const oneWeekAgo = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000);
|
|
83
|
+
const alreadySentThisWeek = lastAlert !== null && lastAlert > oneWeekAgo;
|
|
84
|
+
return {
|
|
85
|
+
shouldSendAlert: usagePercent >= row.alert_threshold && !alreadySentThisWeek,
|
|
86
|
+
usagePercent,
|
|
87
|
+
used,
|
|
88
|
+
total,
|
|
89
|
+
userEmail: row.email,
|
|
90
|
+
userName: row.name || undefined,
|
|
91
|
+
userTier: row.tier,
|
|
92
|
+
alertEmail: row.alert_email || undefined,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
const VALID_LLM_PROVIDERS = ['openai', 'anthropic', 'google'];
|
|
96
|
+
export function createFetchRouter(authStore) {
|
|
97
|
+
const router = Router();
|
|
98
|
+
// LRU cache: 5 minute TTL, max 500 entries, 100MB total size
|
|
99
|
+
const cache = new LRUCache({
|
|
100
|
+
max: 500,
|
|
101
|
+
ttl: 5 * 60 * 1000, // 5 minutes default
|
|
102
|
+
maxSize: 100 * 1024 * 1024, // 100MB
|
|
103
|
+
sizeCalculation: (entry) => {
|
|
104
|
+
return JSON.stringify(entry).length;
|
|
105
|
+
},
|
|
106
|
+
});
|
|
107
|
+
router.get('/v1/fetch', async (req, res) => {
|
|
108
|
+
try {
|
|
109
|
+
// Require authentication — API key or JWT session
|
|
110
|
+
const userId = req.auth?.keyInfo?.accountId || req.user?.userId;
|
|
111
|
+
if (!userId) {
|
|
112
|
+
res.status(401).json({
|
|
113
|
+
success: false,
|
|
114
|
+
error: {
|
|
115
|
+
type: 'unauthorized',
|
|
116
|
+
message: 'API key required. Get one free at https://app.webpeel.dev/keys',
|
|
117
|
+
hint: 'Get a free API key at https://app.webpeel.dev/keys',
|
|
118
|
+
docs: 'https://webpeel.dev/docs/errors#unauthorized',
|
|
119
|
+
},
|
|
120
|
+
requestId: req.requestId,
|
|
121
|
+
});
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
const { url, render, wait, format, includeTags, excludeTags, images, location, languages, onlyMainContent, actions, maxAge, storeInCache, stream, noCache, cacheTtl, budget, question, summary, readable, stealth, screenshot, maxTokens, selector, exclude, fullPage, raw, lite, timeout, schema, detail, } = req.query;
|
|
125
|
+
const detailMode = detail || 'standard';
|
|
126
|
+
// Validate URL parameter
|
|
127
|
+
if (!url || typeof url !== 'string') {
|
|
128
|
+
res.status(400).json({
|
|
129
|
+
success: false,
|
|
130
|
+
error: {
|
|
131
|
+
type: 'invalid_request',
|
|
132
|
+
message: 'Missing or invalid "url" parameter.',
|
|
133
|
+
hint: 'Pass a URL as a query parameter: GET /v1/fetch?url=https://example.com',
|
|
134
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
135
|
+
},
|
|
136
|
+
requestId: req.requestId,
|
|
137
|
+
});
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
// SECURITY: Validate URL format and length
|
|
141
|
+
if (url.length > 2048) {
|
|
142
|
+
res.status(400).json({
|
|
143
|
+
success: false,
|
|
144
|
+
error: {
|
|
145
|
+
type: 'invalid_url',
|
|
146
|
+
message: 'URL too long (max 2048 characters)',
|
|
147
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
148
|
+
},
|
|
149
|
+
requestId: req.requestId,
|
|
150
|
+
});
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
try {
|
|
154
|
+
const parsed = new URL(url);
|
|
155
|
+
// Normalize URL for consistent caching
|
|
156
|
+
const normalizedUrl = parsed.href;
|
|
157
|
+
// Use normalized URL for cache key
|
|
158
|
+
if (normalizedUrl !== url) {
|
|
159
|
+
// URL was normalized, update for caching
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
res.status(400).json({
|
|
164
|
+
success: false,
|
|
165
|
+
error: {
|
|
166
|
+
type: 'invalid_url',
|
|
167
|
+
message: 'Invalid URL format',
|
|
168
|
+
hint: 'Ensure the URL includes a scheme (https://) and a valid hostname',
|
|
169
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
170
|
+
},
|
|
171
|
+
requestId: req.requestId,
|
|
172
|
+
});
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
// SECURITY: Validate URL to prevent SSRF attacks
|
|
176
|
+
try {
|
|
177
|
+
validateUrlForSSRF(url);
|
|
178
|
+
}
|
|
179
|
+
catch (error) {
|
|
180
|
+
if (error instanceof SSRFError) {
|
|
181
|
+
res.status(400).json({
|
|
182
|
+
success: false,
|
|
183
|
+
error: {
|
|
184
|
+
type: 'forbidden_url',
|
|
185
|
+
message: 'Cannot fetch localhost, private networks, or non-HTTP URLs',
|
|
186
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
187
|
+
},
|
|
188
|
+
requestId: req.requestId,
|
|
189
|
+
});
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
throw error;
|
|
193
|
+
}
|
|
194
|
+
// Parse actions query param (JSON-encoded array)
|
|
195
|
+
let parsedActions;
|
|
196
|
+
if (actions && typeof actions === 'string') {
|
|
197
|
+
try {
|
|
198
|
+
const raw = JSON.parse(actions);
|
|
199
|
+
parsedActions = normalizeActions(raw);
|
|
200
|
+
}
|
|
201
|
+
catch (e) {
|
|
202
|
+
res.status(400).json({
|
|
203
|
+
success: false,
|
|
204
|
+
error: {
|
|
205
|
+
type: 'invalid_request',
|
|
206
|
+
message: 'Invalid "actions" parameter: must be a valid JSON array',
|
|
207
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
208
|
+
},
|
|
209
|
+
requestId: req.requestId,
|
|
210
|
+
});
|
|
211
|
+
return;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
// Build cache key (include new parameters)
|
|
215
|
+
const actionsKey = parsedActions ? JSON.stringify(parsedActions) : '';
|
|
216
|
+
const cacheKey = `fetch:${url}:${render}:${wait}:${format}:${includeTags}:${excludeTags}:${images}:${location}:${languages}:${onlyMainContent}:${stream}:${actionsKey}:${budget}:${question}:${summary}:${readable}:${stealth}:${screenshot}:${maxTokens}:${selector}:${exclude}:${fullPage}:${raw}`;
|
|
217
|
+
// Cache bypass: ?noCache=true or Cache-Control: no-cache header
|
|
218
|
+
const bypassCache = noCache === 'true' || req.headers['cache-control'] === 'no-cache';
|
|
219
|
+
// Per-request TTL (cacheTtl in seconds, default 300s = 5 min)
|
|
220
|
+
const cacheTtlMs = cacheTtl !== undefined
|
|
221
|
+
? parseInt(cacheTtl, 10) * 1000
|
|
222
|
+
: 5 * 60 * 1000;
|
|
223
|
+
// Check cache (with maxAge support)
|
|
224
|
+
const maxAgeMs = maxAge !== undefined ? parseInt(maxAge, 10) : 172800000; // Default 2 days
|
|
225
|
+
if (!bypassCache) {
|
|
226
|
+
const cached = cache.get(cacheKey);
|
|
227
|
+
if (cached && maxAgeMs > 0) {
|
|
228
|
+
const cacheAge = Date.now() - cached.timestamp;
|
|
229
|
+
if (cacheAge < maxAgeMs && cacheAge < cacheTtlMs) {
|
|
230
|
+
res.setHeader('X-Cache', 'HIT');
|
|
231
|
+
res.setHeader('X-Cache-Age', Math.floor(cacheAge / 1000).toString());
|
|
232
|
+
if (wantsEnvelope(req)) {
|
|
233
|
+
successResponse(res, cached.result, {
|
|
234
|
+
requestId: req.requestId,
|
|
235
|
+
cached: true,
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
else {
|
|
239
|
+
res.json(cached.result);
|
|
240
|
+
}
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
// Parse options
|
|
246
|
+
const isSoftLimited = req.auth?.softLimited === true;
|
|
247
|
+
const hasExtraUsage = req.auth?.extraUsageAvailable === true;
|
|
248
|
+
// Parse tag arrays from comma-separated strings
|
|
249
|
+
const includeTagsArray = includeTags
|
|
250
|
+
? includeTags.split(',').map(t => t.trim()).filter(Boolean)
|
|
251
|
+
: undefined;
|
|
252
|
+
const excludeTagsArray = excludeTags
|
|
253
|
+
? excludeTags.split(',').map(t => t.trim()).filter(Boolean)
|
|
254
|
+
: undefined;
|
|
255
|
+
const languagesArray = languages
|
|
256
|
+
? languages.split(',').map(l => l.trim()).filter(Boolean)
|
|
257
|
+
: undefined;
|
|
258
|
+
// onlyMainContent is a shortcut for common include tags
|
|
259
|
+
const finalIncludeTags = onlyMainContent === 'true'
|
|
260
|
+
? ['main', 'article', '.content', '#content']
|
|
261
|
+
: includeTagsArray;
|
|
262
|
+
// When actions are present, force browser mode (skip HTTP fast path)
|
|
263
|
+
const hasActions = parsedActions && parsedActions.length > 0;
|
|
264
|
+
const shouldRender = hasActions || render === 'true';
|
|
265
|
+
const options = {
|
|
266
|
+
// SOFT LIMIT: When over quota AND no extra usage, force HTTP-only
|
|
267
|
+
// If extra usage is available, allow full functionality
|
|
268
|
+
// Exception: actions always require render
|
|
269
|
+
render: (isSoftLimited && !hasExtraUsage && !hasActions) ? false : shouldRender,
|
|
270
|
+
wait: (isSoftLimited && !hasExtraUsage) ? 0 : (wait ? parseInt(wait, 10) : undefined),
|
|
271
|
+
format: format || 'markdown',
|
|
272
|
+
stream: stream === 'true',
|
|
273
|
+
includeTags: finalIncludeTags,
|
|
274
|
+
excludeTags: excludeTagsArray,
|
|
275
|
+
images: images === 'true',
|
|
276
|
+
actions: parsedActions,
|
|
277
|
+
location: location || languagesArray ? {
|
|
278
|
+
country: location,
|
|
279
|
+
languages: languagesArray,
|
|
280
|
+
} : undefined,
|
|
281
|
+
budget: budget ? parseInt(budget, 10) : undefined,
|
|
282
|
+
question: question,
|
|
283
|
+
readable: readable === 'true',
|
|
284
|
+
stealth: (isSoftLimited && !hasExtraUsage) ? false : stealth === 'true',
|
|
285
|
+
screenshot: (isSoftLimited && !hasExtraUsage) ? false : screenshot === 'true',
|
|
286
|
+
maxTokens: maxTokens ? parseInt(maxTokens, 10) : undefined,
|
|
287
|
+
selector: selector,
|
|
288
|
+
exclude: exclude ? exclude.split(',').map(s => s.trim()).filter(Boolean) : undefined,
|
|
289
|
+
fullPage: fullPage === 'true',
|
|
290
|
+
raw: raw === 'true',
|
|
291
|
+
lite: lite === 'true',
|
|
292
|
+
timeout: timeout ? parseInt(timeout, 10) : undefined,
|
|
293
|
+
};
|
|
294
|
+
// Auto-budget: default to 4000 tokens for API requests when no budget specified
|
|
295
|
+
// Opt-out: budget=0 explicitly disables. Lite mode disables auto-budget.
|
|
296
|
+
if (options.budget === undefined && !options.lite) {
|
|
297
|
+
options.budget = 4000;
|
|
298
|
+
res.setHeader('X-Auto-Budget', '4000');
|
|
299
|
+
}
|
|
300
|
+
// Inform the user if their request was degraded
|
|
301
|
+
if (isSoftLimited && !hasExtraUsage && render === 'true' && !hasActions) {
|
|
302
|
+
res.setHeader('X-Degraded', 'render=true downgraded to HTTP-only (quota exceeded)');
|
|
303
|
+
}
|
|
304
|
+
if (isSoftLimited && !hasExtraUsage && stealth === 'true') {
|
|
305
|
+
res.setHeader('X-Degraded', 'stealth=true downgraded (quota exceeded)');
|
|
306
|
+
}
|
|
307
|
+
if (isSoftLimited && !hasExtraUsage && screenshot === 'true') {
|
|
308
|
+
res.setHeader('X-Degraded', 'screenshot=true downgraded (quota exceeded)');
|
|
309
|
+
}
|
|
310
|
+
// Validate wait parameter
|
|
311
|
+
if (options.wait !== undefined && (isNaN(options.wait) || options.wait < 0 || options.wait > 60000)) {
|
|
312
|
+
res.status(400).json({
|
|
313
|
+
success: false,
|
|
314
|
+
error: {
|
|
315
|
+
type: 'invalid_request',
|
|
316
|
+
message: 'Invalid "wait" parameter: must be between 0 and 60000ms',
|
|
317
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
318
|
+
},
|
|
319
|
+
requestId: req.requestId,
|
|
320
|
+
});
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
// Validate format parameter
|
|
324
|
+
if (!['markdown', 'text', 'html', 'clean'].includes(options.format || '')) {
|
|
325
|
+
res.status(400).json({
|
|
326
|
+
success: false,
|
|
327
|
+
error: {
|
|
328
|
+
type: 'invalid_request',
|
|
329
|
+
message: 'Invalid "format" parameter: must be "markdown", "text", "html", or "clean"',
|
|
330
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
331
|
+
},
|
|
332
|
+
requestId: req.requestId,
|
|
333
|
+
});
|
|
334
|
+
return;
|
|
335
|
+
}
|
|
336
|
+
const shouldStream = options.stream === true;
|
|
337
|
+
if (shouldStream) {
|
|
338
|
+
res.setHeader('X-Stream', 'true');
|
|
339
|
+
if (typeof res.flushHeaders === 'function') {
|
|
340
|
+
res.flushHeaders();
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
// Fetch content
|
|
344
|
+
const startTime = Date.now();
|
|
345
|
+
const result = await peel(url, options);
|
|
346
|
+
const elapsed = Date.now() - startTime;
|
|
347
|
+
// --- BM25 Schema Template Extraction (GET, no LLM needed) ---
|
|
348
|
+
if (schema && typeof schema === 'string' && result.content) {
|
|
349
|
+
const template = getSchemaTemplate(schema);
|
|
350
|
+
if (template) {
|
|
351
|
+
const { quickAnswer } = await import('../../core/quick-answer.js');
|
|
352
|
+
const { smartExtractSchemaFields } = await import('../../core/schema-postprocess.js');
|
|
353
|
+
const extracted = smartExtractSchemaFields(result.content, template.fields, quickAnswer, {
|
|
354
|
+
pageTitle: result.title,
|
|
355
|
+
pageUrl: result.url,
|
|
356
|
+
metadata: result.metadata,
|
|
357
|
+
});
|
|
358
|
+
result.extracted = extracted;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
// Determine fetch type from the result method
|
|
362
|
+
const fetchType = result.method === 'stealth' ? 'stealth' :
|
|
363
|
+
result.method === 'browser' ? 'stealth' : 'basic';
|
|
364
|
+
// Log request to database (PostgreSQL only)
|
|
365
|
+
const pgStore = authStore;
|
|
366
|
+
// Log usage for BOTH API key auth AND JWT session auth
|
|
367
|
+
const logUserId = req.auth?.keyInfo?.accountId || req.user?.userId;
|
|
368
|
+
if (logUserId && typeof pgStore.pool !== 'undefined') {
|
|
369
|
+
pgStore.pool.query(`INSERT INTO usage_logs
|
|
370
|
+
(user_id, endpoint, url, method, processing_time_ms, status_code, ip_address, user_agent, tokens_used)
|
|
371
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, [
|
|
372
|
+
logUserId,
|
|
373
|
+
'fetch',
|
|
374
|
+
url,
|
|
375
|
+
fetchType,
|
|
376
|
+
elapsed,
|
|
377
|
+
200,
|
|
378
|
+
req.ip || req.socket.remoteAddress,
|
|
379
|
+
req.get('user-agent'),
|
|
380
|
+
result?.tokens || null,
|
|
381
|
+
]).catch((err) => {
|
|
382
|
+
console.error('Failed to log request to usage_logs:', err);
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
// Track usage (check for trackBurstUsage method to detect PostgresAuthStore)
|
|
386
|
+
if (req.auth?.keyInfo?.key && typeof pgStore.trackBurstUsage === 'function') {
|
|
387
|
+
// Track burst usage (always)
|
|
388
|
+
await pgStore.trackBurstUsage(req.auth.keyInfo.key);
|
|
389
|
+
// If soft-limited with extra usage available, charge to extra usage
|
|
390
|
+
if (isSoftLimited && hasExtraUsage) {
|
|
391
|
+
const extraResult = await pgStore.trackExtraUsage(req.auth.keyInfo.key, fetchType, url, elapsed, 200 // PeelResult doesn't include statusCode, assume success
|
|
392
|
+
);
|
|
393
|
+
if (extraResult.success) {
|
|
394
|
+
res.setHeader('X-Extra-Usage-Charged', `$${extraResult.cost.toFixed(4)}`);
|
|
395
|
+
res.setHeader('X-Extra-Usage-New-Balance', extraResult.newBalance.toFixed(2));
|
|
396
|
+
}
|
|
397
|
+
else {
|
|
398
|
+
// Extra usage failed - fall back to soft limit
|
|
399
|
+
res.setHeader('X-Degraded', 'Extra usage insufficient, degraded to soft limit');
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
else if (!isSoftLimited) {
|
|
403
|
+
// Normal weekly usage tracking
|
|
404
|
+
await pgStore.trackUsage(req.auth.keyInfo.key, fetchType);
|
|
405
|
+
}
|
|
406
|
+
// If soft-limited WITHOUT extra usage, don't track (already over quota)
|
|
407
|
+
}
|
|
408
|
+
// Check usage alert (fire-and-forget, never block the response)
|
|
409
|
+
if (req.auth?.keyInfo?.accountId && typeof pgStore.pool !== 'undefined') {
|
|
410
|
+
try {
|
|
411
|
+
const alertResult = await checkAndTriggerAlert(pgStore, req.auth.keyInfo.accountId);
|
|
412
|
+
if (alertResult.shouldSendAlert && alertResult.usagePercent !== undefined) {
|
|
413
|
+
await sendUsageAlertEmail({
|
|
414
|
+
toEmail: alertResult.alertEmail || alertResult.userEmail,
|
|
415
|
+
userName: alertResult.userName,
|
|
416
|
+
usagePercent: alertResult.usagePercent,
|
|
417
|
+
used: alertResult.used,
|
|
418
|
+
total: alertResult.total,
|
|
419
|
+
tier: alertResult.userTier,
|
|
420
|
+
});
|
|
421
|
+
// Mark alert as sent so we don't spam (rate-limited to once/week)
|
|
422
|
+
await pgStore.pool.query('UPDATE users SET alert_sent_at = NOW() WHERE id = $1', [req.auth.keyInfo.accountId]);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
catch (alertErr) {
|
|
426
|
+
// Never let alert errors affect the main response
|
|
427
|
+
console.warn('[alert] Failed to check/send alert:', alertErr);
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
// Cache result (unless storeInCache is explicitly false or cache bypass requested)
|
|
431
|
+
if (storeInCache !== 'false' && !bypassCache) {
|
|
432
|
+
cache.set(cacheKey, {
|
|
433
|
+
result,
|
|
434
|
+
timestamp: Date.now(),
|
|
435
|
+
}, { ttl: cacheTtlMs });
|
|
436
|
+
}
|
|
437
|
+
// Apply ?detail=brief mode: truncate content and prepend TL;DR
|
|
438
|
+
if (detailMode === 'brief' && result.content) {
|
|
439
|
+
const words = result.content.split(/\s+/);
|
|
440
|
+
const truncatedWords = words.slice(0, 500);
|
|
441
|
+
const truncated = truncatedWords.join(' ');
|
|
442
|
+
// Extract TL;DR from first non-empty paragraph
|
|
443
|
+
const firstPara = result.content
|
|
444
|
+
.split(/\n{2,}/)
|
|
445
|
+
.map((p) => p.replace(/^#+\s*/, '').trim())
|
|
446
|
+
.find((p) => p.length > 40 && !p.startsWith('!') && !p.startsWith('['));
|
|
447
|
+
const tldr = firstPara
|
|
448
|
+
? firstPara.replace(/\s+/g, ' ').slice(0, 300) + (firstPara.length > 300 ? '...' : '')
|
|
449
|
+
: truncated.slice(0, 200) + '...';
|
|
450
|
+
result.content = `**TL;DR:** ${tldr}\n\n---\n\n${truncated}${words.length > 500 ? '\n\n*[Content truncated — use ?detail=full for complete output]*' : ''}`;
|
|
451
|
+
const tokenEstimate = Math.round(truncatedWords.length * 0.75);
|
|
452
|
+
res.setHeader('X-Detail-Mode', 'brief');
|
|
453
|
+
res.setHeader('X-Token-Estimate', tokenEstimate.toString());
|
|
454
|
+
}
|
|
455
|
+
// --- question → answer field (GET) ---
|
|
456
|
+
// When ?question= is provided, run quickAnswer() on the fetched content
|
|
457
|
+
// and expose the result as an `answer` field in the response.
|
|
458
|
+
const getAnswerResult = (question && typeof question === 'string' && result.content)
|
|
459
|
+
? quickAnswer({ question, content: result.content, url: result.url })
|
|
460
|
+
: undefined;
|
|
461
|
+
// --- summary field (GET) ---
|
|
462
|
+
// When ?summary=true, return a truncated 500-word summary in a `summary` field.
|
|
463
|
+
const getSummaryText = (summary === 'true' && result.content)
|
|
464
|
+
? extractSummary(result.content)
|
|
465
|
+
: undefined;
|
|
466
|
+
// Add usage headers (kept for backward compat; also surfaced in envelope metadata)
|
|
467
|
+
res.setHeader('X-Cache', 'MISS');
|
|
468
|
+
res.setHeader('X-Credits-Used', '1');
|
|
469
|
+
res.setHeader('X-Processing-Time', elapsed.toString());
|
|
470
|
+
res.setHeader('X-Fetch-Type', fetchType);
|
|
471
|
+
// Build response — extend result with optional answer/summary fields
|
|
472
|
+
const getResponseBody = { ...result };
|
|
473
|
+
if (getAnswerResult !== undefined)
|
|
474
|
+
getResponseBody.answer = getAnswerResult.answer;
|
|
475
|
+
if (getSummaryText !== undefined)
|
|
476
|
+
getResponseBody.summary = getSummaryText;
|
|
477
|
+
if (wantsEnvelope(req)) {
|
|
478
|
+
successResponse(res, getResponseBody, {
|
|
479
|
+
requestId: req.requestId,
|
|
480
|
+
processingTimeMs: elapsed,
|
|
481
|
+
creditsUsed: 1,
|
|
482
|
+
cached: false,
|
|
483
|
+
fetchType,
|
|
484
|
+
});
|
|
485
|
+
}
|
|
486
|
+
else {
|
|
487
|
+
res.json(getResponseBody);
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
catch (error) {
|
|
491
|
+
const err = error;
|
|
492
|
+
// Log error to database (PostgreSQL only)
|
|
493
|
+
const pgStore = authStore;
|
|
494
|
+
if (req.auth?.keyInfo?.accountId && typeof pgStore.pool !== 'undefined') {
|
|
495
|
+
const url = req.query.url;
|
|
496
|
+
const render = req.query.render === 'true';
|
|
497
|
+
const fetchType = render ? 'stealth' : 'basic';
|
|
498
|
+
pgStore.pool.query(`INSERT INTO usage_logs
|
|
499
|
+
(user_id, endpoint, url, method, status_code, error, ip_address, user_agent, tokens_used)
|
|
500
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, [
|
|
501
|
+
req.auth.keyInfo.accountId,
|
|
502
|
+
'fetch',
|
|
503
|
+
url,
|
|
504
|
+
fetchType,
|
|
505
|
+
500,
|
|
506
|
+
err.message || 'Unknown error',
|
|
507
|
+
req.ip || req.socket.remoteAddress,
|
|
508
|
+
req.get('user-agent'),
|
|
509
|
+
null,
|
|
510
|
+
]).catch((logErr) => {
|
|
511
|
+
console.error('Failed to log error to usage_logs:', logErr);
|
|
512
|
+
});
|
|
513
|
+
}
|
|
514
|
+
// SECURITY: Sanitize error messages to prevent information disclosure
|
|
515
|
+
if (err.code) {
|
|
516
|
+
// WebPeelError from core library - safe to expose with helpful context
|
|
517
|
+
const safeMessage = err.message.replace(/[<>"']/g, ''); // Remove HTML chars
|
|
518
|
+
const statusCode = err.code === 'TIMEOUT' ? 504
|
|
519
|
+
: err.code === 'BLOCKED' ? 403
|
|
520
|
+
: err.code === 'NETWORK' ? 502
|
|
521
|
+
: 500;
|
|
522
|
+
const hints = {
|
|
523
|
+
TIMEOUT: 'Try increasing timeout with ?wait=10000, or use render=true for JS-heavy sites.',
|
|
524
|
+
BLOCKED: 'This site blocks automated requests. Try adding render=true or use stealth mode (costs 5 credits).',
|
|
525
|
+
NETWORK: 'Could not reach the target URL. Verify the URL is correct and the site is online.',
|
|
526
|
+
};
|
|
527
|
+
res.status(statusCode).json({
|
|
528
|
+
success: false,
|
|
529
|
+
error: {
|
|
530
|
+
type: err.code,
|
|
531
|
+
message: safeMessage,
|
|
532
|
+
hint: hints[err.code] || undefined,
|
|
533
|
+
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
534
|
+
},
|
|
535
|
+
requestId: req.requestId,
|
|
536
|
+
});
|
|
537
|
+
}
|
|
538
|
+
else {
|
|
539
|
+
// Unexpected error - generic message only
|
|
540
|
+
console.error('Fetch error:', err); // Log full error server-side
|
|
541
|
+
res.status(500).json({
|
|
542
|
+
success: false,
|
|
543
|
+
error: {
|
|
544
|
+
type: 'internal_error',
|
|
545
|
+
message: 'An unexpected error occurred while fetching the URL. If this persists, check https://webpeel.dev/status',
|
|
546
|
+
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
547
|
+
},
|
|
548
|
+
requestId: req.requestId,
|
|
549
|
+
});
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
});
|
|
553
|
+
// -----------------------------------------------------------------------
|
|
554
|
+
// POST /v1/fetch — same as GET but accepts JSON body with extract param
|
|
555
|
+
// POST /v2/scrape — alias with identical behaviour
|
|
556
|
+
// -----------------------------------------------------------------------
|
|
557
|
+
async function handlePostFetch(req, res) {
|
|
558
|
+
try {
|
|
559
|
+
// Require authentication — API key or JWT session
|
|
560
|
+
const postUserId = req.auth?.keyInfo?.accountId || req.user?.userId;
|
|
561
|
+
if (!postUserId) {
|
|
562
|
+
res.status(401).json({
|
|
563
|
+
success: false,
|
|
564
|
+
error: {
|
|
565
|
+
type: 'unauthorized',
|
|
566
|
+
message: 'API key required. Get one free at https://app.webpeel.dev/keys',
|
|
567
|
+
hint: 'Get a free API key at https://app.webpeel.dev/keys',
|
|
568
|
+
docs: 'https://webpeel.dev/docs/errors#unauthorized',
|
|
569
|
+
},
|
|
570
|
+
requestId: req.requestId,
|
|
571
|
+
});
|
|
572
|
+
return;
|
|
573
|
+
}
|
|
574
|
+
const { url, render, wait, format, includeTags, excludeTags, images, location, languages, onlyMainContent, actions: rawActions, storeInCache: storeFlag,
|
|
575
|
+
// Cache control
|
|
576
|
+
noCache: noCacheBody, cacheTtl: cacheTtlBody,
|
|
577
|
+
// Inline extraction (BYOK)
|
|
578
|
+
extract, llmProvider, llmApiKey, llmModel,
|
|
579
|
+
// Firecrawl-compatible formats array
|
|
580
|
+
formats, stream,
|
|
581
|
+
// Extended peel options
|
|
582
|
+
budget, question, summary: summaryParam, readable, stealth, screenshot, maxTokens, selector, exclude, fullPage, raw, lite, timeout, proxies, chunk, device, viewportWidth, viewportHeight, waitUntil, waitSelector, blockResources, cloaked, schema: bodySchema, } = req.body;
|
|
583
|
+
// --- Validate URL -------------------------------------------------------
|
|
584
|
+
if (!url || typeof url !== 'string') {
|
|
585
|
+
res.status(400).json({
|
|
586
|
+
success: false,
|
|
587
|
+
error: {
|
|
588
|
+
type: 'invalid_request',
|
|
589
|
+
message: 'Missing or invalid "url" in request body.',
|
|
590
|
+
hint: 'Send JSON: { "url": "https://example.com" }',
|
|
591
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
592
|
+
},
|
|
593
|
+
requestId: req.requestId,
|
|
594
|
+
});
|
|
595
|
+
return;
|
|
596
|
+
}
|
|
597
|
+
if (url.length > 2048) {
|
|
598
|
+
res.status(400).json({
|
|
599
|
+
success: false,
|
|
600
|
+
error: {
|
|
601
|
+
type: 'invalid_url',
|
|
602
|
+
message: 'URL too long (max 2048 characters)',
|
|
603
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
604
|
+
},
|
|
605
|
+
requestId: req.requestId,
|
|
606
|
+
});
|
|
607
|
+
return;
|
|
608
|
+
}
|
|
609
|
+
try {
|
|
610
|
+
new URL(url);
|
|
611
|
+
}
|
|
612
|
+
catch {
|
|
613
|
+
res.status(400).json({
|
|
614
|
+
success: false,
|
|
615
|
+
error: {
|
|
616
|
+
type: 'invalid_url',
|
|
617
|
+
message: 'Invalid URL format',
|
|
618
|
+
hint: 'Ensure the URL includes a scheme (https://) and a valid hostname',
|
|
619
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
620
|
+
},
|
|
621
|
+
requestId: req.requestId,
|
|
622
|
+
});
|
|
623
|
+
return;
|
|
624
|
+
}
|
|
625
|
+
try {
|
|
626
|
+
validateUrlForSSRF(url);
|
|
627
|
+
}
|
|
628
|
+
catch (error) {
|
|
629
|
+
if (error instanceof SSRFError) {
|
|
630
|
+
res.status(400).json({
|
|
631
|
+
success: false,
|
|
632
|
+
error: {
|
|
633
|
+
type: 'forbidden_url',
|
|
634
|
+
message: 'Cannot fetch localhost, private networks, or non-HTTP URLs',
|
|
635
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
636
|
+
},
|
|
637
|
+
requestId: req.requestId,
|
|
638
|
+
});
|
|
639
|
+
return;
|
|
640
|
+
}
|
|
641
|
+
throw error;
|
|
642
|
+
}
|
|
643
|
+
// --- Parse and normalize actions -----------------------------------------
|
|
644
|
+
let postActions;
|
|
645
|
+
if (rawActions !== undefined) {
|
|
646
|
+
try {
|
|
647
|
+
postActions = normalizeActions(rawActions);
|
|
648
|
+
}
|
|
649
|
+
catch (e) {
|
|
650
|
+
res.status(400).json({
|
|
651
|
+
success: false,
|
|
652
|
+
error: {
|
|
653
|
+
type: 'invalid_request',
|
|
654
|
+
message: `Invalid "actions" parameter: ${e.message}`,
|
|
655
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
656
|
+
},
|
|
657
|
+
requestId: req.requestId,
|
|
658
|
+
});
|
|
659
|
+
return;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
// --- Cache bypass and lookup -------------------------------------------
|
|
663
|
+
const postBypassCache = noCacheBody === true || req.headers['cache-control'] === 'no-cache';
|
|
664
|
+
const postCacheTtlMs = typeof cacheTtlBody === 'number' ? cacheTtlBody * 1000 : 5 * 60 * 1000;
|
|
665
|
+
const postActionsKey = postActions ? JSON.stringify(postActions) : '';
|
|
666
|
+
const postCacheKey = `fetch:${url}:${render}:${wait}:${format}:${JSON.stringify(includeTags)}:${JSON.stringify(excludeTags)}:${images}:${location}:${JSON.stringify(languages)}:${onlyMainContent}:${stream}:${postActionsKey}:${budget}:${question}:${summaryParam}:${readable}:${stealth}:${screenshot}:${maxTokens}:${selector}:${JSON.stringify(exclude)}:${fullPage}:${raw}`;
|
|
667
|
+
if (!postBypassCache && !extract) {
|
|
668
|
+
const cached = cache.get(postCacheKey);
|
|
669
|
+
if (cached) {
|
|
670
|
+
const cacheAge = Date.now() - cached.timestamp;
|
|
671
|
+
if (cacheAge < postCacheTtlMs) {
|
|
672
|
+
res.setHeader('X-Cache', 'HIT');
|
|
673
|
+
res.setHeader('X-Cache-Age', Math.floor(cacheAge / 1000).toString());
|
|
674
|
+
if (wantsEnvelope(req)) {
|
|
675
|
+
successResponse(res, cached.result, {
|
|
676
|
+
requestId: req.requestId,
|
|
677
|
+
cached: true,
|
|
678
|
+
});
|
|
679
|
+
}
|
|
680
|
+
else {
|
|
681
|
+
res.json(cached.result);
|
|
682
|
+
}
|
|
683
|
+
return;
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
// --- Resolve inline extract from body or Firecrawl-compatible formats ---
|
|
688
|
+
let resolvedExtract = extract;
|
|
689
|
+
if (!resolvedExtract && Array.isArray(formats)) {
|
|
690
|
+
const jsonFormat = formats.find((f) => (typeof f === 'object' && f !== null && f.type === 'json') ||
|
|
691
|
+
(typeof f === 'string' && f === 'json'));
|
|
692
|
+
if (jsonFormat && typeof jsonFormat === 'object' && (jsonFormat.schema || jsonFormat.prompt)) {
|
|
693
|
+
resolvedExtract = {
|
|
694
|
+
schema: jsonFormat.schema,
|
|
695
|
+
prompt: jsonFormat.prompt,
|
|
696
|
+
};
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
// Resolve schema template names (e.g. "product", "article") to field objects
|
|
700
|
+
if (resolvedExtract && typeof resolvedExtract.schema === 'string') {
|
|
701
|
+
const tmpl = getSchemaTemplate(resolvedExtract.schema);
|
|
702
|
+
if (tmpl) {
|
|
703
|
+
resolvedExtract = { ...resolvedExtract, schema: tmpl.fields };
|
|
704
|
+
}
|
|
705
|
+
else {
|
|
706
|
+
// Try parsing as JSON string
|
|
707
|
+
try {
|
|
708
|
+
resolvedExtract = { ...resolvedExtract, schema: JSON.parse(resolvedExtract.schema) };
|
|
709
|
+
}
|
|
710
|
+
catch { /* leave as-is */ }
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
// Validate LLM params if extraction is requested
|
|
714
|
+
if (resolvedExtract && (resolvedExtract.schema || resolvedExtract.prompt)) {
|
|
715
|
+
if (!llmProvider || !VALID_LLM_PROVIDERS.includes(llmProvider)) {
|
|
716
|
+
res.status(400).json({
|
|
717
|
+
success: false,
|
|
718
|
+
error: {
|
|
719
|
+
type: 'invalid_request',
|
|
720
|
+
message: `"llmProvider" is required for inline extraction and must be one of: ${VALID_LLM_PROVIDERS.join(', ')}`,
|
|
721
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
722
|
+
},
|
|
723
|
+
requestId: req.requestId,
|
|
724
|
+
});
|
|
725
|
+
return;
|
|
726
|
+
}
|
|
727
|
+
if (!llmApiKey || typeof llmApiKey !== 'string' || llmApiKey.trim().length === 0) {
|
|
728
|
+
res.status(400).json({
|
|
729
|
+
success: false,
|
|
730
|
+
error: {
|
|
731
|
+
type: 'invalid_request',
|
|
732
|
+
message: 'Missing or invalid "llmApiKey" (BYOK required for inline extraction)',
|
|
733
|
+
hint: 'Pass your LLM provider API key in the "llmApiKey" field',
|
|
734
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
735
|
+
},
|
|
736
|
+
requestId: req.requestId,
|
|
737
|
+
});
|
|
738
|
+
return;
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
// --- Build PeelOptions ---------------------------------------------------
|
|
742
|
+
const isSoftLimited = req.auth?.softLimited === true;
|
|
743
|
+
const hasExtraUsage = req.auth?.extraUsageAvailable === true;
|
|
744
|
+
const includeTagsArray = Array.isArray(includeTags) ? includeTags : undefined;
|
|
745
|
+
const excludeTagsArray = Array.isArray(excludeTags) ? excludeTags : undefined;
|
|
746
|
+
const languagesArray = Array.isArray(languages) ? languages : undefined;
|
|
747
|
+
const finalIncludeTags = onlyMainContent === true
|
|
748
|
+
? ['main', 'article', '.content', '#content']
|
|
749
|
+
: includeTagsArray;
|
|
750
|
+
const resolvedFormat = format || 'markdown';
|
|
751
|
+
if (!['markdown', 'text', 'html', 'clean'].includes(resolvedFormat)) {
|
|
752
|
+
res.status(400).json({
|
|
753
|
+
success: false,
|
|
754
|
+
error: {
|
|
755
|
+
type: 'invalid_request',
|
|
756
|
+
message: 'Invalid "format" parameter: must be "markdown", "text", "html", or "clean"',
|
|
757
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
758
|
+
},
|
|
759
|
+
requestId: req.requestId,
|
|
760
|
+
});
|
|
761
|
+
return;
|
|
762
|
+
}
|
|
763
|
+
const resolvedWait = typeof wait === 'number' ? wait : undefined;
|
|
764
|
+
if (resolvedWait !== undefined && (isNaN(resolvedWait) || resolvedWait < 0 || resolvedWait > 60000)) {
|
|
765
|
+
res.status(400).json({
|
|
766
|
+
success: false,
|
|
767
|
+
error: {
|
|
768
|
+
type: 'invalid_request',
|
|
769
|
+
message: 'Invalid "wait" parameter: must be between 0 and 60000ms',
|
|
770
|
+
docs: 'https://webpeel.dev/docs/api-reference#fetch',
|
|
771
|
+
},
|
|
772
|
+
requestId: req.requestId,
|
|
773
|
+
});
|
|
774
|
+
return;
|
|
775
|
+
}
|
|
776
|
+
// When actions are present, force browser mode
|
|
777
|
+
const postHasActions = postActions && postActions.length > 0;
|
|
778
|
+
const postShouldRender = postHasActions || render === true;
|
|
779
|
+
// Normalize exclude: accept string (comma-separated) or string array
|
|
780
|
+
const excludeArray = exclude
|
|
781
|
+
? (Array.isArray(exclude) ? exclude : exclude.split(',').map(s => s.trim()).filter(Boolean))
|
|
782
|
+
: undefined;
|
|
783
|
+
const options = {
|
|
784
|
+
render: (isSoftLimited && !hasExtraUsage && !postHasActions) ? false : postShouldRender,
|
|
785
|
+
wait: (isSoftLimited && !hasExtraUsage) ? 0 : resolvedWait,
|
|
786
|
+
format: resolvedFormat,
|
|
787
|
+
stream: stream === true,
|
|
788
|
+
includeTags: finalIncludeTags,
|
|
789
|
+
excludeTags: excludeTagsArray,
|
|
790
|
+
images: images === true,
|
|
791
|
+
actions: postActions,
|
|
792
|
+
location: location || languagesArray ? {
|
|
793
|
+
country: location,
|
|
794
|
+
languages: languagesArray,
|
|
795
|
+
} : undefined,
|
|
796
|
+
budget: typeof budget === 'number' ? budget : undefined,
|
|
797
|
+
question: question,
|
|
798
|
+
readable: readable === true,
|
|
799
|
+
stealth: (isSoftLimited && !hasExtraUsage) ? false : stealth === true,
|
|
800
|
+
screenshot: (isSoftLimited && !hasExtraUsage) ? false : screenshot === true,
|
|
801
|
+
maxTokens: typeof maxTokens === 'number' ? maxTokens : undefined,
|
|
802
|
+
selector: selector,
|
|
803
|
+
exclude: excludeArray,
|
|
804
|
+
fullPage: fullPage === true,
|
|
805
|
+
raw: raw === true,
|
|
806
|
+
lite: lite === true,
|
|
807
|
+
timeout: typeof timeout === 'number' ? timeout : undefined,
|
|
808
|
+
proxies: Array.isArray(proxies) ? proxies : undefined,
|
|
809
|
+
device: device,
|
|
810
|
+
viewportWidth: typeof viewportWidth === 'number' ? viewportWidth : undefined,
|
|
811
|
+
viewportHeight: typeof viewportHeight === 'number' ? viewportHeight : undefined,
|
|
812
|
+
waitUntil: waitUntil,
|
|
813
|
+
waitSelector: waitSelector,
|
|
814
|
+
blockResources: Array.isArray(blockResources) ? blockResources : undefined,
|
|
815
|
+
};
|
|
816
|
+
if (cloaked)
|
|
817
|
+
options.cloaked = cloaked;
|
|
818
|
+
if (chunk)
|
|
819
|
+
options.chunk = chunk === true ? true : chunk;
|
|
820
|
+
// Auto-budget: default to 4000 tokens for API requests when no budget specified
|
|
821
|
+
// Opt-out: budget=0 explicitly disables. Lite mode disables auto-budget.
|
|
822
|
+
if (options.budget === undefined && !options.lite) {
|
|
823
|
+
options.budget = 4000;
|
|
824
|
+
res.setHeader('X-Auto-Budget', '4000');
|
|
825
|
+
}
|
|
826
|
+
if (isSoftLimited && !hasExtraUsage && render === true && !postHasActions) {
|
|
827
|
+
res.setHeader('X-Degraded', 'render=true downgraded to HTTP-only (quota exceeded)');
|
|
828
|
+
}
|
|
829
|
+
if (isSoftLimited && !hasExtraUsage && stealth === true) {
|
|
830
|
+
res.setHeader('X-Degraded', 'stealth=true downgraded (quota exceeded)');
|
|
831
|
+
}
|
|
832
|
+
if (isSoftLimited && !hasExtraUsage && screenshot === true) {
|
|
833
|
+
res.setHeader('X-Degraded', 'screenshot=true downgraded (quota exceeded)');
|
|
834
|
+
}
|
|
835
|
+
const shouldStream = options.stream === true;
|
|
836
|
+
if (shouldStream) {
|
|
837
|
+
res.setHeader('X-Stream', 'true');
|
|
838
|
+
if (typeof res.flushHeaders === 'function') {
|
|
839
|
+
res.flushHeaders();
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
// --- Fetch content -------------------------------------------------------
|
|
843
|
+
const startTime = Date.now();
|
|
844
|
+
const result = await peel(url, options);
|
|
845
|
+
const elapsed = Date.now() - startTime;
|
|
846
|
+
// --- BM25 Schema Template Extraction (POST, no LLM needed) ---
|
|
847
|
+
if (bodySchema && typeof bodySchema === 'string' && result.content) {
|
|
848
|
+
const template = getSchemaTemplate(bodySchema);
|
|
849
|
+
if (template) {
|
|
850
|
+
const { quickAnswer } = await import('../../core/quick-answer.js');
|
|
851
|
+
const { smartExtractSchemaFields } = await import('../../core/schema-postprocess.js');
|
|
852
|
+
const extracted = smartExtractSchemaFields(result.content, template.fields, quickAnswer, {
|
|
853
|
+
pageTitle: result.title,
|
|
854
|
+
pageUrl: result.url,
|
|
855
|
+
metadata: result.metadata,
|
|
856
|
+
});
|
|
857
|
+
result.extracted = extracted;
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
// --- Inline extraction (post-fetch) -------------------------------------
|
|
861
|
+
let jsonData;
|
|
862
|
+
let extractTokensUsed;
|
|
863
|
+
if (resolvedExtract && (resolvedExtract.schema || resolvedExtract.prompt) && llmApiKey) {
|
|
864
|
+
const extractResult = await extractInlineJson(result.content, {
|
|
865
|
+
schema: resolvedExtract.schema,
|
|
866
|
+
prompt: resolvedExtract.prompt,
|
|
867
|
+
llmProvider: llmProvider,
|
|
868
|
+
llmApiKey: llmApiKey.trim(),
|
|
869
|
+
llmModel,
|
|
870
|
+
});
|
|
871
|
+
jsonData = extractResult.data;
|
|
872
|
+
extractTokensUsed = extractResult.tokensUsed;
|
|
873
|
+
}
|
|
874
|
+
// --- Usage tracking (same as GET) ----------------------------------------
|
|
875
|
+
const fetchType = result.method === 'stealth' ? 'stealth' :
|
|
876
|
+
result.method === 'browser' ? 'stealth' : 'basic';
|
|
877
|
+
const pgStore = authStore;
|
|
878
|
+
if (req.auth?.keyInfo?.accountId && typeof pgStore.pool !== 'undefined') {
|
|
879
|
+
pgStore.pool.query(`INSERT INTO usage_logs
|
|
880
|
+
(user_id, endpoint, url, method, processing_time_ms, status_code, ip_address, user_agent, tokens_used)
|
|
881
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, [
|
|
882
|
+
req.auth.keyInfo.accountId,
|
|
883
|
+
'fetch',
|
|
884
|
+
url,
|
|
885
|
+
fetchType,
|
|
886
|
+
elapsed,
|
|
887
|
+
200,
|
|
888
|
+
req.ip || req.socket.remoteAddress,
|
|
889
|
+
req.get('user-agent'),
|
|
890
|
+
result?.tokens || null,
|
|
891
|
+
]).catch((err) => {
|
|
892
|
+
console.error('Failed to log request to usage_logs:', err);
|
|
893
|
+
});
|
|
894
|
+
}
|
|
895
|
+
if (req.auth?.keyInfo?.key && typeof pgStore.trackBurstUsage === 'function') {
|
|
896
|
+
await pgStore.trackBurstUsage(req.auth.keyInfo.key);
|
|
897
|
+
if (isSoftLimited && hasExtraUsage) {
|
|
898
|
+
const extraResult = await pgStore.trackExtraUsage(req.auth.keyInfo.key, fetchType, url, elapsed, 200);
|
|
899
|
+
if (extraResult.success) {
|
|
900
|
+
res.setHeader('X-Extra-Usage-Charged', `$${extraResult.cost.toFixed(4)}`);
|
|
901
|
+
res.setHeader('X-Extra-Usage-New-Balance', extraResult.newBalance.toFixed(2));
|
|
902
|
+
}
|
|
903
|
+
else {
|
|
904
|
+
res.setHeader('X-Degraded', 'Extra usage insufficient, degraded to soft limit');
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
else if (!isSoftLimited) {
|
|
908
|
+
await pgStore.trackUsage(req.auth.keyInfo.key, fetchType);
|
|
909
|
+
}
|
|
910
|
+
}
|
|
911
|
+
// Cache result (skip extraction results — they depend on user's LLM keys)
|
|
912
|
+
if (storeFlag !== false && !postBypassCache && !resolvedExtract) {
|
|
913
|
+
cache.set(postCacheKey, { result, timestamp: Date.now() }, { ttl: postCacheTtlMs });
|
|
914
|
+
}
|
|
915
|
+
// --- question → answer field (POST) ---
|
|
916
|
+
// When question is provided, run quickAnswer() on the fetched content
|
|
917
|
+
// and expose the result as an `answer` field in the response.
|
|
918
|
+
const postAnswerResult = (question && typeof question === 'string' && result.content)
|
|
919
|
+
? quickAnswer({ question, content: result.content, url: result.url })
|
|
920
|
+
: undefined;
|
|
921
|
+
// --- summary field (POST) ---
|
|
922
|
+
// When summary: true, return a truncated 500-word summary in a `summary` field.
|
|
923
|
+
const postSummaryText = (summaryParam === true && result.content)
|
|
924
|
+
? extractSummary(result.content)
|
|
925
|
+
: undefined;
|
|
926
|
+
// --- Build response ------------------------------------------------------
|
|
927
|
+
// Headers kept for backward compat; also surfaced in envelope metadata.
|
|
928
|
+
res.setHeader('X-Cache', 'MISS');
|
|
929
|
+
res.setHeader('X-Credits-Used', '1');
|
|
930
|
+
res.setHeader('X-Processing-Time', elapsed.toString());
|
|
931
|
+
res.setHeader('X-Fetch-Type', fetchType);
|
|
932
|
+
const responseBody = { ...result };
|
|
933
|
+
if (jsonData !== undefined) {
|
|
934
|
+
responseBody.json = jsonData;
|
|
935
|
+
}
|
|
936
|
+
if (extractTokensUsed) {
|
|
937
|
+
responseBody.extractTokensUsed = extractTokensUsed;
|
|
938
|
+
}
|
|
939
|
+
if (postAnswerResult !== undefined) {
|
|
940
|
+
responseBody.answer = postAnswerResult.answer;
|
|
941
|
+
}
|
|
942
|
+
if (postSummaryText !== undefined) {
|
|
943
|
+
responseBody.summary = postSummaryText;
|
|
944
|
+
}
|
|
945
|
+
if (wantsEnvelope(req)) {
|
|
946
|
+
successResponse(res, responseBody, {
|
|
947
|
+
requestId: req.requestId,
|
|
948
|
+
processingTimeMs: elapsed,
|
|
949
|
+
creditsUsed: 1,
|
|
950
|
+
cached: false,
|
|
951
|
+
fetchType,
|
|
952
|
+
});
|
|
953
|
+
}
|
|
954
|
+
else {
|
|
955
|
+
res.json(responseBody);
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
catch (error) {
|
|
959
|
+
const err = error;
|
|
960
|
+
console.error('POST fetch/scrape error:', err);
|
|
961
|
+
if (err.code) {
|
|
962
|
+
const safeMessage = err.message.replace(/[<>"']/g, '');
|
|
963
|
+
const statusCode = err.code === 'TIMEOUT' ? 504
|
|
964
|
+
: err.code === 'BLOCKED' ? 403
|
|
965
|
+
: err.code === 'NETWORK' ? 502
|
|
966
|
+
: 500;
|
|
967
|
+
const hints = {
|
|
968
|
+
TIMEOUT: 'Try increasing timeout, or set render:true for JS-heavy sites.',
|
|
969
|
+
BLOCKED: 'Site blocks automated requests. Try render:true or stealth mode.',
|
|
970
|
+
NETWORK: 'Could not reach the target URL. Verify it is correct and online.',
|
|
971
|
+
};
|
|
972
|
+
res.status(statusCode).json({
|
|
973
|
+
success: false,
|
|
974
|
+
error: {
|
|
975
|
+
type: err.code,
|
|
976
|
+
message: safeMessage,
|
|
977
|
+
hint: hints[err.code] || undefined,
|
|
978
|
+
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
979
|
+
},
|
|
980
|
+
requestId: req.requestId,
|
|
981
|
+
});
|
|
982
|
+
}
|
|
983
|
+
else {
|
|
984
|
+
res.status(500).json({
|
|
985
|
+
success: false,
|
|
986
|
+
error: {
|
|
987
|
+
type: 'internal_error',
|
|
988
|
+
message: 'An unexpected error occurred. If this persists, check https://webpeel.dev/status',
|
|
989
|
+
docs: 'https://webpeel.dev/docs/api-reference#errors',
|
|
990
|
+
},
|
|
991
|
+
requestId: req.requestId,
|
|
992
|
+
});
|
|
993
|
+
}
|
|
994
|
+
}
|
|
995
|
+
}
|
|
996
|
+
router.post('/v1/fetch', handlePostFetch);
|
|
997
|
+
router.post('/v2/scrape', handlePostFetch);
|
|
998
|
+
return router;
|
|
999
|
+
}
|