webpeel 0.20.2 → 0.20.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +384 -0
- package/dist/server/auth-store.d.ts +27 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/email-service.d.ts +21 -0
- package/dist/server/email-service.js +79 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/auth.d.ts +28 -0
- package/dist/server/middleware/auth.js +221 -0
- package/dist/server/middleware/rate-limit.d.ts +24 -0
- package/dist/server/middleware/rate-limit.js +167 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +186 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +132 -0
- package/dist/server/pg-auth-store.js +472 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/premium/domain-intel.d.ts +16 -0
- package/dist/server/premium/domain-intel.js +133 -0
- package/dist/server/premium/index.d.ts +17 -0
- package/dist/server/premium/index.js +35 -0
- package/dist/server/premium/swr-cache.d.ts +14 -0
- package/dist/server/premium/swr-cache.js +34 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +74 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +229 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +8 -0
- package/dist/server/routes/extract.js +235 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +999 -0
- package/dist/server/routes/health.d.ts +7 -0
- package/dist/server/routes/health.js +19 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +573 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +141 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +816 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +303 -0
- package/dist/server/routes/session.d.ts +15 -0
- package/dist/server/routes/session.js +397 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +294 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1671 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +13 -0
- package/dist/server/sentry.js +38 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/package.json +2 -1
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Firecrawl API Compatibility Layer
|
|
3
|
+
*
|
|
4
|
+
* Drop-in replacement for Firecrawl's API - users can switch by ONLY changing the base URL.
|
|
5
|
+
* This is our killer acquisition feature.
|
|
6
|
+
*
|
|
7
|
+
* NOTE: Error responses in this file intentionally use Firecrawl's format:
|
|
8
|
+
* { success: false, error: "Human-readable message" }
|
|
9
|
+
* This is required for Firecrawl drop-in compatibility and differs from the
|
|
10
|
+
* standard WebPeel API error format: { error: "error_code", message: "description" }.
|
|
11
|
+
* Do NOT change this format — it would break Firecrawl-compatible integrations.
|
|
12
|
+
*
|
|
13
|
+
* Implements Firecrawl endpoints:
|
|
14
|
+
* - POST /v1/scrape
|
|
15
|
+
* - POST /v2/scrape (v2 with formats: ["screenshot"] support)
|
|
16
|
+
* - POST /v1/crawl
|
|
17
|
+
* - GET /v1/crawl/:id
|
|
18
|
+
* - POST /v1/search
|
|
19
|
+
* - POST /v1/map
|
|
20
|
+
*/
|
|
21
|
+
import { Router } from 'express';
|
|
22
|
+
import { peel } from '../../index.js';
|
|
23
|
+
import { crawl } from '../../core/crawler.js';
|
|
24
|
+
import { mapDomain } from '../../core/map.js';
|
|
25
|
+
import { takeScreenshot } from '../../core/screenshot.js';
|
|
26
|
+
import { normalizeActions } from '../../core/actions.js';
|
|
27
|
+
import { extractInlineJson } from '../../core/extract-inline.js';
|
|
28
|
+
import { validateUrlForSSRF, SSRFError } from '../middleware/url-validator.js';
|
|
29
|
+
const VALID_LLM_PROVIDERS = ['openai', 'anthropic', 'google'];
|
|
30
|
+
/**
|
|
31
|
+
* Map Firecrawl's action format to our PageAction format.
|
|
32
|
+
* Delegates to the shared normalizeActions helper so behaviour stays
|
|
33
|
+
* consistent across all API surfaces.
|
|
34
|
+
*/
|
|
35
|
+
function mapFirecrawlActions(actions) {
|
|
36
|
+
if (!actions || !Array.isArray(actions))
|
|
37
|
+
return undefined;
|
|
38
|
+
return normalizeActions(actions);
|
|
39
|
+
}
|
|
40
|
+
export function createCompatRouter(jobQueue) {
|
|
41
|
+
const router = Router();
|
|
42
|
+
/**
|
|
43
|
+
* POST /v1/scrape - Firecrawl's main scrape endpoint
|
|
44
|
+
*
|
|
45
|
+
* Maps to our peel() function
|
|
46
|
+
*/
|
|
47
|
+
router.post('/v1/scrape', async (req, res) => {
|
|
48
|
+
try {
|
|
49
|
+
const { url, formats = ['markdown'], onlyMainContent = true, // Firecrawl defaults to true
|
|
50
|
+
includeTags, excludeTags, waitFor, timeout, actions, headers, location,
|
|
51
|
+
// Inline extraction (BYOK)
|
|
52
|
+
extract: extractParam, llmProvider, llmApiKey, llmModel, stream, } = req.body;
|
|
53
|
+
// Validate URL
|
|
54
|
+
if (!url || typeof url !== 'string') {
|
|
55
|
+
res.status(400).json({
|
|
56
|
+
success: false,
|
|
57
|
+
error: 'Missing or invalid "url" parameter',
|
|
58
|
+
});
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
// SECURITY: Validate URL to prevent SSRF attacks
|
|
62
|
+
try {
|
|
63
|
+
validateUrlForSSRF(url);
|
|
64
|
+
}
|
|
65
|
+
catch (error) {
|
|
66
|
+
if (error instanceof SSRFError) {
|
|
67
|
+
res.status(400).json({
|
|
68
|
+
success: false,
|
|
69
|
+
error: 'blocked_url',
|
|
70
|
+
message: 'Cannot fetch localhost, private networks, or non-HTTP URLs',
|
|
71
|
+
});
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
throw error;
|
|
75
|
+
}
|
|
76
|
+
// Determine if we need to render based on Firecrawl params
|
|
77
|
+
const needsRender = waitFor !== undefined || actions !== undefined;
|
|
78
|
+
// Map Firecrawl parameters to our PeelOptions
|
|
79
|
+
// onlyMainContent=true (default) → raw=false (use smart extraction)
|
|
80
|
+
// onlyMainContent=false → raw=true (return everything)
|
|
81
|
+
const options = {
|
|
82
|
+
render: needsRender,
|
|
83
|
+
wait: waitFor,
|
|
84
|
+
timeout: timeout || 30000,
|
|
85
|
+
stream: stream === true,
|
|
86
|
+
includeTags: Array.isArray(includeTags) ? includeTags : undefined,
|
|
87
|
+
excludeTags: Array.isArray(excludeTags) ? excludeTags : undefined,
|
|
88
|
+
raw: onlyMainContent === false,
|
|
89
|
+
actions: mapFirecrawlActions(actions),
|
|
90
|
+
headers,
|
|
91
|
+
screenshot: formats.includes('screenshot'),
|
|
92
|
+
images: formats.includes('images'),
|
|
93
|
+
format: 'markdown', // Always use markdown as base
|
|
94
|
+
};
|
|
95
|
+
// If location is provided, map it
|
|
96
|
+
if (location) {
|
|
97
|
+
options.location = {
|
|
98
|
+
country: location.country,
|
|
99
|
+
languages: location.languages,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
if (options.stream) {
|
|
103
|
+
res.setHeader('X-Stream', 'true');
|
|
104
|
+
if (typeof res.flushHeaders === 'function') {
|
|
105
|
+
res.flushHeaders();
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Execute peel
|
|
109
|
+
const result = await peel(url, options);
|
|
110
|
+
// Build Firecrawl-compatible response
|
|
111
|
+
const data = {
|
|
112
|
+
markdown: result.content,
|
|
113
|
+
metadata: {
|
|
114
|
+
...result.metadata,
|
|
115
|
+
title: result.title,
|
|
116
|
+
description: result.metadata?.description || '',
|
|
117
|
+
language: result.metadata?.language || 'en',
|
|
118
|
+
sourceURL: result.url,
|
|
119
|
+
statusCode: 200,
|
|
120
|
+
},
|
|
121
|
+
};
|
|
122
|
+
// Add optional formats
|
|
123
|
+
if (formats.includes('html')) {
|
|
124
|
+
// Re-fetch with HTML format if requested
|
|
125
|
+
const htmlResult = await peel(url, { ...options, format: 'html' });
|
|
126
|
+
data.html = htmlResult.content;
|
|
127
|
+
}
|
|
128
|
+
if (formats.includes('rawHtml')) {
|
|
129
|
+
const rawResult = await peel(url, { ...options, format: 'html', raw: true });
|
|
130
|
+
data.rawHtml = rawResult.content;
|
|
131
|
+
}
|
|
132
|
+
if (formats.includes('links')) {
|
|
133
|
+
data.links = result.links;
|
|
134
|
+
}
|
|
135
|
+
if (formats.includes('screenshot') && result.screenshot) {
|
|
136
|
+
data.screenshot = `data:image/png;base64,${result.screenshot}`;
|
|
137
|
+
}
|
|
138
|
+
if (formats.includes('images') && result.images) {
|
|
139
|
+
data.images = result.images;
|
|
140
|
+
}
|
|
141
|
+
// --- Inline JSON extraction via LLM (BYOK) ---
|
|
142
|
+
// Resolve extract from: (1) top-level extract param, (2) formats array object
|
|
143
|
+
let resolvedExtract;
|
|
144
|
+
if (extractParam && typeof extractParam === 'object' && (extractParam.schema || extractParam.prompt)) {
|
|
145
|
+
resolvedExtract = extractParam;
|
|
146
|
+
}
|
|
147
|
+
if (!resolvedExtract) {
|
|
148
|
+
const jsonFormatObj = formats.find((f) => typeof f === 'object' && f !== null && f.type === 'json' && (f.schema || f.prompt));
|
|
149
|
+
if (jsonFormatObj) {
|
|
150
|
+
resolvedExtract = { schema: jsonFormatObj.schema, prompt: jsonFormatObj.prompt };
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (resolvedExtract && llmApiKey && llmProvider && VALID_LLM_PROVIDERS.includes(llmProvider)) {
|
|
154
|
+
const extractResult = await extractInlineJson(result.content, {
|
|
155
|
+
schema: resolvedExtract.schema,
|
|
156
|
+
prompt: resolvedExtract.prompt,
|
|
157
|
+
llmProvider: llmProvider,
|
|
158
|
+
llmApiKey: llmApiKey.trim(),
|
|
159
|
+
llmModel,
|
|
160
|
+
});
|
|
161
|
+
data.json = extractResult.data;
|
|
162
|
+
data.extractTokensUsed = extractResult.tokensUsed;
|
|
163
|
+
}
|
|
164
|
+
else if (formats.includes('json')) {
|
|
165
|
+
// Fallback: return structured metadata as JSON (no LLM)
|
|
166
|
+
data.json = result.extracted || result.metadata;
|
|
167
|
+
}
|
|
168
|
+
if (formats.includes('branding')) {
|
|
169
|
+
data.branding = result.branding;
|
|
170
|
+
}
|
|
171
|
+
if (formats.includes('summary')) {
|
|
172
|
+
data.summary = result.summary;
|
|
173
|
+
}
|
|
174
|
+
res.json({
|
|
175
|
+
success: true,
|
|
176
|
+
data,
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
catch (error) {
|
|
180
|
+
console.error('Firecrawl /v1/scrape error:', error);
|
|
181
|
+
res.status(500).json({
|
|
182
|
+
success: false,
|
|
183
|
+
error: 'An unexpected error occurred. Please try again.',
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
});
|
|
187
|
+
/**
|
|
188
|
+
* POST /v1/crawl - Firecrawl's crawl endpoint (async)
|
|
189
|
+
*
|
|
190
|
+
* Maps to our crawl() function with job queue
|
|
191
|
+
*/
|
|
192
|
+
router.post('/v1/crawl', async (req, res) => {
|
|
193
|
+
try {
|
|
194
|
+
const { url, limit = 100, maxDepth = 3, includePaths = [], excludePaths = [], scrapeOptions = {}, webhook, } = req.body;
|
|
195
|
+
// Validate URL
|
|
196
|
+
if (!url || typeof url !== 'string') {
|
|
197
|
+
res.status(400).json({
|
|
198
|
+
success: false,
|
|
199
|
+
error: 'Missing or invalid "url" parameter',
|
|
200
|
+
});
|
|
201
|
+
return;
|
|
202
|
+
}
|
|
203
|
+
try {
|
|
204
|
+
new URL(url);
|
|
205
|
+
}
|
|
206
|
+
catch {
|
|
207
|
+
res.status(400).json({
|
|
208
|
+
success: false,
|
|
209
|
+
error: 'Invalid URL format',
|
|
210
|
+
});
|
|
211
|
+
return;
|
|
212
|
+
}
|
|
213
|
+
// SECURITY: Validate URL to prevent SSRF attacks
|
|
214
|
+
try {
|
|
215
|
+
validateUrlForSSRF(url);
|
|
216
|
+
}
|
|
217
|
+
catch (error) {
|
|
218
|
+
if (error instanceof SSRFError) {
|
|
219
|
+
res.status(400).json({
|
|
220
|
+
success: false,
|
|
221
|
+
error: 'blocked_url',
|
|
222
|
+
message: 'Cannot fetch localhost, private networks, or non-HTTP URLs',
|
|
223
|
+
});
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
throw error;
|
|
227
|
+
}
|
|
228
|
+
// Create job (with owner for authorization)
|
|
229
|
+
const ownerId = req.auth?.keyInfo?.accountId;
|
|
230
|
+
const job = await jobQueue.createJob('crawl', webhook, ownerId);
|
|
231
|
+
// Start crawl in background
|
|
232
|
+
setImmediate(async () => {
|
|
233
|
+
try {
|
|
234
|
+
jobQueue.updateJob(job.id, { status: 'processing' });
|
|
235
|
+
// Build crawl options
|
|
236
|
+
const crawlOptions = {
|
|
237
|
+
maxPages: limit,
|
|
238
|
+
maxDepth,
|
|
239
|
+
tier: req.auth?.tier,
|
|
240
|
+
onProgress: (progress) => {
|
|
241
|
+
const total = progress.crawled + progress.queued;
|
|
242
|
+
jobQueue.updateJob(job.id, {
|
|
243
|
+
total,
|
|
244
|
+
completed: progress.crawled,
|
|
245
|
+
creditsUsed: progress.crawled,
|
|
246
|
+
});
|
|
247
|
+
},
|
|
248
|
+
// Map scrapeOptions to PeelOptions
|
|
249
|
+
...scrapeOptions,
|
|
250
|
+
};
|
|
251
|
+
// Add path filters if provided
|
|
252
|
+
if (includePaths.length > 0) {
|
|
253
|
+
crawlOptions.includePatterns = includePaths;
|
|
254
|
+
}
|
|
255
|
+
if (excludePaths.length > 0) {
|
|
256
|
+
crawlOptions.excludePatterns = excludePaths;
|
|
257
|
+
}
|
|
258
|
+
// Run crawl
|
|
259
|
+
const results = await crawl(url, crawlOptions);
|
|
260
|
+
// Map results to Firecrawl format
|
|
261
|
+
const firecrawlResults = results.map(r => ({
|
|
262
|
+
url: r.url,
|
|
263
|
+
markdown: r.markdown,
|
|
264
|
+
metadata: {
|
|
265
|
+
title: r.title,
|
|
266
|
+
description: '',
|
|
267
|
+
sourceURL: r.url,
|
|
268
|
+
statusCode: 200,
|
|
269
|
+
},
|
|
270
|
+
links: r.links,
|
|
271
|
+
}));
|
|
272
|
+
// Update job with results
|
|
273
|
+
jobQueue.updateJob(job.id, {
|
|
274
|
+
status: 'completed',
|
|
275
|
+
data: firecrawlResults,
|
|
276
|
+
total: results.length,
|
|
277
|
+
completed: results.length,
|
|
278
|
+
creditsUsed: results.length,
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
catch (error) {
|
|
282
|
+
jobQueue.updateJob(job.id, {
|
|
283
|
+
status: 'failed',
|
|
284
|
+
error: error.message || 'Unknown error',
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
});
|
|
288
|
+
// Return job ID immediately (Firecrawl format)
|
|
289
|
+
res.json({
|
|
290
|
+
success: true,
|
|
291
|
+
id: job.id,
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
catch (error) {
|
|
295
|
+
console.error('Firecrawl /v1/crawl error:', error);
|
|
296
|
+
res.status(500).json({
|
|
297
|
+
success: false,
|
|
298
|
+
error: 'An unexpected error occurred. Please try again.',
|
|
299
|
+
});
|
|
300
|
+
}
|
|
301
|
+
});
|
|
302
|
+
/**
|
|
303
|
+
* GET /v1/crawl/:id - Get crawl job status (Firecrawl format)
|
|
304
|
+
*/
|
|
305
|
+
router.get('/v1/crawl/:id', async (req, res) => {
|
|
306
|
+
try {
|
|
307
|
+
const id = req.params.id;
|
|
308
|
+
const job = await jobQueue.getJob(id);
|
|
309
|
+
if (!job) {
|
|
310
|
+
res.status(404).json({
|
|
311
|
+
success: false,
|
|
312
|
+
error: 'Job not found',
|
|
313
|
+
});
|
|
314
|
+
return;
|
|
315
|
+
}
|
|
316
|
+
// SECURITY: Verify the requester owns this job
|
|
317
|
+
const requestOwnerId = req.auth?.keyInfo?.accountId;
|
|
318
|
+
if (job.ownerId && requestOwnerId && job.ownerId !== requestOwnerId) {
|
|
319
|
+
res.status(404).json({
|
|
320
|
+
success: false,
|
|
321
|
+
error: 'Job not found',
|
|
322
|
+
});
|
|
323
|
+
return;
|
|
324
|
+
}
|
|
325
|
+
// Map our job status to Firecrawl's status format
|
|
326
|
+
const firecrawlStatus = job.status === 'processing' ? 'scraping' : job.status;
|
|
327
|
+
res.json({
|
|
328
|
+
success: true,
|
|
329
|
+
status: firecrawlStatus,
|
|
330
|
+
completed: job.completed || 0,
|
|
331
|
+
total: job.total || 0,
|
|
332
|
+
creditsUsed: job.creditsUsed || 0,
|
|
333
|
+
expiresAt: job.expiresAt,
|
|
334
|
+
data: job.data || [],
|
|
335
|
+
});
|
|
336
|
+
}
|
|
337
|
+
catch (error) {
|
|
338
|
+
console.error('Firecrawl GET /v1/crawl/:id error:', error);
|
|
339
|
+
res.status(500).json({
|
|
340
|
+
success: false,
|
|
341
|
+
error: 'An unexpected error occurred. Please try again.',
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
});
|
|
345
|
+
/**
|
|
346
|
+
* POST /v1/search - Firecrawl's search endpoint
|
|
347
|
+
*
|
|
348
|
+
* Uses DuckDuckGo search with optional scraping
|
|
349
|
+
*/
|
|
350
|
+
router.post('/v1/search', async (req, res) => {
|
|
351
|
+
try {
|
|
352
|
+
const { query, limit = 5, scrapeOptions = {}, } = req.body;
|
|
353
|
+
// Validate query
|
|
354
|
+
if (!query || typeof query !== 'string') {
|
|
355
|
+
res.status(400).json({
|
|
356
|
+
success: false,
|
|
357
|
+
error: 'Missing or invalid "query" parameter',
|
|
358
|
+
});
|
|
359
|
+
return;
|
|
360
|
+
}
|
|
361
|
+
// Use our search route logic (DuckDuckGo HTML scraping)
|
|
362
|
+
const { fetch: undiciFetch } = await import('undici');
|
|
363
|
+
const { load } = await import('cheerio');
|
|
364
|
+
const searchUrl = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
|
|
365
|
+
const response = await undiciFetch(searchUrl, {
|
|
366
|
+
headers: {
|
|
367
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
|
368
|
+
},
|
|
369
|
+
});
|
|
370
|
+
if (!response.ok) {
|
|
371
|
+
throw new Error(`Search failed: HTTP ${response.status}`);
|
|
372
|
+
}
|
|
373
|
+
const html = await response.text();
|
|
374
|
+
const $ = load(html);
|
|
375
|
+
const results = [];
|
|
376
|
+
$('.result').each((_i, elem) => {
|
|
377
|
+
if (results.length >= limit)
|
|
378
|
+
return;
|
|
379
|
+
const $result = $(elem);
|
|
380
|
+
let title = $result.find('.result__title').text().trim();
|
|
381
|
+
const rawUrl = $result.find('.result__a').attr('href') || '';
|
|
382
|
+
let snippet = $result.find('.result__snippet').text().trim();
|
|
383
|
+
if (!title || !rawUrl)
|
|
384
|
+
return;
|
|
385
|
+
// Extract actual URL from DuckDuckGo redirect
|
|
386
|
+
let url = rawUrl;
|
|
387
|
+
try {
|
|
388
|
+
const ddgUrl = new URL(rawUrl, 'https://duckduckgo.com');
|
|
389
|
+
const uddg = ddgUrl.searchParams.get('uddg');
|
|
390
|
+
if (uddg) {
|
|
391
|
+
url = decodeURIComponent(uddg);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
catch (e) {
|
|
395
|
+
if (process.env.DEBUG)
|
|
396
|
+
console.debug('[webpeel]', 'ddg url parse failed:', e instanceof Error ? e.message : e);
|
|
397
|
+
}
|
|
398
|
+
// Validate URL
|
|
399
|
+
try {
|
|
400
|
+
const parsed = new URL(url);
|
|
401
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
402
|
+
return;
|
|
403
|
+
}
|
|
404
|
+
url = parsed.href;
|
|
405
|
+
}
|
|
406
|
+
catch {
|
|
407
|
+
return;
|
|
408
|
+
}
|
|
409
|
+
results.push({ title, url, snippet });
|
|
410
|
+
});
|
|
411
|
+
// If scraping is requested, fetch each result
|
|
412
|
+
const firecrawlResults = await Promise.all(results.map(async (result) => {
|
|
413
|
+
try {
|
|
414
|
+
// Scrape the URL with provided options
|
|
415
|
+
const peelResult = await peel(result.url, {
|
|
416
|
+
format: 'markdown',
|
|
417
|
+
timeout: 10000,
|
|
418
|
+
...scrapeOptions,
|
|
419
|
+
});
|
|
420
|
+
return {
|
|
421
|
+
url: result.url,
|
|
422
|
+
markdown: peelResult.content,
|
|
423
|
+
metadata: {
|
|
424
|
+
title: peelResult.title || result.title,
|
|
425
|
+
description: result.snippet,
|
|
426
|
+
sourceURL: result.url,
|
|
427
|
+
statusCode: 200,
|
|
428
|
+
...peelResult.metadata,
|
|
429
|
+
},
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
catch (error) {
|
|
433
|
+
// Return basic result if scraping fails
|
|
434
|
+
return {
|
|
435
|
+
url: result.url,
|
|
436
|
+
markdown: '',
|
|
437
|
+
metadata: {
|
|
438
|
+
title: result.title,
|
|
439
|
+
description: result.snippet,
|
|
440
|
+
sourceURL: result.url,
|
|
441
|
+
error: error.message,
|
|
442
|
+
},
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
}));
|
|
446
|
+
res.json({
|
|
447
|
+
success: true,
|
|
448
|
+
data: firecrawlResults,
|
|
449
|
+
});
|
|
450
|
+
}
|
|
451
|
+
catch (error) {
|
|
452
|
+
console.error('Firecrawl /v1/search error:', error);
|
|
453
|
+
res.status(500).json({
|
|
454
|
+
success: false,
|
|
455
|
+
error: 'An unexpected error occurred. Please try again.',
|
|
456
|
+
});
|
|
457
|
+
}
|
|
458
|
+
});
|
|
459
|
+
/**
|
|
460
|
+
* POST /v1/map - Firecrawl's map endpoint
|
|
461
|
+
*
|
|
462
|
+
* Maps to our mapDomain() function
|
|
463
|
+
*/
|
|
464
|
+
router.post('/v1/map', async (req, res) => {
|
|
465
|
+
try {
|
|
466
|
+
const { url, limit = 5000, search, } = req.body;
|
|
467
|
+
// Validate URL
|
|
468
|
+
if (!url || typeof url !== 'string') {
|
|
469
|
+
res.status(400).json({
|
|
470
|
+
success: false,
|
|
471
|
+
error: 'Missing or invalid "url" parameter',
|
|
472
|
+
});
|
|
473
|
+
return;
|
|
474
|
+
}
|
|
475
|
+
try {
|
|
476
|
+
new URL(url);
|
|
477
|
+
}
|
|
478
|
+
catch {
|
|
479
|
+
res.status(400).json({
|
|
480
|
+
success: false,
|
|
481
|
+
error: 'Invalid URL format',
|
|
482
|
+
});
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
// SECURITY: Validate URL to prevent SSRF attacks
|
|
486
|
+
try {
|
|
487
|
+
validateUrlForSSRF(url);
|
|
488
|
+
}
|
|
489
|
+
catch (error) {
|
|
490
|
+
if (error instanceof SSRFError) {
|
|
491
|
+
res.status(400).json({
|
|
492
|
+
success: false,
|
|
493
|
+
error: 'blocked_url',
|
|
494
|
+
message: 'Cannot fetch localhost, private networks, or non-HTTP URLs',
|
|
495
|
+
});
|
|
496
|
+
return;
|
|
497
|
+
}
|
|
498
|
+
throw error;
|
|
499
|
+
}
|
|
500
|
+
// Run mapDomain
|
|
501
|
+
const result = await mapDomain(url, {
|
|
502
|
+
maxUrls: limit,
|
|
503
|
+
search,
|
|
504
|
+
});
|
|
505
|
+
res.json({
|
|
506
|
+
success: true,
|
|
507
|
+
links: result.urls,
|
|
508
|
+
});
|
|
509
|
+
}
|
|
510
|
+
catch (error) {
|
|
511
|
+
console.error('Firecrawl /v1/map error:', error);
|
|
512
|
+
res.status(500).json({
|
|
513
|
+
success: false,
|
|
514
|
+
error: 'An unexpected error occurred. Please try again.',
|
|
515
|
+
});
|
|
516
|
+
}
|
|
517
|
+
});
|
|
518
|
+
/**
|
|
519
|
+
* POST /v2/scrape - Firecrawl v2-compatible scrape with screenshot support
|
|
520
|
+
*
|
|
521
|
+
* Same as /v1/scrape but adds first-class screenshot support.
|
|
522
|
+
* When formats includes "screenshot" (and nothing else), returns
|
|
523
|
+
* a screenshot directly; otherwise falls through to peel() like v1.
|
|
524
|
+
*/
|
|
525
|
+
router.post('/v2/scrape', async (req, res) => {
|
|
526
|
+
try {
|
|
527
|
+
const { url, formats = ['markdown'], onlyMainContent = true, includeTags, excludeTags, waitFor, timeout, actions, headers, location,
|
|
528
|
+
// Screenshot-specific v2 options
|
|
529
|
+
fullPage, width, height, screenshotFormat, quality, stream, } = req.body;
|
|
530
|
+
// Validate URL
|
|
531
|
+
if (!url || typeof url !== 'string') {
|
|
532
|
+
res.status(400).json({
|
|
533
|
+
success: false,
|
|
534
|
+
error: 'Missing or invalid "url" parameter',
|
|
535
|
+
});
|
|
536
|
+
return;
|
|
537
|
+
}
|
|
538
|
+
// SECURITY: Validate URL to prevent SSRF attacks
|
|
539
|
+
try {
|
|
540
|
+
validateUrlForSSRF(url);
|
|
541
|
+
}
|
|
542
|
+
catch (error) {
|
|
543
|
+
if (error instanceof SSRFError) {
|
|
544
|
+
res.status(400).json({
|
|
545
|
+
success: false,
|
|
546
|
+
error: 'blocked_url',
|
|
547
|
+
message: 'Cannot fetch localhost, private networks, or non-HTTP URLs',
|
|
548
|
+
});
|
|
549
|
+
return;
|
|
550
|
+
}
|
|
551
|
+
throw error;
|
|
552
|
+
}
|
|
553
|
+
const wantsScreenshot = formats.includes('screenshot') || formats.includes('screenshot@fullPage');
|
|
554
|
+
// If screenshot-only request, use the dedicated screenshot function
|
|
555
|
+
if (wantsScreenshot && formats.length === 1) {
|
|
556
|
+
const result = await takeScreenshot(url, {
|
|
557
|
+
fullPage: fullPage === true || formats[0] === 'screenshot@fullPage',
|
|
558
|
+
width: typeof width === 'number' ? width : undefined,
|
|
559
|
+
height: typeof height === 'number' ? height : undefined,
|
|
560
|
+
format: screenshotFormat || 'png',
|
|
561
|
+
quality: typeof quality === 'number' ? quality : undefined,
|
|
562
|
+
waitFor: typeof waitFor === 'number' ? waitFor : undefined,
|
|
563
|
+
timeout: typeof timeout === 'number' ? timeout : 30000,
|
|
564
|
+
actions: mapFirecrawlActions(actions),
|
|
565
|
+
headers,
|
|
566
|
+
});
|
|
567
|
+
res.json({
|
|
568
|
+
success: true,
|
|
569
|
+
data: {
|
|
570
|
+
screenshot: `data:${result.contentType};base64,${result.screenshot}`,
|
|
571
|
+
metadata: {
|
|
572
|
+
sourceURL: result.url,
|
|
573
|
+
statusCode: 200,
|
|
574
|
+
format: result.format,
|
|
575
|
+
},
|
|
576
|
+
},
|
|
577
|
+
});
|
|
578
|
+
return;
|
|
579
|
+
}
|
|
580
|
+
// Otherwise, fall through to peel() like v1/scrape
|
|
581
|
+
const needsRender = waitFor !== undefined || actions !== undefined || wantsScreenshot;
|
|
582
|
+
const options = {
|
|
583
|
+
render: needsRender,
|
|
584
|
+
wait: waitFor,
|
|
585
|
+
timeout: timeout || 30000,
|
|
586
|
+
stream: stream === true,
|
|
587
|
+
includeTags: Array.isArray(includeTags) ? includeTags : undefined,
|
|
588
|
+
excludeTags: Array.isArray(excludeTags) ? excludeTags : undefined,
|
|
589
|
+
raw: onlyMainContent === false,
|
|
590
|
+
actions: mapFirecrawlActions(actions),
|
|
591
|
+
headers,
|
|
592
|
+
screenshot: wantsScreenshot,
|
|
593
|
+
screenshotFullPage: fullPage === true,
|
|
594
|
+
images: formats.includes('images'),
|
|
595
|
+
format: 'markdown',
|
|
596
|
+
};
|
|
597
|
+
if (location) {
|
|
598
|
+
options.location = {
|
|
599
|
+
country: location.country,
|
|
600
|
+
languages: location.languages,
|
|
601
|
+
};
|
|
602
|
+
}
|
|
603
|
+
if (options.stream) {
|
|
604
|
+
res.setHeader('X-Stream', 'true');
|
|
605
|
+
if (typeof res.flushHeaders === 'function') {
|
|
606
|
+
res.flushHeaders();
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
const result = await peel(url, options);
|
|
610
|
+
const data = {
|
|
611
|
+
markdown: result.content,
|
|
612
|
+
metadata: {
|
|
613
|
+
title: result.title,
|
|
614
|
+
description: result.metadata.description || '',
|
|
615
|
+
language: 'en',
|
|
616
|
+
sourceURL: result.url,
|
|
617
|
+
statusCode: 200,
|
|
618
|
+
...result.metadata,
|
|
619
|
+
},
|
|
620
|
+
};
|
|
621
|
+
if (formats.includes('html')) {
|
|
622
|
+
const htmlResult = await peel(url, { ...options, format: 'html' });
|
|
623
|
+
data.html = htmlResult.content;
|
|
624
|
+
}
|
|
625
|
+
if (formats.includes('rawHtml')) {
|
|
626
|
+
const rawResult = await peel(url, { ...options, format: 'html', raw: true });
|
|
627
|
+
data.rawHtml = rawResult.content;
|
|
628
|
+
}
|
|
629
|
+
if (formats.includes('links')) {
|
|
630
|
+
data.links = result.links;
|
|
631
|
+
}
|
|
632
|
+
if (wantsScreenshot && result.screenshot) {
|
|
633
|
+
data.screenshot = `data:image/png;base64,${result.screenshot}`;
|
|
634
|
+
}
|
|
635
|
+
if (formats.includes('images') && result.images) {
|
|
636
|
+
data.images = result.images;
|
|
637
|
+
}
|
|
638
|
+
res.json({
|
|
639
|
+
success: true,
|
|
640
|
+
data,
|
|
641
|
+
});
|
|
642
|
+
}
|
|
643
|
+
catch (error) {
|
|
644
|
+
console.error('Firecrawl /v2/scrape error:', error);
|
|
645
|
+
res.status(500).json({
|
|
646
|
+
success: false,
|
|
647
|
+
error: 'An unexpected error occurred. Please try again.',
|
|
648
|
+
});
|
|
649
|
+
}
|
|
650
|
+
});
|
|
651
|
+
return router;
|
|
652
|
+
}
|