spectrawl 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +86 -0
- package/src/crawl.js +20 -3
- package/src/server.js +32 -3
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -164,9 +164,18 @@ class BrowseEngine {
|
|
|
164
164
|
|
|
165
165
|
result.url = page.url()
|
|
166
166
|
result.title = await page.title()
|
|
167
|
+
result.statusCode = null // playwright doesn't expose easily, but we detect blocks below
|
|
167
168
|
result.cached = false
|
|
168
169
|
result.engine = this._engine
|
|
169
170
|
|
|
171
|
+
// Detect block pages (Cloudflare, Akamai, etc.)
|
|
172
|
+
const blockInfo = detectBlockPage(result.content, result.title, result.html, result.url)
|
|
173
|
+
if (blockInfo) {
|
|
174
|
+
result.blocked = true
|
|
175
|
+
result.blockType = blockInfo.type
|
|
176
|
+
result.blockDetail = blockInfo.detail
|
|
177
|
+
}
|
|
178
|
+
|
|
170
179
|
if (!opts.screenshot) {
|
|
171
180
|
this.cache?.set('scrape', url, { content: result.content, url: result.url, title: result.title })
|
|
172
181
|
}
|
|
@@ -288,4 +297,81 @@ class BrowseEngine {
|
|
|
288
297
|
}
|
|
289
298
|
}
|
|
290
299
|
|
|
300
|
+
/**
|
|
301
|
+
* Detect block/challenge pages from CDNs and bot protection services.
|
|
302
|
+
* Returns { type, detail } if blocked, null if clean.
|
|
303
|
+
*/
|
|
304
|
+
function detectBlockPage(content, title, html, url) {
|
|
305
|
+
const text = (content || '').toLowerCase()
|
|
306
|
+
const titleLower = (title || '').toLowerCase()
|
|
307
|
+
const htmlLower = (html || '').toLowerCase()
|
|
308
|
+
|
|
309
|
+
// Cloudflare
|
|
310
|
+
if (htmlLower.includes('cf-error-details') || htmlLower.includes('cf_chl_opt') ||
|
|
311
|
+
text.includes('attention required') && text.includes('cloudflare') ||
|
|
312
|
+
text.includes('checking if the site connection is secure') ||
|
|
313
|
+
titleLower.includes('just a moment') && htmlLower.includes('cloudflare') ||
|
|
314
|
+
text.includes('ray id:') && text.includes('cloudflare')) {
|
|
315
|
+
return { type: 'cloudflare', detail: 'Cloudflare bot challenge or block page detected' }
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Cloudflare RFC 9457 structured error (new format)
|
|
319
|
+
if (htmlLower.includes('application/problem+json') ||
|
|
320
|
+
text.includes('error 1') && text.includes('cloudflare') ||
|
|
321
|
+
htmlLower.includes('"type":') && htmlLower.includes('cloudflare.com/errors/')) {
|
|
322
|
+
return { type: 'cloudflare-rfc9457', detail: 'Cloudflare structured error response (RFC 9457)' }
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Akamai
|
|
326
|
+
if (text.includes('access denied') && htmlLower.includes('akamai') ||
|
|
327
|
+
htmlLower.includes('akamaighost') ||
|
|
328
|
+
text.includes('reference #') && text.includes('access denied')) {
|
|
329
|
+
return { type: 'akamai', detail: 'Akamai bot detection triggered' }
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// AWS WAF
|
|
333
|
+
if (text.includes('request blocked') && htmlLower.includes('aws') ||
|
|
334
|
+
htmlLower.includes('awswaf')) {
|
|
335
|
+
return { type: 'aws-waf', detail: 'AWS WAF blocked the request' }
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// Imperva / Incapsula
|
|
339
|
+
if (htmlLower.includes('incapsula') || htmlLower.includes('imperva') ||
|
|
340
|
+
text.includes('request unsuccessful') && text.includes('incapsula')) {
|
|
341
|
+
return { type: 'imperva', detail: 'Imperva/Incapsula bot detection triggered' }
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// DataDome
|
|
345
|
+
if (htmlLower.includes('datadome') || htmlLower.includes('dd.js')) {
|
|
346
|
+
return { type: 'datadome', detail: 'DataDome bot detection triggered' }
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// PerimeterX / HUMAN
|
|
350
|
+
if (htmlLower.includes('perimeterx') || htmlLower.includes('px-captcha') ||
|
|
351
|
+
htmlLower.includes('human security')) {
|
|
352
|
+
return { type: 'perimeterx', detail: 'PerimeterX/HUMAN bot detection triggered' }
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// hCaptcha challenge
|
|
356
|
+
if (htmlLower.includes('hcaptcha.com') && htmlLower.includes('h-captcha')) {
|
|
357
|
+
return { type: 'hcaptcha', detail: 'hCaptcha challenge page' }
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// reCAPTCHA challenge (standalone, not embedded)
|
|
361
|
+
if (htmlLower.includes('recaptcha') && text.length < 500 &&
|
|
362
|
+
(titleLower === '' || titleLower.includes('blocked') || titleLower.includes('verify'))) {
|
|
363
|
+
return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Generic bot detection signals
|
|
367
|
+
if (text.length < 200 && (
|
|
368
|
+
text.includes('access denied') || text.includes('403 forbidden') ||
|
|
369
|
+
text.includes('bot detected') || text.includes('automated access') ||
|
|
370
|
+
text.includes('please verify you are human') || text.includes('are you a robot'))) {
|
|
371
|
+
return { type: 'generic', detail: 'Generic bot detection or access denied page' }
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return null
|
|
375
|
+
}
|
|
376
|
+
|
|
291
377
|
module.exports = { BrowseEngine }
|
package/src/crawl.js
CHANGED
|
@@ -276,15 +276,32 @@ class CrawlEngine {
|
|
|
276
276
|
|
|
277
277
|
async _fetchPage(url, config, cookies) {
|
|
278
278
|
try {
|
|
279
|
-
const
|
|
279
|
+
const browseOpts = {
|
|
280
280
|
stealth: config.stealth,
|
|
281
281
|
_cookies: cookies,
|
|
282
282
|
timeout: config.timeout,
|
|
283
283
|
html: true,
|
|
284
284
|
noCache: true,
|
|
285
|
-
fastMode: true
|
|
286
|
-
}
|
|
285
|
+
fastMode: true
|
|
286
|
+
}
|
|
287
|
+
let result = await this.browseEngine.browse(url, browseOpts)
|
|
288
|
+
|
|
289
|
+
// Auto-retry with full stealth if blocked
|
|
290
|
+
if (result?.blocked && browseOpts.fastMode) {
|
|
291
|
+
console.log(`[crawl] Block detected on ${url} (${result.blockType}) — retrying with full stealth`)
|
|
292
|
+
result = await this.browseEngine.browse(url, {
|
|
293
|
+
...browseOpts,
|
|
294
|
+
fastMode: false,
|
|
295
|
+
stealth: true,
|
|
296
|
+
camoufox: true
|
|
297
|
+
})
|
|
298
|
+
}
|
|
299
|
+
|
|
287
300
|
if (result?.content) {
|
|
301
|
+
// Skip if still blocked after retry
|
|
302
|
+
if (result.blocked) {
|
|
303
|
+
throw new Error(`Blocked by ${result.blockType}: ${result.blockDetail}`)
|
|
304
|
+
}
|
|
288
305
|
const linkSource = result.html || result.content
|
|
289
306
|
return {
|
|
290
307
|
title: result.title || '',
|
package/src/server.js
CHANGED
|
@@ -159,7 +159,15 @@ const server = http.createServer(async (req, res) => {
|
|
|
159
159
|
return error(res, 404, 'Not found')
|
|
160
160
|
} catch (err) {
|
|
161
161
|
console.error('Server error:', err)
|
|
162
|
-
|
|
162
|
+
const status = err.statusCode || 500
|
|
163
|
+
const extra = {}
|
|
164
|
+
if (err.retryable) extra.retryable = true
|
|
165
|
+
if (err.suggestion) extra.suggestion = err.suggestion
|
|
166
|
+
if (err.blocked) {
|
|
167
|
+
extra.retryable = true
|
|
168
|
+
extra.suggestion = 'Retry with stealth:true or use Camoufox engine'
|
|
169
|
+
}
|
|
170
|
+
return error(res, status, err.message, extra)
|
|
163
171
|
}
|
|
164
172
|
})
|
|
165
173
|
|
|
@@ -168,8 +176,29 @@ function json(res, data, status = 200) {
|
|
|
168
176
|
res.end(JSON.stringify(data))
|
|
169
177
|
}
|
|
170
178
|
|
|
171
|
-
|
|
172
|
-
|
|
179
|
+
/**
|
|
180
|
+
* RFC 9457-style structured error responses.
|
|
181
|
+
* Machine-readable for AI agents consuming our API.
|
|
182
|
+
*/
|
|
183
|
+
function error(res, status, message, extra = {}) {
|
|
184
|
+
const errorTypes = {
|
|
185
|
+
400: 'bad-request',
|
|
186
|
+
401: 'unauthorized',
|
|
187
|
+
403: 'forbidden',
|
|
188
|
+
404: 'not-found',
|
|
189
|
+
429: 'rate-limited',
|
|
190
|
+
500: 'internal-error',
|
|
191
|
+
502: 'upstream-error',
|
|
192
|
+
503: 'service-unavailable'
|
|
193
|
+
}
|
|
194
|
+
const body = {
|
|
195
|
+
type: `https://spectrawl.dev/errors/${errorTypes[status] || 'unknown'}`,
|
|
196
|
+
status,
|
|
197
|
+
title: errorTypes[status] ? errorTypes[status].replace(/-/g, ' ') : 'error',
|
|
198
|
+
detail: message,
|
|
199
|
+
...extra
|
|
200
|
+
}
|
|
201
|
+
json(res, body, status)
|
|
173
202
|
}
|
|
174
203
|
|
|
175
204
|
function readBody(req) {
|