spectrawl 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +101 -6
- package/src/crawl.js +25 -5
- package/src/server.js +32 -3
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -127,12 +127,21 @@ class BrowseEngine {
|
|
|
127
127
|
|
|
128
128
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
130
|
+
if (opts.fastMode) {
|
|
131
|
+
// Crawl mode: minimal delays, just enough for lazy-load triggers
|
|
132
|
+
await page.waitForTimeout(400)
|
|
133
|
+
await page.evaluate(() => {
|
|
134
|
+
window.scrollBy({ top: 500, behavior: 'instant' })
|
|
135
|
+
})
|
|
136
|
+
await page.waitForTimeout(200)
|
|
137
|
+
} else {
|
|
138
|
+
// Normal browse: full human-like delays
|
|
139
|
+
await page.waitForTimeout(800 + Math.random() * 1500)
|
|
140
|
+
await page.evaluate(() => {
|
|
141
|
+
window.scrollBy({ top: Math.floor(Math.random() * 400) + 100, behavior: 'smooth' })
|
|
142
|
+
})
|
|
143
|
+
await page.waitForTimeout(300 + Math.random() * 700)
|
|
144
|
+
}
|
|
136
145
|
|
|
137
146
|
const result = {}
|
|
138
147
|
|
|
@@ -155,9 +164,18 @@ class BrowseEngine {
|
|
|
155
164
|
|
|
156
165
|
result.url = page.url()
|
|
157
166
|
result.title = await page.title()
|
|
167
|
+
result.statusCode = null // playwright doesn't expose easily, but we detect blocks below
|
|
158
168
|
result.cached = false
|
|
159
169
|
result.engine = this._engine
|
|
160
170
|
|
|
171
|
+
// Detect block pages (Cloudflare, Akamai, etc.)
|
|
172
|
+
const blockInfo = detectBlockPage(result.content, result.title, result.html, result.url)
|
|
173
|
+
if (blockInfo) {
|
|
174
|
+
result.blocked = true
|
|
175
|
+
result.blockType = blockInfo.type
|
|
176
|
+
result.blockDetail = blockInfo.detail
|
|
177
|
+
}
|
|
178
|
+
|
|
161
179
|
if (!opts.screenshot) {
|
|
162
180
|
this.cache?.set('scrape', url, { content: result.content, url: result.url, title: result.title })
|
|
163
181
|
}
|
|
@@ -279,4 +297,81 @@ class BrowseEngine {
|
|
|
279
297
|
}
|
|
280
298
|
}
|
|
281
299
|
|
|
300
|
+
/**
|
|
301
|
+
* Detect block/challenge pages from CDNs and bot protection services.
|
|
302
|
+
* Returns { type, detail } if blocked, null if clean.
|
|
303
|
+
*/
|
|
304
|
+
function detectBlockPage(content, title, html, url) {
|
|
305
|
+
const text = (content || '').toLowerCase()
|
|
306
|
+
const titleLower = (title || '').toLowerCase()
|
|
307
|
+
const htmlLower = (html || '').toLowerCase()
|
|
308
|
+
|
|
309
|
+
// Cloudflare
|
|
310
|
+
if (htmlLower.includes('cf-error-details') || htmlLower.includes('cf_chl_opt') ||
|
|
311
|
+
text.includes('attention required') && text.includes('cloudflare') ||
|
|
312
|
+
text.includes('checking if the site connection is secure') ||
|
|
313
|
+
titleLower.includes('just a moment') && htmlLower.includes('cloudflare') ||
|
|
314
|
+
text.includes('ray id:') && text.includes('cloudflare')) {
|
|
315
|
+
return { type: 'cloudflare', detail: 'Cloudflare bot challenge or block page detected' }
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Cloudflare RFC 9457 structured error (new format)
|
|
319
|
+
if (htmlLower.includes('application/problem+json') ||
|
|
320
|
+
text.includes('error 1') && text.includes('cloudflare') ||
|
|
321
|
+
htmlLower.includes('"type":') && htmlLower.includes('cloudflare.com/errors/')) {
|
|
322
|
+
return { type: 'cloudflare-rfc9457', detail: 'Cloudflare structured error response (RFC 9457)' }
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Akamai
|
|
326
|
+
if (text.includes('access denied') && htmlLower.includes('akamai') ||
|
|
327
|
+
htmlLower.includes('akamaighost') ||
|
|
328
|
+
text.includes('reference #') && text.includes('access denied')) {
|
|
329
|
+
return { type: 'akamai', detail: 'Akamai bot detection triggered' }
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// AWS WAF
|
|
333
|
+
if (text.includes('request blocked') && htmlLower.includes('aws') ||
|
|
334
|
+
htmlLower.includes('awswaf')) {
|
|
335
|
+
return { type: 'aws-waf', detail: 'AWS WAF blocked the request' }
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// Imperva / Incapsula
|
|
339
|
+
if (htmlLower.includes('incapsula') || htmlLower.includes('imperva') ||
|
|
340
|
+
text.includes('request unsuccessful') && text.includes('incapsula')) {
|
|
341
|
+
return { type: 'imperva', detail: 'Imperva/Incapsula bot detection triggered' }
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// DataDome
|
|
345
|
+
if (htmlLower.includes('datadome') || htmlLower.includes('dd.js')) {
|
|
346
|
+
return { type: 'datadome', detail: 'DataDome bot detection triggered' }
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// PerimeterX / HUMAN
|
|
350
|
+
if (htmlLower.includes('perimeterx') || htmlLower.includes('px-captcha') ||
|
|
351
|
+
htmlLower.includes('human security')) {
|
|
352
|
+
return { type: 'perimeterx', detail: 'PerimeterX/HUMAN bot detection triggered' }
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// hCaptcha challenge
|
|
356
|
+
if (htmlLower.includes('hcaptcha.com') && htmlLower.includes('h-captcha')) {
|
|
357
|
+
return { type: 'hcaptcha', detail: 'hCaptcha challenge page' }
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// reCAPTCHA challenge (standalone, not embedded)
|
|
361
|
+
if (htmlLower.includes('recaptcha') && text.length < 500 &&
|
|
362
|
+
(titleLower === '' || titleLower.includes('blocked') || titleLower.includes('verify'))) {
|
|
363
|
+
return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Generic bot detection signals
|
|
367
|
+
if (text.length < 200 && (
|
|
368
|
+
text.includes('access denied') || text.includes('403 forbidden') ||
|
|
369
|
+
text.includes('bot detected') || text.includes('automated access') ||
|
|
370
|
+
text.includes('please verify you are human') || text.includes('are you a robot'))) {
|
|
371
|
+
return { type: 'generic', detail: 'Generic bot detection or access denied page' }
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return null
|
|
375
|
+
}
|
|
376
|
+
|
|
282
377
|
module.exports = { BrowseEngine }
|
package/src/crawl.js
CHANGED
|
@@ -260,8 +260,10 @@ class CrawlEngine {
|
|
|
260
260
|
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
261
261
|
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
262
262
|
const concurrency = detectConcurrency()
|
|
263
|
-
//
|
|
264
|
-
|
|
263
|
+
// Realistic: ~0.8s per page with fast mode, limited by shared browser pipeline
|
|
264
|
+
// Concurrency helps but not linearly — shared browser bottleneck
|
|
265
|
+
const effectiveConcurrency = Math.min(concurrency, 5) // diminishing returns past 5
|
|
266
|
+
const pagesPerMinute = Math.floor(effectiveConcurrency * 30) // ~2s effective per page with overhead
|
|
265
267
|
return {
|
|
266
268
|
totalRamMB: totalMB,
|
|
267
269
|
freeRamMB: freeMB,
|
|
@@ -274,14 +276,32 @@ class CrawlEngine {
|
|
|
274
276
|
|
|
275
277
|
async _fetchPage(url, config, cookies) {
|
|
276
278
|
try {
|
|
277
|
-
const
|
|
279
|
+
const browseOpts = {
|
|
278
280
|
stealth: config.stealth,
|
|
279
281
|
_cookies: cookies,
|
|
280
282
|
timeout: config.timeout,
|
|
281
283
|
html: true,
|
|
282
|
-
noCache: true
|
|
283
|
-
|
|
284
|
+
noCache: true,
|
|
285
|
+
fastMode: true
|
|
286
|
+
}
|
|
287
|
+
let result = await this.browseEngine.browse(url, browseOpts)
|
|
288
|
+
|
|
289
|
+
// Auto-retry with full stealth if blocked
|
|
290
|
+
if (result?.blocked && browseOpts.fastMode) {
|
|
291
|
+
console.log(`[crawl] Block detected on ${url} (${result.blockType}) — retrying with full stealth`)
|
|
292
|
+
result = await this.browseEngine.browse(url, {
|
|
293
|
+
...browseOpts,
|
|
294
|
+
fastMode: false,
|
|
295
|
+
stealth: true,
|
|
296
|
+
camoufox: true
|
|
297
|
+
})
|
|
298
|
+
}
|
|
299
|
+
|
|
284
300
|
if (result?.content) {
|
|
301
|
+
// Skip if still blocked after retry
|
|
302
|
+
if (result.blocked) {
|
|
303
|
+
throw new Error(`Blocked by ${result.blockType}: ${result.blockDetail}`)
|
|
304
|
+
}
|
|
285
305
|
const linkSource = result.html || result.content
|
|
286
306
|
return {
|
|
287
307
|
title: result.title || '',
|
package/src/server.js
CHANGED
|
@@ -159,7 +159,15 @@ const server = http.createServer(async (req, res) => {
|
|
|
159
159
|
return error(res, 404, 'Not found')
|
|
160
160
|
} catch (err) {
|
|
161
161
|
console.error('Server error:', err)
|
|
162
|
-
|
|
162
|
+
const status = err.statusCode || 500
|
|
163
|
+
const extra = {}
|
|
164
|
+
if (err.retryable) extra.retryable = true
|
|
165
|
+
if (err.suggestion) extra.suggestion = err.suggestion
|
|
166
|
+
if (err.blocked) {
|
|
167
|
+
extra.retryable = true
|
|
168
|
+
extra.suggestion = 'Retry with stealth:true or use Camoufox engine'
|
|
169
|
+
}
|
|
170
|
+
return error(res, status, err.message, extra)
|
|
163
171
|
}
|
|
164
172
|
})
|
|
165
173
|
|
|
@@ -168,8 +176,29 @@ function json(res, data, status = 200) {
|
|
|
168
176
|
res.end(JSON.stringify(data))
|
|
169
177
|
}
|
|
170
178
|
|
|
171
|
-
|
|
172
|
-
|
|
179
|
+
/**
|
|
180
|
+
* RFC 9457-style structured error responses.
|
|
181
|
+
* Machine-readable for AI agents consuming our API.
|
|
182
|
+
*/
|
|
183
|
+
function error(res, status, message, extra = {}) {
|
|
184
|
+
const errorTypes = {
|
|
185
|
+
400: 'bad-request',
|
|
186
|
+
401: 'unauthorized',
|
|
187
|
+
403: 'forbidden',
|
|
188
|
+
404: 'not-found',
|
|
189
|
+
429: 'rate-limited',
|
|
190
|
+
500: 'internal-error',
|
|
191
|
+
502: 'upstream-error',
|
|
192
|
+
503: 'service-unavailable'
|
|
193
|
+
}
|
|
194
|
+
const body = {
|
|
195
|
+
type: `https://spectrawl.dev/errors/${errorTypes[status] || 'unknown'}`,
|
|
196
|
+
status,
|
|
197
|
+
title: errorTypes[status] ? errorTypes[status].replace(/-/g, ' ') : 'error',
|
|
198
|
+
detail: message,
|
|
199
|
+
...extra
|
|
200
|
+
}
|
|
201
|
+
json(res, body, status)
|
|
173
202
|
}
|
|
174
203
|
|
|
175
204
|
function readBody(req) {
|