spectrawl 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.4.3",
3
+ "version": "0.5.0",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -164,9 +164,18 @@ class BrowseEngine {
164
164
 
165
165
  result.url = page.url()
166
166
  result.title = await page.title()
167
+ result.statusCode = null // playwright doesn't expose easily, but we detect blocks below
167
168
  result.cached = false
168
169
  result.engine = this._engine
169
170
 
171
+ // Detect block pages (Cloudflare, Akamai, etc.)
172
+ const blockInfo = detectBlockPage(result.content, result.title, result.html, result.url)
173
+ if (blockInfo) {
174
+ result.blocked = true
175
+ result.blockType = blockInfo.type
176
+ result.blockDetail = blockInfo.detail
177
+ }
178
+
170
179
  if (!opts.screenshot) {
171
180
  this.cache?.set('scrape', url, { content: result.content, url: result.url, title: result.title })
172
181
  }
@@ -288,4 +297,81 @@ class BrowseEngine {
288
297
  }
289
298
  }
290
299
 
300
+ /**
301
+ * Detect block/challenge pages from CDNs and bot protection services.
302
+ * Returns { type, detail } if blocked, null if clean.
303
+ */
304
+ function detectBlockPage(content, title, html, url) {
305
+ const text = (content || '').toLowerCase()
306
+ const titleLower = (title || '').toLowerCase()
307
+ const htmlLower = (html || '').toLowerCase()
308
+
309
+ // Cloudflare
310
+ if (htmlLower.includes('cf-error-details') || htmlLower.includes('cf_chl_opt') ||
311
+ text.includes('attention required') && text.includes('cloudflare') ||
312
+ text.includes('checking if the site connection is secure') ||
313
+ titleLower.includes('just a moment') && htmlLower.includes('cloudflare') ||
314
+ text.includes('ray id:') && text.includes('cloudflare')) {
315
+ return { type: 'cloudflare', detail: 'Cloudflare bot challenge or block page detected' }
316
+ }
317
+
318
+ // Cloudflare RFC 9457 structured error (new format)
319
+ if (htmlLower.includes('application/problem+json') ||
320
+ text.includes('error 1') && text.includes('cloudflare') ||
321
+ htmlLower.includes('"type":') && htmlLower.includes('cloudflare.com/errors/')) {
322
+ return { type: 'cloudflare-rfc9457', detail: 'Cloudflare structured error response (RFC 9457)' }
323
+ }
324
+
325
+ // Akamai
326
+ if (text.includes('access denied') && htmlLower.includes('akamai') ||
327
+ htmlLower.includes('akamaighost') ||
328
+ text.includes('reference #') && text.includes('access denied')) {
329
+ return { type: 'akamai', detail: 'Akamai bot detection triggered' }
330
+ }
331
+
332
+ // AWS WAF
333
+ if (text.includes('request blocked') && htmlLower.includes('aws') ||
334
+ htmlLower.includes('awswaf')) {
335
+ return { type: 'aws-waf', detail: 'AWS WAF blocked the request' }
336
+ }
337
+
338
+ // Imperva / Incapsula
339
+ if (htmlLower.includes('incapsula') || htmlLower.includes('imperva') ||
340
+ text.includes('request unsuccessful') && text.includes('incapsula')) {
341
+ return { type: 'imperva', detail: 'Imperva/Incapsula bot detection triggered' }
342
+ }
343
+
344
+ // DataDome
345
+ if (htmlLower.includes('datadome') || htmlLower.includes('dd.js')) {
346
+ return { type: 'datadome', detail: 'DataDome bot detection triggered' }
347
+ }
348
+
349
+ // PerimeterX / HUMAN
350
+ if (htmlLower.includes('perimeterx') || htmlLower.includes('px-captcha') ||
351
+ htmlLower.includes('human security')) {
352
+ return { type: 'perimeterx', detail: 'PerimeterX/HUMAN bot detection triggered' }
353
+ }
354
+
355
+ // hCaptcha challenge
356
+ if (htmlLower.includes('hcaptcha.com') && htmlLower.includes('h-captcha')) {
357
+ return { type: 'hcaptcha', detail: 'hCaptcha challenge page' }
358
+ }
359
+
360
+ // reCAPTCHA challenge (standalone, not embedded)
361
+ if (htmlLower.includes('recaptcha') && text.length < 500 &&
362
+ (titleLower === '' || titleLower.includes('blocked') || titleLower.includes('verify'))) {
363
+ return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
364
+ }
365
+
366
+ // Generic bot detection signals
367
+ if (text.length < 200 && (
368
+ text.includes('access denied') || text.includes('403 forbidden') ||
369
+ text.includes('bot detected') || text.includes('automated access') ||
370
+ text.includes('please verify you are human') || text.includes('are you a robot'))) {
371
+ return { type: 'generic', detail: 'Generic bot detection or access denied page' }
372
+ }
373
+
374
+ return null
375
+ }
376
+
291
377
  module.exports = { BrowseEngine }
package/src/crawl.js CHANGED
@@ -276,15 +276,32 @@ class CrawlEngine {
276
276
 
277
277
  async _fetchPage(url, config, cookies) {
278
278
  try {
279
- const result = await this.browseEngine.browse(url, {
279
+ const browseOpts = {
280
280
  stealth: config.stealth,
281
281
  _cookies: cookies,
282
282
  timeout: config.timeout,
283
283
  html: true,
284
284
  noCache: true,
285
- fastMode: true // crawl mode: reduced delays for speed
286
- })
285
+ fastMode: true
286
+ }
287
+ let result = await this.browseEngine.browse(url, browseOpts)
288
+
289
+ // Auto-retry with full stealth if blocked
290
+ if (result?.blocked && browseOpts.fastMode) {
291
+ console.log(`[crawl] Block detected on ${url} (${result.blockType}) — retrying with full stealth`)
292
+ result = await this.browseEngine.browse(url, {
293
+ ...browseOpts,
294
+ fastMode: false,
295
+ stealth: true,
296
+ camoufox: true
297
+ })
298
+ }
299
+
287
300
  if (result?.content) {
301
+ // Skip if still blocked after retry
302
+ if (result.blocked) {
303
+ throw new Error(`Blocked by ${result.blockType}: ${result.blockDetail}`)
304
+ }
288
305
  const linkSource = result.html || result.content
289
306
  return {
290
307
  title: result.title || '',
package/src/server.js CHANGED
@@ -159,7 +159,15 @@ const server = http.createServer(async (req, res) => {
159
159
  return error(res, 404, 'Not found')
160
160
  } catch (err) {
161
161
  console.error('Server error:', err)
162
- return error(res, 500, err.message)
162
+ const status = err.statusCode || 500
163
+ const extra = {}
164
+ if (err.retryable) extra.retryable = true
165
+ if (err.suggestion) extra.suggestion = err.suggestion
166
+ if (err.blocked) {
167
+ extra.retryable = true
168
+ extra.suggestion = 'Retry with stealth:true or use Camoufox engine'
169
+ }
170
+ return error(res, status, err.message, extra)
163
171
  }
164
172
  })
165
173
 
@@ -168,8 +176,29 @@ function json(res, data, status = 200) {
168
176
  res.end(JSON.stringify(data))
169
177
  }
170
178
 
171
- function error(res, status, message) {
172
- json(res, { error: message }, status)
179
+ /**
180
+ * RFC 9457-style structured error responses.
181
+ * Machine-readable for AI agents consuming our API.
182
+ */
183
+ function error(res, status, message, extra = {}) {
184
+ const errorTypes = {
185
+ 400: 'bad-request',
186
+ 401: 'unauthorized',
187
+ 403: 'forbidden',
188
+ 404: 'not-found',
189
+ 429: 'rate-limited',
190
+ 500: 'internal-error',
191
+ 502: 'upstream-error',
192
+ 503: 'service-unavailable'
193
+ }
194
+ const body = {
195
+ type: `https://spectrawl.dev/errors/${errorTypes[status] || 'unknown'}`,
196
+ status,
197
+ title: errorTypes[status] ? errorTypes[status].replace(/-/g, ' ') : 'error',
198
+ detail: message,
199
+ ...extra
200
+ }
201
+ json(res, body, status)
173
202
  }
174
203
 
175
204
  function readBody(req) {