spectrawl 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.4.2",
3
+ "version": "0.5.0",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -127,12 +127,21 @@ class BrowseEngine {
127
127
 
128
128
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
129
129
 
130
- // Human-like delays
131
- await page.waitForTimeout(800 + Math.random() * 1500)
132
- await page.evaluate(() => {
133
- window.scrollBy({ top: Math.floor(Math.random() * 400) + 100, behavior: 'smooth' })
134
- })
135
- await page.waitForTimeout(300 + Math.random() * 700)
130
+ if (opts.fastMode) {
131
+ // Crawl mode: minimal delays, just enough for lazy-load triggers
132
+ await page.waitForTimeout(400)
133
+ await page.evaluate(() => {
134
+ window.scrollBy({ top: 500, behavior: 'instant' })
135
+ })
136
+ await page.waitForTimeout(200)
137
+ } else {
138
+ // Normal browse: full human-like delays
139
+ await page.waitForTimeout(800 + Math.random() * 1500)
140
+ await page.evaluate(() => {
141
+ window.scrollBy({ top: Math.floor(Math.random() * 400) + 100, behavior: 'smooth' })
142
+ })
143
+ await page.waitForTimeout(300 + Math.random() * 700)
144
+ }
136
145
 
137
146
  const result = {}
138
147
 
@@ -155,9 +164,18 @@ class BrowseEngine {
155
164
 
156
165
  result.url = page.url()
157
166
  result.title = await page.title()
167
+ result.statusCode = null // playwright doesn't expose easily, but we detect blocks below
158
168
  result.cached = false
159
169
  result.engine = this._engine
160
170
 
171
+ // Detect block pages (Cloudflare, Akamai, etc.)
172
+ const blockInfo = detectBlockPage(result.content, result.title, result.html, result.url)
173
+ if (blockInfo) {
174
+ result.blocked = true
175
+ result.blockType = blockInfo.type
176
+ result.blockDetail = blockInfo.detail
177
+ }
178
+
161
179
  if (!opts.screenshot) {
162
180
  this.cache?.set('scrape', url, { content: result.content, url: result.url, title: result.title })
163
181
  }
@@ -279,4 +297,81 @@ class BrowseEngine {
279
297
  }
280
298
  }
281
299
 
300
+ /**
301
+ * Detect block/challenge pages from CDNs and bot protection services.
302
+ * Returns { type, detail } if blocked, null if clean.
303
+ */
304
+ function detectBlockPage(content, title, html, url) {
305
+ const text = (content || '').toLowerCase()
306
+ const titleLower = (title || '').toLowerCase()
307
+ const htmlLower = (html || '').toLowerCase()
308
+
309
+ // Cloudflare
310
+ if (htmlLower.includes('cf-error-details') || htmlLower.includes('cf_chl_opt') ||
311
+ text.includes('attention required') && text.includes('cloudflare') ||
312
+ text.includes('checking if the site connection is secure') ||
313
+ titleLower.includes('just a moment') && htmlLower.includes('cloudflare') ||
314
+ text.includes('ray id:') && text.includes('cloudflare')) {
315
+ return { type: 'cloudflare', detail: 'Cloudflare bot challenge or block page detected' }
316
+ }
317
+
318
+ // Cloudflare RFC 9457 structured error (new format)
319
+ if (htmlLower.includes('application/problem+json') ||
320
+ text.includes('error 1') && text.includes('cloudflare') ||
321
+ htmlLower.includes('"type":') && htmlLower.includes('cloudflare.com/errors/')) {
322
+ return { type: 'cloudflare-rfc9457', detail: 'Cloudflare structured error response (RFC 9457)' }
323
+ }
324
+
325
+ // Akamai
326
+ if (text.includes('access denied') && htmlLower.includes('akamai') ||
327
+ htmlLower.includes('akamaighost') ||
328
+ text.includes('reference #') && text.includes('access denied')) {
329
+ return { type: 'akamai', detail: 'Akamai bot detection triggered' }
330
+ }
331
+
332
+ // AWS WAF
333
+ if (text.includes('request blocked') && htmlLower.includes('aws') ||
334
+ htmlLower.includes('awswaf')) {
335
+ return { type: 'aws-waf', detail: 'AWS WAF blocked the request' }
336
+ }
337
+
338
+ // Imperva / Incapsula
339
+ if (htmlLower.includes('incapsula') || htmlLower.includes('imperva') ||
340
+ text.includes('request unsuccessful') && text.includes('incapsula')) {
341
+ return { type: 'imperva', detail: 'Imperva/Incapsula bot detection triggered' }
342
+ }
343
+
344
+ // DataDome
345
+ if (htmlLower.includes('datadome') || htmlLower.includes('dd.js')) {
346
+ return { type: 'datadome', detail: 'DataDome bot detection triggered' }
347
+ }
348
+
349
+ // PerimeterX / HUMAN
350
+ if (htmlLower.includes('perimeterx') || htmlLower.includes('px-captcha') ||
351
+ htmlLower.includes('human security')) {
352
+ return { type: 'perimeterx', detail: 'PerimeterX/HUMAN bot detection triggered' }
353
+ }
354
+
355
+ // hCaptcha challenge
356
+ if (htmlLower.includes('hcaptcha.com') && htmlLower.includes('h-captcha')) {
357
+ return { type: 'hcaptcha', detail: 'hCaptcha challenge page' }
358
+ }
359
+
360
+ // reCAPTCHA challenge (standalone, not embedded)
361
+ if (htmlLower.includes('recaptcha') && text.length < 500 &&
362
+ (titleLower === '' || titleLower.includes('blocked') || titleLower.includes('verify'))) {
363
+ return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
364
+ }
365
+
366
+ // Generic bot detection signals
367
+ if (text.length < 200 && (
368
+ text.includes('access denied') || text.includes('403 forbidden') ||
369
+ text.includes('bot detected') || text.includes('automated access') ||
370
+ text.includes('please verify you are human') || text.includes('are you a robot'))) {
371
+ return { type: 'generic', detail: 'Generic bot detection or access denied page' }
372
+ }
373
+
374
+ return null
375
+ }
376
+
282
377
  module.exports = { BrowseEngine }
package/src/crawl.js CHANGED
@@ -260,8 +260,10 @@ class CrawlEngine {
260
260
  const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
261
261
  const freeMB = Math.floor(os.freemem() / 1024 / 1024)
262
262
  const concurrency = detectConcurrency()
263
- // Estimate: each page takes ~4s with stealth delays
264
- const pagesPerMinute = concurrency * 15 // ~4s per page
263
+ // Realistic: ~0.8s per page with fast mode, limited by shared browser pipeline
264
+ // Concurrency helps but not linearly shared browser bottleneck
265
+ const effectiveConcurrency = Math.min(concurrency, 5) // diminishing returns past 5
266
+ const pagesPerMinute = Math.floor(effectiveConcurrency * 30) // ~2s effective per page with overhead
265
267
  return {
266
268
  totalRamMB: totalMB,
267
269
  freeRamMB: freeMB,
@@ -274,14 +276,32 @@ class CrawlEngine {
274
276
 
275
277
  async _fetchPage(url, config, cookies) {
276
278
  try {
277
- const result = await this.browseEngine.browse(url, {
279
+ const browseOpts = {
278
280
  stealth: config.stealth,
279
281
  _cookies: cookies,
280
282
  timeout: config.timeout,
281
283
  html: true,
282
- noCache: true
283
- })
284
+ noCache: true,
285
+ fastMode: true
286
+ }
287
+ let result = await this.browseEngine.browse(url, browseOpts)
288
+
289
+ // Auto-retry with full stealth if blocked
290
+ if (result?.blocked && browseOpts.fastMode) {
291
+ console.log(`[crawl] Block detected on ${url} (${result.blockType}) — retrying with full stealth`)
292
+ result = await this.browseEngine.browse(url, {
293
+ ...browseOpts,
294
+ fastMode: false,
295
+ stealth: true,
296
+ camoufox: true
297
+ })
298
+ }
299
+
284
300
  if (result?.content) {
301
+ // Skip if still blocked after retry
302
+ if (result.blocked) {
303
+ throw new Error(`Blocked by ${result.blockType}: ${result.blockDetail}`)
304
+ }
285
305
  const linkSource = result.html || result.content
286
306
  return {
287
307
  title: result.title || '',
package/src/server.js CHANGED
@@ -159,7 +159,15 @@ const server = http.createServer(async (req, res) => {
159
159
  return error(res, 404, 'Not found')
160
160
  } catch (err) {
161
161
  console.error('Server error:', err)
162
- return error(res, 500, err.message)
162
+ const status = err.statusCode || 500
163
+ const extra = {}
164
+ if (err.retryable) extra.retryable = true
165
+ if (err.suggestion) extra.suggestion = err.suggestion
166
+ if (err.blocked) {
167
+ extra.retryable = true
168
+ extra.suggestion = 'Retry with stealth:true or use Camoufox engine'
169
+ }
170
+ return error(res, status, err.message, extra)
163
171
  }
164
172
  })
165
173
 
@@ -168,8 +176,29 @@ function json(res, data, status = 200) {
168
176
  res.end(JSON.stringify(data))
169
177
  }
170
178
 
171
- function error(res, status, message) {
172
- json(res, { error: message }, status)
179
+ /**
180
+ * RFC 9457-style structured error responses.
181
+ * Machine-readable for AI agents consuming our API.
182
+ */
183
+ function error(res, status, message, extra = {}) {
184
+ const errorTypes = {
185
+ 400: 'bad-request',
186
+ 401: 'unauthorized',
187
+ 403: 'forbidden',
188
+ 404: 'not-found',
189
+ 429: 'rate-limited',
190
+ 500: 'internal-error',
191
+ 502: 'upstream-error',
192
+ 503: 'service-unavailable'
193
+ }
194
+ const body = {
195
+ type: `https://spectrawl.dev/errors/${errorTypes[status] || 'unknown'}`,
196
+ status,
197
+ title: errorTypes[status] ? errorTypes[status].replace(/-/g, ' ') : 'error',
198
+ detail: message,
199
+ ...extra
200
+ }
201
+ json(res, body, status)
173
202
  }
174
203
 
175
204
  function readBody(req) {