spectrawl 0.4.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.4.3",
3
+ "version": "0.6.0",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/agent.js ADDED
@@ -0,0 +1,295 @@
1
+ /**
2
+ * Spectrawl Agent Engine
3
+ * Natural language browser actions — "click the sign in button", "fill the search box with query".
4
+ * Uses LLM to interpret page DOM and generate Playwright actions.
5
+ */
6
+
7
+ const https = require('https')
8
+
9
+ class AgentEngine {
10
+ constructor(browseEngine, config = {}) {
11
+ this.browseEngine = browseEngine
12
+ this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
13
+ this.openaiKey = config.openaiKey || process.env.OPENAI_API_KEY
14
+ this.model = config.model || 'gemini-2.0-flash'
15
+ }
16
+
17
+ /**
18
+ * Execute a natural language action on a page.
19
+ * @param {string} url - URL to navigate to
20
+ * @param {string} instruction - what to do (e.g. "click the login button")
21
+ * @param {object} opts - options
22
+ * @param {number} opts.maxSteps - max number of actions to take (default 5)
23
+ * @param {boolean} opts.screenshot - take screenshot after action
24
+ * @param {number} opts.timeout - timeout per action in ms
25
+ */
26
+ async act(url, instruction, opts = {}) {
27
+ const maxSteps = opts.maxSteps || 5
28
+ const timeout = opts.timeout || 30000
29
+ const startTime = Date.now()
30
+ const steps = []
31
+
32
+ // Get a browser page
33
+ const { page, context } = await this.browseEngine.getPage({ url, timeout })
34
+
35
+ try {
36
+ // Wait for page to be ready
37
+ await page.waitForLoadState('domcontentloaded', { timeout: 10000 }).catch(() => {})
38
+ await page.waitForTimeout(1000)
39
+
40
+ for (let i = 0; i < maxSteps; i++) {
41
+ // Get simplified DOM
42
+ const dom = await this._getSimplifiedDOM(page)
43
+
44
+ // Ask LLM what to do
45
+ const action = await this._planAction(dom, instruction, steps, page.url())
46
+
47
+ if (action.done) {
48
+ steps.push({ step: i + 1, action: 'done', reason: action.reason })
49
+ break
50
+ }
51
+
52
+ // Execute the action
53
+ try {
54
+ const result = await this._executeAction(page, action)
55
+ steps.push({ step: i + 1, ...action, result: result || 'ok' })
56
+
57
+ // Wait for potential navigation/load
58
+ await page.waitForTimeout(500 + Math.random() * 1000)
59
+ await page.waitForLoadState('domcontentloaded', { timeout: 5000 }).catch(() => {})
60
+ } catch (err) {
61
+ steps.push({ step: i + 1, ...action, error: err.message })
62
+ // Continue trying if there are more steps
63
+ }
64
+ }
65
+
66
+ // Get final page state
67
+ const finalContent = await page.evaluate(() => document.body?.innerText?.slice(0, 10000) || '')
68
+ const finalUrl = page.url()
69
+ const finalTitle = await page.title()
70
+
71
+ let screenshot = null
72
+ if (opts.screenshot) {
73
+ screenshot = await page.screenshot({ type: 'png', fullPage: false })
74
+ }
75
+
76
+ return {
77
+ success: steps.some(s => s.action === 'done' || !s.error),
78
+ url: finalUrl,
79
+ title: finalTitle,
80
+ steps,
81
+ content: finalContent,
82
+ screenshot,
83
+ duration: Date.now() - startTime
84
+ }
85
+ } finally {
86
+ await context.close().catch(() => {})
87
+ }
88
+ }
89
+
90
+ /**
91
+ * Get a simplified DOM representation for the LLM.
92
+ * Strips noise, keeps interactive elements with indices.
93
+ */
94
+ async _getSimplifiedDOM(page) {
95
+ return page.evaluate(() => {
96
+ const elements = []
97
+ const interactiveSelectors = [
98
+ 'a[href]', 'button', 'input', 'textarea', 'select',
99
+ '[role="button"]', '[role="link"]', '[role="tab"]',
100
+ '[onclick]', '[type="submit"]', 'label'
101
+ ]
102
+
103
+ const allElements = document.querySelectorAll(interactiveSelectors.join(','))
104
+
105
+ allElements.forEach((el, idx) => {
106
+ if (!el.offsetParent && el.tagName !== 'INPUT') return // skip hidden
107
+ const rect = el.getBoundingClientRect()
108
+ if (rect.width === 0 && rect.height === 0) return
109
+
110
+ const tag = el.tagName.toLowerCase()
111
+ const type = el.type || ''
112
+ const text = (el.textContent || '').trim().slice(0, 100)
113
+ const placeholder = el.placeholder || ''
114
+ const ariaLabel = el.getAttribute('aria-label') || ''
115
+ const href = el.href || ''
116
+ const value = el.value || ''
117
+ const name = el.name || ''
118
+ const id = el.id || ''
119
+
120
+ // Create a unique selector for this element
121
+ let selector = tag
122
+ if (id) selector = `#${id}`
123
+ else if (name) selector = `${tag}[name="${name}"]`
124
+ else if (ariaLabel) selector = `${tag}[aria-label="${ariaLabel}"]`
125
+
126
+ elements.push({
127
+ idx,
128
+ tag,
129
+ type,
130
+ text: text.slice(0, 80),
131
+ placeholder,
132
+ ariaLabel,
133
+ href: href.slice(0, 100),
134
+ value,
135
+ selector,
136
+ id,
137
+ name
138
+ })
139
+ })
140
+
141
+ return {
142
+ title: document.title,
143
+ url: location.href,
144
+ elements: elements.slice(0, 100) // cap at 100 elements
145
+ }
146
+ })
147
+ }
148
+
149
+ /**
150
+ * Ask LLM to plan the next action.
151
+ */
152
+ async _planAction(dom, instruction, previousSteps, currentUrl) {
153
+ const prompt = `You are a browser automation agent. Given the current page state and instruction, determine the next action.
154
+
155
+ Current URL: ${currentUrl}
156
+ Page title: ${dom.title}
157
+
158
+ Interactive elements on page:
159
+ ${dom.elements.map(e => `[${e.idx}] <${e.tag}${e.type ? ` type="${e.type}"` : ''}${e.id ? ` id="${e.id}"` : ''}${e.name ? ` name="${e.name}"` : ''}> ${e.text || e.placeholder || e.ariaLabel || e.href || '(empty)'}`).join('\n')}
160
+
161
+ Instruction: ${instruction}
162
+
163
+ Previous steps: ${previousSteps.length > 0 ? JSON.stringify(previousSteps) : 'none'}
164
+
165
+ Respond with a JSON object:
166
+ - If the instruction is complete: {"done": true, "reason": "why it's done"}
167
+ - To click: {"action": "click", "elementIdx": 5, "reason": "clicking the login button"}
168
+ - To type: {"action": "type", "elementIdx": 3, "text": "hello", "reason": "filling search box"}
169
+ - To select: {"action": "select", "elementIdx": 7, "value": "option1", "reason": "selecting dropdown"}
170
+ - To press a key: {"action": "press", "key": "Enter", "reason": "submitting form"}
171
+ - To scroll: {"action": "scroll", "direction": "down", "reason": "loading more content"}
172
+
173
+ Only return valid JSON. No explanation.`
174
+
175
+ const result = await this._llmCall(prompt)
176
+ return result
177
+ }
178
+
179
+ /**
180
+ * Execute a planned action on the page.
181
+ */
182
+ async _executeAction(page, action) {
183
+ switch (action.action) {
184
+ case 'click': {
185
+ const elements = await page.$$('a[href], button, input, textarea, select, [role="button"], [role="link"], [role="tab"], [onclick], [type="submit"], label')
186
+ const visibleElements = []
187
+ for (const el of elements) {
188
+ const visible = await el.isVisible().catch(() => false)
189
+ if (visible) visibleElements.push(el)
190
+ }
191
+ const target = visibleElements[action.elementIdx]
192
+ if (!target) throw new Error(`Element [${action.elementIdx}] not found`)
193
+ await target.click({ timeout: 5000 })
194
+ return 'clicked'
195
+ }
196
+
197
+ case 'type': {
198
+ const elements = await page.$$('a[href], button, input, textarea, select, [role="button"], [role="link"], [role="tab"], [onclick], [type="submit"], label')
199
+ const visibleElements = []
200
+ for (const el of elements) {
201
+ const visible = await el.isVisible().catch(() => false)
202
+ if (visible) visibleElements.push(el)
203
+ }
204
+ const target = visibleElements[action.elementIdx]
205
+ if (!target) throw new Error(`Element [${action.elementIdx}] not found`)
206
+ await target.fill('')
207
+ await target.type(action.text, { delay: 50 + Math.random() * 100 })
208
+ return 'typed'
209
+ }
210
+
211
+ case 'select': {
212
+ const elements = await page.$$('a[href], button, input, textarea, select, [role="button"], [role="link"], [role="tab"], [onclick], [type="submit"], label')
213
+ const visibleElements = []
214
+ for (const el of elements) {
215
+ const visible = await el.isVisible().catch(() => false)
216
+ if (visible) visibleElements.push(el)
217
+ }
218
+ const target = visibleElements[action.elementIdx]
219
+ if (!target) throw new Error(`Element [${action.elementIdx}] not found`)
220
+ await target.selectOption(action.value)
221
+ return 'selected'
222
+ }
223
+
224
+ case 'press':
225
+ await page.keyboard.press(action.key)
226
+ return 'pressed'
227
+
228
+ case 'scroll':
229
+ await page.evaluate((dir) => {
230
+ window.scrollBy(0, dir === 'up' ? -500 : 500)
231
+ }, action.direction)
232
+ return 'scrolled'
233
+
234
+ default:
235
+ throw new Error(`Unknown action: ${action.action}`)
236
+ }
237
+ }
238
+
239
+ async _llmCall(prompt) {
240
+ if (this.apiKey) {
241
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.model}:generateContent?key=${this.apiKey}`
242
+ const body = {
243
+ contents: [{ parts: [{ text: prompt }] }],
244
+ generationConfig: { responseMimeType: 'application/json', temperature: 0.1 }
245
+ }
246
+
247
+ const response = await this._post(url, body)
248
+ const text = response?.candidates?.[0]?.content?.parts?.[0]?.text
249
+ if (!text) throw new Error('Empty LLM response')
250
+ return JSON.parse(text)
251
+ } else if (this.openaiKey) {
252
+ const url = 'https://api.openai.com/v1/chat/completions'
253
+ const body = {
254
+ model: 'gpt-4o-mini',
255
+ messages: [{ role: 'user', content: prompt }],
256
+ response_format: { type: 'json_object' },
257
+ temperature: 0.1
258
+ }
259
+ const response = await this._post(url, body, { 'Authorization': `Bearer ${this.openaiKey}` })
260
+ return JSON.parse(response?.choices?.[0]?.message?.content)
261
+ }
262
+ throw new Error('No LLM API key configured')
263
+ }
264
+
265
+ _post(url, body, extraHeaders = {}) {
266
+ return new Promise((resolve, reject) => {
267
+ const urlObj = new URL(url)
268
+ const data = JSON.stringify(body)
269
+ const opts = {
270
+ hostname: urlObj.hostname,
271
+ path: urlObj.pathname + urlObj.search,
272
+ method: 'POST',
273
+ headers: {
274
+ 'Content-Type': 'application/json',
275
+ 'Content-Length': Buffer.byteLength(data),
276
+ ...extraHeaders
277
+ }
278
+ }
279
+ const req = https.request(opts, res => {
280
+ let responseData = ''
281
+ res.on('data', chunk => responseData += chunk)
282
+ res.on('end', () => {
283
+ try { resolve(JSON.parse(responseData)) }
284
+ catch (e) { reject(new Error(`Invalid JSON: ${responseData.slice(0, 200)}`)) }
285
+ })
286
+ })
287
+ req.on('error', reject)
288
+ req.setTimeout(30000, () => { req.destroy(); reject(new Error('LLM timeout')) })
289
+ req.write(data)
290
+ req.end()
291
+ })
292
+ }
293
+ }
294
+
295
+ module.exports = { AgentEngine }
@@ -120,6 +120,40 @@ class BrowseEngine {
120
120
  const context = await this._createContext(browser, opts)
121
121
  const page = await context.newPage()
122
122
 
123
+ // Network request capturing
124
+ const networkRequests = []
125
+ if (opts.captureNetwork) {
126
+ page.on('request', req => {
127
+ const resourceType = req.resourceType()
128
+ if (['xhr', 'fetch'].includes(resourceType)) {
129
+ networkRequests.push({
130
+ url: req.url(),
131
+ method: req.method(),
132
+ resourceType,
133
+ headers: opts.captureNetworkHeaders ? req.headers() : undefined,
134
+ postData: req.postData() || undefined
135
+ })
136
+ }
137
+ })
138
+ page.on('response', async res => {
139
+ const req = res.request()
140
+ const resourceType = req.resourceType()
141
+ if (['xhr', 'fetch'].includes(resourceType)) {
142
+ const existing = networkRequests.find(r => r.url === req.url() && r.method === req.method())
143
+ if (existing) {
144
+ existing.status = res.status()
145
+ existing.contentType = res.headers()['content-type'] || null
146
+ if (opts.captureNetworkBody) {
147
+ try {
148
+ const body = await res.text().catch(() => null)
149
+ if (body && body.length < 50000) existing.body = body
150
+ } catch (e) { /* ignore */ }
151
+ }
152
+ }
153
+ }
154
+ })
155
+ }
156
+
123
157
  try {
124
158
  if (opts._cookies) {
125
159
  await context.addCookies(opts._cookies)
@@ -164,9 +198,23 @@ class BrowseEngine {
164
198
 
165
199
  result.url = page.url()
166
200
  result.title = await page.title()
201
+ result.statusCode = null // playwright doesn't expose easily, but we detect blocks below
167
202
  result.cached = false
168
203
  result.engine = this._engine
169
204
 
205
+ // Attach captured network requests
206
+ if (opts.captureNetwork && networkRequests.length > 0) {
207
+ result.networkRequests = networkRequests
208
+ }
209
+
210
+ // Detect block pages (Cloudflare, Akamai, etc.)
211
+ const blockInfo = detectBlockPage(result.content, result.title, result.html, result.url)
212
+ if (blockInfo) {
213
+ result.blocked = true
214
+ result.blockType = blockInfo.type
215
+ result.blockDetail = blockInfo.detail
216
+ }
217
+
170
218
  if (!opts.screenshot) {
171
219
  this.cache?.set('scrape', url, { content: result.content, url: result.url, title: result.title })
172
220
  }
@@ -288,4 +336,81 @@ class BrowseEngine {
288
336
  }
289
337
  }
290
338
 
339
+ /**
340
+ * Detect block/challenge pages from CDNs and bot protection services.
341
+ * Returns { type, detail } if blocked, null if clean.
342
+ */
343
+ function detectBlockPage(content, title, html, url) {
344
+ const text = (content || '').toLowerCase()
345
+ const titleLower = (title || '').toLowerCase()
346
+ const htmlLower = (html || '').toLowerCase()
347
+
348
+ // Cloudflare
349
+ if (htmlLower.includes('cf-error-details') || htmlLower.includes('cf_chl_opt') ||
350
+ text.includes('attention required') && text.includes('cloudflare') ||
351
+ text.includes('checking if the site connection is secure') ||
352
+ titleLower.includes('just a moment') && htmlLower.includes('cloudflare') ||
353
+ text.includes('ray id:') && text.includes('cloudflare')) {
354
+ return { type: 'cloudflare', detail: 'Cloudflare bot challenge or block page detected' }
355
+ }
356
+
357
+ // Cloudflare RFC 9457 structured error (new format)
358
+ if (htmlLower.includes('application/problem+json') ||
359
+ text.includes('error 1') && text.includes('cloudflare') ||
360
+ htmlLower.includes('"type":') && htmlLower.includes('cloudflare.com/errors/')) {
361
+ return { type: 'cloudflare-rfc9457', detail: 'Cloudflare structured error response (RFC 9457)' }
362
+ }
363
+
364
+ // Akamai
365
+ if (text.includes('access denied') && htmlLower.includes('akamai') ||
366
+ htmlLower.includes('akamaighost') ||
367
+ text.includes('reference #') && text.includes('access denied')) {
368
+ return { type: 'akamai', detail: 'Akamai bot detection triggered' }
369
+ }
370
+
371
+ // AWS WAF
372
+ if (text.includes('request blocked') && htmlLower.includes('aws') ||
373
+ htmlLower.includes('awswaf')) {
374
+ return { type: 'aws-waf', detail: 'AWS WAF blocked the request' }
375
+ }
376
+
377
+ // Imperva / Incapsula
378
+ if (htmlLower.includes('incapsula') || htmlLower.includes('imperva') ||
379
+ text.includes('request unsuccessful') && text.includes('incapsula')) {
380
+ return { type: 'imperva', detail: 'Imperva/Incapsula bot detection triggered' }
381
+ }
382
+
383
+ // DataDome
384
+ if (htmlLower.includes('datadome') || htmlLower.includes('dd.js')) {
385
+ return { type: 'datadome', detail: 'DataDome bot detection triggered' }
386
+ }
387
+
388
+ // PerimeterX / HUMAN
389
+ if (htmlLower.includes('perimeterx') || htmlLower.includes('px-captcha') ||
390
+ htmlLower.includes('human security')) {
391
+ return { type: 'perimeterx', detail: 'PerimeterX/HUMAN bot detection triggered' }
392
+ }
393
+
394
+ // hCaptcha challenge
395
+ if (htmlLower.includes('hcaptcha.com') && htmlLower.includes('h-captcha')) {
396
+ return { type: 'hcaptcha', detail: 'hCaptcha challenge page' }
397
+ }
398
+
399
+ // reCAPTCHA challenge (standalone, not embedded)
400
+ if (htmlLower.includes('recaptcha') && text.length < 500 &&
401
+ (titleLower === '' || titleLower.includes('blocked') || titleLower.includes('verify'))) {
402
+ return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
403
+ }
404
+
405
+ // Generic bot detection signals
406
+ if (text.length < 200 && (
407
+ text.includes('access denied') || text.includes('403 forbidden') ||
408
+ text.includes('bot detected') || text.includes('automated access') ||
409
+ text.includes('please verify you are human') || text.includes('are you a robot'))) {
410
+ return { type: 'generic', detail: 'Generic bot detection or access denied page' }
411
+ }
412
+
413
+ return null
414
+ }
415
+
291
416
  module.exports = { BrowseEngine }
package/src/crawl.js CHANGED
@@ -90,6 +90,29 @@ class CrawlEngine {
90
90
  const failed = []
91
91
  let activeCount = 0
92
92
 
93
+ // Sitemap-based crawling: pre-seed queue with sitemap URLs
94
+ if (config.useSitemap !== false) {
95
+ try {
96
+ const sitemapUrls = await fetchSitemap(startUrl)
97
+ if (sitemapUrls.length > 0) {
98
+ for (const sUrl of sitemapUrls.slice(0, config.maxPages)) {
99
+ const norm = normalizeUrl(sUrl)
100
+ if (!visited.has(norm) && this._inScope(sUrl, baseDomain, basePrefix, config.scope)) {
101
+ if (!config.skipPatterns.some(p => p.test(sUrl))) {
102
+ if (this._matchesFilters(sUrl, config.includePatterns, config.excludePatterns)) {
103
+ visited.add(norm)
104
+ queue.push({ url: sUrl, depth: 0 })
105
+ }
106
+ }
107
+ }
108
+ }
109
+ console.log(`[crawl] Pre-seeded ${Math.min(sitemapUrls.length, config.maxPages)} URLs from sitemap`)
110
+ }
111
+ } catch (e) {
112
+ console.log(`[crawl] Sitemap fetch failed, continuing with link discovery`)
113
+ }
114
+ }
115
+
93
116
  // Process queue with concurrency control
94
117
  const processUrl = async (item) => {
95
118
  const { url, depth } = item
@@ -177,6 +200,14 @@ class CrawlEngine {
177
200
  }).join('\n\n---\n\n')
178
201
  }
179
202
 
203
+ // Webhook notification
204
+ if (config.webhook) {
205
+ sendWebhook(config.webhook, {
206
+ event: 'crawl_complete',
207
+ ...result
208
+ }).catch(err => console.error('[crawl] Webhook failed:', err.message))
209
+ }
210
+
180
211
  return result
181
212
  }
182
213
 
@@ -276,15 +307,32 @@ class CrawlEngine {
276
307
 
277
308
  async _fetchPage(url, config, cookies) {
278
309
  try {
279
- const result = await this.browseEngine.browse(url, {
310
+ const browseOpts = {
280
311
  stealth: config.stealth,
281
312
  _cookies: cookies,
282
313
  timeout: config.timeout,
283
314
  html: true,
284
315
  noCache: true,
285
- fastMode: true // crawl mode: reduced delays for speed
286
- })
316
+ fastMode: true
317
+ }
318
+ let result = await this.browseEngine.browse(url, browseOpts)
319
+
320
+ // Auto-retry with full stealth if blocked
321
+ if (result?.blocked && browseOpts.fastMode) {
322
+ console.log(`[crawl] Block detected on ${url} (${result.blockType}) — retrying with full stealth`)
323
+ result = await this.browseEngine.browse(url, {
324
+ ...browseOpts,
325
+ fastMode: false,
326
+ stealth: true,
327
+ camoufox: true
328
+ })
329
+ }
330
+
287
331
  if (result?.content) {
332
+ // Skip if still blocked after retry
333
+ if (result.blocked) {
334
+ throw new Error(`Blocked by ${result.blockType}: ${result.blockDetail}`)
335
+ }
288
336
  const linkSource = result.html || result.content
289
337
  return {
290
338
  title: result.title || '',
@@ -354,6 +402,93 @@ function resolveUrl(url, base) {
354
402
  }
355
403
  }
356
404
 
405
+ /**
406
+ * Fetch and parse sitemap.xml for a domain.
407
+ * Returns array of URLs found in the sitemap.
408
+ */
409
+ async function fetchSitemap(startUrl) {
410
+ const { URL } = require('url')
411
+ const base = new URL(startUrl)
412
+ const sitemapUrls = [
413
+ `${base.origin}/sitemap.xml`,
414
+ `${base.origin}/sitemap_index.xml`,
415
+ `${base.origin}/sitemap/sitemap.xml`
416
+ ]
417
+
418
+ for (const sitemapUrl of sitemapUrls) {
419
+ try {
420
+ const xml = await fetchText(sitemapUrl, 5000)
421
+ if (!xml || !xml.includes('<url') && !xml.includes('<sitemap')) continue
422
+
423
+ const urls = []
424
+ // Extract <loc> URLs from sitemap
425
+ const locMatches = xml.matchAll(/<loc>\s*(.*?)\s*<\/loc>/gi)
426
+ for (const match of locMatches) {
427
+ const loc = match[1].trim()
428
+ // If it's a sitemap index, recursively fetch
429
+ if (loc.endsWith('.xml') || loc.includes('sitemap')) {
430
+ try {
431
+ const subXml = await fetchText(loc, 5000)
432
+ if (subXml) {
433
+ const subMatches = subXml.matchAll(/<loc>\s*(.*?)\s*<\/loc>/gi)
434
+ for (const subMatch of subMatches) {
435
+ const subLoc = subMatch[1].trim()
436
+ if (!subLoc.endsWith('.xml')) urls.push(subLoc)
437
+ }
438
+ }
439
+ } catch (e) { /* skip failed sub-sitemaps */ }
440
+ } else {
441
+ urls.push(loc)
442
+ }
443
+ }
444
+ if (urls.length > 0) {
445
+ console.log(`[crawl] Found sitemap at ${sitemapUrl} with ${urls.length} URLs`)
446
+ return urls
447
+ }
448
+ } catch (e) { /* try next */ }
449
+ }
450
+ return []
451
+ }
452
+
453
+ function fetchText(url, timeout = 5000) {
454
+ const h = url.startsWith('https') ? require('https') : require('http')
455
+ return new Promise((resolve, reject) => {
456
+ const req = h.get(url, { timeout, headers: { 'User-Agent': 'Spectrawl/1.0 (sitemap crawler)' } }, res => {
457
+ if (res.statusCode !== 200) return resolve(null)
458
+ let data = ''
459
+ res.on('data', chunk => data += chunk)
460
+ res.on('end', () => resolve(data))
461
+ })
462
+ req.on('error', () => resolve(null))
463
+ req.setTimeout(timeout, () => { req.destroy(); resolve(null) })
464
+ })
465
+ }
466
+
467
+ /**
468
+ * Send a webhook notification.
469
+ */
470
+ async function sendWebhook(webhookUrl, data) {
471
+ const h = webhookUrl.startsWith('https') ? require('https') : require('http')
472
+ const body = JSON.stringify(data)
473
+ const urlObj = new URL(webhookUrl)
474
+ return new Promise((resolve) => {
475
+ const req = h.request({
476
+ hostname: urlObj.hostname,
477
+ port: urlObj.port,
478
+ path: urlObj.pathname + urlObj.search,
479
+ method: 'POST',
480
+ headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(body) }
481
+ }, res => {
482
+ res.on('data', () => {})
483
+ res.on('end', () => resolve(true))
484
+ })
485
+ req.on('error', () => resolve(false))
486
+ req.setTimeout(10000, () => { req.destroy(); resolve(false) })
487
+ req.write(body)
488
+ req.end()
489
+ })
490
+ }
491
+
357
492
  function normalizeUrl(url) {
358
493
  try {
359
494
  const u = new URL(url)