spectrawl 0.4.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +578 -67
- package/package.json +1 -1
- package/src/agent.js +295 -0
- package/src/browse/index.js +125 -0
- package/src/crawl.js +138 -3
- package/src/extract.js +314 -0
- package/src/index.js +35 -0
- package/src/server.js +69 -12
package/package.json
CHANGED
package/src/agent.js
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Spectrawl Agent Engine
|
|
3
|
+
* Natural language browser actions — "click the sign in button", "fill the search box with query".
|
|
4
|
+
* Uses LLM to interpret page DOM and generate Playwright actions.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const https = require('https')
|
|
8
|
+
|
|
9
|
+
class AgentEngine {
|
|
10
|
+
constructor(browseEngine, config = {}) {
|
|
11
|
+
this.browseEngine = browseEngine
|
|
12
|
+
this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
|
|
13
|
+
this.openaiKey = config.openaiKey || process.env.OPENAI_API_KEY
|
|
14
|
+
this.model = config.model || 'gemini-2.0-flash'
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Execute a natural language action on a page.
|
|
19
|
+
* @param {string} url - URL to navigate to
|
|
20
|
+
* @param {string} instruction - what to do (e.g. "click the login button")
|
|
21
|
+
* @param {object} opts - options
|
|
22
|
+
* @param {number} opts.maxSteps - max number of actions to take (default 5)
|
|
23
|
+
* @param {boolean} opts.screenshot - take screenshot after action
|
|
24
|
+
* @param {number} opts.timeout - timeout per action in ms
|
|
25
|
+
*/
|
|
26
|
+
async act(url, instruction, opts = {}) {
|
|
27
|
+
const maxSteps = opts.maxSteps || 5
|
|
28
|
+
const timeout = opts.timeout || 30000
|
|
29
|
+
const startTime = Date.now()
|
|
30
|
+
const steps = []
|
|
31
|
+
|
|
32
|
+
// Get a browser page
|
|
33
|
+
const { page, context } = await this.browseEngine.getPage({ url, timeout })
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
// Wait for page to be ready
|
|
37
|
+
await page.waitForLoadState('domcontentloaded', { timeout: 10000 }).catch(() => {})
|
|
38
|
+
await page.waitForTimeout(1000)
|
|
39
|
+
|
|
40
|
+
for (let i = 0; i < maxSteps; i++) {
|
|
41
|
+
// Get simplified DOM
|
|
42
|
+
const dom = await this._getSimplifiedDOM(page)
|
|
43
|
+
|
|
44
|
+
// Ask LLM what to do
|
|
45
|
+
const action = await this._planAction(dom, instruction, steps, page.url())
|
|
46
|
+
|
|
47
|
+
if (action.done) {
|
|
48
|
+
steps.push({ step: i + 1, action: 'done', reason: action.reason })
|
|
49
|
+
break
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Execute the action
|
|
53
|
+
try {
|
|
54
|
+
const result = await this._executeAction(page, action)
|
|
55
|
+
steps.push({ step: i + 1, ...action, result: result || 'ok' })
|
|
56
|
+
|
|
57
|
+
// Wait for potential navigation/load
|
|
58
|
+
await page.waitForTimeout(500 + Math.random() * 1000)
|
|
59
|
+
await page.waitForLoadState('domcontentloaded', { timeout: 5000 }).catch(() => {})
|
|
60
|
+
} catch (err) {
|
|
61
|
+
steps.push({ step: i + 1, ...action, error: err.message })
|
|
62
|
+
// Continue trying if there are more steps
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Get final page state
|
|
67
|
+
const finalContent = await page.evaluate(() => document.body?.innerText?.slice(0, 10000) || '')
|
|
68
|
+
const finalUrl = page.url()
|
|
69
|
+
const finalTitle = await page.title()
|
|
70
|
+
|
|
71
|
+
let screenshot = null
|
|
72
|
+
if (opts.screenshot) {
|
|
73
|
+
screenshot = await page.screenshot({ type: 'png', fullPage: false })
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
success: steps.some(s => s.action === 'done' || !s.error),
|
|
78
|
+
url: finalUrl,
|
|
79
|
+
title: finalTitle,
|
|
80
|
+
steps,
|
|
81
|
+
content: finalContent,
|
|
82
|
+
screenshot,
|
|
83
|
+
duration: Date.now() - startTime
|
|
84
|
+
}
|
|
85
|
+
} finally {
|
|
86
|
+
await context.close().catch(() => {})
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Get a simplified DOM representation for the LLM.
|
|
92
|
+
* Strips noise, keeps interactive elements with indices.
|
|
93
|
+
*/
|
|
94
|
+
async _getSimplifiedDOM(page) {
|
|
95
|
+
return page.evaluate(() => {
|
|
96
|
+
const elements = []
|
|
97
|
+
const interactiveSelectors = [
|
|
98
|
+
'a[href]', 'button', 'input', 'textarea', 'select',
|
|
99
|
+
'[role="button"]', '[role="link"]', '[role="tab"]',
|
|
100
|
+
'[onclick]', '[type="submit"]', 'label'
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
const allElements = document.querySelectorAll(interactiveSelectors.join(','))
|
|
104
|
+
|
|
105
|
+
allElements.forEach((el, idx) => {
|
|
106
|
+
if (!el.offsetParent && el.tagName !== 'INPUT') return // skip hidden
|
|
107
|
+
const rect = el.getBoundingClientRect()
|
|
108
|
+
if (rect.width === 0 && rect.height === 0) return
|
|
109
|
+
|
|
110
|
+
const tag = el.tagName.toLowerCase()
|
|
111
|
+
const type = el.type || ''
|
|
112
|
+
const text = (el.textContent || '').trim().slice(0, 100)
|
|
113
|
+
const placeholder = el.placeholder || ''
|
|
114
|
+
const ariaLabel = el.getAttribute('aria-label') || ''
|
|
115
|
+
const href = el.href || ''
|
|
116
|
+
const value = el.value || ''
|
|
117
|
+
const name = el.name || ''
|
|
118
|
+
const id = el.id || ''
|
|
119
|
+
|
|
120
|
+
// Create a unique selector for this element
|
|
121
|
+
let selector = tag
|
|
122
|
+
if (id) selector = `#${id}`
|
|
123
|
+
else if (name) selector = `${tag}[name="${name}"]`
|
|
124
|
+
else if (ariaLabel) selector = `${tag}[aria-label="${ariaLabel}"]`
|
|
125
|
+
|
|
126
|
+
elements.push({
|
|
127
|
+
idx,
|
|
128
|
+
tag,
|
|
129
|
+
type,
|
|
130
|
+
text: text.slice(0, 80),
|
|
131
|
+
placeholder,
|
|
132
|
+
ariaLabel,
|
|
133
|
+
href: href.slice(0, 100),
|
|
134
|
+
value,
|
|
135
|
+
selector,
|
|
136
|
+
id,
|
|
137
|
+
name
|
|
138
|
+
})
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
title: document.title,
|
|
143
|
+
url: location.href,
|
|
144
|
+
elements: elements.slice(0, 100) // cap at 100 elements
|
|
145
|
+
}
|
|
146
|
+
})
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Ask LLM to plan the next action.
|
|
151
|
+
*/
|
|
152
|
+
async _planAction(dom, instruction, previousSteps, currentUrl) {
|
|
153
|
+
const prompt = `You are a browser automation agent. Given the current page state and instruction, determine the next action.
|
|
154
|
+
|
|
155
|
+
Current URL: ${currentUrl}
|
|
156
|
+
Page title: ${dom.title}
|
|
157
|
+
|
|
158
|
+
Interactive elements on page:
|
|
159
|
+
${dom.elements.map(e => `[${e.idx}] <${e.tag}${e.type ? ` type="${e.type}"` : ''}${e.id ? ` id="${e.id}"` : ''}${e.name ? ` name="${e.name}"` : ''}> ${e.text || e.placeholder || e.ariaLabel || e.href || '(empty)'}`).join('\n')}
|
|
160
|
+
|
|
161
|
+
Instruction: ${instruction}
|
|
162
|
+
|
|
163
|
+
Previous steps: ${previousSteps.length > 0 ? JSON.stringify(previousSteps) : 'none'}
|
|
164
|
+
|
|
165
|
+
Respond with a JSON object:
|
|
166
|
+
- If the instruction is complete: {"done": true, "reason": "why it's done"}
|
|
167
|
+
- To click: {"action": "click", "elementIdx": 5, "reason": "clicking the login button"}
|
|
168
|
+
- To type: {"action": "type", "elementIdx": 3, "text": "hello", "reason": "filling search box"}
|
|
169
|
+
- To select: {"action": "select", "elementIdx": 7, "value": "option1", "reason": "selecting dropdown"}
|
|
170
|
+
- To press a key: {"action": "press", "key": "Enter", "reason": "submitting form"}
|
|
171
|
+
- To scroll: {"action": "scroll", "direction": "down", "reason": "loading more content"}
|
|
172
|
+
|
|
173
|
+
Only return valid JSON. No explanation.`
|
|
174
|
+
|
|
175
|
+
const result = await this._llmCall(prompt)
|
|
176
|
+
return result
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Execute a planned action on the page.
|
|
181
|
+
*/
|
|
182
|
+
async _executeAction(page, action) {
|
|
183
|
+
switch (action.action) {
|
|
184
|
+
case 'click': {
|
|
185
|
+
const elements = await page.$$('a[href], button, input, textarea, select, [role="button"], [role="link"], [role="tab"], [onclick], [type="submit"], label')
|
|
186
|
+
const visibleElements = []
|
|
187
|
+
for (const el of elements) {
|
|
188
|
+
const visible = await el.isVisible().catch(() => false)
|
|
189
|
+
if (visible) visibleElements.push(el)
|
|
190
|
+
}
|
|
191
|
+
const target = visibleElements[action.elementIdx]
|
|
192
|
+
if (!target) throw new Error(`Element [${action.elementIdx}] not found`)
|
|
193
|
+
await target.click({ timeout: 5000 })
|
|
194
|
+
return 'clicked'
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
case 'type': {
|
|
198
|
+
const elements = await page.$$('a[href], button, input, textarea, select, [role="button"], [role="link"], [role="tab"], [onclick], [type="submit"], label')
|
|
199
|
+
const visibleElements = []
|
|
200
|
+
for (const el of elements) {
|
|
201
|
+
const visible = await el.isVisible().catch(() => false)
|
|
202
|
+
if (visible) visibleElements.push(el)
|
|
203
|
+
}
|
|
204
|
+
const target = visibleElements[action.elementIdx]
|
|
205
|
+
if (!target) throw new Error(`Element [${action.elementIdx}] not found`)
|
|
206
|
+
await target.fill('')
|
|
207
|
+
await target.type(action.text, { delay: 50 + Math.random() * 100 })
|
|
208
|
+
return 'typed'
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
case 'select': {
|
|
212
|
+
const elements = await page.$$('a[href], button, input, textarea, select, [role="button"], [role="link"], [role="tab"], [onclick], [type="submit"], label')
|
|
213
|
+
const visibleElements = []
|
|
214
|
+
for (const el of elements) {
|
|
215
|
+
const visible = await el.isVisible().catch(() => false)
|
|
216
|
+
if (visible) visibleElements.push(el)
|
|
217
|
+
}
|
|
218
|
+
const target = visibleElements[action.elementIdx]
|
|
219
|
+
if (!target) throw new Error(`Element [${action.elementIdx}] not found`)
|
|
220
|
+
await target.selectOption(action.value)
|
|
221
|
+
return 'selected'
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
case 'press':
|
|
225
|
+
await page.keyboard.press(action.key)
|
|
226
|
+
return 'pressed'
|
|
227
|
+
|
|
228
|
+
case 'scroll':
|
|
229
|
+
await page.evaluate((dir) => {
|
|
230
|
+
window.scrollBy(0, dir === 'up' ? -500 : 500)
|
|
231
|
+
}, action.direction)
|
|
232
|
+
return 'scrolled'
|
|
233
|
+
|
|
234
|
+
default:
|
|
235
|
+
throw new Error(`Unknown action: ${action.action}`)
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
async _llmCall(prompt) {
|
|
240
|
+
if (this.apiKey) {
|
|
241
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.model}:generateContent?key=${this.apiKey}`
|
|
242
|
+
const body = {
|
|
243
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
244
|
+
generationConfig: { responseMimeType: 'application/json', temperature: 0.1 }
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const response = await this._post(url, body)
|
|
248
|
+
const text = response?.candidates?.[0]?.content?.parts?.[0]?.text
|
|
249
|
+
if (!text) throw new Error('Empty LLM response')
|
|
250
|
+
return JSON.parse(text)
|
|
251
|
+
} else if (this.openaiKey) {
|
|
252
|
+
const url = 'https://api.openai.com/v1/chat/completions'
|
|
253
|
+
const body = {
|
|
254
|
+
model: 'gpt-4o-mini',
|
|
255
|
+
messages: [{ role: 'user', content: prompt }],
|
|
256
|
+
response_format: { type: 'json_object' },
|
|
257
|
+
temperature: 0.1
|
|
258
|
+
}
|
|
259
|
+
const response = await this._post(url, body, { 'Authorization': `Bearer ${this.openaiKey}` })
|
|
260
|
+
return JSON.parse(response?.choices?.[0]?.message?.content)
|
|
261
|
+
}
|
|
262
|
+
throw new Error('No LLM API key configured')
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
_post(url, body, extraHeaders = {}) {
|
|
266
|
+
return new Promise((resolve, reject) => {
|
|
267
|
+
const urlObj = new URL(url)
|
|
268
|
+
const data = JSON.stringify(body)
|
|
269
|
+
const opts = {
|
|
270
|
+
hostname: urlObj.hostname,
|
|
271
|
+
path: urlObj.pathname + urlObj.search,
|
|
272
|
+
method: 'POST',
|
|
273
|
+
headers: {
|
|
274
|
+
'Content-Type': 'application/json',
|
|
275
|
+
'Content-Length': Buffer.byteLength(data),
|
|
276
|
+
...extraHeaders
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
const req = https.request(opts, res => {
|
|
280
|
+
let responseData = ''
|
|
281
|
+
res.on('data', chunk => responseData += chunk)
|
|
282
|
+
res.on('end', () => {
|
|
283
|
+
try { resolve(JSON.parse(responseData)) }
|
|
284
|
+
catch (e) { reject(new Error(`Invalid JSON: ${responseData.slice(0, 200)}`)) }
|
|
285
|
+
})
|
|
286
|
+
})
|
|
287
|
+
req.on('error', reject)
|
|
288
|
+
req.setTimeout(30000, () => { req.destroy(); reject(new Error('LLM timeout')) })
|
|
289
|
+
req.write(data)
|
|
290
|
+
req.end()
|
|
291
|
+
})
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
module.exports = { AgentEngine }
|
package/src/browse/index.js
CHANGED
|
@@ -120,6 +120,40 @@ class BrowseEngine {
|
|
|
120
120
|
const context = await this._createContext(browser, opts)
|
|
121
121
|
const page = await context.newPage()
|
|
122
122
|
|
|
123
|
+
// Network request capturing
|
|
124
|
+
const networkRequests = []
|
|
125
|
+
if (opts.captureNetwork) {
|
|
126
|
+
page.on('request', req => {
|
|
127
|
+
const resourceType = req.resourceType()
|
|
128
|
+
if (['xhr', 'fetch'].includes(resourceType)) {
|
|
129
|
+
networkRequests.push({
|
|
130
|
+
url: req.url(),
|
|
131
|
+
method: req.method(),
|
|
132
|
+
resourceType,
|
|
133
|
+
headers: opts.captureNetworkHeaders ? req.headers() : undefined,
|
|
134
|
+
postData: req.postData() || undefined
|
|
135
|
+
})
|
|
136
|
+
}
|
|
137
|
+
})
|
|
138
|
+
page.on('response', async res => {
|
|
139
|
+
const req = res.request()
|
|
140
|
+
const resourceType = req.resourceType()
|
|
141
|
+
if (['xhr', 'fetch'].includes(resourceType)) {
|
|
142
|
+
const existing = networkRequests.find(r => r.url === req.url() && r.method === req.method())
|
|
143
|
+
if (existing) {
|
|
144
|
+
existing.status = res.status()
|
|
145
|
+
existing.contentType = res.headers()['content-type'] || null
|
|
146
|
+
if (opts.captureNetworkBody) {
|
|
147
|
+
try {
|
|
148
|
+
const body = await res.text().catch(() => null)
|
|
149
|
+
if (body && body.length < 50000) existing.body = body
|
|
150
|
+
} catch (e) { /* ignore */ }
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
})
|
|
155
|
+
}
|
|
156
|
+
|
|
123
157
|
try {
|
|
124
158
|
if (opts._cookies) {
|
|
125
159
|
await context.addCookies(opts._cookies)
|
|
@@ -164,9 +198,23 @@ class BrowseEngine {
|
|
|
164
198
|
|
|
165
199
|
result.url = page.url()
|
|
166
200
|
result.title = await page.title()
|
|
201
|
+
result.statusCode = null // playwright doesn't expose easily, but we detect blocks below
|
|
167
202
|
result.cached = false
|
|
168
203
|
result.engine = this._engine
|
|
169
204
|
|
|
205
|
+
// Attach captured network requests
|
|
206
|
+
if (opts.captureNetwork && networkRequests.length > 0) {
|
|
207
|
+
result.networkRequests = networkRequests
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Detect block pages (Cloudflare, Akamai, etc.)
|
|
211
|
+
const blockInfo = detectBlockPage(result.content, result.title, result.html, result.url)
|
|
212
|
+
if (blockInfo) {
|
|
213
|
+
result.blocked = true
|
|
214
|
+
result.blockType = blockInfo.type
|
|
215
|
+
result.blockDetail = blockInfo.detail
|
|
216
|
+
}
|
|
217
|
+
|
|
170
218
|
if (!opts.screenshot) {
|
|
171
219
|
this.cache?.set('scrape', url, { content: result.content, url: result.url, title: result.title })
|
|
172
220
|
}
|
|
@@ -288,4 +336,81 @@ class BrowseEngine {
|
|
|
288
336
|
}
|
|
289
337
|
}
|
|
290
338
|
|
|
339
|
+
/**
|
|
340
|
+
* Detect block/challenge pages from CDNs and bot protection services.
|
|
341
|
+
* Returns { type, detail } if blocked, null if clean.
|
|
342
|
+
*/
|
|
343
|
+
function detectBlockPage(content, title, html, url) {
|
|
344
|
+
const text = (content || '').toLowerCase()
|
|
345
|
+
const titleLower = (title || '').toLowerCase()
|
|
346
|
+
const htmlLower = (html || '').toLowerCase()
|
|
347
|
+
|
|
348
|
+
// Cloudflare
|
|
349
|
+
if (htmlLower.includes('cf-error-details') || htmlLower.includes('cf_chl_opt') ||
|
|
350
|
+
text.includes('attention required') && text.includes('cloudflare') ||
|
|
351
|
+
text.includes('checking if the site connection is secure') ||
|
|
352
|
+
titleLower.includes('just a moment') && htmlLower.includes('cloudflare') ||
|
|
353
|
+
text.includes('ray id:') && text.includes('cloudflare')) {
|
|
354
|
+
return { type: 'cloudflare', detail: 'Cloudflare bot challenge or block page detected' }
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Cloudflare RFC 9457 structured error (new format)
|
|
358
|
+
if (htmlLower.includes('application/problem+json') ||
|
|
359
|
+
text.includes('error 1') && text.includes('cloudflare') ||
|
|
360
|
+
htmlLower.includes('"type":') && htmlLower.includes('cloudflare.com/errors/')) {
|
|
361
|
+
return { type: 'cloudflare-rfc9457', detail: 'Cloudflare structured error response (RFC 9457)' }
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Akamai
|
|
365
|
+
if (text.includes('access denied') && htmlLower.includes('akamai') ||
|
|
366
|
+
htmlLower.includes('akamaighost') ||
|
|
367
|
+
text.includes('reference #') && text.includes('access denied')) {
|
|
368
|
+
return { type: 'akamai', detail: 'Akamai bot detection triggered' }
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// AWS WAF
|
|
372
|
+
if (text.includes('request blocked') && htmlLower.includes('aws') ||
|
|
373
|
+
htmlLower.includes('awswaf')) {
|
|
374
|
+
return { type: 'aws-waf', detail: 'AWS WAF blocked the request' }
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Imperva / Incapsula
|
|
378
|
+
if (htmlLower.includes('incapsula') || htmlLower.includes('imperva') ||
|
|
379
|
+
text.includes('request unsuccessful') && text.includes('incapsula')) {
|
|
380
|
+
return { type: 'imperva', detail: 'Imperva/Incapsula bot detection triggered' }
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// DataDome
|
|
384
|
+
if (htmlLower.includes('datadome') || htmlLower.includes('dd.js')) {
|
|
385
|
+
return { type: 'datadome', detail: 'DataDome bot detection triggered' }
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// PerimeterX / HUMAN
|
|
389
|
+
if (htmlLower.includes('perimeterx') || htmlLower.includes('px-captcha') ||
|
|
390
|
+
htmlLower.includes('human security')) {
|
|
391
|
+
return { type: 'perimeterx', detail: 'PerimeterX/HUMAN bot detection triggered' }
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// hCaptcha challenge
|
|
395
|
+
if (htmlLower.includes('hcaptcha.com') && htmlLower.includes('h-captcha')) {
|
|
396
|
+
return { type: 'hcaptcha', detail: 'hCaptcha challenge page' }
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
// reCAPTCHA challenge (standalone, not embedded)
|
|
400
|
+
if (htmlLower.includes('recaptcha') && text.length < 500 &&
|
|
401
|
+
(titleLower === '' || titleLower.includes('blocked') || titleLower.includes('verify'))) {
|
|
402
|
+
return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// Generic bot detection signals
|
|
406
|
+
if (text.length < 200 && (
|
|
407
|
+
text.includes('access denied') || text.includes('403 forbidden') ||
|
|
408
|
+
text.includes('bot detected') || text.includes('automated access') ||
|
|
409
|
+
text.includes('please verify you are human') || text.includes('are you a robot'))) {
|
|
410
|
+
return { type: 'generic', detail: 'Generic bot detection or access denied page' }
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return null
|
|
414
|
+
}
|
|
415
|
+
|
|
291
416
|
module.exports = { BrowseEngine }
|
package/src/crawl.js
CHANGED
|
@@ -90,6 +90,29 @@ class CrawlEngine {
|
|
|
90
90
|
const failed = []
|
|
91
91
|
let activeCount = 0
|
|
92
92
|
|
|
93
|
+
// Sitemap-based crawling: pre-seed queue with sitemap URLs
|
|
94
|
+
if (config.useSitemap !== false) {
|
|
95
|
+
try {
|
|
96
|
+
const sitemapUrls = await fetchSitemap(startUrl)
|
|
97
|
+
if (sitemapUrls.length > 0) {
|
|
98
|
+
for (const sUrl of sitemapUrls.slice(0, config.maxPages)) {
|
|
99
|
+
const norm = normalizeUrl(sUrl)
|
|
100
|
+
if (!visited.has(norm) && this._inScope(sUrl, baseDomain, basePrefix, config.scope)) {
|
|
101
|
+
if (!config.skipPatterns.some(p => p.test(sUrl))) {
|
|
102
|
+
if (this._matchesFilters(sUrl, config.includePatterns, config.excludePatterns)) {
|
|
103
|
+
visited.add(norm)
|
|
104
|
+
queue.push({ url: sUrl, depth: 0 })
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
console.log(`[crawl] Pre-seeded ${Math.min(sitemapUrls.length, config.maxPages)} URLs from sitemap`)
|
|
110
|
+
}
|
|
111
|
+
} catch (e) {
|
|
112
|
+
console.log(`[crawl] Sitemap fetch failed, continuing with link discovery`)
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
93
116
|
// Process queue with concurrency control
|
|
94
117
|
const processUrl = async (item) => {
|
|
95
118
|
const { url, depth } = item
|
|
@@ -177,6 +200,14 @@ class CrawlEngine {
|
|
|
177
200
|
}).join('\n\n---\n\n')
|
|
178
201
|
}
|
|
179
202
|
|
|
203
|
+
// Webhook notification
|
|
204
|
+
if (config.webhook) {
|
|
205
|
+
sendWebhook(config.webhook, {
|
|
206
|
+
event: 'crawl_complete',
|
|
207
|
+
...result
|
|
208
|
+
}).catch(err => console.error('[crawl] Webhook failed:', err.message))
|
|
209
|
+
}
|
|
210
|
+
|
|
180
211
|
return result
|
|
181
212
|
}
|
|
182
213
|
|
|
@@ -276,15 +307,32 @@ class CrawlEngine {
|
|
|
276
307
|
|
|
277
308
|
async _fetchPage(url, config, cookies) {
|
|
278
309
|
try {
|
|
279
|
-
const
|
|
310
|
+
const browseOpts = {
|
|
280
311
|
stealth: config.stealth,
|
|
281
312
|
_cookies: cookies,
|
|
282
313
|
timeout: config.timeout,
|
|
283
314
|
html: true,
|
|
284
315
|
noCache: true,
|
|
285
|
-
fastMode: true
|
|
286
|
-
}
|
|
316
|
+
fastMode: true
|
|
317
|
+
}
|
|
318
|
+
let result = await this.browseEngine.browse(url, browseOpts)
|
|
319
|
+
|
|
320
|
+
// Auto-retry with full stealth if blocked
|
|
321
|
+
if (result?.blocked && browseOpts.fastMode) {
|
|
322
|
+
console.log(`[crawl] Block detected on ${url} (${result.blockType}) — retrying with full stealth`)
|
|
323
|
+
result = await this.browseEngine.browse(url, {
|
|
324
|
+
...browseOpts,
|
|
325
|
+
fastMode: false,
|
|
326
|
+
stealth: true,
|
|
327
|
+
camoufox: true
|
|
328
|
+
})
|
|
329
|
+
}
|
|
330
|
+
|
|
287
331
|
if (result?.content) {
|
|
332
|
+
// Skip if still blocked after retry
|
|
333
|
+
if (result.blocked) {
|
|
334
|
+
throw new Error(`Blocked by ${result.blockType}: ${result.blockDetail}`)
|
|
335
|
+
}
|
|
288
336
|
const linkSource = result.html || result.content
|
|
289
337
|
return {
|
|
290
338
|
title: result.title || '',
|
|
@@ -354,6 +402,93 @@ function resolveUrl(url, base) {
|
|
|
354
402
|
}
|
|
355
403
|
}
|
|
356
404
|
|
|
405
|
+
/**
|
|
406
|
+
* Fetch and parse sitemap.xml for a domain.
|
|
407
|
+
* Returns array of URLs found in the sitemap.
|
|
408
|
+
*/
|
|
409
|
+
async function fetchSitemap(startUrl) {
|
|
410
|
+
const { URL } = require('url')
|
|
411
|
+
const base = new URL(startUrl)
|
|
412
|
+
const sitemapUrls = [
|
|
413
|
+
`${base.origin}/sitemap.xml`,
|
|
414
|
+
`${base.origin}/sitemap_index.xml`,
|
|
415
|
+
`${base.origin}/sitemap/sitemap.xml`
|
|
416
|
+
]
|
|
417
|
+
|
|
418
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
419
|
+
try {
|
|
420
|
+
const xml = await fetchText(sitemapUrl, 5000)
|
|
421
|
+
if (!xml || !xml.includes('<url') && !xml.includes('<sitemap')) continue
|
|
422
|
+
|
|
423
|
+
const urls = []
|
|
424
|
+
// Extract <loc> URLs from sitemap
|
|
425
|
+
const locMatches = xml.matchAll(/<loc>\s*(.*?)\s*<\/loc>/gi)
|
|
426
|
+
for (const match of locMatches) {
|
|
427
|
+
const loc = match[1].trim()
|
|
428
|
+
// If it's a sitemap index, recursively fetch
|
|
429
|
+
if (loc.endsWith('.xml') || loc.includes('sitemap')) {
|
|
430
|
+
try {
|
|
431
|
+
const subXml = await fetchText(loc, 5000)
|
|
432
|
+
if (subXml) {
|
|
433
|
+
const subMatches = subXml.matchAll(/<loc>\s*(.*?)\s*<\/loc>/gi)
|
|
434
|
+
for (const subMatch of subMatches) {
|
|
435
|
+
const subLoc = subMatch[1].trim()
|
|
436
|
+
if (!subLoc.endsWith('.xml')) urls.push(subLoc)
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
} catch (e) { /* skip failed sub-sitemaps */ }
|
|
440
|
+
} else {
|
|
441
|
+
urls.push(loc)
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
if (urls.length > 0) {
|
|
445
|
+
console.log(`[crawl] Found sitemap at ${sitemapUrl} with ${urls.length} URLs`)
|
|
446
|
+
return urls
|
|
447
|
+
}
|
|
448
|
+
} catch (e) { /* try next */ }
|
|
449
|
+
}
|
|
450
|
+
return []
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
function fetchText(url, timeout = 5000) {
|
|
454
|
+
const h = url.startsWith('https') ? require('https') : require('http')
|
|
455
|
+
return new Promise((resolve, reject) => {
|
|
456
|
+
const req = h.get(url, { timeout, headers: { 'User-Agent': 'Spectrawl/1.0 (sitemap crawler)' } }, res => {
|
|
457
|
+
if (res.statusCode !== 200) return resolve(null)
|
|
458
|
+
let data = ''
|
|
459
|
+
res.on('data', chunk => data += chunk)
|
|
460
|
+
res.on('end', () => resolve(data))
|
|
461
|
+
})
|
|
462
|
+
req.on('error', () => resolve(null))
|
|
463
|
+
req.setTimeout(timeout, () => { req.destroy(); resolve(null) })
|
|
464
|
+
})
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/**
|
|
468
|
+
* Send a webhook notification.
|
|
469
|
+
*/
|
|
470
|
+
async function sendWebhook(webhookUrl, data) {
|
|
471
|
+
const h = webhookUrl.startsWith('https') ? require('https') : require('http')
|
|
472
|
+
const body = JSON.stringify(data)
|
|
473
|
+
const urlObj = new URL(webhookUrl)
|
|
474
|
+
return new Promise((resolve) => {
|
|
475
|
+
const req = h.request({
|
|
476
|
+
hostname: urlObj.hostname,
|
|
477
|
+
port: urlObj.port,
|
|
478
|
+
path: urlObj.pathname + urlObj.search,
|
|
479
|
+
method: 'POST',
|
|
480
|
+
headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(body) }
|
|
481
|
+
}, res => {
|
|
482
|
+
res.on('data', () => {})
|
|
483
|
+
res.on('end', () => resolve(true))
|
|
484
|
+
})
|
|
485
|
+
req.on('error', () => resolve(false))
|
|
486
|
+
req.setTimeout(10000, () => { req.destroy(); resolve(false) })
|
|
487
|
+
req.write(body)
|
|
488
|
+
req.end()
|
|
489
|
+
})
|
|
490
|
+
}
|
|
491
|
+
|
|
357
492
|
function normalizeUrl(url) {
|
|
358
493
|
try {
|
|
359
494
|
const u = new URL(url)
|