spectrawl 0.4.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +578 -67
- package/package.json +1 -1
- package/src/agent.js +295 -0
- package/src/browse/index.js +125 -0
- package/src/crawl.js +138 -3
- package/src/extract.js +314 -0
- package/src/index.js +35 -0
- package/src/server.js +69 -12
package/src/extract.js
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Spectrawl Extract Engine
|
|
3
|
+
* Structured data extraction from web pages using LLM + optional CSS/XPath selectors.
|
|
4
|
+
* Inspired by Stagehand's extract() but self-hosted and integrated with Spectrawl's browse engine.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const https = require('https')
|
|
8
|
+
const http = require('http')
|
|
9
|
+
|
|
10
|
+
const DEFAULT_OPTS = {
|
|
11
|
+
model: 'gemini-2.0-flash',
|
|
12
|
+
timeout: 30000,
|
|
13
|
+
selector: null, // CSS or XPath selector to narrow extraction scope
|
|
14
|
+
instruction: null, // natural language instruction
|
|
15
|
+
schema: null, // JSON Schema for structured output
|
|
16
|
+
relevanceFilter: false // BM25-style relevance filtering
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
class ExtractEngine {
|
|
20
|
+
constructor(browseEngine, config = {}) {
|
|
21
|
+
this.browseEngine = browseEngine
|
|
22
|
+
this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
|
|
23
|
+
this.openaiKey = config.openaiKey || process.env.OPENAI_API_KEY
|
|
24
|
+
this.model = config.model || DEFAULT_OPTS.model
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Extract structured data from a URL.
|
|
29
|
+
* @param {string} url - URL to extract from
|
|
30
|
+
* @param {object} opts - extraction options
|
|
31
|
+
* @param {string} opts.instruction - what to extract (natural language)
|
|
32
|
+
* @param {object} opts.schema - JSON Schema for the output structure
|
|
33
|
+
* @param {string} opts.selector - CSS/XPath selector to narrow scope
|
|
34
|
+
* @param {boolean} opts.relevanceFilter - filter content by relevance to instruction
|
|
35
|
+
* @param {string} opts.model - LLM model to use
|
|
36
|
+
*/
|
|
37
|
+
async extract(url, opts = {}) {
|
|
38
|
+
const config = { ...DEFAULT_OPTS, ...opts }
|
|
39
|
+
const startTime = Date.now()
|
|
40
|
+
|
|
41
|
+
// Step 1: Browse the page
|
|
42
|
+
const page = await this.browseEngine.browse(url, {
|
|
43
|
+
html: !!config.selector, // need HTML for selector extraction
|
|
44
|
+
timeout: config.timeout
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
let content = page.content || ''
|
|
48
|
+
|
|
49
|
+
// Step 2: If selector provided, narrow the content
|
|
50
|
+
if (config.selector && page.html) {
|
|
51
|
+
content = this._extractBySelector(page.html, config.selector)
|
|
52
|
+
if (!content || content.length < 10) {
|
|
53
|
+
content = page.content // fallback to full content
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Step 3: If relevance filter, apply BM25-style filtering
|
|
58
|
+
if (config.relevanceFilter && config.instruction) {
|
|
59
|
+
content = this._filterByRelevance(content, config.instruction)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Step 4: Extract with LLM
|
|
63
|
+
const extracted = await this._llmExtract(content, config.instruction, config.schema)
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
data: extracted,
|
|
67
|
+
url: page.url,
|
|
68
|
+
title: page.title,
|
|
69
|
+
contentLength: content.length,
|
|
70
|
+
duration: Date.now() - startTime
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Extract from already-fetched content (no browsing needed).
|
|
76
|
+
*/
|
|
77
|
+
async extractFromContent(content, opts = {}) {
|
|
78
|
+
const config = { ...DEFAULT_OPTS, ...opts }
|
|
79
|
+
|
|
80
|
+
if (config.relevanceFilter && config.instruction) {
|
|
81
|
+
content = this._filterByRelevance(content, config.instruction)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const extracted = await this._llmExtract(content, config.instruction, config.schema)
|
|
85
|
+
return { data: extracted }
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Extract content matching a CSS or XPath selector from HTML.
|
|
90
|
+
*/
|
|
91
|
+
_extractBySelector(html, selector) {
|
|
92
|
+
// Simple CSS selector extraction (handles common cases)
|
|
93
|
+
// For XPath, we prefix with xpath=
|
|
94
|
+
if (selector.startsWith('xpath=') || selector.startsWith('//')) {
|
|
95
|
+
// XPath — extract using regex patterns for common XPath expressions
|
|
96
|
+
const xpath = selector.replace('xpath=', '')
|
|
97
|
+
return this._extractByXPath(html, xpath)
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// CSS selector — use tag/class/id matching
|
|
101
|
+
return this._extractByCSS(html, selector)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
_extractByCSS(html, selector) {
|
|
105
|
+
// Handle common CSS selectors: tag, .class, #id, tag.class, tag#id
|
|
106
|
+
let pattern
|
|
107
|
+
|
|
108
|
+
if (selector.startsWith('#')) {
|
|
109
|
+
// ID selector
|
|
110
|
+
const id = selector.slice(1)
|
|
111
|
+
pattern = new RegExp(`<[^>]+id=["']${id}["'][^>]*>[\\s\\S]*?(?=<\\/[^>]+>\\s*$)`, 'i')
|
|
112
|
+
} else if (selector.startsWith('.')) {
|
|
113
|
+
// Class selector
|
|
114
|
+
const cls = selector.slice(1)
|
|
115
|
+
pattern = new RegExp(`<[^>]+class=["'][^"']*\\b${cls}\\b[^"']*["'][^>]*>[\\s\\S]*?<\\/`, 'i')
|
|
116
|
+
} else {
|
|
117
|
+
// Tag selector (with optional class/id)
|
|
118
|
+
const parts = selector.split(/([.#])/)
|
|
119
|
+
const tag = parts[0] || 'div'
|
|
120
|
+
pattern = new RegExp(`<${tag}[^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi')
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const matches = html.match(pattern)
|
|
124
|
+
if (!matches) return null
|
|
125
|
+
|
|
126
|
+
// Strip HTML tags and return text
|
|
127
|
+
const { extractMarkdown } = require('./search/scraper')
|
|
128
|
+
return extractMarkdown(matches.join('\n'))
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
_extractByXPath(html, xpath) {
|
|
132
|
+
// Handle common XPath patterns by converting to regex
|
|
133
|
+
// //table → find all <table>...</table>
|
|
134
|
+
// //div[@class="content"] → find div with class content
|
|
135
|
+
const tagMatch = xpath.match(/\/\/(\w+)(?:\[@(\w+)=["']([^"']+)["']\])?/)
|
|
136
|
+
if (!tagMatch) return null
|
|
137
|
+
|
|
138
|
+
const [, tag, attr, val] = tagMatch
|
|
139
|
+
let pattern
|
|
140
|
+
if (attr && val) {
|
|
141
|
+
pattern = new RegExp(`<${tag}[^>]*${attr}=["'][^"']*${val}[^"']*["'][^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi')
|
|
142
|
+
} else {
|
|
143
|
+
pattern = new RegExp(`<${tag}[^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi')
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const matches = html.match(pattern)
|
|
147
|
+
if (!matches) return null
|
|
148
|
+
|
|
149
|
+
const { extractMarkdown } = require('./search/scraper')
|
|
150
|
+
return extractMarkdown(matches.join('\n'))
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* BM25-inspired relevance filtering.
|
|
155
|
+
* Splits content into sections, scores each against the query, returns top sections.
|
|
156
|
+
*/
|
|
157
|
+
_filterByRelevance(content, query, topK = 5) {
|
|
158
|
+
const queryTerms = this._tokenize(query)
|
|
159
|
+
if (queryTerms.length === 0) return content
|
|
160
|
+
|
|
161
|
+
// Split content into sections (by headings, double newlines, or paragraphs)
|
|
162
|
+
const sections = content.split(/\n(?=#{1,6}\s)|(?:\n\n)/).filter(s => s.trim().length > 20)
|
|
163
|
+
if (sections.length <= topK) return content
|
|
164
|
+
|
|
165
|
+
// Calculate document frequency for IDF
|
|
166
|
+
const df = {}
|
|
167
|
+
for (const section of sections) {
|
|
168
|
+
const terms = new Set(this._tokenize(section))
|
|
169
|
+
for (const term of terms) {
|
|
170
|
+
df[term] = (df[term] || 0) + 1
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Score each section
|
|
175
|
+
const scored = sections.map(section => {
|
|
176
|
+
const sectionTerms = this._tokenize(section)
|
|
177
|
+
const tf = {}
|
|
178
|
+
for (const term of sectionTerms) {
|
|
179
|
+
tf[term] = (tf[term] || 0) + 1
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
let score = 0
|
|
183
|
+
for (const queryTerm of queryTerms) {
|
|
184
|
+
const termFreq = tf[queryTerm] || 0
|
|
185
|
+
const docFreq = df[queryTerm] || 1
|
|
186
|
+
const idf = Math.log((sections.length - docFreq + 0.5) / (docFreq + 0.5) + 1)
|
|
187
|
+
// BM25 formula
|
|
188
|
+
const k1 = 1.2
|
|
189
|
+
const b = 0.75
|
|
190
|
+
const avgDl = sections.reduce((a, s) => a + this._tokenize(s).length, 0) / sections.length
|
|
191
|
+
const dl = sectionTerms.length
|
|
192
|
+
score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * dl / avgDl)))
|
|
193
|
+
}
|
|
194
|
+
return { section, score }
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
// Return top sections in original order
|
|
198
|
+
scored.sort((a, b) => b.score - a.score)
|
|
199
|
+
const topSections = new Set(scored.slice(0, topK).map(s => s.section))
|
|
200
|
+
return sections.filter(s => topSections.has(s)).join('\n\n')
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
_tokenize(text) {
|
|
204
|
+
return text.toLowerCase()
|
|
205
|
+
.replace(/[^\w\s]/g, ' ')
|
|
206
|
+
.split(/\s+/)
|
|
207
|
+
.filter(t => t.length > 2)
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Use LLM to extract structured data from content.
|
|
212
|
+
*/
|
|
213
|
+
async _llmExtract(content, instruction, schema) {
|
|
214
|
+
// Truncate content to avoid token limits
|
|
215
|
+
const maxChars = 30000
|
|
216
|
+
if (content.length > maxChars) {
|
|
217
|
+
content = content.slice(0, maxChars) + '\n...(truncated)'
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const systemPrompt = `You are a data extraction assistant. Extract structured data from the provided web page content.
|
|
221
|
+
${schema ? `Return a valid JSON object matching this schema:\n${JSON.stringify(schema, null, 2)}` : 'Return the extracted data as a JSON object.'}
|
|
222
|
+
Only return valid JSON. No markdown code fences. No explanation.`
|
|
223
|
+
|
|
224
|
+
const userPrompt = instruction
|
|
225
|
+
? `${instruction}\n\nPage content:\n${content}`
|
|
226
|
+
: `Extract the key information from this page:\n${content}`
|
|
227
|
+
|
|
228
|
+
// Try Gemini first (free), fallback to OpenAI
|
|
229
|
+
if (this.apiKey) {
|
|
230
|
+
return this._geminiExtract(systemPrompt, userPrompt)
|
|
231
|
+
} else if (this.openaiKey) {
|
|
232
|
+
return this._openaiExtract(systemPrompt, userPrompt)
|
|
233
|
+
} else {
|
|
234
|
+
throw new Error('No LLM API key configured. Set GEMINI_API_KEY or OPENAI_API_KEY.')
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
async _geminiExtract(systemPrompt, userPrompt) {
|
|
239
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.model}:generateContent?key=${this.apiKey}`
|
|
240
|
+
|
|
241
|
+
const body = {
|
|
242
|
+
contents: [{ parts: [{ text: `${systemPrompt}\n\n${userPrompt}` }] }],
|
|
243
|
+
generationConfig: {
|
|
244
|
+
responseMimeType: 'application/json',
|
|
245
|
+
temperature: 0.1
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
const response = await this._post(url, body)
|
|
250
|
+
const text = response?.candidates?.[0]?.content?.parts?.[0]?.text
|
|
251
|
+
if (!text) throw new Error('Empty response from Gemini')
|
|
252
|
+
|
|
253
|
+
try {
|
|
254
|
+
return JSON.parse(text)
|
|
255
|
+
} catch (e) {
|
|
256
|
+
// Try to extract JSON from response
|
|
257
|
+
const jsonMatch = text.match(/\{[\s\S]*\}|\[[\s\S]*\]/)
|
|
258
|
+
if (jsonMatch) return JSON.parse(jsonMatch[0])
|
|
259
|
+
throw new Error(`Failed to parse extraction result: ${text.slice(0, 200)}`)
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
async _openaiExtract(systemPrompt, userPrompt) {
|
|
264
|
+
const url = 'https://api.openai.com/v1/chat/completions'
|
|
265
|
+
const body = {
|
|
266
|
+
model: 'gpt-4o-mini',
|
|
267
|
+
messages: [
|
|
268
|
+
{ role: 'system', content: systemPrompt },
|
|
269
|
+
{ role: 'user', content: userPrompt }
|
|
270
|
+
],
|
|
271
|
+
response_format: { type: 'json_object' },
|
|
272
|
+
temperature: 0.1
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const response = await this._post(url, body, {
|
|
276
|
+
'Authorization': `Bearer ${this.openaiKey}`
|
|
277
|
+
})
|
|
278
|
+
const text = response?.choices?.[0]?.message?.content
|
|
279
|
+
if (!text) throw new Error('Empty response from OpenAI')
|
|
280
|
+
return JSON.parse(text)
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
_post(url, body, extraHeaders = {}) {
|
|
284
|
+
return new Promise((resolve, reject) => {
|
|
285
|
+
const urlObj = new URL(url)
|
|
286
|
+
const data = JSON.stringify(body)
|
|
287
|
+
const opts = {
|
|
288
|
+
hostname: urlObj.hostname,
|
|
289
|
+
path: urlObj.pathname + urlObj.search,
|
|
290
|
+
method: 'POST',
|
|
291
|
+
headers: {
|
|
292
|
+
'Content-Type': 'application/json',
|
|
293
|
+
'Content-Length': Buffer.byteLength(data),
|
|
294
|
+
...extraHeaders
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
const req = https.request(opts, res => {
|
|
299
|
+
let responseData = ''
|
|
300
|
+
res.on('data', chunk => responseData += chunk)
|
|
301
|
+
res.on('end', () => {
|
|
302
|
+
try { resolve(JSON.parse(responseData)) }
|
|
303
|
+
catch (e) { reject(new Error(`Invalid JSON response: ${responseData.slice(0, 200)}`)) }
|
|
304
|
+
})
|
|
305
|
+
})
|
|
306
|
+
req.on('error', reject)
|
|
307
|
+
req.setTimeout(30000, () => { req.destroy(); reject(new Error('LLM request timeout')) })
|
|
308
|
+
req.write(data)
|
|
309
|
+
req.end()
|
|
310
|
+
})
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
module.exports = { ExtractEngine }
|
package/src/index.js
CHANGED
|
@@ -8,6 +8,8 @@ const { BrowseEngine } = require('./browse')
|
|
|
8
8
|
const { AuthManager } = require('./auth')
|
|
9
9
|
const { ActEngine } = require('./act')
|
|
10
10
|
const { CrawlEngine } = require('./crawl')
|
|
11
|
+
const { ExtractEngine } = require('./extract')
|
|
12
|
+
const { AgentEngine } = require('./agent')
|
|
11
13
|
const { Cache } = require('./cache')
|
|
12
14
|
const { EventEmitter, EVENTS } = require('./events')
|
|
13
15
|
const { CookieRefresher } = require('./auth/refresh')
|
|
@@ -38,6 +40,8 @@ class Spectrawl {
|
|
|
38
40
|
this.auth = new AuthManager(this.config.auth)
|
|
39
41
|
this.actEngine = new ActEngine(this.config, this.auth, this.browseEngine)
|
|
40
42
|
this.crawlEngine = new CrawlEngine(this.browseEngine, this.cache)
|
|
43
|
+
this.extractEngine = new ExtractEngine(this.browseEngine, this.config.search)
|
|
44
|
+
this.agentEngine = new AgentEngine(this.browseEngine, this.config.search)
|
|
41
45
|
this.refresher = new CookieRefresher(this.auth, this.events, this.config.auth)
|
|
42
46
|
}
|
|
43
47
|
|
|
@@ -113,6 +117,37 @@ class Spectrawl {
|
|
|
113
117
|
return this.crawlEngine.listJobs()
|
|
114
118
|
}
|
|
115
119
|
|
|
120
|
+
/**
|
|
121
|
+
* Extract structured data from a URL using LLM.
|
|
122
|
+
* @param {string} url - URL to extract from
|
|
123
|
+
* @param {object} opts - { instruction, schema, selector, relevanceFilter, model }
|
|
124
|
+
* @returns {Promise<{data, url, title, contentLength, duration}>}
|
|
125
|
+
*/
|
|
126
|
+
async extract(url, opts = {}) {
|
|
127
|
+
return this.extractEngine.extract(url, opts)
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Extract from already-fetched content (no browsing).
|
|
132
|
+
* @param {string} content - Page content
|
|
133
|
+
* @param {object} opts - { instruction, schema, relevanceFilter }
|
|
134
|
+
* @returns {Promise<{data}>}
|
|
135
|
+
*/
|
|
136
|
+
async extractFromContent(content, opts = {}) {
|
|
137
|
+
return this.extractEngine.extractFromContent(content, opts)
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Execute natural language browser actions.
|
|
142
|
+
* @param {string} url - URL to navigate to
|
|
143
|
+
* @param {string} instruction - what to do (e.g. "click the login button")
|
|
144
|
+
* @param {object} opts - { maxSteps, screenshot, timeout }
|
|
145
|
+
* @returns {Promise<{success, url, title, steps, content, screenshot?, duration}>}
|
|
146
|
+
*/
|
|
147
|
+
async agent(url, instruction, opts = {}) {
|
|
148
|
+
return this.agentEngine.act(url, instruction, opts)
|
|
149
|
+
}
|
|
150
|
+
|
|
116
151
|
/**
|
|
117
152
|
* Perform an authenticated action on a platform.
|
|
118
153
|
* @param {string} platform - Platform name (x, reddit, devto, etc.)
|
package/src/server.js
CHANGED
|
@@ -40,10 +40,12 @@ const server = http.createServer(async (req, res) => {
|
|
|
40
40
|
|
|
41
41
|
if (req.method === 'POST' && path === '/browse') {
|
|
42
42
|
const body = await readBody(req)
|
|
43
|
-
const { url: targetUrl, auth, screenshot, html, stealth
|
|
43
|
+
const { url: targetUrl, auth, screenshot, fullPage, html, stealth, camoufox, noCache,
|
|
44
|
+
captureNetwork, captureNetworkHeaders, captureNetworkBody } = body
|
|
44
45
|
if (!targetUrl) return error(res, 400, 'url is required')
|
|
45
46
|
|
|
46
|
-
const result = await spectrawl.browse(targetUrl, { auth, screenshot, html, stealth
|
|
47
|
+
const result = await spectrawl.browse(targetUrl, { auth, screenshot, fullPage, html, stealth,
|
|
48
|
+
camoufox, noCache, captureNetwork, captureNetworkHeaders, captureNetworkBody })
|
|
47
49
|
|
|
48
50
|
// If screenshot, return as base64
|
|
49
51
|
if (result.screenshot) {
|
|
@@ -55,10 +57,11 @@ const server = http.createServer(async (req, res) => {
|
|
|
55
57
|
if (req.method === 'POST' && path === '/crawl') {
|
|
56
58
|
const body = await readBody(req)
|
|
57
59
|
const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
|
|
58
|
-
includePatterns, excludePatterns, merge, async: asyncMode, concurrency
|
|
60
|
+
includePatterns, excludePatterns, merge, async: asyncMode, concurrency,
|
|
61
|
+
useSitemap, webhook } = body
|
|
59
62
|
if (!targetUrl) return error(res, 400, 'url is required')
|
|
60
63
|
|
|
61
|
-
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
|
|
64
|
+
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency, useSitemap, webhook }
|
|
62
65
|
|
|
63
66
|
if (asyncMode) {
|
|
64
67
|
// Async mode: return job ID immediately
|
|
@@ -88,6 +91,28 @@ const server = http.createServer(async (req, res) => {
|
|
|
88
91
|
return json(res, job)
|
|
89
92
|
}
|
|
90
93
|
|
|
94
|
+
if (req.method === 'POST' && path === '/extract') {
|
|
95
|
+
const body = await readBody(req)
|
|
96
|
+
const { url: targetUrl, instruction, schema, selector, relevanceFilter, model } = body
|
|
97
|
+
if (!targetUrl) return error(res, 400, 'url is required')
|
|
98
|
+
|
|
99
|
+
const result = await spectrawl.extract(targetUrl, { instruction, schema, selector, relevanceFilter, model })
|
|
100
|
+
return json(res, result)
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (req.method === 'POST' && path === '/agent') {
|
|
104
|
+
const body = await readBody(req)
|
|
105
|
+
const { url: targetUrl, instruction, maxSteps, screenshot, timeout } = body
|
|
106
|
+
if (!targetUrl) return error(res, 400, 'url is required')
|
|
107
|
+
if (!instruction) return error(res, 400, 'instruction is required')
|
|
108
|
+
|
|
109
|
+
const result = await spectrawl.agent(targetUrl, instruction, { maxSteps, screenshot, timeout })
|
|
110
|
+
if (result.screenshot) {
|
|
111
|
+
result.screenshot = result.screenshot.toString('base64')
|
|
112
|
+
}
|
|
113
|
+
return json(res, result)
|
|
114
|
+
}
|
|
115
|
+
|
|
91
116
|
if (req.method === 'POST' && path === '/act') {
|
|
92
117
|
const body = await readBody(req)
|
|
93
118
|
const { platform, action, ...params } = body
|
|
@@ -159,7 +184,15 @@ const server = http.createServer(async (req, res) => {
|
|
|
159
184
|
return error(res, 404, 'Not found')
|
|
160
185
|
} catch (err) {
|
|
161
186
|
console.error('Server error:', err)
|
|
162
|
-
|
|
187
|
+
const status = err.statusCode || 500
|
|
188
|
+
const extra = {}
|
|
189
|
+
if (err.retryable) extra.retryable = true
|
|
190
|
+
if (err.suggestion) extra.suggestion = err.suggestion
|
|
191
|
+
if (err.blocked) {
|
|
192
|
+
extra.retryable = true
|
|
193
|
+
extra.suggestion = 'Retry with stealth:true or use Camoufox engine'
|
|
194
|
+
}
|
|
195
|
+
return error(res, status, err.message, extra)
|
|
163
196
|
}
|
|
164
197
|
})
|
|
165
198
|
|
|
@@ -168,8 +201,29 @@ function json(res, data, status = 200) {
|
|
|
168
201
|
res.end(JSON.stringify(data))
|
|
169
202
|
}
|
|
170
203
|
|
|
171
|
-
|
|
172
|
-
|
|
204
|
+
/**
|
|
205
|
+
* RFC 9457-style structured error responses.
|
|
206
|
+
* Machine-readable for AI agents consuming our API.
|
|
207
|
+
*/
|
|
208
|
+
function error(res, status, message, extra = {}) {
|
|
209
|
+
const errorTypes = {
|
|
210
|
+
400: 'bad-request',
|
|
211
|
+
401: 'unauthorized',
|
|
212
|
+
403: 'forbidden',
|
|
213
|
+
404: 'not-found',
|
|
214
|
+
429: 'rate-limited',
|
|
215
|
+
500: 'internal-error',
|
|
216
|
+
502: 'upstream-error',
|
|
217
|
+
503: 'service-unavailable'
|
|
218
|
+
}
|
|
219
|
+
const body = {
|
|
220
|
+
type: `https://spectrawl.dev/errors/${errorTypes[status] || 'unknown'}`,
|
|
221
|
+
status,
|
|
222
|
+
title: errorTypes[status] ? errorTypes[status].replace(/-/g, ' ') : 'error',
|
|
223
|
+
detail: message,
|
|
224
|
+
...extra
|
|
225
|
+
}
|
|
226
|
+
json(res, body, status)
|
|
173
227
|
}
|
|
174
228
|
|
|
175
229
|
function readBody(req) {
|
|
@@ -187,11 +241,14 @@ function readBody(req) {
|
|
|
187
241
|
const port = config.port || 3900
|
|
188
242
|
server.listen(port, () => {
|
|
189
243
|
console.log(`🌐 Spectrawl server running on http://localhost:${port}`)
|
|
190
|
-
console.log(` POST /search
|
|
191
|
-
console.log(` POST /browse
|
|
192
|
-
console.log(` POST /
|
|
193
|
-
console.log(`
|
|
194
|
-
console.log(`
|
|
244
|
+
console.log(` POST /search — search the web`)
|
|
245
|
+
console.log(` POST /browse — stealth browse`)
|
|
246
|
+
console.log(` POST /crawl — crawl websites`)
|
|
247
|
+
console.log(` POST /extract — structured data extraction`)
|
|
248
|
+
console.log(` POST /agent — natural language browser actions`)
|
|
249
|
+
console.log(` POST /act — platform actions`)
|
|
250
|
+
console.log(` GET /status — auth health`)
|
|
251
|
+
console.log(` GET /health — server health`)
|
|
195
252
|
})
|
|
196
253
|
|
|
197
254
|
// Graceful shutdown
|