spectrawl 0.4.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/extract.js ADDED
@@ -0,0 +1,314 @@
1
+ /**
2
+ * Spectrawl Extract Engine
3
+ * Structured data extraction from web pages using LLM + optional CSS/XPath selectors.
4
+ * Inspired by Stagehand's extract() but self-hosted and integrated with Spectrawl's browse engine.
5
+ */
6
+
7
+ const https = require('https')
8
+ const http = require('http')
9
+
10
+ const DEFAULT_OPTS = {
11
+ model: 'gemini-2.0-flash',
12
+ timeout: 30000,
13
+ selector: null, // CSS or XPath selector to narrow extraction scope
14
+ instruction: null, // natural language instruction
15
+ schema: null, // JSON Schema for structured output
16
+ relevanceFilter: false // BM25-style relevance filtering
17
+ }
18
+
19
+ class ExtractEngine {
20
+ constructor(browseEngine, config = {}) {
21
+ this.browseEngine = browseEngine
22
+ this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
23
+ this.openaiKey = config.openaiKey || process.env.OPENAI_API_KEY
24
+ this.model = config.model || DEFAULT_OPTS.model
25
+ }
26
+
27
+ /**
28
+ * Extract structured data from a URL.
29
+ * @param {string} url - URL to extract from
30
+ * @param {object} opts - extraction options
31
+ * @param {string} opts.instruction - what to extract (natural language)
32
+ * @param {object} opts.schema - JSON Schema for the output structure
33
+ * @param {string} opts.selector - CSS/XPath selector to narrow scope
34
+ * @param {boolean} opts.relevanceFilter - filter content by relevance to instruction
35
+ * @param {string} opts.model - LLM model to use
36
+ */
37
+ async extract(url, opts = {}) {
38
+ const config = { ...DEFAULT_OPTS, ...opts }
39
+ const startTime = Date.now()
40
+
41
+ // Step 1: Browse the page
42
+ const page = await this.browseEngine.browse(url, {
43
+ html: !!config.selector, // need HTML for selector extraction
44
+ timeout: config.timeout
45
+ })
46
+
47
+ let content = page.content || ''
48
+
49
+ // Step 2: If selector provided, narrow the content
50
+ if (config.selector && page.html) {
51
+ content = this._extractBySelector(page.html, config.selector)
52
+ if (!content || content.length < 10) {
53
+ content = page.content // fallback to full content
54
+ }
55
+ }
56
+
57
+ // Step 3: If relevance filter, apply BM25-style filtering
58
+ if (config.relevanceFilter && config.instruction) {
59
+ content = this._filterByRelevance(content, config.instruction)
60
+ }
61
+
62
+ // Step 4: Extract with LLM
63
+ const extracted = await this._llmExtract(content, config.instruction, config.schema)
64
+
65
+ return {
66
+ data: extracted,
67
+ url: page.url,
68
+ title: page.title,
69
+ contentLength: content.length,
70
+ duration: Date.now() - startTime
71
+ }
72
+ }
73
+
74
+ /**
75
+ * Extract from already-fetched content (no browsing needed).
76
+ */
77
+ async extractFromContent(content, opts = {}) {
78
+ const config = { ...DEFAULT_OPTS, ...opts }
79
+
80
+ if (config.relevanceFilter && config.instruction) {
81
+ content = this._filterByRelevance(content, config.instruction)
82
+ }
83
+
84
+ const extracted = await this._llmExtract(content, config.instruction, config.schema)
85
+ return { data: extracted }
86
+ }
87
+
88
+ /**
89
+ * Extract content matching a CSS or XPath selector from HTML.
90
+ */
91
+ _extractBySelector(html, selector) {
92
+ // Simple CSS selector extraction (handles common cases)
93
+ // For XPath, we prefix with xpath=
94
+ if (selector.startsWith('xpath=') || selector.startsWith('//')) {
95
+ // XPath — extract using regex patterns for common XPath expressions
96
+ const xpath = selector.replace('xpath=', '')
97
+ return this._extractByXPath(html, xpath)
98
+ }
99
+
100
+ // CSS selector — use tag/class/id matching
101
+ return this._extractByCSS(html, selector)
102
+ }
103
+
104
+ _extractByCSS(html, selector) {
105
+ // Handle common CSS selectors: tag, .class, #id, tag.class, tag#id
106
+ let pattern
107
+
108
+ if (selector.startsWith('#')) {
109
+ // ID selector
110
+ const id = selector.slice(1)
111
+ pattern = new RegExp(`<[^>]+id=["']${id}["'][^>]*>[\\s\\S]*?(?=<\\/[^>]+>\\s*$)`, 'i')
112
+ } else if (selector.startsWith('.')) {
113
+ // Class selector
114
+ const cls = selector.slice(1)
115
+ pattern = new RegExp(`<[^>]+class=["'][^"']*\\b${cls}\\b[^"']*["'][^>]*>[\\s\\S]*?<\\/`, 'i')
116
+ } else {
117
+ // Tag selector (with optional class/id)
118
+ const parts = selector.split(/([.#])/)
119
+ const tag = parts[0] || 'div'
120
+ pattern = new RegExp(`<${tag}[^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi')
121
+ }
122
+
123
+ const matches = html.match(pattern)
124
+ if (!matches) return null
125
+
126
+ // Strip HTML tags and return text
127
+ const { extractMarkdown } = require('./search/scraper')
128
+ return extractMarkdown(matches.join('\n'))
129
+ }
130
+
131
+ _extractByXPath(html, xpath) {
132
+ // Handle common XPath patterns by converting to regex
133
+ // //table → find all <table>...</table>
134
+ // //div[@class="content"] → find div with class content
135
+ const tagMatch = xpath.match(/\/\/(\w+)(?:\[@(\w+)=["']([^"']+)["']\])?/)
136
+ if (!tagMatch) return null
137
+
138
+ const [, tag, attr, val] = tagMatch
139
+ let pattern
140
+ if (attr && val) {
141
+ pattern = new RegExp(`<${tag}[^>]*${attr}=["'][^"']*${val}[^"']*["'][^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi')
142
+ } else {
143
+ pattern = new RegExp(`<${tag}[^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi')
144
+ }
145
+
146
+ const matches = html.match(pattern)
147
+ if (!matches) return null
148
+
149
+ const { extractMarkdown } = require('./search/scraper')
150
+ return extractMarkdown(matches.join('\n'))
151
+ }
152
+
153
+ /**
154
+ * BM25-inspired relevance filtering.
155
+ * Splits content into sections, scores each against the query, returns top sections.
156
+ */
157
+ _filterByRelevance(content, query, topK = 5) {
158
+ const queryTerms = this._tokenize(query)
159
+ if (queryTerms.length === 0) return content
160
+
161
+ // Split content into sections (by headings, double newlines, or paragraphs)
162
+ const sections = content.split(/\n(?=#{1,6}\s)|(?:\n\n)/).filter(s => s.trim().length > 20)
163
+ if (sections.length <= topK) return content
164
+
165
+ // Calculate document frequency for IDF
166
+ const df = {}
167
+ for (const section of sections) {
168
+ const terms = new Set(this._tokenize(section))
169
+ for (const term of terms) {
170
+ df[term] = (df[term] || 0) + 1
171
+ }
172
+ }
173
+
174
+ // Score each section
175
+ const scored = sections.map(section => {
176
+ const sectionTerms = this._tokenize(section)
177
+ const tf = {}
178
+ for (const term of sectionTerms) {
179
+ tf[term] = (tf[term] || 0) + 1
180
+ }
181
+
182
+ let score = 0
183
+ for (const queryTerm of queryTerms) {
184
+ const termFreq = tf[queryTerm] || 0
185
+ const docFreq = df[queryTerm] || 1
186
+ const idf = Math.log((sections.length - docFreq + 0.5) / (docFreq + 0.5) + 1)
187
+ // BM25 formula
188
+ const k1 = 1.2
189
+ const b = 0.75
190
+ const avgDl = sections.reduce((a, s) => a + this._tokenize(s).length, 0) / sections.length
191
+ const dl = sectionTerms.length
192
+ score += idf * ((termFreq * (k1 + 1)) / (termFreq + k1 * (1 - b + b * dl / avgDl)))
193
+ }
194
+ return { section, score }
195
+ })
196
+
197
+ // Return top sections in original order
198
+ scored.sort((a, b) => b.score - a.score)
199
+ const topSections = new Set(scored.slice(0, topK).map(s => s.section))
200
+ return sections.filter(s => topSections.has(s)).join('\n\n')
201
+ }
202
+
203
+ _tokenize(text) {
204
+ return text.toLowerCase()
205
+ .replace(/[^\w\s]/g, ' ')
206
+ .split(/\s+/)
207
+ .filter(t => t.length > 2)
208
+ }
209
+
210
+ /**
211
+ * Use LLM to extract structured data from content.
212
+ */
213
+ async _llmExtract(content, instruction, schema) {
214
+ // Truncate content to avoid token limits
215
+ const maxChars = 30000
216
+ if (content.length > maxChars) {
217
+ content = content.slice(0, maxChars) + '\n...(truncated)'
218
+ }
219
+
220
+ const systemPrompt = `You are a data extraction assistant. Extract structured data from the provided web page content.
221
+ ${schema ? `Return a valid JSON object matching this schema:\n${JSON.stringify(schema, null, 2)}` : 'Return the extracted data as a JSON object.'}
222
+ Only return valid JSON. No markdown code fences. No explanation.`
223
+
224
+ const userPrompt = instruction
225
+ ? `${instruction}\n\nPage content:\n${content}`
226
+ : `Extract the key information from this page:\n${content}`
227
+
228
+ // Try Gemini first (free), fallback to OpenAI
229
+ if (this.apiKey) {
230
+ return this._geminiExtract(systemPrompt, userPrompt)
231
+ } else if (this.openaiKey) {
232
+ return this._openaiExtract(systemPrompt, userPrompt)
233
+ } else {
234
+ throw new Error('No LLM API key configured. Set GEMINI_API_KEY or OPENAI_API_KEY.')
235
+ }
236
+ }
237
+
238
+ async _geminiExtract(systemPrompt, userPrompt) {
239
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.model}:generateContent?key=${this.apiKey}`
240
+
241
+ const body = {
242
+ contents: [{ parts: [{ text: `${systemPrompt}\n\n${userPrompt}` }] }],
243
+ generationConfig: {
244
+ responseMimeType: 'application/json',
245
+ temperature: 0.1
246
+ }
247
+ }
248
+
249
+ const response = await this._post(url, body)
250
+ const text = response?.candidates?.[0]?.content?.parts?.[0]?.text
251
+ if (!text) throw new Error('Empty response from Gemini')
252
+
253
+ try {
254
+ return JSON.parse(text)
255
+ } catch (e) {
256
+ // Try to extract JSON from response
257
+ const jsonMatch = text.match(/\{[\s\S]*\}|\[[\s\S]*\]/)
258
+ if (jsonMatch) return JSON.parse(jsonMatch[0])
259
+ throw new Error(`Failed to parse extraction result: ${text.slice(0, 200)}`)
260
+ }
261
+ }
262
+
263
+ async _openaiExtract(systemPrompt, userPrompt) {
264
+ const url = 'https://api.openai.com/v1/chat/completions'
265
+ const body = {
266
+ model: 'gpt-4o-mini',
267
+ messages: [
268
+ { role: 'system', content: systemPrompt },
269
+ { role: 'user', content: userPrompt }
270
+ ],
271
+ response_format: { type: 'json_object' },
272
+ temperature: 0.1
273
+ }
274
+
275
+ const response = await this._post(url, body, {
276
+ 'Authorization': `Bearer ${this.openaiKey}`
277
+ })
278
+ const text = response?.choices?.[0]?.message?.content
279
+ if (!text) throw new Error('Empty response from OpenAI')
280
+ return JSON.parse(text)
281
+ }
282
+
283
+ _post(url, body, extraHeaders = {}) {
284
+ return new Promise((resolve, reject) => {
285
+ const urlObj = new URL(url)
286
+ const data = JSON.stringify(body)
287
+ const opts = {
288
+ hostname: urlObj.hostname,
289
+ path: urlObj.pathname + urlObj.search,
290
+ method: 'POST',
291
+ headers: {
292
+ 'Content-Type': 'application/json',
293
+ 'Content-Length': Buffer.byteLength(data),
294
+ ...extraHeaders
295
+ }
296
+ }
297
+
298
+ const req = https.request(opts, res => {
299
+ let responseData = ''
300
+ res.on('data', chunk => responseData += chunk)
301
+ res.on('end', () => {
302
+ try { resolve(JSON.parse(responseData)) }
303
+ catch (e) { reject(new Error(`Invalid JSON response: ${responseData.slice(0, 200)}`)) }
304
+ })
305
+ })
306
+ req.on('error', reject)
307
+ req.setTimeout(30000, () => { req.destroy(); reject(new Error('LLM request timeout')) })
308
+ req.write(data)
309
+ req.end()
310
+ })
311
+ }
312
+ }
313
+
314
+ module.exports = { ExtractEngine }
package/src/index.js CHANGED
@@ -8,6 +8,8 @@ const { BrowseEngine } = require('./browse')
8
8
  const { AuthManager } = require('./auth')
9
9
  const { ActEngine } = require('./act')
10
10
  const { CrawlEngine } = require('./crawl')
11
+ const { ExtractEngine } = require('./extract')
12
+ const { AgentEngine } = require('./agent')
11
13
  const { Cache } = require('./cache')
12
14
  const { EventEmitter, EVENTS } = require('./events')
13
15
  const { CookieRefresher } = require('./auth/refresh')
@@ -38,6 +40,8 @@ class Spectrawl {
38
40
  this.auth = new AuthManager(this.config.auth)
39
41
  this.actEngine = new ActEngine(this.config, this.auth, this.browseEngine)
40
42
  this.crawlEngine = new CrawlEngine(this.browseEngine, this.cache)
43
+ this.extractEngine = new ExtractEngine(this.browseEngine, this.config.search)
44
+ this.agentEngine = new AgentEngine(this.browseEngine, this.config.search)
41
45
  this.refresher = new CookieRefresher(this.auth, this.events, this.config.auth)
42
46
  }
43
47
 
@@ -113,6 +117,37 @@ class Spectrawl {
113
117
  return this.crawlEngine.listJobs()
114
118
  }
115
119
 
120
+ /**
121
+ * Extract structured data from a URL using LLM.
122
+ * @param {string} url - URL to extract from
123
+ * @param {object} opts - { instruction, schema, selector, relevanceFilter, model }
124
+ * @returns {Promise<{data, url, title, contentLength, duration}>}
125
+ */
126
+ async extract(url, opts = {}) {
127
+ return this.extractEngine.extract(url, opts)
128
+ }
129
+
130
+ /**
131
+ * Extract from already-fetched content (no browsing).
132
+ * @param {string} content - Page content
133
+ * @param {object} opts - { instruction, schema, relevanceFilter }
134
+ * @returns {Promise<{data}>}
135
+ */
136
+ async extractFromContent(content, opts = {}) {
137
+ return this.extractEngine.extractFromContent(content, opts)
138
+ }
139
+
140
+ /**
141
+ * Execute natural language browser actions.
142
+ * @param {string} url - URL to navigate to
143
+ * @param {string} instruction - what to do (e.g. "click the login button")
144
+ * @param {object} opts - { maxSteps, screenshot, timeout }
145
+ * @returns {Promise<{success, url, title, steps, content, screenshot?, duration}>}
146
+ */
147
+ async agent(url, instruction, opts = {}) {
148
+ return this.agentEngine.act(url, instruction, opts)
149
+ }
150
+
116
151
  /**
117
152
  * Perform an authenticated action on a platform.
118
153
  * @param {string} platform - Platform name (x, reddit, devto, etc.)
package/src/server.js CHANGED
@@ -40,10 +40,12 @@ const server = http.createServer(async (req, res) => {
40
40
 
41
41
  if (req.method === 'POST' && path === '/browse') {
42
42
  const body = await readBody(req)
43
- const { url: targetUrl, auth, screenshot, html, stealth } = body
43
+ const { url: targetUrl, auth, screenshot, fullPage, html, stealth, camoufox, noCache,
44
+ captureNetwork, captureNetworkHeaders, captureNetworkBody } = body
44
45
  if (!targetUrl) return error(res, 400, 'url is required')
45
46
 
46
- const result = await spectrawl.browse(targetUrl, { auth, screenshot, html, stealth })
47
+ const result = await spectrawl.browse(targetUrl, { auth, screenshot, fullPage, html, stealth,
48
+ camoufox, noCache, captureNetwork, captureNetworkHeaders, captureNetworkBody })
47
49
 
48
50
  // If screenshot, return as base64
49
51
  if (result.screenshot) {
@@ -55,10 +57,11 @@ const server = http.createServer(async (req, res) => {
55
57
  if (req.method === 'POST' && path === '/crawl') {
56
58
  const body = await readBody(req)
57
59
  const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
58
- includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
60
+ includePatterns, excludePatterns, merge, async: asyncMode, concurrency,
61
+ useSitemap, webhook } = body
59
62
  if (!targetUrl) return error(res, 400, 'url is required')
60
63
 
61
- const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
64
+ const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency, useSitemap, webhook }
62
65
 
63
66
  if (asyncMode) {
64
67
  // Async mode: return job ID immediately
@@ -88,6 +91,28 @@ const server = http.createServer(async (req, res) => {
88
91
  return json(res, job)
89
92
  }
90
93
 
94
+ if (req.method === 'POST' && path === '/extract') {
95
+ const body = await readBody(req)
96
+ const { url: targetUrl, instruction, schema, selector, relevanceFilter, model } = body
97
+ if (!targetUrl) return error(res, 400, 'url is required')
98
+
99
+ const result = await spectrawl.extract(targetUrl, { instruction, schema, selector, relevanceFilter, model })
100
+ return json(res, result)
101
+ }
102
+
103
+ if (req.method === 'POST' && path === '/agent') {
104
+ const body = await readBody(req)
105
+ const { url: targetUrl, instruction, maxSteps, screenshot, timeout } = body
106
+ if (!targetUrl) return error(res, 400, 'url is required')
107
+ if (!instruction) return error(res, 400, 'instruction is required')
108
+
109
+ const result = await spectrawl.agent(targetUrl, instruction, { maxSteps, screenshot, timeout })
110
+ if (result.screenshot) {
111
+ result.screenshot = result.screenshot.toString('base64')
112
+ }
113
+ return json(res, result)
114
+ }
115
+
91
116
  if (req.method === 'POST' && path === '/act') {
92
117
  const body = await readBody(req)
93
118
  const { platform, action, ...params } = body
@@ -159,7 +184,15 @@ const server = http.createServer(async (req, res) => {
159
184
  return error(res, 404, 'Not found')
160
185
  } catch (err) {
161
186
  console.error('Server error:', err)
162
- return error(res, 500, err.message)
187
+ const status = err.statusCode || 500
188
+ const extra = {}
189
+ if (err.retryable) extra.retryable = true
190
+ if (err.suggestion) extra.suggestion = err.suggestion
191
+ if (err.blocked) {
192
+ extra.retryable = true
193
+ extra.suggestion = 'Retry with stealth:true or use Camoufox engine'
194
+ }
195
+ return error(res, status, err.message, extra)
163
196
  }
164
197
  })
165
198
 
@@ -168,8 +201,29 @@ function json(res, data, status = 200) {
168
201
  res.end(JSON.stringify(data))
169
202
  }
170
203
 
171
- function error(res, status, message) {
172
- json(res, { error: message }, status)
204
+ /**
205
+ * RFC 9457-style structured error responses.
206
+ * Machine-readable for AI agents consuming our API.
207
+ */
208
+ function error(res, status, message, extra = {}) {
209
+ const errorTypes = {
210
+ 400: 'bad-request',
211
+ 401: 'unauthorized',
212
+ 403: 'forbidden',
213
+ 404: 'not-found',
214
+ 429: 'rate-limited',
215
+ 500: 'internal-error',
216
+ 502: 'upstream-error',
217
+ 503: 'service-unavailable'
218
+ }
219
+ const body = {
220
+ type: `https://spectrawl.dev/errors/${errorTypes[status] || 'unknown'}`,
221
+ status,
222
+ title: errorTypes[status] ? errorTypes[status].replace(/-/g, ' ') : 'error',
223
+ detail: message,
224
+ ...extra
225
+ }
226
+ json(res, body, status)
173
227
  }
174
228
 
175
229
  function readBody(req) {
@@ -187,11 +241,14 @@ function readBody(req) {
187
241
  const port = config.port || 3900
188
242
  server.listen(port, () => {
189
243
  console.log(`🌐 Spectrawl server running on http://localhost:${port}`)
190
- console.log(` POST /search — search the web`)
191
- console.log(` POST /browse — stealth browse`)
192
- console.log(` POST /act platform actions`)
193
- console.log(` GET /statusauth health`)
194
- console.log(` GET /health server health`)
244
+ console.log(` POST /search — search the web`)
245
+ console.log(` POST /browse — stealth browse`)
246
+ console.log(` POST /crawl crawl websites`)
247
+ console.log(` POST /extractstructured data extraction`)
248
+ console.log(` POST /agent natural language browser actions`)
249
+ console.log(` POST /act — platform actions`)
250
+ console.log(` GET /status — auth health`)
251
+ console.log(` GET /health — server health`)
195
252
  })
196
253
 
197
254
  // Graceful shutdown