spectrawl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Serper.dev — 2500 free Google SERP queries.
5
+ * Requires SERPER_API_KEY in config or env.
6
+ */
7
+ async function serperSearch(query, config = {}) {
8
+ const apiKey = config.apiKey || process.env.SERPER_API_KEY
9
+ if (!apiKey) throw new Error('Serper API key not configured')
10
+
11
+ const maxResults = config.maxResults || 10
12
+ const body = JSON.stringify({
13
+ q: query,
14
+ num: maxResults
15
+ })
16
+
17
+ const data = await postJson('https://google.serper.dev/search', body, {
18
+ 'X-API-KEY': apiKey,
19
+ 'Content-Type': 'application/json'
20
+ })
21
+
22
+ if (!data.organic) return []
23
+
24
+ return data.organic.map(r => ({
25
+ url: r.link,
26
+ title: r.title,
27
+ snippet: r.snippet || '',
28
+ engine: 'serper'
29
+ }))
30
+ }
31
+
32
+ function postJson(url, body, headers = {}) {
33
+ return new Promise((resolve, reject) => {
34
+ const urlObj = new URL(url)
35
+ const opts = {
36
+ hostname: urlObj.hostname,
37
+ path: urlObj.pathname,
38
+ method: 'POST',
39
+ headers: {
40
+ ...headers,
41
+ 'User-Agent': 'Spectrawl/0.1.0',
42
+ 'Content-Length': Buffer.byteLength(body)
43
+ }
44
+ }
45
+
46
+ const req = https.request(opts, (res) => {
47
+ let data = ''
48
+ res.on('data', chunk => data += chunk)
49
+ res.on('end', () => {
50
+ try {
51
+ resolve(JSON.parse(data))
52
+ } catch (e) {
53
+ reject(new Error(`Serper API returned invalid JSON: ${data.slice(0, 200)}`))
54
+ }
55
+ })
56
+ })
57
+ req.on('error', reject)
58
+ req.setTimeout(10000, () => { req.destroy(); reject(new Error('Serper API timeout')) })
59
+ req.write(body)
60
+ req.end()
61
+ })
62
+ }
63
+
64
+ module.exports = { serperSearch }
@@ -0,0 +1,104 @@
1
+ const { ddgSearch } = require('./engines/ddg')
2
+ const { braveSearch } = require('./engines/brave')
3
+ const { serperSearch } = require('./engines/serper')
4
+ const { searxngSearch } = require('./engines/searxng')
5
+ const { googleCseSearch } = require('./engines/google-cse')
6
+ const { jinaSearch } = require('./engines/jina')
7
+ const { scrapeUrls } = require('./scraper')
8
+ const { Summarizer } = require('./summarizer')
9
+
10
+ const ENGINES = {
11
+ searxng: searxngSearch,
12
+ ddg: ddgSearch,
13
+ brave: braveSearch,
14
+ serper: serperSearch,
15
+ 'google-cse': googleCseSearch,
16
+ jina: jinaSearch
17
+ }
18
+
19
+ class SearchEngine {
20
+ constructor(config = {}, cache) {
21
+ this.config = config
22
+ this.cache = cache
23
+ this.cascade = config.cascade || ['ddg', 'brave', 'serper']
24
+ this.scrapeTop = config.scrapeTop || 3
25
+ this.summarizer = config.llm ? new Summarizer(config.llm) : null
26
+ }
27
+
28
+ /**
29
+ * Search using the cascade strategy.
30
+ * Tries free/unlimited engines first, escalates to quota-limited ones if needed.
31
+ */
32
+ async search(query, opts = {}) {
33
+ if (!query || !query.trim()) {
34
+ throw new Error('Search query is required')
35
+ }
36
+
37
+ // Check cache first
38
+ const cacheKey = `${query}:${JSON.stringify(opts)}`
39
+ const cached = this.cache?.get('search', cacheKey)
40
+ if (cached) return { ...cached, cached: true }
41
+
42
+ let results = []
43
+ const minResults = opts.minResults || 5
44
+
45
+ // Cascade through engines until we have enough results
46
+ for (const engineName of this.cascade) {
47
+ const engine = ENGINES[engineName]
48
+ if (!engine) continue
49
+
50
+ try {
51
+ const engineResults = await engine(query, this.config[engineName] || {})
52
+ results = dedupeResults([...results, ...engineResults])
53
+
54
+ if (results.length >= minResults) break
55
+ } catch (err) {
56
+ console.warn(`Search engine ${engineName} failed:`, err.message)
57
+ continue
58
+ }
59
+ }
60
+
61
+ // Scrape top N results for full content
62
+ const scrapeCount = opts.scrapeTop ?? this.scrapeTop
63
+ if (scrapeCount > 0 && results.length > 0) {
64
+ const urls = results.slice(0, scrapeCount).map(r => r.url)
65
+ const scraped = await scrapeUrls(urls)
66
+
67
+ for (const result of results) {
68
+ const scrapedContent = scraped[result.url]
69
+ if (scrapedContent) {
70
+ result.fullContent = scrapedContent
71
+ }
72
+ }
73
+ }
74
+
75
+ // LLM summarization (optional)
76
+ let answer = null
77
+ if (opts.summarize && this.config.llm) {
78
+ answer = await this._summarize(query, results)
79
+ }
80
+
81
+ const response = { answer, sources: results, cached: false }
82
+
83
+ // Cache the result
84
+ this.cache?.set('search', cacheKey, response)
85
+
86
+ return response
87
+ }
88
+
89
+ async _summarize(query, results) {
90
+ if (!this.summarizer) return null
91
+ return this.summarizer.summarize(query, results)
92
+ }
93
+ }
94
+
95
+ function dedupeResults(results) {
96
+ const seen = new Set()
97
+ return results.filter(r => {
98
+ if (seen.has(r.url)) return false
99
+ seen.add(r.url)
100
+ return true
101
+ })
102
+ }
103
+
104
+ module.exports = { SearchEngine }
@@ -0,0 +1,170 @@
1
+ const https = require('https')
2
+ const http = require('http')
3
+ const { URL } = require('url')
4
+ const { jinaExtract } = require('./engines/jina')
5
+
6
+ /**
7
+ * Scrape URLs for full content.
8
+ * Dual engine approach (like tavily-open):
9
+ * 1. Jina Reader (fast, AI-optimized markdown) — if available
10
+ * 2. Readability (built-in, no deps) — fallback
11
+ * 3. Browser (Camoufox/Playwright) — for JS-heavy/blocked pages
12
+ */
13
+ async function scrapeUrls(urls, opts = {}) {
14
+ const results = {}
15
+ const timeout = opts.timeout || 10000
16
+ const concurrent = opts.concurrent || 3
17
+ const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
18
+
19
+ for (let i = 0; i < urls.length; i += concurrent) {
20
+ const batch = urls.slice(i, i + concurrent)
21
+ const promises = batch.map(url => scrapeUrl(url, { timeout, engine }).catch(() => null))
22
+ const batchResults = await Promise.all(promises)
23
+
24
+ batch.forEach((url, idx) => {
25
+ if (batchResults[idx]) {
26
+ results[url] = batchResults[idx]
27
+ }
28
+ })
29
+ }
30
+
31
+ return results
32
+ }
33
+
34
+ async function scrapeUrl(url, opts = {}) {
35
+ const { timeout = 10000, engine = 'auto' } = opts
36
+
37
+ // Try Jina first if available (better markdown output)
38
+ if (engine === 'jina' || engine === 'auto') {
39
+ try {
40
+ const result = await jinaExtract(url)
41
+ if (result.content && result.content.length > 100) {
42
+ return result.content
43
+ }
44
+ } catch (e) {
45
+ // Fall through to readability
46
+ }
47
+ }
48
+
49
+ // Readability fallback
50
+ const html = await fetchPage(url, timeout)
51
+ return extractMarkdown(html)
52
+ }
53
+
54
+ function fetchPage(url, timeout = 10000, redirects = 3) {
55
+ return new Promise((resolve, reject) => {
56
+ if (redirects <= 0) return reject(new Error('Too many redirects'))
57
+
58
+ const urlObj = new URL(url)
59
+ const client = urlObj.protocol === 'https:' ? https : http
60
+
61
+ const opts = {
62
+ hostname: urlObj.hostname,
63
+ path: urlObj.pathname + urlObj.search,
64
+ method: 'GET',
65
+ headers: {
66
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
67
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
68
+ 'Accept-Language': 'en-US,en;q=0.9',
69
+ 'Accept-Encoding': 'identity'
70
+ }
71
+ }
72
+
73
+ const req = client.request(opts, (res) => {
74
+ if ([301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
75
+ const redirectUrl = new URL(res.headers.location, url).toString()
76
+ return fetchPage(redirectUrl, timeout, redirects - 1).then(resolve).catch(reject)
77
+ }
78
+
79
+ if (res.statusCode !== 200) {
80
+ return reject(new Error(`HTTP ${res.statusCode}`))
81
+ }
82
+
83
+ let data = ''
84
+ res.on('data', chunk => data += chunk)
85
+ res.on('end', () => resolve(data))
86
+ })
87
+
88
+ req.on('error', reject)
89
+ req.setTimeout(timeout, () => { req.destroy(); reject(new Error('Scrape timeout')) })
90
+ req.end()
91
+ })
92
+ }
93
+
94
+ /**
95
+ * Extract content as clean markdown (improved over basic readability).
96
+ * Handles: headings, lists, code blocks, tables, links, bold/italic.
97
+ */
98
+ function extractMarkdown(html) {
99
+ // Remove noise
100
+ let content = html
101
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
102
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
103
+ .replace(/<!--[\s\S]*?-->/g, '')
104
+ .replace(/<nav[\s\S]*?<\/nav>/gi, '')
105
+ .replace(/<footer[\s\S]*?<\/footer>/gi, '')
106
+ .replace(/<header[\s\S]*?<\/header>/gi, '')
107
+ .replace(/<aside[\s\S]*?<\/aside>/gi, '')
108
+ .replace(/<iframe[\s\S]*?<\/iframe>/gi, '')
109
+ .replace(/<svg[\s\S]*?<\/svg>/gi, '')
110
+
111
+ // Try to find main content
112
+ const mainMatch = content.match(/<main[\s\S]*?<\/main>/i) ||
113
+ content.match(/<article[\s\S]*?<\/article>/i) ||
114
+ content.match(/<div[^>]*(?:content|article|post|entry|main)[^>]*>[\s\S]*?<\/div>/i)
115
+
116
+ if (mainMatch) content = mainMatch[0]
117
+
118
+ // Convert to markdown
119
+ content = content
120
+ // Headings
121
+ .replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n# $1\n')
122
+ .replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n## $1\n')
123
+ .replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n### $1\n')
124
+ .replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n#### $1\n')
125
+ .replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, '\n##### $1\n')
126
+ .replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, '\n###### $1\n')
127
+ // Bold/italic
128
+ .replace(/<(?:strong|b)>([\s\S]*?)<\/(?:strong|b)>/gi, '**$1**')
129
+ .replace(/<(?:em|i)>([\s\S]*?)<\/(?:em|i)>/gi, '*$1*')
130
+ // Links
131
+ .replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)')
132
+ // Code blocks
133
+ .replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '\n```\n$1\n```\n')
134
+ .replace(/<code>([\s\S]*?)<\/code>/gi, '`$1`')
135
+ // Lists
136
+ .replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n')
137
+ .replace(/<\/?[ou]l[^>]*>/gi, '\n')
138
+ // Table (basic)
139
+ .replace(/<tr[^>]*>([\s\S]*?)<\/tr>/gi, '|$1|\n')
140
+ .replace(/<t[hd][^>]*>([\s\S]*?)<\/t[hd]>/gi, ' $1 |')
141
+ // Paragraphs/breaks
142
+ .replace(/<br\s*\/?>/gi, '\n')
143
+ .replace(/<\/p>/gi, '\n\n')
144
+ .replace(/<p[^>]*>/gi, '')
145
+ // Strip remaining tags
146
+ .replace(/<[^>]+>/g, '')
147
+ // Decode entities
148
+ .replace(/&nbsp;/g, ' ')
149
+ .replace(/&amp;/g, '&')
150
+ .replace(/&lt;/g, '<')
151
+ .replace(/&gt;/g, '>')
152
+ .replace(/&quot;/g, '"')
153
+ .replace(/&#39;/g, "'")
154
+ .replace(/&mdash;/g, '—')
155
+ .replace(/&ndash;/g, '–')
156
+ // Clean whitespace
157
+ .replace(/\n{3,}/g, '\n\n')
158
+ .replace(/[ \t]+/g, ' ')
159
+ .replace(/^ +/gm, '')
160
+ .trim()
161
+
162
+ // Truncate
163
+ if (content.length > 15000) {
164
+ content = content.slice(0, 15000) + '\n\n...(truncated)'
165
+ }
166
+
167
+ return content
168
+ }
169
+
170
+ module.exports = { scrapeUrls, scrapeUrl, extractMarkdown }
@@ -0,0 +1,156 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * LLM summarization for search results.
5
+ * Supports: openai, anthropic, minimax, ollama
6
+ */
7
+ class Summarizer {
8
+ constructor(config = {}) {
9
+ this.provider = config.provider || 'openai'
10
+ this.model = config.model || 'gpt-4o-mini'
11
+ this.apiKey = config.apiKey || process.env[this._envKey()]
12
+ this.baseUrl = config.baseUrl || null
13
+ }
14
+
15
+ _envKey() {
16
+ const keys = {
17
+ openai: 'OPENAI_API_KEY',
18
+ anthropic: 'ANTHROPIC_API_KEY',
19
+ minimax: 'MINIMAX_API_KEY',
20
+ xai: 'XAI_API_KEY'
21
+ }
22
+ return keys[this.provider] || 'OPENAI_API_KEY'
23
+ }
24
+
25
+ async summarize(query, sources) {
26
+ if (!this.apiKey) return null
27
+
28
+ const context = sources
29
+ .slice(0, 5)
30
+ .map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
31
+ .join('\n\n')
32
+
33
+ const prompt = `Based on the following search results, provide a concise answer to the query: "${query}"
34
+
35
+ Include citations as [1], [2], etc. referencing the source numbers below.
36
+ Be direct and factual. If the sources don't contain enough information, say so.
37
+
38
+ Sources:
39
+ ${context}
40
+
41
+ Answer:`
42
+
43
+ try {
44
+ return await this._call(prompt)
45
+ } catch (err) {
46
+ console.warn('Summarization failed:', err.message)
47
+ return null
48
+ }
49
+ }
50
+
51
+ async _call(prompt) {
52
+ switch (this.provider) {
53
+ case 'openai':
54
+ case 'minimax':
55
+ case 'xai':
56
+ return this._openaiCompatible(prompt)
57
+ case 'anthropic':
58
+ return this._anthropic(prompt)
59
+ case 'ollama':
60
+ return this._ollama(prompt)
61
+ default:
62
+ return this._openaiCompatible(prompt)
63
+ }
64
+ }
65
+
66
+ async _openaiCompatible(prompt) {
67
+ const urls = {
68
+ openai: 'https://api.openai.com/v1/chat/completions',
69
+ minimax: 'https://api.minimax.chat/v1/text/chatcompletion_v2',
70
+ xai: 'https://api.x.ai/v1/chat/completions'
71
+ }
72
+ const url = this.baseUrl || urls[this.provider] || urls.openai
73
+
74
+ const body = JSON.stringify({
75
+ model: this.model,
76
+ messages: [
77
+ { role: 'system', content: 'You are a concise search assistant. Answer with citations.' },
78
+ { role: 'user', content: prompt }
79
+ ],
80
+ max_tokens: 500,
81
+ temperature: 0.3
82
+ })
83
+
84
+ const data = await postJson(url, body, {
85
+ 'Authorization': `Bearer ${this.apiKey}`,
86
+ 'Content-Type': 'application/json'
87
+ })
88
+
89
+ return data.choices?.[0]?.message?.content || null
90
+ }
91
+
92
+ async _anthropic(prompt) {
93
+ const url = this.baseUrl || 'https://api.anthropic.com/v1/messages'
94
+ const body = JSON.stringify({
95
+ model: this.model || 'claude-3-5-haiku-20241022',
96
+ max_tokens: 500,
97
+ messages: [{ role: 'user', content: prompt }]
98
+ })
99
+
100
+ const data = await postJson(url, body, {
101
+ 'x-api-key': this.apiKey,
102
+ 'anthropic-version': '2023-06-01',
103
+ 'Content-Type': 'application/json'
104
+ })
105
+
106
+ return data.content?.[0]?.text || null
107
+ }
108
+
109
+ async _ollama(prompt) {
110
+ const url = this.baseUrl || 'http://localhost:11434/api/generate'
111
+ const body = JSON.stringify({
112
+ model: this.model || 'llama3',
113
+ prompt,
114
+ stream: false
115
+ })
116
+
117
+ const data = await postJson(url, body, {
118
+ 'Content-Type': 'application/json'
119
+ })
120
+
121
+ return data.response || null
122
+ }
123
+ }
124
+
125
+ function postJson(url, body, headers = {}) {
126
+ return new Promise((resolve, reject) => {
127
+ const urlObj = new URL(url)
128
+ const client = urlObj.protocol === 'https:' ? https : require('http')
129
+
130
+ const opts = {
131
+ hostname: urlObj.hostname,
132
+ port: urlObj.port || (urlObj.protocol === 'https:' ? 443 : 80),
133
+ path: urlObj.pathname + urlObj.search,
134
+ method: 'POST',
135
+ headers: {
136
+ ...headers,
137
+ 'Content-Length': Buffer.byteLength(body)
138
+ }
139
+ }
140
+
141
+ const req = client.request(opts, (res) => {
142
+ let data = ''
143
+ res.on('data', chunk => data += chunk)
144
+ res.on('end', () => {
145
+ try { resolve(JSON.parse(data)) }
146
+ catch (e) { reject(new Error(`Invalid JSON: ${data.slice(0, 200)}`)) }
147
+ })
148
+ })
149
+ req.on('error', reject)
150
+ req.setTimeout(30000, () => { req.destroy(); reject(new Error('LLM timeout')) })
151
+ req.write(body)
152
+ req.end()
153
+ })
154
+ }
155
+
156
+ module.exports = { Summarizer }
package/src/server.js ADDED
@@ -0,0 +1,111 @@
1
+ const http = require('http')
2
+ const { Spectrawl } = require('./index')
3
+ const { loadConfig } = require('./config')
4
+
5
+ const config = loadConfig()
6
+ const spectrawl = new Spectrawl()
7
+
8
+ const server = http.createServer(async (req, res) => {
9
+ // CORS
10
+ res.setHeader('Access-Control-Allow-Origin', '*')
11
+ res.setHeader('Access-Control-Allow-Methods', 'GET, POST, OPTIONS')
12
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Authorization')
13
+
14
+ if (req.method === 'OPTIONS') {
15
+ res.writeHead(204)
16
+ return res.end()
17
+ }
18
+
19
+ const url = new URL(req.url, `http://${req.headers.host}`)
20
+ const path = url.pathname
21
+
22
+ try {
23
+ if (req.method === 'GET' && path === '/health') {
24
+ return json(res, { status: 'ok', version: '0.1.0' })
25
+ }
26
+
27
+ if (req.method === 'GET' && path === '/status') {
28
+ const status = await spectrawl.status()
29
+ return json(res, { accounts: status })
30
+ }
31
+
32
+ if (req.method === 'POST' && path === '/search') {
33
+ const body = await readBody(req)
34
+ const { query, summarize, scrapeTop, minResults } = body
35
+ if (!query) return error(res, 400, 'query is required')
36
+
37
+ const results = await spectrawl.search(query, { summarize, scrapeTop, minResults })
38
+ return json(res, results)
39
+ }
40
+
41
+ if (req.method === 'POST' && path === '/browse') {
42
+ const body = await readBody(req)
43
+ const { url: targetUrl, auth, screenshot, html, stealth } = body
44
+ if (!targetUrl) return error(res, 400, 'url is required')
45
+
46
+ const result = await spectrawl.browse(targetUrl, { auth, screenshot, html, stealth })
47
+
48
+ // If screenshot, return as base64
49
+ if (result.screenshot) {
50
+ result.screenshot = result.screenshot.toString('base64')
51
+ }
52
+ return json(res, result)
53
+ }
54
+
55
+ if (req.method === 'POST' && path === '/act') {
56
+ const body = await readBody(req)
57
+ const { platform, action, ...params } = body
58
+ if (!platform || !action) return error(res, 400, 'platform and action are required')
59
+
60
+ const result = await spectrawl.act(platform, action, params)
61
+ return json(res, result)
62
+ }
63
+
64
+ return error(res, 404, 'Not found')
65
+ } catch (err) {
66
+ console.error('Server error:', err)
67
+ return error(res, 500, err.message)
68
+ }
69
+ })
70
+
71
+ function json(res, data, status = 200) {
72
+ res.writeHead(status, { 'Content-Type': 'application/json' })
73
+ res.end(JSON.stringify(data))
74
+ }
75
+
76
+ function error(res, status, message) {
77
+ json(res, { error: message }, status)
78
+ }
79
+
80
+ function readBody(req) {
81
+ return new Promise((resolve, reject) => {
82
+ let body = ''
83
+ req.on('data', chunk => body += chunk)
84
+ req.on('end', () => {
85
+ try { resolve(JSON.parse(body)) }
86
+ catch (e) { reject(new Error('Invalid JSON body')) }
87
+ })
88
+ req.on('error', reject)
89
+ })
90
+ }
91
+
92
+ const port = config.port || 3900
93
+ server.listen(port, () => {
94
+ console.log(`🌐 Spectrawl server running on http://localhost:${port}`)
95
+ console.log(` POST /search — search the web`)
96
+ console.log(` POST /browse — stealth browse`)
97
+ console.log(` POST /act — platform actions`)
98
+ console.log(` GET /status — auth health`)
99
+ console.log(` GET /health — server health`)
100
+ })
101
+
102
+ // Graceful shutdown
103
+ process.on('SIGTERM', async () => {
104
+ await spectrawl.close()
105
+ server.close()
106
+ })
107
+
108
+ process.on('SIGINT', async () => {
109
+ await spectrawl.close()
110
+ server.close()
111
+ })