spectrawl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +179 -0
- package/index.d.ts +90 -0
- package/package.json +53 -0
- package/src/act/adapters/devto.js +103 -0
- package/src/act/adapters/hashnode.js +89 -0
- package/src/act/adapters/ih.js +251 -0
- package/src/act/adapters/linkedin.js +106 -0
- package/src/act/adapters/reddit.js +160 -0
- package/src/act/adapters/x.js +202 -0
- package/src/act/form-filler.js +94 -0
- package/src/act/index.js +159 -0
- package/src/act/rate-limiter.js +143 -0
- package/src/auth/index.js +132 -0
- package/src/auth/refresh.js +111 -0
- package/src/browse/camoufox.js +164 -0
- package/src/browse/index.js +278 -0
- package/src/browse/install-stealth.js +188 -0
- package/src/cache.js +82 -0
- package/src/cli.js +160 -0
- package/src/config.js +65 -0
- package/src/events.js +57 -0
- package/src/index.js +108 -0
- package/src/mcp.js +195 -0
- package/src/search/engines/brave.js +62 -0
- package/src/search/engines/ddg.js +192 -0
- package/src/search/engines/google-cse.js +50 -0
- package/src/search/engines/jina.js +76 -0
- package/src/search/engines/searxng.js +69 -0
- package/src/search/engines/serper.js +64 -0
- package/src/search/index.js +104 -0
- package/src/search/scraper.js +170 -0
- package/src/search/summarizer.js +156 -0
- package/src/server.js +111 -0
package/src/mcp.js
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP (Model Context Protocol) server for Spectrawl.
|
|
3
|
+
* Exposes search, browse, act, auth, and status as MCP tools.
|
|
4
|
+
* Communicates over stdio (standard MCP transport).
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const { Spectrawl } = require('./index')
|
|
8
|
+
|
|
9
|
+
const TOOLS = [
|
|
10
|
+
{
|
|
11
|
+
name: 'web_search',
|
|
12
|
+
description: 'Search the web using free API cascade (DuckDuckGo, Brave, Serper). Returns results with optional LLM summary and full page content.',
|
|
13
|
+
inputSchema: {
|
|
14
|
+
type: 'object',
|
|
15
|
+
properties: {
|
|
16
|
+
query: { type: 'string', description: 'Search query' },
|
|
17
|
+
summarize: { type: 'boolean', description: 'Generate LLM summary with citations', default: false },
|
|
18
|
+
scrapeTop: { type: 'number', description: 'Number of top results to scrape for full content', default: 3 },
|
|
19
|
+
minResults: { type: 'number', description: 'Minimum results before trying next engine', default: 5 }
|
|
20
|
+
},
|
|
21
|
+
required: ['query']
|
|
22
|
+
}
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
name: 'web_browse',
|
|
26
|
+
description: 'Browse a URL with stealth anti-detection. Extracts text content, optionally takes screenshots. Supports authenticated sessions.',
|
|
27
|
+
inputSchema: {
|
|
28
|
+
type: 'object',
|
|
29
|
+
properties: {
|
|
30
|
+
url: { type: 'string', description: 'URL to browse' },
|
|
31
|
+
auth: { type: 'string', description: 'Platform name to use stored auth (e.g. "reddit", "x")' },
|
|
32
|
+
screenshot: { type: 'boolean', description: 'Take a screenshot', default: false },
|
|
33
|
+
html: { type: 'boolean', description: 'Return raw HTML', default: false },
|
|
34
|
+
stealth: { type: 'boolean', description: 'Force stealth browser mode', default: false }
|
|
35
|
+
},
|
|
36
|
+
required: ['url']
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
name: 'web_act',
|
|
41
|
+
description: 'Perform an authenticated action on a platform (post, comment, like, etc). Supports X/Twitter, Reddit, Dev.to.',
|
|
42
|
+
inputSchema: {
|
|
43
|
+
type: 'object',
|
|
44
|
+
properties: {
|
|
45
|
+
platform: { type: 'string', description: 'Platform name', enum: ['x', 'reddit', 'devto', 'hashnode', 'linkedin', 'ih'] },
|
|
46
|
+
action: { type: 'string', description: 'Action to perform (post, comment, like, delete)' },
|
|
47
|
+
account: { type: 'string', description: 'Account handle (e.g. @myhandle)' },
|
|
48
|
+
text: { type: 'string', description: 'Text content for post/comment' },
|
|
49
|
+
title: { type: 'string', description: 'Title (for Reddit/Dev.to posts)' },
|
|
50
|
+
subreddit: { type: 'string', description: 'Subreddit name (Reddit only)' },
|
|
51
|
+
tags: { type: 'array', items: { type: 'string' }, description: 'Tags (Dev.to only)' }
|
|
52
|
+
},
|
|
53
|
+
required: ['platform', 'action']
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
name: 'web_auth',
|
|
58
|
+
description: 'Manage platform authentication. Add, remove, or list accounts.',
|
|
59
|
+
inputSchema: {
|
|
60
|
+
type: 'object',
|
|
61
|
+
properties: {
|
|
62
|
+
action: { type: 'string', description: 'Auth action', enum: ['list', 'add', 'remove'] },
|
|
63
|
+
platform: { type: 'string', description: 'Platform name' },
|
|
64
|
+
account: { type: 'string', description: 'Account handle' }
|
|
65
|
+
},
|
|
66
|
+
required: ['action']
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
name: 'web_status',
|
|
71
|
+
description: 'Check health status of all authenticated accounts. Shows cookie expiry, OAuth status, and issues.',
|
|
72
|
+
inputSchema: {
|
|
73
|
+
type: 'object',
|
|
74
|
+
properties: {},
|
|
75
|
+
required: []
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
class MCPServer {
|
|
81
|
+
constructor(configPath) {
|
|
82
|
+
this.spectrawl = new Spectrawl(configPath)
|
|
83
|
+
this._buffer = ''
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
async start() {
|
|
87
|
+
process.stdin.setEncoding('utf8')
|
|
88
|
+
process.stdin.on('data', (chunk) => {
|
|
89
|
+
this._buffer += chunk
|
|
90
|
+
this._processBuffer()
|
|
91
|
+
})
|
|
92
|
+
process.stdin.on('end', () => {
|
|
93
|
+
this.spectrawl.close()
|
|
94
|
+
})
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
_processBuffer() {
|
|
98
|
+
const lines = this._buffer.split('\n')
|
|
99
|
+
this._buffer = lines.pop() || ''
|
|
100
|
+
|
|
101
|
+
for (const line of lines) {
|
|
102
|
+
if (!line.trim()) continue
|
|
103
|
+
try {
|
|
104
|
+
const msg = JSON.parse(line)
|
|
105
|
+
this._handleMessage(msg)
|
|
106
|
+
} catch (e) {
|
|
107
|
+
// Not JSON, ignore
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
async _handleMessage(msg) {
|
|
113
|
+
if (msg.method === 'initialize') {
|
|
114
|
+
this._send({
|
|
115
|
+
jsonrpc: '2.0',
|
|
116
|
+
id: msg.id,
|
|
117
|
+
result: {
|
|
118
|
+
protocolVersion: '2024-11-05',
|
|
119
|
+
capabilities: { tools: {} },
|
|
120
|
+
serverInfo: { name: 'spectrawl', version: '0.1.0' }
|
|
121
|
+
}
|
|
122
|
+
})
|
|
123
|
+
} else if (msg.method === 'tools/list') {
|
|
124
|
+
this._send({
|
|
125
|
+
jsonrpc: '2.0',
|
|
126
|
+
id: msg.id,
|
|
127
|
+
result: { tools: TOOLS }
|
|
128
|
+
})
|
|
129
|
+
} else if (msg.method === 'tools/call') {
|
|
130
|
+
const result = await this._handleToolCall(msg.params.name, msg.params.arguments || {})
|
|
131
|
+
this._send({
|
|
132
|
+
jsonrpc: '2.0',
|
|
133
|
+
id: msg.id,
|
|
134
|
+
result: {
|
|
135
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }]
|
|
136
|
+
}
|
|
137
|
+
})
|
|
138
|
+
} else if (msg.method === 'notifications/initialized') {
|
|
139
|
+
// Client acknowledged init, nothing to do
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
async _handleToolCall(name, args) {
|
|
144
|
+
try {
|
|
145
|
+
switch (name) {
|
|
146
|
+
case 'web_search':
|
|
147
|
+
return await this.spectrawl.search(args.query, {
|
|
148
|
+
summarize: args.summarize,
|
|
149
|
+
scrapeTop: args.scrapeTop,
|
|
150
|
+
minResults: args.minResults
|
|
151
|
+
})
|
|
152
|
+
case 'web_browse':
|
|
153
|
+
return await this.spectrawl.browse(args.url, {
|
|
154
|
+
auth: args.auth,
|
|
155
|
+
screenshot: args.screenshot,
|
|
156
|
+
html: args.html,
|
|
157
|
+
stealth: args.stealth
|
|
158
|
+
})
|
|
159
|
+
case 'web_act':
|
|
160
|
+
return await this.spectrawl.act(args.platform, args.action, {
|
|
161
|
+
account: args.account,
|
|
162
|
+
text: args.text,
|
|
163
|
+
title: args.title,
|
|
164
|
+
subreddit: args.subreddit,
|
|
165
|
+
tags: args.tags
|
|
166
|
+
})
|
|
167
|
+
case 'web_auth':
|
|
168
|
+
if (args.action === 'list') return await this.spectrawl.status()
|
|
169
|
+
if (args.action === 'remove') {
|
|
170
|
+
await this.spectrawl.auth.remove(args.platform, args.account)
|
|
171
|
+
return { removed: `${args.platform}/${args.account}` }
|
|
172
|
+
}
|
|
173
|
+
return { error: 'Use CLI for adding accounts: spectrawl login <platform>' }
|
|
174
|
+
case 'web_status':
|
|
175
|
+
return await this.spectrawl.status()
|
|
176
|
+
default:
|
|
177
|
+
return { error: `Unknown tool: ${name}` }
|
|
178
|
+
}
|
|
179
|
+
} catch (err) {
|
|
180
|
+
return { error: err.message }
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
_send(msg) {
|
|
185
|
+
process.stdout.write(JSON.stringify(msg) + '\n')
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Run if called directly
|
|
190
|
+
if (require.main === module) {
|
|
191
|
+
const server = new MCPServer()
|
|
192
|
+
server.start()
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
module.exports = { MCPServer, TOOLS }
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Brave Search API — 2000 queries/month free tier.
|
|
5
|
+
* Requires BRAVE_API_KEY in config or env.
|
|
6
|
+
*/
|
|
7
|
+
async function braveSearch(query, config = {}) {
|
|
8
|
+
const apiKey = config.apiKey || process.env.BRAVE_API_KEY
|
|
9
|
+
if (!apiKey) throw new Error('Brave API key not configured')
|
|
10
|
+
|
|
11
|
+
const maxResults = config.maxResults || 10
|
|
12
|
+
const params = new URLSearchParams({
|
|
13
|
+
q: query,
|
|
14
|
+
count: String(maxResults)
|
|
15
|
+
})
|
|
16
|
+
|
|
17
|
+
const data = await fetchJson(`https://api.search.brave.com/res/v1/web/search?${params}`, {
|
|
18
|
+
'X-Subscription-Token': apiKey,
|
|
19
|
+
'Accept': 'application/json'
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
if (!data.web?.results) return []
|
|
23
|
+
|
|
24
|
+
return data.web.results.map(r => ({
|
|
25
|
+
url: r.url,
|
|
26
|
+
title: r.title,
|
|
27
|
+
snippet: r.description || '',
|
|
28
|
+
engine: 'brave'
|
|
29
|
+
}))
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function fetchJson(url, headers = {}) {
|
|
33
|
+
return new Promise((resolve, reject) => {
|
|
34
|
+
const urlObj = new URL(url)
|
|
35
|
+
const opts = {
|
|
36
|
+
hostname: urlObj.hostname,
|
|
37
|
+
path: urlObj.pathname + urlObj.search,
|
|
38
|
+
method: 'GET',
|
|
39
|
+
headers: {
|
|
40
|
+
...headers,
|
|
41
|
+
'User-Agent': 'Spectrawl/0.1.0'
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const req = https.request(opts, (res) => {
|
|
46
|
+
let data = ''
|
|
47
|
+
res.on('data', chunk => data += chunk)
|
|
48
|
+
res.on('end', () => {
|
|
49
|
+
try {
|
|
50
|
+
resolve(JSON.parse(data))
|
|
51
|
+
} catch (e) {
|
|
52
|
+
reject(new Error(`Brave API returned invalid JSON: ${data.slice(0, 200)}`))
|
|
53
|
+
}
|
|
54
|
+
})
|
|
55
|
+
})
|
|
56
|
+
req.on('error', reject)
|
|
57
|
+
req.setTimeout(10000, () => { req.destroy(); reject(new Error('Brave API timeout')) })
|
|
58
|
+
req.end()
|
|
59
|
+
})
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
module.exports = { braveSearch }
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
const { URL } = require('url')
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* DuckDuckGo search — free, unlimited, no API key needed.
|
|
6
|
+
* Uses JSON API + HTML fallback. Filters ads automatically.
|
|
7
|
+
*/
|
|
8
|
+
async function ddgSearch(query, config = {}) {
|
|
9
|
+
const maxResults = config.maxResults || 10
|
|
10
|
+
|
|
11
|
+
// Strategy 1: JSON API (instant answers)
|
|
12
|
+
try {
|
|
13
|
+
const results = await ddgJsonApi(query, maxResults)
|
|
14
|
+
if (results.length > 0) return results
|
|
15
|
+
} catch (e) { /* fall through */ }
|
|
16
|
+
|
|
17
|
+
// Strategy 2: HTML search
|
|
18
|
+
try {
|
|
19
|
+
const results = await ddgHtmlSearch(query, maxResults)
|
|
20
|
+
if (results.length > 0) return results
|
|
21
|
+
} catch (e) { /* fall through */ }
|
|
22
|
+
|
|
23
|
+
return []
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
async function ddgJsonApi(query, maxResults) {
|
|
27
|
+
const url = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`
|
|
28
|
+
const data = await fetchJson(url)
|
|
29
|
+
|
|
30
|
+
const results = []
|
|
31
|
+
|
|
32
|
+
if (data.AbstractURL && data.Abstract) {
|
|
33
|
+
results.push({
|
|
34
|
+
url: data.AbstractURL,
|
|
35
|
+
title: data.Heading || query,
|
|
36
|
+
snippet: data.Abstract,
|
|
37
|
+
engine: 'ddg'
|
|
38
|
+
})
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (data.RelatedTopics) {
|
|
42
|
+
for (const topic of data.RelatedTopics) {
|
|
43
|
+
if (results.length >= maxResults) break
|
|
44
|
+
if (topic.FirstURL && topic.Text) {
|
|
45
|
+
results.push({
|
|
46
|
+
url: topic.FirstURL,
|
|
47
|
+
title: topic.Text.slice(0, 100),
|
|
48
|
+
snippet: topic.Text,
|
|
49
|
+
engine: 'ddg'
|
|
50
|
+
})
|
|
51
|
+
}
|
|
52
|
+
if (topic.Topics) {
|
|
53
|
+
for (const sub of topic.Topics) {
|
|
54
|
+
if (results.length >= maxResults) break
|
|
55
|
+
if (sub.FirstURL && sub.Text) {
|
|
56
|
+
results.push({
|
|
57
|
+
url: sub.FirstURL,
|
|
58
|
+
title: sub.Text.slice(0, 100),
|
|
59
|
+
snippet: sub.Text,
|
|
60
|
+
engine: 'ddg'
|
|
61
|
+
})
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (data.Results) {
|
|
69
|
+
for (const r of data.Results) {
|
|
70
|
+
if (results.length >= maxResults) break
|
|
71
|
+
if (r.FirstURL && r.Text) {
|
|
72
|
+
results.push({
|
|
73
|
+
url: r.FirstURL,
|
|
74
|
+
title: r.Text.slice(0, 100),
|
|
75
|
+
snippet: r.Text,
|
|
76
|
+
engine: 'ddg'
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return results
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
async function ddgHtmlSearch(query, maxResults) {
|
|
86
|
+
const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`
|
|
87
|
+
const html = await fetchHtml(url)
|
|
88
|
+
|
|
89
|
+
const results = []
|
|
90
|
+
|
|
91
|
+
const resultRegex = /<a[^>]+class="result__a"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/g
|
|
92
|
+
const snippetRegex = /<a[^>]+class="result__snippet"[^>]*>([\s\S]*?)<\/a>/g
|
|
93
|
+
|
|
94
|
+
const links = []
|
|
95
|
+
let match
|
|
96
|
+
while ((match = resultRegex.exec(html)) !== null) {
|
|
97
|
+
const url = decodeUddg(match[1])
|
|
98
|
+
// Filter ads — DDG ads go through duckduckgo.com/y.js
|
|
99
|
+
if (isAd(url)) continue
|
|
100
|
+
links.push({ url, title: stripHtml(match[2]) })
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const snippets = []
|
|
104
|
+
while ((match = snippetRegex.exec(html)) !== null) {
|
|
105
|
+
snippets.push(stripHtml(match[1]))
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
for (let i = 0; i < Math.min(links.length, maxResults); i++) {
|
|
109
|
+
results.push({
|
|
110
|
+
url: links[i].url,
|
|
111
|
+
title: links[i].title,
|
|
112
|
+
snippet: snippets[i] || '',
|
|
113
|
+
engine: 'ddg'
|
|
114
|
+
})
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return results
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Filter out DDG ads.
|
|
122
|
+
*/
|
|
123
|
+
function isAd(url) {
|
|
124
|
+
if (!url) return true
|
|
125
|
+
if (url.includes('duckduckgo.com/y.js')) return true
|
|
126
|
+
if (url.includes('ad_provider=')) return true
|
|
127
|
+
if (url.includes('ad_domain=')) return true
|
|
128
|
+
if (url.startsWith('//duckduckgo.com/l/?')) {
|
|
129
|
+
// This is a redirect — might be organic
|
|
130
|
+
return false
|
|
131
|
+
}
|
|
132
|
+
return false
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function decodeUddg(url) {
|
|
136
|
+
if (url.includes('uddg=')) {
|
|
137
|
+
const match = url.match(/uddg=([^&]+)/)
|
|
138
|
+
if (match) return decodeURIComponent(match[1])
|
|
139
|
+
}
|
|
140
|
+
return url
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function fetchJson(url) {
|
|
144
|
+
return new Promise((resolve, reject) => {
|
|
145
|
+
const urlObj = new URL(url)
|
|
146
|
+
https.get({
|
|
147
|
+
hostname: urlObj.hostname,
|
|
148
|
+
path: urlObj.pathname + urlObj.search,
|
|
149
|
+
headers: { 'User-Agent': 'Spectrawl/0.1.0' }
|
|
150
|
+
}, res => {
|
|
151
|
+
let data = ''
|
|
152
|
+
res.on('data', chunk => data += chunk)
|
|
153
|
+
res.on('end', () => {
|
|
154
|
+
try { resolve(JSON.parse(data)) }
|
|
155
|
+
catch (e) { reject(new Error('Invalid JSON from DDG API')) }
|
|
156
|
+
})
|
|
157
|
+
}).on('error', reject)
|
|
158
|
+
})
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function fetchHtml(url) {
|
|
162
|
+
return new Promise((resolve, reject) => {
|
|
163
|
+
const urlObj = new URL(url)
|
|
164
|
+
https.get({
|
|
165
|
+
hostname: urlObj.hostname,
|
|
166
|
+
path: urlObj.pathname + urlObj.search,
|
|
167
|
+
headers: {
|
|
168
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
169
|
+
'Accept': 'text/html',
|
|
170
|
+
'Accept-Language': 'en-US,en;q=0.9'
|
|
171
|
+
}
|
|
172
|
+
}, res => {
|
|
173
|
+
let data = ''
|
|
174
|
+
res.on('data', chunk => data += chunk)
|
|
175
|
+
res.on('end', () => resolve(data))
|
|
176
|
+
}).on('error', reject)
|
|
177
|
+
})
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function stripHtml(html) {
|
|
181
|
+
return html
|
|
182
|
+
.replace(/<[^>]+>/g, '')
|
|
183
|
+
.replace(/&/g, '&')
|
|
184
|
+
.replace(/</g, '<')
|
|
185
|
+
.replace(/>/g, '>')
|
|
186
|
+
.replace(/"/g, '"')
|
|
187
|
+
.replace(/'/g, "'")
|
|
188
|
+
.replace(/\s+/g, ' ')
|
|
189
|
+
.trim()
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
module.exports = { ddgSearch }
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Google Custom Search Engine — 100 queries/day free.
|
|
5
|
+
* Requires GOOGLE_CSE_KEY and GOOGLE_CSE_ID in config or env.
|
|
6
|
+
*/
|
|
7
|
+
async function googleCseSearch(query, config = {}) {
|
|
8
|
+
const apiKey = config.apiKey || process.env.GOOGLE_CSE_KEY
|
|
9
|
+
const cseId = config.cseId || process.env.GOOGLE_CSE_ID
|
|
10
|
+
if (!apiKey || !cseId) throw new Error('Google CSE key/ID not configured')
|
|
11
|
+
|
|
12
|
+
const maxResults = Math.min(config.maxResults || 10, 10) // Google caps at 10
|
|
13
|
+
const params = new URLSearchParams({
|
|
14
|
+
key: apiKey,
|
|
15
|
+
cx: cseId,
|
|
16
|
+
q: query,
|
|
17
|
+
num: String(maxResults)
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
const data = await fetchJson(`https://www.googleapis.com/customsearch/v1?${params}`)
|
|
21
|
+
|
|
22
|
+
if (!data.items) return []
|
|
23
|
+
|
|
24
|
+
return data.items.map(r => ({
|
|
25
|
+
url: r.link,
|
|
26
|
+
title: r.title,
|
|
27
|
+
snippet: r.snippet || '',
|
|
28
|
+
engine: 'google-cse'
|
|
29
|
+
}))
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function fetchJson(url) {
|
|
33
|
+
return new Promise((resolve, reject) => {
|
|
34
|
+
const urlObj = new URL(url)
|
|
35
|
+
https.get({
|
|
36
|
+
hostname: urlObj.hostname,
|
|
37
|
+
path: urlObj.pathname + urlObj.search,
|
|
38
|
+
headers: { 'User-Agent': 'Spectrawl/0.1.0' }
|
|
39
|
+
}, res => {
|
|
40
|
+
let data = ''
|
|
41
|
+
res.on('data', chunk => data += chunk)
|
|
42
|
+
res.on('end', () => {
|
|
43
|
+
try { resolve(JSON.parse(data)) }
|
|
44
|
+
catch (e) { reject(new Error('Invalid JSON from Google CSE')) }
|
|
45
|
+
})
|
|
46
|
+
}).on('error', reject)
|
|
47
|
+
})
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
module.exports = { googleCseSearch }
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Jina Reader — AI-optimized content extraction.
|
|
5
|
+
* Prepend r.jina.ai/ to any URL for clean markdown output.
|
|
6
|
+
* Free tier available, no API key required for basic use.
|
|
7
|
+
*/
|
|
8
|
+
async function jinaExtract(url, config = {}) {
|
|
9
|
+
const apiKey = config.apiKey || process.env.JINA_API_KEY
|
|
10
|
+
const readerUrl = `https://r.jina.ai/${url}`
|
|
11
|
+
|
|
12
|
+
const headers = {
|
|
13
|
+
'Accept': 'application/json',
|
|
14
|
+
'User-Agent': 'Spectrawl/0.1.0'
|
|
15
|
+
}
|
|
16
|
+
if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`
|
|
17
|
+
|
|
18
|
+
const data = await fetchJson(readerUrl, headers)
|
|
19
|
+
|
|
20
|
+
return {
|
|
21
|
+
content: data.data?.content || data.content || '',
|
|
22
|
+
title: data.data?.title || data.title || '',
|
|
23
|
+
url: data.data?.url || url,
|
|
24
|
+
description: data.data?.description || ''
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Jina Search — search + extract in one call.
|
|
30
|
+
* Prepend s.jina.ai/ to a query for search results.
|
|
31
|
+
*/
|
|
32
|
+
async function jinaSearch(query, config = {}) {
|
|
33
|
+
const apiKey = config.apiKey || process.env.JINA_API_KEY
|
|
34
|
+
const searchUrl = `https://s.jina.ai/${encodeURIComponent(query)}`
|
|
35
|
+
const maxResults = config.maxResults || 5
|
|
36
|
+
|
|
37
|
+
const headers = {
|
|
38
|
+
'Accept': 'application/json',
|
|
39
|
+
'User-Agent': 'Spectrawl/0.1.0'
|
|
40
|
+
}
|
|
41
|
+
if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`
|
|
42
|
+
|
|
43
|
+
const data = await fetchJson(searchUrl, headers)
|
|
44
|
+
|
|
45
|
+
const results = (data.data || []).slice(0, maxResults)
|
|
46
|
+
return results.map(r => ({
|
|
47
|
+
url: r.url,
|
|
48
|
+
title: r.title || '',
|
|
49
|
+
snippet: r.description || '',
|
|
50
|
+
fullContent: r.content || '',
|
|
51
|
+
engine: 'jina'
|
|
52
|
+
}))
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function fetchJson(url, headers = {}) {
|
|
56
|
+
return new Promise((resolve, reject) => {
|
|
57
|
+
const urlObj = new URL(url)
|
|
58
|
+
https.get({
|
|
59
|
+
hostname: urlObj.hostname,
|
|
60
|
+
path: urlObj.pathname + urlObj.search,
|
|
61
|
+
headers
|
|
62
|
+
}, res => {
|
|
63
|
+
let data = ''
|
|
64
|
+
res.on('data', chunk => data += chunk)
|
|
65
|
+
res.on('end', () => {
|
|
66
|
+
try { resolve(JSON.parse(data)) }
|
|
67
|
+
catch (e) {
|
|
68
|
+
// Jina sometimes returns plain text/markdown
|
|
69
|
+
resolve({ content: data, title: '', url })
|
|
70
|
+
}
|
|
71
|
+
})
|
|
72
|
+
}).on('error', reject)
|
|
73
|
+
})
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
module.exports = { jinaExtract, jinaSearch }
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
const http = require('http')
|
|
2
|
+
const https = require('https')
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* SearXNG — self-hosted metasearch engine.
|
|
6
|
+
* Aggregates 70+ search engines (Google, Bing, DDG, etc.)
|
|
7
|
+
* Free, unlimited, no API key needed.
|
|
8
|
+
*
|
|
9
|
+
* Requires a SearXNG instance running (self-hosted or public).
|
|
10
|
+
* Default: http://localhost:8888 (local Docker instance)
|
|
11
|
+
*
|
|
12
|
+
* Docker quick start:
|
|
13
|
+
* docker run -d -p 8888:8080 searxng/searxng
|
|
14
|
+
*/
|
|
15
|
+
async function searxngSearch(query, config = {}) {
|
|
16
|
+
const baseUrl = config.url || process.env.SEARXNG_URL || 'http://localhost:8888'
|
|
17
|
+
const maxResults = config.maxResults || 10
|
|
18
|
+
const engines = config.engines || '' // empty = all engines
|
|
19
|
+
const categories = config.categories || 'general'
|
|
20
|
+
|
|
21
|
+
const params = new URLSearchParams({
|
|
22
|
+
q: query,
|
|
23
|
+
format: 'json',
|
|
24
|
+
categories,
|
|
25
|
+
pageno: '1'
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
if (engines) params.set('engines', engines)
|
|
29
|
+
|
|
30
|
+
const url = `${baseUrl}/search?${params}`
|
|
31
|
+
const data = await fetchJson(url)
|
|
32
|
+
|
|
33
|
+
if (!data.results) return []
|
|
34
|
+
|
|
35
|
+
return data.results.slice(0, maxResults).map(r => ({
|
|
36
|
+
url: r.url,
|
|
37
|
+
title: r.title || '',
|
|
38
|
+
snippet: r.content || '',
|
|
39
|
+
engine: r.engine || 'searxng',
|
|
40
|
+
engines: r.engines || [],
|
|
41
|
+
score: r.score || 0
|
|
42
|
+
}))
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function fetchJson(url) {
|
|
46
|
+
return new Promise((resolve, reject) => {
|
|
47
|
+
const urlObj = new URL(url)
|
|
48
|
+
const client = urlObj.protocol === 'https:' ? https : http
|
|
49
|
+
|
|
50
|
+
client.get({
|
|
51
|
+
hostname: urlObj.hostname,
|
|
52
|
+
port: urlObj.port,
|
|
53
|
+
path: urlObj.pathname + urlObj.search,
|
|
54
|
+
headers: {
|
|
55
|
+
'Accept': 'application/json',
|
|
56
|
+
'User-Agent': 'Spectrawl/0.1.0'
|
|
57
|
+
}
|
|
58
|
+
}, res => {
|
|
59
|
+
let data = ''
|
|
60
|
+
res.on('data', chunk => data += chunk)
|
|
61
|
+
res.on('end', () => {
|
|
62
|
+
try { resolve(JSON.parse(data)) }
|
|
63
|
+
catch (e) { reject(new Error(`SearXNG returned invalid JSON: ${data.slice(0, 200)}`)) }
|
|
64
|
+
})
|
|
65
|
+
}).on('error', reject)
|
|
66
|
+
})
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
module.exports = { searxngSearch }
|