spectrawl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/mcp.js ADDED
@@ -0,0 +1,195 @@
1
+ /**
2
+ * MCP (Model Context Protocol) server for Spectrawl.
3
+ * Exposes search, browse, act, auth, and status as MCP tools.
4
+ * Communicates over stdio (standard MCP transport).
5
+ */
6
+
7
+ const { Spectrawl } = require('./index')
8
+
9
+ const TOOLS = [
10
+ {
11
+ name: 'web_search',
12
+ description: 'Search the web using free API cascade (DuckDuckGo, Brave, Serper). Returns results with optional LLM summary and full page content.',
13
+ inputSchema: {
14
+ type: 'object',
15
+ properties: {
16
+ query: { type: 'string', description: 'Search query' },
17
+ summarize: { type: 'boolean', description: 'Generate LLM summary with citations', default: false },
18
+ scrapeTop: { type: 'number', description: 'Number of top results to scrape for full content', default: 3 },
19
+ minResults: { type: 'number', description: 'Minimum results before trying next engine', default: 5 }
20
+ },
21
+ required: ['query']
22
+ }
23
+ },
24
+ {
25
+ name: 'web_browse',
26
+ description: 'Browse a URL with stealth anti-detection. Extracts text content, optionally takes screenshots. Supports authenticated sessions.',
27
+ inputSchema: {
28
+ type: 'object',
29
+ properties: {
30
+ url: { type: 'string', description: 'URL to browse' },
31
+ auth: { type: 'string', description: 'Platform name to use stored auth (e.g. "reddit", "x")' },
32
+ screenshot: { type: 'boolean', description: 'Take a screenshot', default: false },
33
+ html: { type: 'boolean', description: 'Return raw HTML', default: false },
34
+ stealth: { type: 'boolean', description: 'Force stealth browser mode', default: false }
35
+ },
36
+ required: ['url']
37
+ }
38
+ },
39
+ {
40
+ name: 'web_act',
41
+ description: 'Perform an authenticated action on a platform (post, comment, like, etc). Supports X/Twitter, Reddit, Dev.to.',
42
+ inputSchema: {
43
+ type: 'object',
44
+ properties: {
45
+ platform: { type: 'string', description: 'Platform name', enum: ['x', 'reddit', 'devto', 'hashnode', 'linkedin', 'ih'] },
46
+ action: { type: 'string', description: 'Action to perform (post, comment, like, delete)' },
47
+ account: { type: 'string', description: 'Account handle (e.g. @myhandle)' },
48
+ text: { type: 'string', description: 'Text content for post/comment' },
49
+ title: { type: 'string', description: 'Title (for Reddit/Dev.to posts)' },
50
+ subreddit: { type: 'string', description: 'Subreddit name (Reddit only)' },
51
+ tags: { type: 'array', items: { type: 'string' }, description: 'Tags (Dev.to only)' }
52
+ },
53
+ required: ['platform', 'action']
54
+ }
55
+ },
56
+ {
57
+ name: 'web_auth',
58
+ description: 'Manage platform authentication. Add, remove, or list accounts.',
59
+ inputSchema: {
60
+ type: 'object',
61
+ properties: {
62
+ action: { type: 'string', description: 'Auth action', enum: ['list', 'add', 'remove'] },
63
+ platform: { type: 'string', description: 'Platform name' },
64
+ account: { type: 'string', description: 'Account handle' }
65
+ },
66
+ required: ['action']
67
+ }
68
+ },
69
+ {
70
+ name: 'web_status',
71
+ description: 'Check health status of all authenticated accounts. Shows cookie expiry, OAuth status, and issues.',
72
+ inputSchema: {
73
+ type: 'object',
74
+ properties: {},
75
+ required: []
76
+ }
77
+ }
78
+ ]
79
+
80
+ class MCPServer {
81
+ constructor(configPath) {
82
+ this.spectrawl = new Spectrawl(configPath)
83
+ this._buffer = ''
84
+ }
85
+
86
+ async start() {
87
+ process.stdin.setEncoding('utf8')
88
+ process.stdin.on('data', (chunk) => {
89
+ this._buffer += chunk
90
+ this._processBuffer()
91
+ })
92
+ process.stdin.on('end', () => {
93
+ this.spectrawl.close()
94
+ })
95
+ }
96
+
97
+ _processBuffer() {
98
+ const lines = this._buffer.split('\n')
99
+ this._buffer = lines.pop() || ''
100
+
101
+ for (const line of lines) {
102
+ if (!line.trim()) continue
103
+ try {
104
+ const msg = JSON.parse(line)
105
+ this._handleMessage(msg)
106
+ } catch (e) {
107
+ // Not JSON, ignore
108
+ }
109
+ }
110
+ }
111
+
112
+ async _handleMessage(msg) {
113
+ if (msg.method === 'initialize') {
114
+ this._send({
115
+ jsonrpc: '2.0',
116
+ id: msg.id,
117
+ result: {
118
+ protocolVersion: '2024-11-05',
119
+ capabilities: { tools: {} },
120
+ serverInfo: { name: 'spectrawl', version: '0.1.0' }
121
+ }
122
+ })
123
+ } else if (msg.method === 'tools/list') {
124
+ this._send({
125
+ jsonrpc: '2.0',
126
+ id: msg.id,
127
+ result: { tools: TOOLS }
128
+ })
129
+ } else if (msg.method === 'tools/call') {
130
+ const result = await this._handleToolCall(msg.params.name, msg.params.arguments || {})
131
+ this._send({
132
+ jsonrpc: '2.0',
133
+ id: msg.id,
134
+ result: {
135
+ content: [{ type: 'text', text: JSON.stringify(result, null, 2) }]
136
+ }
137
+ })
138
+ } else if (msg.method === 'notifications/initialized') {
139
+ // Client acknowledged init, nothing to do
140
+ }
141
+ }
142
+
143
+ async _handleToolCall(name, args) {
144
+ try {
145
+ switch (name) {
146
+ case 'web_search':
147
+ return await this.spectrawl.search(args.query, {
148
+ summarize: args.summarize,
149
+ scrapeTop: args.scrapeTop,
150
+ minResults: args.minResults
151
+ })
152
+ case 'web_browse':
153
+ return await this.spectrawl.browse(args.url, {
154
+ auth: args.auth,
155
+ screenshot: args.screenshot,
156
+ html: args.html,
157
+ stealth: args.stealth
158
+ })
159
+ case 'web_act':
160
+ return await this.spectrawl.act(args.platform, args.action, {
161
+ account: args.account,
162
+ text: args.text,
163
+ title: args.title,
164
+ subreddit: args.subreddit,
165
+ tags: args.tags
166
+ })
167
+ case 'web_auth':
168
+ if (args.action === 'list') return await this.spectrawl.status()
169
+ if (args.action === 'remove') {
170
+ await this.spectrawl.auth.remove(args.platform, args.account)
171
+ return { removed: `${args.platform}/${args.account}` }
172
+ }
173
+ return { error: 'Use CLI for adding accounts: spectrawl login <platform>' }
174
+ case 'web_status':
175
+ return await this.spectrawl.status()
176
+ default:
177
+ return { error: `Unknown tool: ${name}` }
178
+ }
179
+ } catch (err) {
180
+ return { error: err.message }
181
+ }
182
+ }
183
+
184
+ _send(msg) {
185
+ process.stdout.write(JSON.stringify(msg) + '\n')
186
+ }
187
+ }
188
+
189
+ // Run if called directly
190
+ if (require.main === module) {
191
+ const server = new MCPServer()
192
+ server.start()
193
+ }
194
+
195
+ module.exports = { MCPServer, TOOLS }
@@ -0,0 +1,62 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Brave Search API — 2000 queries/month free tier.
5
+ * Requires BRAVE_API_KEY in config or env.
6
+ */
7
+ async function braveSearch(query, config = {}) {
8
+ const apiKey = config.apiKey || process.env.BRAVE_API_KEY
9
+ if (!apiKey) throw new Error('Brave API key not configured')
10
+
11
+ const maxResults = config.maxResults || 10
12
+ const params = new URLSearchParams({
13
+ q: query,
14
+ count: String(maxResults)
15
+ })
16
+
17
+ const data = await fetchJson(`https://api.search.brave.com/res/v1/web/search?${params}`, {
18
+ 'X-Subscription-Token': apiKey,
19
+ 'Accept': 'application/json'
20
+ })
21
+
22
+ if (!data.web?.results) return []
23
+
24
+ return data.web.results.map(r => ({
25
+ url: r.url,
26
+ title: r.title,
27
+ snippet: r.description || '',
28
+ engine: 'brave'
29
+ }))
30
+ }
31
+
32
+ function fetchJson(url, headers = {}) {
33
+ return new Promise((resolve, reject) => {
34
+ const urlObj = new URL(url)
35
+ const opts = {
36
+ hostname: urlObj.hostname,
37
+ path: urlObj.pathname + urlObj.search,
38
+ method: 'GET',
39
+ headers: {
40
+ ...headers,
41
+ 'User-Agent': 'Spectrawl/0.1.0'
42
+ }
43
+ }
44
+
45
+ const req = https.request(opts, (res) => {
46
+ let data = ''
47
+ res.on('data', chunk => data += chunk)
48
+ res.on('end', () => {
49
+ try {
50
+ resolve(JSON.parse(data))
51
+ } catch (e) {
52
+ reject(new Error(`Brave API returned invalid JSON: ${data.slice(0, 200)}`))
53
+ }
54
+ })
55
+ })
56
+ req.on('error', reject)
57
+ req.setTimeout(10000, () => { req.destroy(); reject(new Error('Brave API timeout')) })
58
+ req.end()
59
+ })
60
+ }
61
+
62
+ module.exports = { braveSearch }
@@ -0,0 +1,192 @@
1
+ const https = require('https')
2
+ const { URL } = require('url')
3
+
4
+ /**
5
+ * DuckDuckGo search — free, unlimited, no API key needed.
6
+ * Uses JSON API + HTML fallback. Filters ads automatically.
7
+ */
8
+ async function ddgSearch(query, config = {}) {
9
+ const maxResults = config.maxResults || 10
10
+
11
+ // Strategy 1: JSON API (instant answers)
12
+ try {
13
+ const results = await ddgJsonApi(query, maxResults)
14
+ if (results.length > 0) return results
15
+ } catch (e) { /* fall through */ }
16
+
17
+ // Strategy 2: HTML search
18
+ try {
19
+ const results = await ddgHtmlSearch(query, maxResults)
20
+ if (results.length > 0) return results
21
+ } catch (e) { /* fall through */ }
22
+
23
+ return []
24
+ }
25
+
26
+ async function ddgJsonApi(query, maxResults) {
27
+ const url = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`
28
+ const data = await fetchJson(url)
29
+
30
+ const results = []
31
+
32
+ if (data.AbstractURL && data.Abstract) {
33
+ results.push({
34
+ url: data.AbstractURL,
35
+ title: data.Heading || query,
36
+ snippet: data.Abstract,
37
+ engine: 'ddg'
38
+ })
39
+ }
40
+
41
+ if (data.RelatedTopics) {
42
+ for (const topic of data.RelatedTopics) {
43
+ if (results.length >= maxResults) break
44
+ if (topic.FirstURL && topic.Text) {
45
+ results.push({
46
+ url: topic.FirstURL,
47
+ title: topic.Text.slice(0, 100),
48
+ snippet: topic.Text,
49
+ engine: 'ddg'
50
+ })
51
+ }
52
+ if (topic.Topics) {
53
+ for (const sub of topic.Topics) {
54
+ if (results.length >= maxResults) break
55
+ if (sub.FirstURL && sub.Text) {
56
+ results.push({
57
+ url: sub.FirstURL,
58
+ title: sub.Text.slice(0, 100),
59
+ snippet: sub.Text,
60
+ engine: 'ddg'
61
+ })
62
+ }
63
+ }
64
+ }
65
+ }
66
+ }
67
+
68
+ if (data.Results) {
69
+ for (const r of data.Results) {
70
+ if (results.length >= maxResults) break
71
+ if (r.FirstURL && r.Text) {
72
+ results.push({
73
+ url: r.FirstURL,
74
+ title: r.Text.slice(0, 100),
75
+ snippet: r.Text,
76
+ engine: 'ddg'
77
+ })
78
+ }
79
+ }
80
+ }
81
+
82
+ return results
83
+ }
84
+
85
+ async function ddgHtmlSearch(query, maxResults) {
86
+ const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`
87
+ const html = await fetchHtml(url)
88
+
89
+ const results = []
90
+
91
+ const resultRegex = /<a[^>]+class="result__a"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/g
92
+ const snippetRegex = /<a[^>]+class="result__snippet"[^>]*>([\s\S]*?)<\/a>/g
93
+
94
+ const links = []
95
+ let match
96
+ while ((match = resultRegex.exec(html)) !== null) {
97
+ const url = decodeUddg(match[1])
98
+ // Filter ads — DDG ads go through duckduckgo.com/y.js
99
+ if (isAd(url)) continue
100
+ links.push({ url, title: stripHtml(match[2]) })
101
+ }
102
+
103
+ const snippets = []
104
+ while ((match = snippetRegex.exec(html)) !== null) {
105
+ snippets.push(stripHtml(match[1]))
106
+ }
107
+
108
+ for (let i = 0; i < Math.min(links.length, maxResults); i++) {
109
+ results.push({
110
+ url: links[i].url,
111
+ title: links[i].title,
112
+ snippet: snippets[i] || '',
113
+ engine: 'ddg'
114
+ })
115
+ }
116
+
117
+ return results
118
+ }
119
+
120
+ /**
121
+ * Filter out DDG ads.
122
+ */
123
+ function isAd(url) {
124
+ if (!url) return true
125
+ if (url.includes('duckduckgo.com/y.js')) return true
126
+ if (url.includes('ad_provider=')) return true
127
+ if (url.includes('ad_domain=')) return true
128
+ if (url.startsWith('//duckduckgo.com/l/?')) {
129
+ // This is a redirect — might be organic
130
+ return false
131
+ }
132
+ return false
133
+ }
134
+
135
+ function decodeUddg(url) {
136
+ if (url.includes('uddg=')) {
137
+ const match = url.match(/uddg=([^&]+)/)
138
+ if (match) return decodeURIComponent(match[1])
139
+ }
140
+ return url
141
+ }
142
+
143
+ function fetchJson(url) {
144
+ return new Promise((resolve, reject) => {
145
+ const urlObj = new URL(url)
146
+ https.get({
147
+ hostname: urlObj.hostname,
148
+ path: urlObj.pathname + urlObj.search,
149
+ headers: { 'User-Agent': 'Spectrawl/0.1.0' }
150
+ }, res => {
151
+ let data = ''
152
+ res.on('data', chunk => data += chunk)
153
+ res.on('end', () => {
154
+ try { resolve(JSON.parse(data)) }
155
+ catch (e) { reject(new Error('Invalid JSON from DDG API')) }
156
+ })
157
+ }).on('error', reject)
158
+ })
159
+ }
160
+
161
+ function fetchHtml(url) {
162
+ return new Promise((resolve, reject) => {
163
+ const urlObj = new URL(url)
164
+ https.get({
165
+ hostname: urlObj.hostname,
166
+ path: urlObj.pathname + urlObj.search,
167
+ headers: {
168
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
169
+ 'Accept': 'text/html',
170
+ 'Accept-Language': 'en-US,en;q=0.9'
171
+ }
172
+ }, res => {
173
+ let data = ''
174
+ res.on('data', chunk => data += chunk)
175
+ res.on('end', () => resolve(data))
176
+ }).on('error', reject)
177
+ })
178
+ }
179
+
180
+ function stripHtml(html) {
181
+ return html
182
+ .replace(/<[^>]+>/g, '')
183
+ .replace(/&amp;/g, '&')
184
+ .replace(/&lt;/g, '<')
185
+ .replace(/&gt;/g, '>')
186
+ .replace(/&quot;/g, '"')
187
+ .replace(/&#39;/g, "'")
188
+ .replace(/\s+/g, ' ')
189
+ .trim()
190
+ }
191
+
192
+ module.exports = { ddgSearch }
@@ -0,0 +1,50 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Google Custom Search Engine — 100 queries/day free.
5
+ * Requires GOOGLE_CSE_KEY and GOOGLE_CSE_ID in config or env.
6
+ */
7
+ async function googleCseSearch(query, config = {}) {
8
+ const apiKey = config.apiKey || process.env.GOOGLE_CSE_KEY
9
+ const cseId = config.cseId || process.env.GOOGLE_CSE_ID
10
+ if (!apiKey || !cseId) throw new Error('Google CSE key/ID not configured')
11
+
12
+ const maxResults = Math.min(config.maxResults || 10, 10) // Google caps at 10
13
+ const params = new URLSearchParams({
14
+ key: apiKey,
15
+ cx: cseId,
16
+ q: query,
17
+ num: String(maxResults)
18
+ })
19
+
20
+ const data = await fetchJson(`https://www.googleapis.com/customsearch/v1?${params}`)
21
+
22
+ if (!data.items) return []
23
+
24
+ return data.items.map(r => ({
25
+ url: r.link,
26
+ title: r.title,
27
+ snippet: r.snippet || '',
28
+ engine: 'google-cse'
29
+ }))
30
+ }
31
+
32
+ function fetchJson(url) {
33
+ return new Promise((resolve, reject) => {
34
+ const urlObj = new URL(url)
35
+ https.get({
36
+ hostname: urlObj.hostname,
37
+ path: urlObj.pathname + urlObj.search,
38
+ headers: { 'User-Agent': 'Spectrawl/0.1.0' }
39
+ }, res => {
40
+ let data = ''
41
+ res.on('data', chunk => data += chunk)
42
+ res.on('end', () => {
43
+ try { resolve(JSON.parse(data)) }
44
+ catch (e) { reject(new Error('Invalid JSON from Google CSE')) }
45
+ })
46
+ }).on('error', reject)
47
+ })
48
+ }
49
+
50
+ module.exports = { googleCseSearch }
@@ -0,0 +1,76 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Jina Reader — AI-optimized content extraction.
5
+ * Prepend r.jina.ai/ to any URL for clean markdown output.
6
+ * Free tier available, no API key required for basic use.
7
+ */
8
+ async function jinaExtract(url, config = {}) {
9
+ const apiKey = config.apiKey || process.env.JINA_API_KEY
10
+ const readerUrl = `https://r.jina.ai/${url}`
11
+
12
+ const headers = {
13
+ 'Accept': 'application/json',
14
+ 'User-Agent': 'Spectrawl/0.1.0'
15
+ }
16
+ if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`
17
+
18
+ const data = await fetchJson(readerUrl, headers)
19
+
20
+ return {
21
+ content: data.data?.content || data.content || '',
22
+ title: data.data?.title || data.title || '',
23
+ url: data.data?.url || url,
24
+ description: data.data?.description || ''
25
+ }
26
+ }
27
+
28
+ /**
29
+ * Jina Search — search + extract in one call.
30
+ * Prepend s.jina.ai/ to a query for search results.
31
+ */
32
+ async function jinaSearch(query, config = {}) {
33
+ const apiKey = config.apiKey || process.env.JINA_API_KEY
34
+ const searchUrl = `https://s.jina.ai/${encodeURIComponent(query)}`
35
+ const maxResults = config.maxResults || 5
36
+
37
+ const headers = {
38
+ 'Accept': 'application/json',
39
+ 'User-Agent': 'Spectrawl/0.1.0'
40
+ }
41
+ if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`
42
+
43
+ const data = await fetchJson(searchUrl, headers)
44
+
45
+ const results = (data.data || []).slice(0, maxResults)
46
+ return results.map(r => ({
47
+ url: r.url,
48
+ title: r.title || '',
49
+ snippet: r.description || '',
50
+ fullContent: r.content || '',
51
+ engine: 'jina'
52
+ }))
53
+ }
54
+
55
+ function fetchJson(url, headers = {}) {
56
+ return new Promise((resolve, reject) => {
57
+ const urlObj = new URL(url)
58
+ https.get({
59
+ hostname: urlObj.hostname,
60
+ path: urlObj.pathname + urlObj.search,
61
+ headers
62
+ }, res => {
63
+ let data = ''
64
+ res.on('data', chunk => data += chunk)
65
+ res.on('end', () => {
66
+ try { resolve(JSON.parse(data)) }
67
+ catch (e) {
68
+ // Jina sometimes returns plain text/markdown
69
+ resolve({ content: data, title: '', url })
70
+ }
71
+ })
72
+ }).on('error', reject)
73
+ })
74
+ }
75
+
76
+ module.exports = { jinaExtract, jinaSearch }
@@ -0,0 +1,69 @@
1
+ const http = require('http')
2
+ const https = require('https')
3
+
4
+ /**
5
+ * SearXNG — self-hosted metasearch engine.
6
+ * Aggregates 70+ search engines (Google, Bing, DDG, etc.)
7
+ * Free, unlimited, no API key needed.
8
+ *
9
+ * Requires a SearXNG instance running (self-hosted or public).
10
+ * Default: http://localhost:8888 (local Docker instance)
11
+ *
12
+ * Docker quick start:
13
+ * docker run -d -p 8888:8080 searxng/searxng
14
+ */
15
+ async function searxngSearch(query, config = {}) {
16
+ const baseUrl = config.url || process.env.SEARXNG_URL || 'http://localhost:8888'
17
+ const maxResults = config.maxResults || 10
18
+ const engines = config.engines || '' // empty = all engines
19
+ const categories = config.categories || 'general'
20
+
21
+ const params = new URLSearchParams({
22
+ q: query,
23
+ format: 'json',
24
+ categories,
25
+ pageno: '1'
26
+ })
27
+
28
+ if (engines) params.set('engines', engines)
29
+
30
+ const url = `${baseUrl}/search?${params}`
31
+ const data = await fetchJson(url)
32
+
33
+ if (!data.results) return []
34
+
35
+ return data.results.slice(0, maxResults).map(r => ({
36
+ url: r.url,
37
+ title: r.title || '',
38
+ snippet: r.content || '',
39
+ engine: r.engine || 'searxng',
40
+ engines: r.engines || [],
41
+ score: r.score || 0
42
+ }))
43
+ }
44
+
45
+ function fetchJson(url) {
46
+ return new Promise((resolve, reject) => {
47
+ const urlObj = new URL(url)
48
+ const client = urlObj.protocol === 'https:' ? https : http
49
+
50
+ client.get({
51
+ hostname: urlObj.hostname,
52
+ port: urlObj.port,
53
+ path: urlObj.pathname + urlObj.search,
54
+ headers: {
55
+ 'Accept': 'application/json',
56
+ 'User-Agent': 'Spectrawl/0.1.0'
57
+ }
58
+ }, res => {
59
+ let data = ''
60
+ res.on('data', chunk => data += chunk)
61
+ res.on('end', () => {
62
+ try { resolve(JSON.parse(data)) }
63
+ catch (e) { reject(new Error(`SearXNG returned invalid JSON: ${data.slice(0, 200)}`)) }
64
+ })
65
+ }).on('error', reject)
66
+ })
67
+ }
68
+
69
+ module.exports = { searxngSearch }