spectrawl 0.3.10 → 0.3.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +94 -62
- package/package.json +1 -1
- package/src/act/adapters/devto.js +18 -3
- package/src/search/index.js +11 -9
- package/src/search/summarizer.js +2 -2
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
The unified web layer for AI agents. Search, browse, authenticate, and act on platforms — one tool, self-hosted, free.
|
|
4
4
|
|
|
5
|
-
**
|
|
5
|
+
**5,000 free searches/month** with Google-quality results via Gemini Grounded Search. Better answers than Tavily. Self-hosted.
|
|
6
6
|
|
|
7
7
|
## What It Does
|
|
8
8
|
|
|
@@ -12,6 +12,40 @@ AI agents need to interact with the web. That means searching, browsing pages, l
|
|
|
12
12
|
npm install spectrawl
|
|
13
13
|
```
|
|
14
14
|
|
|
15
|
+
## Real Output
|
|
16
|
+
|
|
17
|
+
Here's actual output from Spectrawl vs Tavily on the same query:
|
|
18
|
+
|
|
19
|
+
**Query:** `"best open source AI agent frameworks 2025"`
|
|
20
|
+
|
|
21
|
+
### Spectrawl (free)
|
|
22
|
+
```
|
|
23
|
+
Time: 16.8s | Sources: 19
|
|
24
|
+
|
|
25
|
+
Answer: The leading open-source AI agent frameworks for 2025 include AutoGen,
|
|
26
|
+
CrewAI, LangChain, LangGraph, and Semantic Kernel [1, 2, 3]. AutoGen is
|
|
27
|
+
recognized for enabling complex multi-agent conversations, while CrewAI
|
|
28
|
+
focuses on orchestrating collaborative AI agents [1, 2]. LangChain and its
|
|
29
|
+
component LangGraph provide robust tools for building sophisticated agent
|
|
30
|
+
workflows and state management [1, 2, 3]. Semantic Kernel, developed by
|
|
31
|
+
Microsoft, integrates large language models with conventional programming
|
|
32
|
+
languages [1, 2, 3].
|
|
33
|
+
|
|
34
|
+
Other prominent frameworks include LlamaIndex, Haystack, BabyAGI, AgentGPT,
|
|
35
|
+
SuperAGI, MetaGPT, and Open Interpreter [1, 2].
|
|
36
|
+
```
|
|
37
|
+
**12 frameworks named, inline citations, 19 sources**
|
|
38
|
+
|
|
39
|
+
### Tavily ($0.01/query)
|
|
40
|
+
```
|
|
41
|
+
Time: 2s | Sources: 10
|
|
42
|
+
|
|
43
|
+
Answer: In 2025, LangGraph and Microsoft's AutoGen + Semantic Kernel are
|
|
44
|
+
top open-source AI agent frameworks, favored for their robust orchestration
|
|
45
|
+
and enterprise security features.
|
|
46
|
+
```
|
|
47
|
+
**3 frameworks named, no citations, 10 sources**
|
|
48
|
+
|
|
15
49
|
## Quick Start
|
|
16
50
|
|
|
17
51
|
```bash
|
|
@@ -23,32 +57,40 @@ export GEMINI_API_KEY=your-free-key # Get one at aistudio.google.com
|
|
|
23
57
|
const { Spectrawl } = require('spectrawl')
|
|
24
58
|
const web = new Spectrawl()
|
|
25
59
|
|
|
26
|
-
// Deep search —
|
|
27
|
-
const result = await web.deepSearch('
|
|
28
|
-
console.log(result.answer) // AI-generated answer with citations
|
|
60
|
+
// Deep search — returns sources for your agent/LLM to process
|
|
61
|
+
const result = await web.deepSearch('how to build an MCP server in Node.js')
|
|
29
62
|
console.log(result.sources) // [{ title, url, content, score }]
|
|
30
63
|
|
|
31
|
-
//
|
|
64
|
+
// With AI summary (opt-in — uses extra Gemini call)
|
|
65
|
+
const withAnswer = await web.deepSearch('query', { summarize: true })
|
|
66
|
+
console.log(withAnswer.answer) // AI-generated answer with [1] [2] citations
|
|
67
|
+
|
|
68
|
+
// Fast mode — snippets only, skip scraping
|
|
32
69
|
const fast = await web.deepSearch('query', { mode: 'fast' })
|
|
33
70
|
|
|
34
|
-
// Basic search — raw results
|
|
71
|
+
// Basic search — raw results
|
|
35
72
|
const basic = await web.search('query')
|
|
36
73
|
```
|
|
37
74
|
|
|
38
|
-
|
|
75
|
+
> **Why no summary by default?** Your agent already has an LLM. If we summarize AND your agent summarizes, you're paying two LLMs for one answer. We return rich sources — your agent does the rest.
|
|
76
|
+
|
|
77
|
+
## vs Tavily
|
|
39
78
|
|
|
40
79
|
| | Tavily | Spectrawl |
|
|
41
80
|
|---|---|---|
|
|
42
|
-
| Speed | ~2s | ~
|
|
43
|
-
|
|
|
44
|
-
|
|
|
45
|
-
|
|
|
46
|
-
| Cost | $0.01/query | **Free** ✅ |
|
|
81
|
+
| Speed | ~2s ✅ | ~7-17s |
|
|
82
|
+
| Answer quality | Generic (3 items) | **Detailed** (12+ items) ✅ |
|
|
83
|
+
| Inline citations | ❌ | **[1] [2] [3]** ✅ |
|
|
84
|
+
| Results per query | 10 | **12-19** ✅ |
|
|
85
|
+
| Cost | $0.01/query | **Free** (5K/mo) ✅ |
|
|
47
86
|
| Self-hosted | No | **Yes** ✅ |
|
|
87
|
+
| Source ranking | No | **Domain trust scoring** ✅ |
|
|
48
88
|
| Stealth scraping | No | **Yes** ✅ |
|
|
49
89
|
| Auth + posting | No | **24 adapters** ✅ |
|
|
50
90
|
| Cached repeats | No | **<1ms** ✅ |
|
|
51
91
|
|
|
92
|
+
Spectrawl wins on answer quality, result volume, features, and cost. Tavily wins on speed.
|
|
93
|
+
|
|
52
94
|
## Search
|
|
53
95
|
|
|
54
96
|
Default cascade: **Gemini Grounded → Brave → DDG**
|
|
@@ -70,10 +112,10 @@ Gemini Grounded Search gives you Google-quality results through the Gemini API.
|
|
|
70
112
|
|
|
71
113
|
```
|
|
72
114
|
Query → Gemini Grounded + DDG (parallel)
|
|
73
|
-
→ Merge & deduplicate (12-
|
|
115
|
+
→ Merge & deduplicate (12-19 results)
|
|
74
116
|
→ Source quality ranking (boost GitHub/SO/Reddit, penalize SEO spam)
|
|
75
117
|
→ Parallel scraping (Jina → readability → Playwright fallback)
|
|
76
|
-
→ AI
|
|
118
|
+
→ Returns sources to your agent (AI summary opt-in with summarize: true)
|
|
77
119
|
```
|
|
78
120
|
|
|
79
121
|
### What you get without any keys
|
|
@@ -92,56 +134,55 @@ Stealth browsing with anti-detection. Three tiers (auto-detected):
|
|
|
92
134
|
const page = await web.browse('https://example.com')
|
|
93
135
|
console.log(page.content) // extracted text/markdown
|
|
94
136
|
console.log(page.screenshot) // PNG buffer (if requested)
|
|
95
|
-
|
|
96
|
-
// With screenshot
|
|
97
|
-
const page = await web.browse('https://example.com', { screenshot: true })
|
|
98
137
|
```
|
|
99
138
|
|
|
100
139
|
Auto-fallback: if Jina and readability return too little content (<200 chars), Spectrawl renders the page with Playwright and extracts from the rendered DOM. Tavily can't do this — they fail on JS-heavy pages.
|
|
101
140
|
|
|
102
141
|
## Auth
|
|
103
142
|
|
|
104
|
-
Persistent cookie storage (SQLite), multi-account management, automatic
|
|
143
|
+
Persistent cookie storage (SQLite), multi-account management, automatic expiry detection.
|
|
105
144
|
|
|
106
145
|
```js
|
|
107
|
-
//
|
|
108
|
-
await web.auth.
|
|
146
|
+
// Add account
|
|
147
|
+
await web.auth.add('x', { account: '@myhandle', method: 'cookie', cookies })
|
|
109
148
|
|
|
110
149
|
// Check health
|
|
111
|
-
const accounts = await web.
|
|
150
|
+
const accounts = await web.auth.getStatus()
|
|
112
151
|
// [{ platform: 'x', account: '@myhandle', status: 'valid', expiresAt: '...' }]
|
|
113
152
|
```
|
|
114
153
|
|
|
154
|
+
Cookie refresh cron fires `cookie_expiring` and `cookie_expired` events before accounts go stale.
|
|
155
|
+
|
|
115
156
|
## Act — 24 Platform Adapters
|
|
116
157
|
|
|
117
|
-
Post to
|
|
158
|
+
Post to 24+ platforms with one API:
|
|
118
159
|
|
|
119
160
|
```js
|
|
120
|
-
await web.act('
|
|
161
|
+
await web.act('github', 'create-issue', { repo: 'user/repo', title: 'Bug report', body: '...' })
|
|
121
162
|
await web.act('reddit', 'post', { subreddit: 'node', title: '...', text: '...' })
|
|
122
|
-
await web.act('
|
|
163
|
+
await web.act('devto', 'post', { title: '...', body: '...', tags: ['ai'] })
|
|
164
|
+
await web.act('huggingface', 'create-repo', { name: 'my-model', type: 'model' })
|
|
123
165
|
```
|
|
124
166
|
|
|
167
|
+
**Live tested:** GitHub ✅, Reddit ✅, Dev.to ✅, HuggingFace ✅, X (reads) ✅
|
|
168
|
+
|
|
125
169
|
| Platform | Auth Method | Actions |
|
|
126
170
|
|----------|-------------|---------|
|
|
127
|
-
| X/Twitter |
|
|
128
|
-
| Reddit | Cookie API
|
|
129
|
-
| Dev.to | REST API | post |
|
|
171
|
+
| X/Twitter | Cookie + OAuth 1.0a | post |
|
|
172
|
+
| Reddit | Cookie API | post, comment, delete |
|
|
173
|
+
| Dev.to | REST API key | post, update |
|
|
130
174
|
| Hashnode | GraphQL API | post |
|
|
131
175
|
| LinkedIn | Cookie API (Voyager) | post |
|
|
132
|
-
| IndieHackers | Browser automation | post, comment
|
|
133
|
-
| Medium | REST API | post
|
|
176
|
+
| IndieHackers | Browser automation | post, comment |
|
|
177
|
+
| Medium | REST API | post |
|
|
134
178
|
| GitHub | REST v3 | repo, file, issue, release |
|
|
135
|
-
| Discord | Bot API
|
|
136
|
-
| Product Hunt | GraphQL v2 | launch, comment
|
|
137
|
-
| Hacker News | Cookie
|
|
138
|
-
| YouTube | Data API v3 | comment
|
|
139
|
-
| Quora | Browser automation | answer
|
|
179
|
+
| Discord | Bot API | send, thread |
|
|
180
|
+
| Product Hunt | GraphQL v2 | launch, comment |
|
|
181
|
+
| Hacker News | Cookie API | submit, comment |
|
|
182
|
+
| YouTube | Data API v3 | comment |
|
|
183
|
+
| Quora | Browser automation | answer |
|
|
140
184
|
| HuggingFace | Hub API | repo, model card, upload |
|
|
141
185
|
| BetaList | REST API | submit |
|
|
142
|
-
| AlternativeTo | Browser automation | submit |
|
|
143
|
-
| SaaSHub | Browser automation | submit |
|
|
144
|
-
| DevHunt | Browser automation | submit |
|
|
145
186
|
| **14 Directories** | Generic adapter | submit |
|
|
146
187
|
|
|
147
188
|
Built-in rate limiting, content dedup (MD5, 24h window), and dead letter queue for retries.
|
|
@@ -156,11 +197,9 @@ Spectrawl ranks results by domain trust — something Tavily doesn't do:
|
|
|
156
197
|
|
|
157
198
|
```js
|
|
158
199
|
const web = new Spectrawl({
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
block: ['spamsite.com']
|
|
163
|
-
}
|
|
200
|
+
sourceRanker: {
|
|
201
|
+
boost: ['github.com', 'news.ycombinator.com'],
|
|
202
|
+
block: ['spamsite.com']
|
|
164
203
|
}
|
|
165
204
|
})
|
|
166
205
|
```
|
|
@@ -174,14 +213,14 @@ npx spectrawl serve --port 3900
|
|
|
174
213
|
```
|
|
175
214
|
POST /search { "query": "...", "summarize": true }
|
|
176
215
|
POST /browse { "url": "...", "screenshot": true }
|
|
177
|
-
POST /act { "platform": "
|
|
178
|
-
GET /status
|
|
179
|
-
GET /health
|
|
216
|
+
POST /act { "platform": "github", "action": "create-issue", ... }
|
|
217
|
+
GET /status — auth account health
|
|
218
|
+
GET /health — server health
|
|
180
219
|
```
|
|
181
220
|
|
|
182
221
|
## MCP Server
|
|
183
222
|
|
|
184
|
-
Works with any MCP-compatible agent
|
|
223
|
+
Works with any MCP-compatible agent (Claude, Cursor, OpenClaw, LangChain):
|
|
185
224
|
|
|
186
225
|
```bash
|
|
187
226
|
npx spectrawl mcp
|
|
@@ -208,19 +247,12 @@ npx spectrawl install-stealth # download Camoufox browser
|
|
|
208
247
|
{
|
|
209
248
|
"search": {
|
|
210
249
|
"cascade": ["gemini-grounded", "brave", "ddg"],
|
|
211
|
-
"scrapeTop":
|
|
250
|
+
"scrapeTop": 5
|
|
212
251
|
},
|
|
213
252
|
"cache": {
|
|
214
253
|
"searchTtl": 3600,
|
|
215
254
|
"scrapeTtl": 86400
|
|
216
255
|
},
|
|
217
|
-
"proxy": {
|
|
218
|
-
"localPort": 8080,
|
|
219
|
-
"strategy": "round-robin",
|
|
220
|
-
"upstreams": [
|
|
221
|
-
{ "url": "http://user:pass@proxy.example.com:8080" }
|
|
222
|
-
]
|
|
223
|
-
},
|
|
224
256
|
"rateLimit": {
|
|
225
257
|
"x": { "postsPerHour": 3 },
|
|
226
258
|
"reddit": { "postsPerHour": 5 }
|
|
@@ -231,14 +263,14 @@ npx spectrawl install-stealth # download Camoufox browser
|
|
|
231
263
|
## Environment Variables
|
|
232
264
|
|
|
233
265
|
```
|
|
234
|
-
GEMINI_API_KEY
|
|
235
|
-
BRAVE_API_KEY Brave Search
|
|
236
|
-
SERPER_API_KEY Serper.dev
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
OPENAI_API_KEY
|
|
241
|
-
ANTHROPIC_API_KEY
|
|
266
|
+
GEMINI_API_KEY Free — primary search + summarization (aistudio.google.com)
|
|
267
|
+
BRAVE_API_KEY Brave Search (2,000 free/month)
|
|
268
|
+
SERPER_API_KEY Serper.dev (2,500 trial queries)
|
|
269
|
+
GITHUB_TOKEN For GitHub adapter
|
|
270
|
+
DEVTO_API_KEY For Dev.to adapter
|
|
271
|
+
HF_TOKEN For HuggingFace adapter
|
|
272
|
+
OPENAI_API_KEY Alternative LLM for summarization
|
|
273
|
+
ANTHROPIC_API_KEY Alternative LLM for summarization
|
|
242
274
|
```
|
|
243
275
|
|
|
244
276
|
## License
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.12",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -82,16 +82,31 @@ function jsonRequest(method, url, body, headers) {
|
|
|
82
82
|
const urlObj = new URL(url)
|
|
83
83
|
const opts = {
|
|
84
84
|
hostname: urlObj.hostname,
|
|
85
|
-
path: urlObj.pathname,
|
|
85
|
+
path: urlObj.pathname + urlObj.search,
|
|
86
86
|
method,
|
|
87
|
-
headers: {
|
|
87
|
+
headers: {
|
|
88
|
+
...headers,
|
|
89
|
+
'Content-Length': Buffer.byteLength(body),
|
|
90
|
+
'User-Agent': 'Spectrawl/0.3',
|
|
91
|
+
'Accept': 'application/json'
|
|
92
|
+
}
|
|
88
93
|
}
|
|
89
94
|
const req = https.request(opts, res => {
|
|
95
|
+
// Handle redirects
|
|
96
|
+
if ([301, 302, 307, 308].includes(res.statusCode) && res.headers.location) {
|
|
97
|
+
return jsonRequest(method, res.headers.location, body, headers).then(resolve).catch(reject)
|
|
98
|
+
}
|
|
90
99
|
let data = ''
|
|
91
100
|
res.on('data', c => data += c)
|
|
92
101
|
res.on('end', () => {
|
|
102
|
+
if (!data && (res.statusCode >= 200 && res.statusCode < 300)) {
|
|
103
|
+
return resolve({ success: true, statusCode: res.statusCode })
|
|
104
|
+
}
|
|
105
|
+
if (res.statusCode >= 400) {
|
|
106
|
+
return reject(new Error(`Dev.to API ${res.statusCode}: ${data.slice(0, 200)}`))
|
|
107
|
+
}
|
|
93
108
|
try { resolve(JSON.parse(data)) }
|
|
94
|
-
catch (e) { reject(new Error(`Invalid Dev.to response: ${data.slice(0, 200)}`)) }
|
|
109
|
+
catch (e) { reject(new Error(`Invalid Dev.to response (${res.statusCode}): ${data.slice(0, 200)}`)) }
|
|
95
110
|
})
|
|
96
111
|
})
|
|
97
112
|
req.on('error', reject)
|
package/src/search/index.js
CHANGED
|
@@ -29,7 +29,7 @@ class SearchEngine {
|
|
|
29
29
|
this.config = config
|
|
30
30
|
this.cache = cache
|
|
31
31
|
this.cascade = config.cascade || ['ddg', 'brave', 'serper']
|
|
32
|
-
this.scrapeTop = config.scrapeTop ||
|
|
32
|
+
this.scrapeTop = config.scrapeTop || 5
|
|
33
33
|
this.summarizer = config.llm ? new Summarizer(config.llm) : null
|
|
34
34
|
|
|
35
35
|
// Gemini-powered features (free tier)
|
|
@@ -188,16 +188,18 @@ class SearchEngine {
|
|
|
188
188
|
}
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
-
// Step 6: Summarize with citations
|
|
191
|
+
// Step 6: Summarize with citations (opt-in — most agents have their own LLM)
|
|
192
192
|
let answer = null
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
193
|
+
if (opts.summarize === true) {
|
|
194
|
+
const summarizer = this.summarizer || (process.env.GEMINI_API_KEY ? new Summarizer({
|
|
195
|
+
provider: 'gemini',
|
|
196
|
+
model: 'gemini-2.5-flash',
|
|
197
|
+
apiKey: process.env.GEMINI_API_KEY
|
|
198
|
+
}) : null)
|
|
198
199
|
|
|
199
|
-
|
|
200
|
-
|
|
200
|
+
if (summarizer) {
|
|
201
|
+
answer = await summarizer.summarize(query, results)
|
|
202
|
+
}
|
|
201
203
|
}
|
|
202
204
|
|
|
203
205
|
const response = {
|
package/src/search/summarizer.js
CHANGED
|
@@ -39,8 +39,8 @@ class Summarizer {
|
|
|
39
39
|
if (!this.apiKey) return null
|
|
40
40
|
|
|
41
41
|
const context = sources
|
|
42
|
-
.slice(0,
|
|
43
|
-
.map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0,
|
|
42
|
+
.slice(0, 8)
|
|
43
|
+
.map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1500)}`)
|
|
44
44
|
.join('\n\n')
|
|
45
45
|
|
|
46
46
|
const prompt = `Answer this question directly: "${query}"
|