spectrawl 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.js +12 -0
- package/src/search/index.js +119 -0
- package/src/search/query-expander.js +122 -0
- package/src/search/reranker.js +114 -0
- package/src/search/summarizer.js +16 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
package/src/index.js
CHANGED
|
@@ -34,6 +34,18 @@ class Spectrawl {
|
|
|
34
34
|
return this.searchEngine.search(query, opts)
|
|
35
35
|
}
|
|
36
36
|
|
|
37
|
+
/**
|
|
38
|
+
* Deep search — Tavily-equivalent "advanced" mode.
|
|
39
|
+
* Query expansion → parallel search → rerank → scrape → AI answer with citations.
|
|
40
|
+
* Requires GEMINI_API_KEY (free tier) or configured LLM.
|
|
41
|
+
* @param {string} query - Search query
|
|
42
|
+
* @param {object} opts - { scrapeTop, expand, rerank }
|
|
43
|
+
* @returns {Promise<{answer, sources[], queries[], cached}>}
|
|
44
|
+
*/
|
|
45
|
+
async deepSearch(query, opts = {}) {
|
|
46
|
+
return this.searchEngine.deepSearch(query, opts)
|
|
47
|
+
}
|
|
48
|
+
|
|
37
49
|
/**
|
|
38
50
|
* Browse a URL with stealth and optional auth.
|
|
39
51
|
* @param {string} url - URL to browse
|
package/src/search/index.js
CHANGED
|
@@ -6,6 +6,8 @@ const { googleCseSearch } = require('./engines/google-cse')
|
|
|
6
6
|
const { jinaSearch } = require('./engines/jina')
|
|
7
7
|
const { scrapeUrls } = require('./scraper')
|
|
8
8
|
const { Summarizer } = require('./summarizer')
|
|
9
|
+
const { Reranker } = require('./reranker')
|
|
10
|
+
const { QueryExpander } = require('./query-expander')
|
|
9
11
|
|
|
10
12
|
const ENGINES = {
|
|
11
13
|
searxng: searxngSearch,
|
|
@@ -23,6 +25,11 @@ class SearchEngine {
|
|
|
23
25
|
this.cascade = config.cascade || ['ddg', 'brave', 'serper']
|
|
24
26
|
this.scrapeTop = config.scrapeTop || 3
|
|
25
27
|
this.summarizer = config.llm ? new Summarizer(config.llm) : null
|
|
28
|
+
|
|
29
|
+
// Gemini-powered features (free tier)
|
|
30
|
+
const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
|
|
31
|
+
this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
|
|
32
|
+
this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
|
|
26
33
|
}
|
|
27
34
|
|
|
28
35
|
/**
|
|
@@ -86,6 +93,118 @@ class SearchEngine {
|
|
|
86
93
|
return response
|
|
87
94
|
}
|
|
88
95
|
|
|
96
|
+
/**
|
|
97
|
+
* Deep search — Tavily-equivalent "advanced" mode.
|
|
98
|
+
* Query expansion → parallel search → merge/dedup → rerank → scrape top N → summarize with citations.
|
|
99
|
+
*
|
|
100
|
+
* Returns: { answer, sources: [{title, url, content, score}], cached }
|
|
101
|
+
*/
|
|
102
|
+
async deepSearch(query, opts = {}) {
|
|
103
|
+
if (!query || !query.trim()) {
|
|
104
|
+
throw new Error('Search query is required')
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Check cache
|
|
108
|
+
const cacheKey = `deep:${query}:${JSON.stringify(opts)}`
|
|
109
|
+
const cached = this.cache?.get('search', cacheKey)
|
|
110
|
+
if (cached) return { ...cached, cached: true }
|
|
111
|
+
|
|
112
|
+
// Step 1: Query expansion
|
|
113
|
+
let queries = [query]
|
|
114
|
+
if (this.expander && opts.expand !== false) {
|
|
115
|
+
queries = await this.expander.expand(query)
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Step 2: Search across all query variants (with stagger to avoid rate limits)
|
|
119
|
+
const resultSets = []
|
|
120
|
+
for (const q of queries) {
|
|
121
|
+
try {
|
|
122
|
+
const r = await this._rawSearch(q, opts)
|
|
123
|
+
resultSets.push(r)
|
|
124
|
+
} catch (e) {
|
|
125
|
+
resultSets.push([])
|
|
126
|
+
}
|
|
127
|
+
// Small delay between queries to avoid rate limiting
|
|
128
|
+
if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Step 3: Merge and deduplicate
|
|
132
|
+
let results = this.expander
|
|
133
|
+
? this.expander.mergeResults(resultSets)
|
|
134
|
+
: dedupeResults(resultSets.flat())
|
|
135
|
+
|
|
136
|
+
// Step 4: Rerank by relevance
|
|
137
|
+
if (this.reranker && opts.rerank !== false) {
|
|
138
|
+
results = await this.reranker.rerank(query, results)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Step 5: Parallel scrape top N for full content
|
|
142
|
+
const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
|
|
143
|
+
if (scrapeCount > 0 && results.length > 0) {
|
|
144
|
+
const urls = results.slice(0, scrapeCount).map(r => r.url)
|
|
145
|
+
const scraped = await scrapeUrls(urls)
|
|
146
|
+
|
|
147
|
+
for (const result of results) {
|
|
148
|
+
const scrapedContent = scraped[result.url]
|
|
149
|
+
if (scrapedContent) {
|
|
150
|
+
result.fullContent = scrapedContent
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Step 6: Summarize with citations
|
|
156
|
+
let answer = null
|
|
157
|
+
const summarizer = this.summarizer || (this.reranker ? new Summarizer({
|
|
158
|
+
provider: 'gemini',
|
|
159
|
+
model: 'gemini-2.0-flash',
|
|
160
|
+
apiKey: process.env.GEMINI_API_KEY
|
|
161
|
+
}) : null)
|
|
162
|
+
|
|
163
|
+
if (summarizer) {
|
|
164
|
+
answer = await summarizer.summarize(query, results)
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const response = {
|
|
168
|
+
answer,
|
|
169
|
+
sources: results.map(r => ({
|
|
170
|
+
title: r.title,
|
|
171
|
+
url: r.url,
|
|
172
|
+
snippet: r.snippet,
|
|
173
|
+
content: r.fullContent?.slice(0, 2000) || r.snippet || '',
|
|
174
|
+
score: r.score || null
|
|
175
|
+
})),
|
|
176
|
+
queries, // show which queries were used
|
|
177
|
+
cached: false
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
this.cache?.set('search', cacheKey, response)
|
|
181
|
+
return response
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Raw search without reranking or summarization.
|
|
186
|
+
* Used internally by deepSearch for parallel query variants.
|
|
187
|
+
*/
|
|
188
|
+
async _rawSearch(query, opts = {}) {
|
|
189
|
+
let results = []
|
|
190
|
+
const minResults = opts.minResults || 5
|
|
191
|
+
|
|
192
|
+
for (const engineName of this.cascade) {
|
|
193
|
+
const engine = ENGINES[engineName]
|
|
194
|
+
if (!engine) continue
|
|
195
|
+
|
|
196
|
+
try {
|
|
197
|
+
const engineResults = await engine(query, this.config[engineName] || {})
|
|
198
|
+
results = dedupeResults([...results, ...engineResults])
|
|
199
|
+
if (results.length >= minResults) break
|
|
200
|
+
} catch (err) {
|
|
201
|
+
continue
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return results
|
|
206
|
+
}
|
|
207
|
+
|
|
89
208
|
async _summarize(query, results) {
|
|
90
209
|
if (!this.summarizer) return null
|
|
91
210
|
return this.summarizer.summarize(query, results)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Query expansion — generates variant queries to catch what one search misses.
|
|
5
|
+
* "best CRM" → ["top CRM software 2026", "CRM comparison startups", "best CRM for small business"]
|
|
6
|
+
* Merges and deduplicates results across all variants.
|
|
7
|
+
*/
|
|
8
|
+
class QueryExpander {
|
|
9
|
+
constructor(config = {}) {
|
|
10
|
+
this.provider = config.provider || 'gemini'
|
|
11
|
+
this.model = config.model || 'gemini-2.0-flash'
|
|
12
|
+
this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
|
|
13
|
+
this.variants = config.variants || 3
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Expand a query into multiple search variants.
|
|
18
|
+
* Returns array of query strings (including the original).
|
|
19
|
+
*/
|
|
20
|
+
async expand(query) {
|
|
21
|
+
if (!this.apiKey) return [query]
|
|
22
|
+
|
|
23
|
+
const prompt = `Generate ${this.variants} alternative search queries for: "${query}"
|
|
24
|
+
|
|
25
|
+
Requirements:
|
|
26
|
+
- Each should find different but relevant results
|
|
27
|
+
- Include synonyms, related terms, different phrasings
|
|
28
|
+
- One should be more specific, one broader, one from a different angle
|
|
29
|
+
|
|
30
|
+
Respond with ONLY a JSON array of strings. No explanation.
|
|
31
|
+
Example: ["alternative query 1", "alternative query 2", "alternative query 3"]`
|
|
32
|
+
|
|
33
|
+
try {
|
|
34
|
+
const text = await this._call(prompt)
|
|
35
|
+
const match = text.match(/\[[\s\S]*?\]/)
|
|
36
|
+
if (!match) return [query]
|
|
37
|
+
|
|
38
|
+
const variants = JSON.parse(match[0])
|
|
39
|
+
if (!Array.isArray(variants)) return [query]
|
|
40
|
+
|
|
41
|
+
return [query, ...variants.slice(0, this.variants)]
|
|
42
|
+
} catch (err) {
|
|
43
|
+
console.warn('Query expansion failed:', err.message)
|
|
44
|
+
return [query]
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Merge and deduplicate results from multiple queries.
|
|
50
|
+
* Keeps highest-scored version of each URL.
|
|
51
|
+
*/
|
|
52
|
+
mergeResults(resultSets) {
|
|
53
|
+
const seen = new Map() // url → result
|
|
54
|
+
|
|
55
|
+
for (const results of resultSets) {
|
|
56
|
+
for (const r of results) {
|
|
57
|
+
const url = r.url?.toLowerCase()
|
|
58
|
+
if (!url) continue
|
|
59
|
+
|
|
60
|
+
const existing = seen.get(url)
|
|
61
|
+
if (!existing || (r.score || 0) > (existing.score || 0)) {
|
|
62
|
+
seen.set(url, r)
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return Array.from(seen.values())
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
async _call(prompt) {
|
|
71
|
+
if (this.provider === 'gemini') {
|
|
72
|
+
const model = this.model || 'gemini-2.0-flash'
|
|
73
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
|
|
74
|
+
const body = JSON.stringify({
|
|
75
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
76
|
+
generationConfig: { temperature: 0.7, maxOutputTokens: 200 }
|
|
77
|
+
})
|
|
78
|
+
const data = await postJson(url, body)
|
|
79
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || '[]'
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const url = 'https://api.openai.com/v1/chat/completions'
|
|
83
|
+
const body = JSON.stringify({
|
|
84
|
+
model: this.model,
|
|
85
|
+
messages: [{ role: 'user', content: prompt }],
|
|
86
|
+
max_tokens: 200,
|
|
87
|
+
temperature: 0.7
|
|
88
|
+
})
|
|
89
|
+
const data = await postJson(url, body, { 'Authorization': `Bearer ${this.apiKey}` })
|
|
90
|
+
return data.choices?.[0]?.message?.content || '[]'
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function postJson(url, body, extraHeaders = {}) {
|
|
95
|
+
return new Promise((resolve, reject) => {
|
|
96
|
+
const urlObj = new URL(url)
|
|
97
|
+
const opts = {
|
|
98
|
+
hostname: urlObj.hostname,
|
|
99
|
+
path: urlObj.pathname + urlObj.search,
|
|
100
|
+
method: 'POST',
|
|
101
|
+
headers: {
|
|
102
|
+
'Content-Type': 'application/json',
|
|
103
|
+
'Content-Length': Buffer.byteLength(body),
|
|
104
|
+
...extraHeaders
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
const req = https.request(opts, res => {
|
|
108
|
+
let data = ''
|
|
109
|
+
res.on('data', c => data += c)
|
|
110
|
+
res.on('end', () => {
|
|
111
|
+
try { resolve(JSON.parse(data)) }
|
|
112
|
+
catch (e) { reject(new Error(`Invalid response: ${data.slice(0, 200)}`)) }
|
|
113
|
+
})
|
|
114
|
+
})
|
|
115
|
+
req.on('error', reject)
|
|
116
|
+
req.setTimeout(15000, () => { req.destroy(); reject(new Error('Expander timeout')) })
|
|
117
|
+
req.write(body)
|
|
118
|
+
req.end()
|
|
119
|
+
})
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
module.exports = { QueryExpander }
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* AI result reranker — scores search results by relevance.
|
|
5
|
+
* Uses Gemini Flash by default (free, fast).
|
|
6
|
+
* This is Tavily's secret sauce: AI-scored relevance, not raw search order.
|
|
7
|
+
*/
|
|
8
|
+
class Reranker {
|
|
9
|
+
constructor(config = {}) {
|
|
10
|
+
this.provider = config.provider || 'gemini'
|
|
11
|
+
this.model = config.model || 'gemini-2.0-flash'
|
|
12
|
+
this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Rerank results by relevance to query.
|
|
17
|
+
* Returns results sorted by score (highest first) with score field added.
|
|
18
|
+
*/
|
|
19
|
+
async rerank(query, results) {
|
|
20
|
+
if (!this.apiKey || results.length <= 1) return results
|
|
21
|
+
|
|
22
|
+
const batch = results.slice(0, 20) // Max 20 results to rerank
|
|
23
|
+
|
|
24
|
+
const prompt = `Score each search result's relevance to the query on a scale of 0.0 to 1.0.
|
|
25
|
+
|
|
26
|
+
Query: "${query}"
|
|
27
|
+
|
|
28
|
+
Results:
|
|
29
|
+
${batch.map((r, i) => `[${i}] ${r.title}\n${(r.snippet || r.content || '').slice(0, 200)}`).join('\n\n')}
|
|
30
|
+
|
|
31
|
+
Respond with ONLY a JSON array of scores, one per result. Example: [0.95, 0.72, 0.31]
|
|
32
|
+
No explanation, just the array.`
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
const text = await this._call(prompt)
|
|
36
|
+
const scores = JSON.parse(text.match(/\[[\d.,\s]+\]/)?.[0] || '[]')
|
|
37
|
+
|
|
38
|
+
if (scores.length !== batch.length) return results
|
|
39
|
+
|
|
40
|
+
// Attach scores and sort
|
|
41
|
+
const scored = batch.map((r, i) => ({ ...r, score: scores[i] || 0 }))
|
|
42
|
+
scored.sort((a, b) => b.score - a.score)
|
|
43
|
+
|
|
44
|
+
// Append any results beyond the batch limit
|
|
45
|
+
if (results.length > 20) {
|
|
46
|
+
scored.push(...results.slice(20).map(r => ({ ...r, score: 0 })))
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return scored
|
|
50
|
+
} catch (err) {
|
|
51
|
+
console.warn('Reranking failed, using original order:', err.message)
|
|
52
|
+
return results
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
async _call(prompt) {
|
|
57
|
+
if (this.provider === 'gemini') {
|
|
58
|
+
const model = this.model || 'gemini-2.0-flash'
|
|
59
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
|
|
60
|
+
const body = JSON.stringify({
|
|
61
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
62
|
+
generationConfig: { temperature: 0, maxOutputTokens: 200 }
|
|
63
|
+
})
|
|
64
|
+
const data = await postJson(url, body)
|
|
65
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || '[]'
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Fallback: OpenAI-compatible
|
|
69
|
+
const url = this.provider === 'minimax'
|
|
70
|
+
? 'https://api.minimax.chat/v1/text/chatcompletion_v2'
|
|
71
|
+
: 'https://api.openai.com/v1/chat/completions'
|
|
72
|
+
|
|
73
|
+
const body = JSON.stringify({
|
|
74
|
+
model: this.model,
|
|
75
|
+
messages: [{ role: 'user', content: prompt }],
|
|
76
|
+
max_tokens: 200,
|
|
77
|
+
temperature: 0
|
|
78
|
+
})
|
|
79
|
+
const data = await postJson(url, body, {
|
|
80
|
+
'Authorization': `Bearer ${this.apiKey}`
|
|
81
|
+
})
|
|
82
|
+
return data.choices?.[0]?.message?.content || '[]'
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function postJson(url, body, extraHeaders = {}) {
|
|
87
|
+
return new Promise((resolve, reject) => {
|
|
88
|
+
const urlObj = new URL(url)
|
|
89
|
+
const opts = {
|
|
90
|
+
hostname: urlObj.hostname,
|
|
91
|
+
path: urlObj.pathname + urlObj.search,
|
|
92
|
+
method: 'POST',
|
|
93
|
+
headers: {
|
|
94
|
+
'Content-Type': 'application/json',
|
|
95
|
+
'Content-Length': Buffer.byteLength(body),
|
|
96
|
+
...extraHeaders
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
const req = https.request(opts, res => {
|
|
100
|
+
let data = ''
|
|
101
|
+
res.on('data', c => data += c)
|
|
102
|
+
res.on('end', () => {
|
|
103
|
+
try { resolve(JSON.parse(data)) }
|
|
104
|
+
catch (e) { reject(new Error(`Invalid response: ${data.slice(0, 200)}`)) }
|
|
105
|
+
})
|
|
106
|
+
})
|
|
107
|
+
req.on('error', reject)
|
|
108
|
+
req.setTimeout(15000, () => { req.destroy(); reject(new Error('Reranker timeout')) })
|
|
109
|
+
req.write(body)
|
|
110
|
+
req.end()
|
|
111
|
+
})
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
module.exports = { Reranker }
|
package/src/search/summarizer.js
CHANGED
|
@@ -17,7 +17,8 @@ class Summarizer {
|
|
|
17
17
|
openai: 'OPENAI_API_KEY',
|
|
18
18
|
anthropic: 'ANTHROPIC_API_KEY',
|
|
19
19
|
minimax: 'MINIMAX_API_KEY',
|
|
20
|
-
xai: 'XAI_API_KEY'
|
|
20
|
+
xai: 'XAI_API_KEY',
|
|
21
|
+
gemini: 'GEMINI_API_KEY'
|
|
21
22
|
}
|
|
22
23
|
return keys[this.provider] || 'OPENAI_API_KEY'
|
|
23
24
|
}
|
|
@@ -56,6 +57,8 @@ Answer:`
|
|
|
56
57
|
return this._openaiCompatible(prompt)
|
|
57
58
|
case 'anthropic':
|
|
58
59
|
return this._anthropic(prompt)
|
|
60
|
+
case 'gemini':
|
|
61
|
+
return this._gemini(prompt)
|
|
59
62
|
case 'ollama':
|
|
60
63
|
return this._ollama(prompt)
|
|
61
64
|
default:
|
|
@@ -106,6 +109,18 @@ Answer:`
|
|
|
106
109
|
return data.content?.[0]?.text || null
|
|
107
110
|
}
|
|
108
111
|
|
|
112
|
+
async _gemini(prompt) {
|
|
113
|
+
const model = this.model || 'gemini-2.0-flash'
|
|
114
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
|
|
115
|
+
const body = JSON.stringify({
|
|
116
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
117
|
+
generationConfig: { temperature: 0.3, maxOutputTokens: 500 }
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
const data = await postJson(url, body, { 'Content-Type': 'application/json' })
|
|
121
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || null
|
|
122
|
+
}
|
|
123
|
+
|
|
109
124
|
async _ollama(prompt) {
|
|
110
125
|
const url = this.baseUrl || 'http://localhost:11434/api/generate'
|
|
111
126
|
const body = JSON.stringify({
|