spectrawl 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/search/engines/gemini-grounded.js +130 -0
- package/src/search/index.js +31 -19
- package/src/search/scraper.js +14 -11
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.2",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Gemini Grounded Search — uses Google's Gemini API with built-in Google Search.
|
|
5
|
+
* Free tier: 1,500 req/day for Flash.
|
|
6
|
+
* Returns both an AI answer AND the search results it found.
|
|
7
|
+
*
|
|
8
|
+
* This is basically free Google search + AI summarization in one call.
|
|
9
|
+
*/
|
|
10
|
+
async function geminiGroundedSearch(query, config = {}) {
|
|
11
|
+
const apiKey = config.apiKey || process.env.GEMINI_API_KEY
|
|
12
|
+
if (!apiKey) throw new Error('GEMINI_API_KEY required for grounded search')
|
|
13
|
+
|
|
14
|
+
const model = config.model || 'gemini-2.0-flash'
|
|
15
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`
|
|
16
|
+
|
|
17
|
+
const body = JSON.stringify({
|
|
18
|
+
contents: [{
|
|
19
|
+
parts: [{ text: `Search the web and provide relevant results for: ${query}` }]
|
|
20
|
+
}],
|
|
21
|
+
tools: [{ google_search: {} }],
|
|
22
|
+
generationConfig: {
|
|
23
|
+
temperature: 0.1,
|
|
24
|
+
maxOutputTokens: 1000
|
|
25
|
+
}
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
const data = await post(url, body)
|
|
29
|
+
|
|
30
|
+
if (data.error) {
|
|
31
|
+
throw new Error(`Gemini grounded search: ${data.error.message}`)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Extract grounding metadata (search results)
|
|
35
|
+
const candidate = data.candidates?.[0]
|
|
36
|
+
const grounding = candidate?.groundingMetadata
|
|
37
|
+
const chunks = grounding?.groundingChunks || []
|
|
38
|
+
const answer = candidate?.content?.parts?.map(p => p.text).filter(Boolean).join('\n') || ''
|
|
39
|
+
|
|
40
|
+
// Resolve redirect URLs to actual URLs (parallel, with timeout)
|
|
41
|
+
const rawResults = chunks.map((chunk, i) => ({
|
|
42
|
+
title: chunk.web?.title || `Result ${i + 1}`,
|
|
43
|
+
redirectUrl: chunk.web?.uri || '',
|
|
44
|
+
snippet: '',
|
|
45
|
+
source: 'gemini-grounded'
|
|
46
|
+
})).filter(r => r.redirectUrl)
|
|
47
|
+
|
|
48
|
+
// Follow redirects to get real URLs
|
|
49
|
+
const resolved = await Promise.all(
|
|
50
|
+
rawResults.map(r => resolveRedirect(r.redirectUrl).catch(() => r.redirectUrl))
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
const results = rawResults.map((r, i) => ({
|
|
54
|
+
...r,
|
|
55
|
+
url: resolved[i] || r.redirectUrl
|
|
56
|
+
}))
|
|
57
|
+
|
|
58
|
+
// Add confidence scores from grounding supports
|
|
59
|
+
const supports = grounding?.groundingSupports || []
|
|
60
|
+
for (const support of supports) {
|
|
61
|
+
const indices = support.groundingChunkIndices || []
|
|
62
|
+
const scores = support.confidenceScores || []
|
|
63
|
+
indices.forEach((idx, j) => {
|
|
64
|
+
if (results[idx] && scores[j]) {
|
|
65
|
+
results[idx].confidence = Math.max(results[idx].confidence || 0, scores[j])
|
|
66
|
+
}
|
|
67
|
+
})
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Attach the AI answer as metadata
|
|
71
|
+
if (results.length > 0) {
|
|
72
|
+
results._groundedAnswer = answer
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return results
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Follow a redirect URL to get the actual destination.
|
|
80
|
+
*/
|
|
81
|
+
function resolveRedirect(url) {
|
|
82
|
+
return new Promise((resolve, reject) => {
|
|
83
|
+
const urlObj = new URL(url)
|
|
84
|
+
const client = urlObj.protocol === 'https:' ? https : require('http')
|
|
85
|
+
const req = client.request({
|
|
86
|
+
hostname: urlObj.hostname,
|
|
87
|
+
path: urlObj.pathname + urlObj.search,
|
|
88
|
+
method: 'HEAD',
|
|
89
|
+
headers: { 'User-Agent': 'Spectrawl/0.3' }
|
|
90
|
+
}, res => {
|
|
91
|
+
if ([301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
|
|
92
|
+
resolve(res.headers.location)
|
|
93
|
+
} else {
|
|
94
|
+
resolve(url)
|
|
95
|
+
}
|
|
96
|
+
})
|
|
97
|
+
req.on('error', () => resolve(url))
|
|
98
|
+
req.setTimeout(3000, () => { req.destroy(); resolve(url) })
|
|
99
|
+
req.end()
|
|
100
|
+
})
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function post(url, body) {
|
|
104
|
+
return new Promise((resolve, reject) => {
|
|
105
|
+
const urlObj = new URL(url)
|
|
106
|
+
const opts = {
|
|
107
|
+
hostname: urlObj.hostname,
|
|
108
|
+
path: urlObj.pathname + urlObj.search,
|
|
109
|
+
method: 'POST',
|
|
110
|
+
headers: {
|
|
111
|
+
'Content-Type': 'application/json',
|
|
112
|
+
'Content-Length': Buffer.byteLength(body)
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
const req = https.request(opts, res => {
|
|
116
|
+
let data = ''
|
|
117
|
+
res.on('data', c => data += c)
|
|
118
|
+
res.on('end', () => {
|
|
119
|
+
try { resolve(JSON.parse(data)) }
|
|
120
|
+
catch (e) { reject(new Error(`Invalid Gemini response: ${data.slice(0, 200)}`)) }
|
|
121
|
+
})
|
|
122
|
+
})
|
|
123
|
+
req.on('error', reject)
|
|
124
|
+
req.setTimeout(15000, () => { req.destroy(); reject(new Error('Gemini grounded search timeout')) })
|
|
125
|
+
req.write(body)
|
|
126
|
+
req.end()
|
|
127
|
+
})
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
module.exports = { geminiGroundedSearch }
|
package/src/search/index.js
CHANGED
|
@@ -4,6 +4,7 @@ const { serperSearch } = require('./engines/serper')
|
|
|
4
4
|
const { searxngSearch } = require('./engines/searxng')
|
|
5
5
|
const { googleCseSearch } = require('./engines/google-cse')
|
|
6
6
|
const { jinaSearch } = require('./engines/jina')
|
|
7
|
+
const { geminiGroundedSearch } = require('./engines/gemini-grounded')
|
|
7
8
|
const { scrapeUrls } = require('./scraper')
|
|
8
9
|
const { Summarizer } = require('./summarizer')
|
|
9
10
|
const { Reranker } = require('./reranker')
|
|
@@ -15,7 +16,9 @@ const ENGINES = {
|
|
|
15
16
|
brave: braveSearch,
|
|
16
17
|
serper: serperSearch,
|
|
17
18
|
'google-cse': googleCseSearch,
|
|
18
|
-
jina: jinaSearch
|
|
19
|
+
jina: jinaSearch,
|
|
20
|
+
'gemini-grounded': geminiGroundedSearch,
|
|
21
|
+
gemini: geminiGroundedSearch
|
|
19
22
|
}
|
|
20
23
|
|
|
21
24
|
class SearchEngine {
|
|
@@ -105,41 +108,49 @@ class SearchEngine {
|
|
|
105
108
|
}
|
|
106
109
|
|
|
107
110
|
// Check cache
|
|
108
|
-
const cacheKey = `deep:${
|
|
111
|
+
const cacheKey = `deep:${opts.mode || 'full'}:${query}`
|
|
109
112
|
const cached = this.cache?.get('search', cacheKey)
|
|
110
113
|
if (cached) return { ...cached, cached: true }
|
|
111
114
|
|
|
112
|
-
// Step 1: Query expansion
|
|
115
|
+
// Step 1: Query expansion (skip if using Gemini grounded — it searches Google natively)
|
|
113
116
|
let queries = [query]
|
|
114
|
-
|
|
117
|
+
const usesGrounded = this.cascade.includes('gemini-grounded') || this.cascade.includes('gemini')
|
|
118
|
+
if (this.expander && opts.expand !== false && !usesGrounded) {
|
|
115
119
|
queries = await this.expander.expand(query)
|
|
116
120
|
}
|
|
117
121
|
|
|
118
|
-
// Step 2: Search across all query variants
|
|
122
|
+
// Step 2: Search across all query variants
|
|
123
|
+
// When using Gemini Grounded, also run DDG in parallel for volume
|
|
119
124
|
const resultSets = []
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
125
|
+
if (usesGrounded) {
|
|
126
|
+
// Parallel: Gemini for quality + DDG for volume
|
|
127
|
+
const [groundedResults, ddgResults] = await Promise.all([
|
|
128
|
+
this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
|
|
129
|
+
this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
|
|
130
|
+
])
|
|
131
|
+
resultSets.push(groundedResults, ddgResults)
|
|
132
|
+
} else {
|
|
133
|
+
for (const q of queries) {
|
|
134
|
+
try {
|
|
135
|
+
const r = await this._rawSearch(q, opts)
|
|
136
|
+
resultSets.push(r)
|
|
137
|
+
} catch (e) {
|
|
138
|
+
resultSets.push([])
|
|
139
|
+
}
|
|
140
|
+
if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
|
|
126
141
|
}
|
|
127
|
-
// Small delay between queries to avoid rate limiting
|
|
128
|
-
if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
|
|
129
142
|
}
|
|
130
143
|
|
|
131
144
|
// Step 3: Merge and deduplicate
|
|
132
|
-
let results =
|
|
133
|
-
? this.expander.mergeResults(resultSets)
|
|
134
|
-
: dedupeResults(resultSets.flat())
|
|
145
|
+
let results = dedupeResults(resultSets.flat())
|
|
135
146
|
|
|
136
147
|
// Step 4: Rerank by relevance
|
|
137
148
|
if (this.reranker && opts.rerank !== false) {
|
|
138
149
|
results = await this.reranker.rerank(query, results)
|
|
139
150
|
}
|
|
140
151
|
|
|
141
|
-
// Step 5: Parallel scrape top N for full content
|
|
142
|
-
const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
|
|
152
|
+
// Step 5: Parallel scrape top N for full content (skip in fast mode)
|
|
153
|
+
const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
|
|
143
154
|
if (scrapeCount > 0 && results.length > 0) {
|
|
144
155
|
const urls = results.slice(0, scrapeCount).map(r => r.url)
|
|
145
156
|
const scraped = await scrapeUrls(urls)
|
|
@@ -188,8 +199,9 @@ class SearchEngine {
|
|
|
188
199
|
async _rawSearch(query, opts = {}) {
|
|
189
200
|
let results = []
|
|
190
201
|
const minResults = opts.minResults || 5
|
|
202
|
+
const cascade = opts.engines || this.cascade
|
|
191
203
|
|
|
192
|
-
for (const engineName of
|
|
204
|
+
for (const engineName of cascade) {
|
|
193
205
|
const engine = ENGINES[engineName]
|
|
194
206
|
if (!engine) continue
|
|
195
207
|
|
package/src/search/scraper.js
CHANGED
|
@@ -16,17 +16,20 @@ async function scrapeUrls(urls, opts = {}) {
|
|
|
16
16
|
const concurrent = opts.concurrent || 3
|
|
17
17
|
const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
19
|
+
// All URLs in parallel (with per-URL timeout)
|
|
20
|
+
const promises = urls.map(url => {
|
|
21
|
+
const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
|
|
22
|
+
// Hard timeout per URL
|
|
23
|
+
const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 1000))
|
|
24
|
+
return Promise.race([p, timer])
|
|
25
|
+
})
|
|
26
|
+
const allResults = await Promise.all(promises)
|
|
27
|
+
|
|
28
|
+
urls.forEach((url, idx) => {
|
|
29
|
+
if (allResults[idx]) {
|
|
30
|
+
results[url] = allResults[idx]
|
|
31
|
+
}
|
|
32
|
+
})
|
|
30
33
|
|
|
31
34
|
return results
|
|
32
35
|
}
|