spectrawl 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/search/engines/gemini-grounded.js +47 -7
- package/src/search/index.js +26 -18
- package/src/search/scraper.js +40 -5
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.3",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -37,19 +37,34 @@ async function geminiGroundedSearch(query, config = {}) {
|
|
|
37
37
|
const chunks = grounding?.groundingChunks || []
|
|
38
38
|
const answer = candidate?.content?.parts?.map(p => p.text).filter(Boolean).join('\n') || ''
|
|
39
39
|
|
|
40
|
-
//
|
|
41
|
-
const
|
|
40
|
+
// Resolve redirect URLs to actual URLs (parallel, with timeout)
|
|
41
|
+
const rawResults = chunks.map((chunk, i) => ({
|
|
42
42
|
title: chunk.web?.title || `Result ${i + 1}`,
|
|
43
|
-
|
|
44
|
-
snippet: '',
|
|
43
|
+
redirectUrl: chunk.web?.uri || '',
|
|
44
|
+
snippet: '',
|
|
45
45
|
source: 'gemini-grounded'
|
|
46
|
-
})).filter(r => r.
|
|
46
|
+
})).filter(r => r.redirectUrl)
|
|
47
47
|
|
|
48
|
-
//
|
|
48
|
+
// Follow redirects to get real URLs
|
|
49
|
+
const resolved = await Promise.all(
|
|
50
|
+
rawResults.map(r => resolveRedirect(r.redirectUrl).catch(() => r.redirectUrl))
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
const results = rawResults.map((r, i) => ({
|
|
54
|
+
...r,
|
|
55
|
+
url: resolved[i] || r.redirectUrl
|
|
56
|
+
}))
|
|
57
|
+
|
|
58
|
+
// Add confidence scores from grounding supports
|
|
49
59
|
const supports = grounding?.groundingSupports || []
|
|
50
60
|
for (const support of supports) {
|
|
51
61
|
const indices = support.groundingChunkIndices || []
|
|
52
|
-
|
|
62
|
+
const scores = support.confidenceScores || []
|
|
63
|
+
indices.forEach((idx, j) => {
|
|
64
|
+
if (results[idx] && scores[j]) {
|
|
65
|
+
results[idx].confidence = Math.max(results[idx].confidence || 0, scores[j])
|
|
66
|
+
}
|
|
67
|
+
})
|
|
53
68
|
}
|
|
54
69
|
|
|
55
70
|
// Attach the AI answer as metadata
|
|
@@ -60,6 +75,31 @@ async function geminiGroundedSearch(query, config = {}) {
|
|
|
60
75
|
return results
|
|
61
76
|
}
|
|
62
77
|
|
|
78
|
+
/**
|
|
79
|
+
* Follow a redirect URL to get the actual destination.
|
|
80
|
+
*/
|
|
81
|
+
function resolveRedirect(url) {
|
|
82
|
+
return new Promise((resolve, reject) => {
|
|
83
|
+
const urlObj = new URL(url)
|
|
84
|
+
const client = urlObj.protocol === 'https:' ? https : require('http')
|
|
85
|
+
const req = client.request({
|
|
86
|
+
hostname: urlObj.hostname,
|
|
87
|
+
path: urlObj.pathname + urlObj.search,
|
|
88
|
+
method: 'HEAD',
|
|
89
|
+
headers: { 'User-Agent': 'Spectrawl/0.3' }
|
|
90
|
+
}, res => {
|
|
91
|
+
if ([301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
|
|
92
|
+
resolve(res.headers.location)
|
|
93
|
+
} else {
|
|
94
|
+
resolve(url)
|
|
95
|
+
}
|
|
96
|
+
})
|
|
97
|
+
req.on('error', () => resolve(url))
|
|
98
|
+
req.setTimeout(3000, () => { req.destroy(); resolve(url) })
|
|
99
|
+
req.end()
|
|
100
|
+
})
|
|
101
|
+
}
|
|
102
|
+
|
|
63
103
|
function post(url, body) {
|
|
64
104
|
return new Promise((resolve, reject) => {
|
|
65
105
|
const urlObj = new URL(url)
|
package/src/search/index.js
CHANGED
|
@@ -108,7 +108,7 @@ class SearchEngine {
|
|
|
108
108
|
}
|
|
109
109
|
|
|
110
110
|
// Check cache
|
|
111
|
-
const cacheKey = `deep:${
|
|
111
|
+
const cacheKey = `deep:${opts.mode || 'full'}:${query}`
|
|
112
112
|
const cached = this.cache?.get('search', cacheKey)
|
|
113
113
|
if (cached) return { ...cached, cached: true }
|
|
114
114
|
|
|
@@ -119,31 +119,38 @@ class SearchEngine {
|
|
|
119
119
|
queries = await this.expander.expand(query)
|
|
120
120
|
}
|
|
121
121
|
|
|
122
|
-
// Step 2: Search across all query variants
|
|
122
|
+
// Step 2: Search across all query variants
|
|
123
|
+
// When using Gemini Grounded, also run DDG in parallel for volume
|
|
123
124
|
const resultSets = []
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
125
|
+
if (usesGrounded) {
|
|
126
|
+
// Parallel: Gemini for quality + DDG for volume
|
|
127
|
+
const [groundedResults, ddgResults] = await Promise.all([
|
|
128
|
+
this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
|
|
129
|
+
this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
|
|
130
|
+
])
|
|
131
|
+
resultSets.push(groundedResults, ddgResults)
|
|
132
|
+
} else {
|
|
133
|
+
for (const q of queries) {
|
|
134
|
+
try {
|
|
135
|
+
const r = await this._rawSearch(q, opts)
|
|
136
|
+
resultSets.push(r)
|
|
137
|
+
} catch (e) {
|
|
138
|
+
resultSets.push([])
|
|
139
|
+
}
|
|
140
|
+
if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
|
|
130
141
|
}
|
|
131
|
-
// Small delay between queries to avoid rate limiting
|
|
132
|
-
if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
|
|
133
142
|
}
|
|
134
143
|
|
|
135
144
|
// Step 3: Merge and deduplicate
|
|
136
|
-
let results =
|
|
137
|
-
? this.expander.mergeResults(resultSets)
|
|
138
|
-
: dedupeResults(resultSets.flat())
|
|
145
|
+
let results = dedupeResults(resultSets.flat())
|
|
139
146
|
|
|
140
|
-
// Step 4: Rerank by relevance
|
|
141
|
-
if (this.reranker && opts.rerank !== false) {
|
|
147
|
+
// Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
|
|
148
|
+
if (this.reranker && opts.rerank !== false && !usesGrounded) {
|
|
142
149
|
results = await this.reranker.rerank(query, results)
|
|
143
150
|
}
|
|
144
151
|
|
|
145
|
-
// Step 5: Parallel scrape top N for full content
|
|
146
|
-
const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
|
|
152
|
+
// Step 5: Parallel scrape top N for full content (skip in fast mode)
|
|
153
|
+
const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
|
|
147
154
|
if (scrapeCount > 0 && results.length > 0) {
|
|
148
155
|
const urls = results.slice(0, scrapeCount).map(r => r.url)
|
|
149
156
|
const scraped = await scrapeUrls(urls)
|
|
@@ -192,8 +199,9 @@ class SearchEngine {
|
|
|
192
199
|
async _rawSearch(query, opts = {}) {
|
|
193
200
|
let results = []
|
|
194
201
|
const minResults = opts.minResults || 5
|
|
202
|
+
const cascade = opts.engines || this.cascade
|
|
195
203
|
|
|
196
|
-
for (const engineName of
|
|
204
|
+
for (const engineName of cascade) {
|
|
197
205
|
const engine = ENGINES[engineName]
|
|
198
206
|
if (!engine) continue
|
|
199
207
|
|
package/src/search/scraper.js
CHANGED
|
@@ -35,13 +35,13 @@ async function scrapeUrls(urls, opts = {}) {
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
async function scrapeUrl(url, opts = {}) {
|
|
38
|
-
const { timeout = 10000, engine = 'auto' } = opts
|
|
38
|
+
const { timeout = 10000, engine = 'auto', browse } = opts
|
|
39
39
|
|
|
40
40
|
// Try Jina first if available (better markdown output)
|
|
41
41
|
if (engine === 'jina' || engine === 'auto') {
|
|
42
42
|
try {
|
|
43
43
|
const result = await jinaExtract(url)
|
|
44
|
-
if (result.content && result.content.length >
|
|
44
|
+
if (result.content && result.content.length > 200) {
|
|
45
45
|
return result.content
|
|
46
46
|
}
|
|
47
47
|
} catch (e) {
|
|
@@ -49,9 +49,44 @@ async function scrapeUrl(url, opts = {}) {
|
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
-
// Readability fallback
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
// Readability fallback (HTTP fetch + HTML→markdown)
|
|
53
|
+
try {
|
|
54
|
+
const html = await fetchPage(url, timeout)
|
|
55
|
+
const content = extractMarkdown(html)
|
|
56
|
+
if (content && content.length > 200) {
|
|
57
|
+
return content
|
|
58
|
+
}
|
|
59
|
+
} catch (e) {
|
|
60
|
+
// Fall through to browser
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Browser fallback for JS-rendered pages or when extraction is too short
|
|
64
|
+
// This is where we beat Tavily — they can't render JS pages
|
|
65
|
+
if (browse !== false) {
|
|
66
|
+
try {
|
|
67
|
+
const { BrowseEngine } = require('../browse')
|
|
68
|
+
const browser = new BrowseEngine()
|
|
69
|
+
const result = await browser.browse(url, {
|
|
70
|
+
timeout,
|
|
71
|
+
extractText: true,
|
|
72
|
+
screenshot: false
|
|
73
|
+
})
|
|
74
|
+
await browser.close()
|
|
75
|
+
if (result.text && result.text.length > 200) {
|
|
76
|
+
return result.text
|
|
77
|
+
}
|
|
78
|
+
} catch (e) {
|
|
79
|
+
// All methods exhausted
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Return whatever we got, even if short
|
|
84
|
+
try {
|
|
85
|
+
const html = await fetchPage(url, timeout)
|
|
86
|
+
return extractMarkdown(html)
|
|
87
|
+
} catch (e) {
|
|
88
|
+
return ''
|
|
89
|
+
}
|
|
55
90
|
}
|
|
56
91
|
|
|
57
92
|
function fetchPage(url, timeout = 10000, redirects = 3) {
|