spectrawl 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.js +16 -1
- package/src/search/engines/bing.js +123 -0
- package/src/search/engines/ddg.js +81 -35
- package/src/search/index.js +37 -9
- package/src/search/source-ranker.js +138 -0
- package/src/search/summarizer.js +9 -4
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.6",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
package/src/index.js
CHANGED
|
@@ -12,9 +12,24 @@ const { EventEmitter, EVENTS } = require('./events')
|
|
|
12
12
|
const { CookieRefresher } = require('./auth/refresh')
|
|
13
13
|
const { loadConfig } = require('./config')
|
|
14
14
|
|
|
15
|
+
function deepMergeConfig(target, source) {
|
|
16
|
+
const result = { ...target }
|
|
17
|
+
for (const key of Object.keys(source)) {
|
|
18
|
+
if (source[key] && typeof source[key] === 'object' && !Array.isArray(source[key])) {
|
|
19
|
+
result[key] = deepMergeConfig(target[key] || {}, source[key])
|
|
20
|
+
} else {
|
|
21
|
+
result[key] = source[key]
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return result
|
|
25
|
+
}
|
|
26
|
+
|
|
15
27
|
class Spectrawl {
|
|
16
28
|
constructor(configPath) {
|
|
17
|
-
|
|
29
|
+
// Accept either a file path (string) or a config object
|
|
30
|
+
this.config = (typeof configPath === 'object' && configPath !== null)
|
|
31
|
+
? deepMergeConfig(loadConfig(null), configPath)
|
|
32
|
+
: loadConfig(configPath)
|
|
18
33
|
this.events = new EventEmitter()
|
|
19
34
|
this.cache = new Cache(this.config.cache)
|
|
20
35
|
this.searchEngine = new SearchEngine(this.config.search, this.cache)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
const { URL } = require('url')
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Bing web search — scrapes Bing HTML results.
|
|
6
|
+
* No API key needed. More reliable from datacenter IPs than DDG.
|
|
7
|
+
* DDG actually uses Bing's index anyway — this goes direct.
|
|
8
|
+
*/
|
|
9
|
+
async function bingSearch(query, config = {}) {
|
|
10
|
+
const maxResults = config.maxResults || 10
|
|
11
|
+
|
|
12
|
+
try {
|
|
13
|
+
const html = await fetchBing(query)
|
|
14
|
+
|
|
15
|
+
// Detect blocks
|
|
16
|
+
if (html.includes('captcha') || html.includes('unusual traffic') || html.length < 1000) {
|
|
17
|
+
return []
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return parseBingResults(html, maxResults)
|
|
21
|
+
} catch (e) {
|
|
22
|
+
return []
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function parseBingResults(html, maxResults) {
|
|
27
|
+
const results = []
|
|
28
|
+
|
|
29
|
+
// Bing result blocks: <li class="b_algo">
|
|
30
|
+
const blockRegex = /<li\s+class="b_algo">([\s\S]*?)<\/li>/g
|
|
31
|
+
let block
|
|
32
|
+
while ((block = blockRegex.exec(html)) !== null && results.length < maxResults) {
|
|
33
|
+
const content = block[1]
|
|
34
|
+
|
|
35
|
+
// Extract URL and title from <h2><a href="...">title</a></h2>
|
|
36
|
+
const linkMatch = content.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i)
|
|
37
|
+
if (!linkMatch) continue
|
|
38
|
+
|
|
39
|
+
const url = linkMatch[1]
|
|
40
|
+
const title = stripHtml(linkMatch[2])
|
|
41
|
+
|
|
42
|
+
// Skip Bing internal links
|
|
43
|
+
if (url.includes('bing.com') || url.includes('microsoft.com/bing')) continue
|
|
44
|
+
|
|
45
|
+
// Extract snippet from <p> or <div class="b_caption">
|
|
46
|
+
const snippetMatch = content.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
|
|
47
|
+
content.match(/<div\s+class="b_caption"[^>]*>([\s\S]*?)<\/div>/i)
|
|
48
|
+
const snippet = snippetMatch ? stripHtml(snippetMatch[1]) : ''
|
|
49
|
+
|
|
50
|
+
results.push({ url, title, snippet, engine: 'bing' })
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return results
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function fetchBing(query) {
|
|
57
|
+
return new Promise((resolve, reject) => {
|
|
58
|
+
const path = `/search?q=${encodeURIComponent(query)}&setlang=en&count=15`
|
|
59
|
+
const opts = {
|
|
60
|
+
hostname: 'www.bing.com',
|
|
61
|
+
path,
|
|
62
|
+
method: 'GET',
|
|
63
|
+
headers: {
|
|
64
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
65
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
66
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
67
|
+
'Accept-Encoding': 'identity',
|
|
68
|
+
'DNT': '1'
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const req = https.get(opts, res => {
|
|
73
|
+
// Follow redirects
|
|
74
|
+
if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
|
|
75
|
+
const loc = res.headers.location
|
|
76
|
+
if (loc.startsWith('http')) {
|
|
77
|
+
return fetchUrl(loc).then(resolve).catch(reject)
|
|
78
|
+
}
|
|
79
|
+
return fetchUrl('https://www.bing.com' + loc).then(resolve).catch(reject)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
let data = ''
|
|
83
|
+
res.on('data', chunk => data += chunk)
|
|
84
|
+
res.on('end', () => resolve(data))
|
|
85
|
+
})
|
|
86
|
+
req.on('error', reject)
|
|
87
|
+
req.setTimeout(8000, () => { req.destroy(); reject(new Error('Bing timeout')) })
|
|
88
|
+
})
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function fetchUrl(url) {
|
|
92
|
+
return new Promise((resolve, reject) => {
|
|
93
|
+
const urlObj = new URL(url)
|
|
94
|
+
const client = urlObj.protocol === 'https:' ? https : require('http')
|
|
95
|
+
client.get({
|
|
96
|
+
hostname: urlObj.hostname,
|
|
97
|
+
path: urlObj.pathname + urlObj.search,
|
|
98
|
+
headers: {
|
|
99
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
100
|
+
'Accept': 'text/html'
|
|
101
|
+
}
|
|
102
|
+
}, res => {
|
|
103
|
+
let data = ''
|
|
104
|
+
res.on('data', c => data += c)
|
|
105
|
+
res.on('end', () => resolve(data))
|
|
106
|
+
}).on('error', reject)
|
|
107
|
+
})
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function stripHtml(html) {
|
|
111
|
+
return html
|
|
112
|
+
.replace(/<[^>]+>/g, '')
|
|
113
|
+
.replace(/&/g, '&')
|
|
114
|
+
.replace(/</g, '<')
|
|
115
|
+
.replace(/>/g, '>')
|
|
116
|
+
.replace(/"/g, '"')
|
|
117
|
+
.replace(/'/g, "'")
|
|
118
|
+
.replace(/ /g, ' ')
|
|
119
|
+
.replace(/\s+/g, ' ')
|
|
120
|
+
.trim()
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
module.exports = { bingSearch }
|
|
@@ -1,31 +1,46 @@
|
|
|
1
1
|
const https = require('https')
|
|
2
|
+
const http = require('http')
|
|
2
3
|
const { URL } = require('url')
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* DuckDuckGo search — free, unlimited, no API key needed.
|
|
6
|
-
* Uses JSON API + HTML fallback
|
|
7
|
+
* Uses JSON API + HTML fallback + lite fallback.
|
|
8
|
+
* Built-in retry with backoff for datacenter IP rate limiting.
|
|
9
|
+
* Optional proxy support for reliable results.
|
|
7
10
|
*/
|
|
8
11
|
async function ddgSearch(query, config = {}) {
|
|
9
12
|
const maxResults = config.maxResults || 10
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
if (
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
13
|
+
const proxy = config.proxy || null
|
|
14
|
+
|
|
15
|
+
// Try up to 2 times with backoff
|
|
16
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
17
|
+
if (attempt > 0) await delay(1000 + Math.random() * 1000)
|
|
18
|
+
|
|
19
|
+
// Strategy 1: JSON API (instant answers — most reliable from datacenter)
|
|
20
|
+
try {
|
|
21
|
+
const results = await ddgJsonApi(query, maxResults, proxy)
|
|
22
|
+
if (results.length > 0) return results
|
|
23
|
+
} catch (e) { /* fall through */ }
|
|
24
|
+
|
|
25
|
+
// Strategy 2: HTML search (html.duckduckgo.com)
|
|
26
|
+
try {
|
|
27
|
+
const results = await ddgHtmlSearch(query, maxResults, 'html.duckduckgo.com', proxy)
|
|
28
|
+
if (results.length > 0) return results
|
|
29
|
+
} catch (e) { /* fall through */ }
|
|
30
|
+
|
|
31
|
+
// Strategy 3: Lite search (lite.duckduckgo.com — simpler, less likely to CAPTCHA)
|
|
32
|
+
try {
|
|
33
|
+
const results = await ddgHtmlSearch(query, maxResults, 'lite.duckduckgo.com', proxy)
|
|
34
|
+
if (results.length > 0) return results
|
|
35
|
+
} catch (e) { /* fall through */ }
|
|
36
|
+
}
|
|
22
37
|
|
|
23
38
|
return []
|
|
24
39
|
}
|
|
25
40
|
|
|
26
|
-
async function ddgJsonApi(query, maxResults) {
|
|
41
|
+
async function ddgJsonApi(query, maxResults, proxy) {
|
|
27
42
|
const url = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`
|
|
28
|
-
const data = await fetchJson(url)
|
|
43
|
+
const data = await fetchJson(url, proxy)
|
|
29
44
|
|
|
30
45
|
const results = []
|
|
31
46
|
|
|
@@ -82,9 +97,14 @@ async function ddgJsonApi(query, maxResults) {
|
|
|
82
97
|
return results
|
|
83
98
|
}
|
|
84
99
|
|
|
85
|
-
async function ddgHtmlSearch(query, maxResults) {
|
|
86
|
-
const
|
|
87
|
-
const html = await fetchHtml(
|
|
100
|
+
async function ddgHtmlSearch(query, maxResults, hostname, proxy) {
|
|
101
|
+
const path = `/html/?q=${encodeURIComponent(query)}`
|
|
102
|
+
const html = await fetchHtml(`https://${hostname}${path}`, proxy)
|
|
103
|
+
|
|
104
|
+
// Detect CAPTCHA / rate limit
|
|
105
|
+
if (html.includes('g-recaptcha') || html.includes('bot detected') || html.length < 500) {
|
|
106
|
+
return []
|
|
107
|
+
}
|
|
88
108
|
|
|
89
109
|
const results = []
|
|
90
110
|
|
|
@@ -95,11 +115,30 @@ async function ddgHtmlSearch(query, maxResults) {
|
|
|
95
115
|
let match
|
|
96
116
|
while ((match = resultRegex.exec(html)) !== null) {
|
|
97
117
|
const url = decodeUddg(match[1])
|
|
98
|
-
// Filter ads — DDG ads go through duckduckgo.com/y.js
|
|
99
118
|
if (isAd(url)) continue
|
|
100
119
|
links.push({ url, title: stripHtml(match[2]) })
|
|
101
120
|
}
|
|
102
121
|
|
|
122
|
+
// Lite endpoint uses different selectors
|
|
123
|
+
if (links.length === 0) {
|
|
124
|
+
const liteRegex = /<a[^>]+class="result-link"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/g
|
|
125
|
+
while ((match = liteRegex.exec(html)) !== null) {
|
|
126
|
+
const url = decodeUddg(match[1])
|
|
127
|
+
if (isAd(url)) continue
|
|
128
|
+
links.push({ url, title: stripHtml(match[2]) })
|
|
129
|
+
}
|
|
130
|
+
// Even simpler: just grab all non-DDG links from lite results
|
|
131
|
+
if (links.length === 0) {
|
|
132
|
+
const anyLink = /<a[^>]*href="(https?:\/\/(?!duckduckgo)[^"]+)"[^>]*>([\s\S]*?)<\/a>/g
|
|
133
|
+
while ((match = anyLink.exec(html)) !== null) {
|
|
134
|
+
if (results.length >= maxResults) break
|
|
135
|
+
const url = match[1]
|
|
136
|
+
if (isAd(url)) continue
|
|
137
|
+
links.push({ url, title: stripHtml(match[2]) })
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
103
142
|
const snippets = []
|
|
104
143
|
while ((match = snippetRegex.exec(html)) !== null) {
|
|
105
144
|
snippets.push(stripHtml(match[1]))
|
|
@@ -117,18 +156,11 @@ async function ddgHtmlSearch(query, maxResults) {
|
|
|
117
156
|
return results
|
|
118
157
|
}
|
|
119
158
|
|
|
120
|
-
/**
|
|
121
|
-
* Filter out DDG ads.
|
|
122
|
-
*/
|
|
123
159
|
function isAd(url) {
|
|
124
160
|
if (!url) return true
|
|
125
161
|
if (url.includes('duckduckgo.com/y.js')) return true
|
|
126
162
|
if (url.includes('ad_provider=')) return true
|
|
127
163
|
if (url.includes('ad_domain=')) return true
|
|
128
|
-
if (url.startsWith('//duckduckgo.com/l/?')) {
|
|
129
|
-
// This is a redirect — might be organic
|
|
130
|
-
return false
|
|
131
|
-
}
|
|
132
164
|
return false
|
|
133
165
|
}
|
|
134
166
|
|
|
@@ -140,28 +172,34 @@ function decodeUddg(url) {
|
|
|
140
172
|
return url
|
|
141
173
|
}
|
|
142
174
|
|
|
143
|
-
function
|
|
175
|
+
function delay(ms) { return new Promise(r => setTimeout(r, ms)) }
|
|
176
|
+
|
|
177
|
+
function fetchJson(url, proxy) {
|
|
144
178
|
return new Promise((resolve, reject) => {
|
|
145
179
|
const urlObj = new URL(url)
|
|
146
|
-
|
|
180
|
+
const opts = {
|
|
147
181
|
hostname: urlObj.hostname,
|
|
148
182
|
path: urlObj.pathname + urlObj.search,
|
|
149
|
-
headers: { 'User-Agent': 'Spectrawl/0.
|
|
150
|
-
}
|
|
183
|
+
headers: { 'User-Agent': 'Spectrawl/0.3' }
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const req = https.get(opts, res => {
|
|
151
187
|
let data = ''
|
|
152
188
|
res.on('data', chunk => data += chunk)
|
|
153
189
|
res.on('end', () => {
|
|
154
190
|
try { resolve(JSON.parse(data)) }
|
|
155
191
|
catch (e) { reject(new Error('Invalid JSON from DDG API')) }
|
|
156
192
|
})
|
|
157
|
-
})
|
|
193
|
+
})
|
|
194
|
+
req.on('error', reject)
|
|
195
|
+
req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG timeout')) })
|
|
158
196
|
})
|
|
159
197
|
}
|
|
160
198
|
|
|
161
|
-
function fetchHtml(url) {
|
|
199
|
+
function fetchHtml(url, proxy) {
|
|
162
200
|
return new Promise((resolve, reject) => {
|
|
163
201
|
const urlObj = new URL(url)
|
|
164
|
-
|
|
202
|
+
const opts = {
|
|
165
203
|
hostname: urlObj.hostname,
|
|
166
204
|
path: urlObj.pathname + urlObj.search,
|
|
167
205
|
headers: {
|
|
@@ -169,11 +207,19 @@ function fetchHtml(url) {
|
|
|
169
207
|
'Accept': 'text/html',
|
|
170
208
|
'Accept-Language': 'en-US,en;q=0.9'
|
|
171
209
|
}
|
|
172
|
-
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const req = https.get(opts, res => {
|
|
213
|
+
// Follow redirects
|
|
214
|
+
if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
|
|
215
|
+
return fetchHtml(res.headers.location, proxy).then(resolve).catch(reject)
|
|
216
|
+
}
|
|
173
217
|
let data = ''
|
|
174
218
|
res.on('data', chunk => data += chunk)
|
|
175
219
|
res.on('end', () => resolve(data))
|
|
176
|
-
})
|
|
220
|
+
})
|
|
221
|
+
req.on('error', reject)
|
|
222
|
+
req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG HTML timeout')) })
|
|
177
223
|
})
|
|
178
224
|
}
|
|
179
225
|
|
package/src/search/index.js
CHANGED
|
@@ -4,11 +4,13 @@ const { serperSearch } = require('./engines/serper')
|
|
|
4
4
|
const { searxngSearch } = require('./engines/searxng')
|
|
5
5
|
const { googleCseSearch } = require('./engines/google-cse')
|
|
6
6
|
const { jinaSearch } = require('./engines/jina')
|
|
7
|
+
const { bingSearch } = require('./engines/bing')
|
|
7
8
|
const { geminiGroundedSearch } = require('./engines/gemini-grounded')
|
|
8
9
|
const { scrapeUrls } = require('./scraper')
|
|
9
10
|
const { Summarizer } = require('./summarizer')
|
|
10
11
|
const { Reranker } = require('./reranker')
|
|
11
12
|
const { QueryExpander } = require('./query-expander')
|
|
13
|
+
const { SourceRanker } = require('./source-ranker')
|
|
12
14
|
|
|
13
15
|
const ENGINES = {
|
|
14
16
|
searxng: searxngSearch,
|
|
@@ -18,7 +20,8 @@ const ENGINES = {
|
|
|
18
20
|
'google-cse': googleCseSearch,
|
|
19
21
|
jina: jinaSearch,
|
|
20
22
|
'gemini-grounded': geminiGroundedSearch,
|
|
21
|
-
gemini: geminiGroundedSearch
|
|
23
|
+
gemini: geminiGroundedSearch,
|
|
24
|
+
bing: bingSearch
|
|
22
25
|
}
|
|
23
26
|
|
|
24
27
|
class SearchEngine {
|
|
@@ -33,6 +36,7 @@ class SearchEngine {
|
|
|
33
36
|
const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
|
|
34
37
|
this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
|
|
35
38
|
this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
|
|
39
|
+
this.sourceRanker = new SourceRanker(config.sourceRanker || {})
|
|
36
40
|
}
|
|
37
41
|
|
|
38
42
|
/**
|
|
@@ -90,8 +94,10 @@ class SearchEngine {
|
|
|
90
94
|
|
|
91
95
|
const response = { answer, sources: results, cached: false }
|
|
92
96
|
|
|
93
|
-
//
|
|
94
|
-
|
|
97
|
+
// Only cache if we got results
|
|
98
|
+
if (results.length > 0) {
|
|
99
|
+
this.cache?.set('search', cacheKey, response)
|
|
100
|
+
}
|
|
95
101
|
|
|
96
102
|
return response
|
|
97
103
|
}
|
|
@@ -123,12 +129,23 @@ class SearchEngine {
|
|
|
123
129
|
// When using Gemini Grounded, also run DDG in parallel for volume
|
|
124
130
|
const resultSets = []
|
|
125
131
|
if (usesGrounded) {
|
|
126
|
-
// Parallel
|
|
132
|
+
// Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
|
|
133
|
+
const delay = ms => new Promise(r => setTimeout(r, ms))
|
|
127
134
|
const [groundedResults, ddgResults] = await Promise.all([
|
|
128
|
-
this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(()
|
|
129
|
-
this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(()
|
|
135
|
+
this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
|
|
136
|
+
delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
|
|
130
137
|
])
|
|
138
|
+
if (process.env.SPECTRAWL_DEBUG) {
|
|
139
|
+
console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
|
|
140
|
+
}
|
|
131
141
|
resultSets.push(groundedResults, ddgResults)
|
|
142
|
+
|
|
143
|
+
// If primary failed, retry with a different approach
|
|
144
|
+
if (groundedResults.length === 0 && ddgResults.length === 0) {
|
|
145
|
+
await delay(1000)
|
|
146
|
+
const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
|
|
147
|
+
resultSets.push(retry)
|
|
148
|
+
}
|
|
132
149
|
} else {
|
|
133
150
|
for (const q of queries) {
|
|
134
151
|
try {
|
|
@@ -142,13 +159,21 @@ class SearchEngine {
|
|
|
142
159
|
}
|
|
143
160
|
|
|
144
161
|
// Step 3: Merge and deduplicate
|
|
145
|
-
|
|
162
|
+
const flatResults = resultSets.flat()
|
|
163
|
+
let results = dedupeResults(flatResults)
|
|
164
|
+
if (process.env.SPECTRAWL_DEBUG) {
|
|
165
|
+
console.log('[deepSearch] resultSets lengths:', resultSets.map(s => s.length))
|
|
166
|
+
console.log('[deepSearch] flat:', flatResults.length, '→ deduped:', results.length)
|
|
167
|
+
}
|
|
146
168
|
|
|
147
|
-
// Step
|
|
169
|
+
// Step 4a: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
|
|
148
170
|
if (this.reranker && opts.rerank !== false && !usesGrounded) {
|
|
149
171
|
results = await this.reranker.rerank(query, results)
|
|
150
172
|
}
|
|
151
173
|
|
|
174
|
+
// Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
|
|
175
|
+
results = this.sourceRanker.rank(results)
|
|
176
|
+
|
|
152
177
|
// Step 5: Parallel scrape top N for full content (skip in fast mode)
|
|
153
178
|
const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
|
|
154
179
|
if (scrapeCount > 0 && results.length > 0) {
|
|
@@ -188,7 +213,10 @@ class SearchEngine {
|
|
|
188
213
|
cached: false
|
|
189
214
|
}
|
|
190
215
|
|
|
191
|
-
|
|
216
|
+
// Only cache if we got results — never cache failures
|
|
217
|
+
if (response.sources.length > 0) {
|
|
218
|
+
this.cache?.set('search', cacheKey, response)
|
|
219
|
+
}
|
|
192
220
|
return response
|
|
193
221
|
}
|
|
194
222
|
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source quality ranker — boost trusted sources, penalize SEO spam.
|
|
3
|
+
* This is something Tavily doesn't have.
|
|
4
|
+
*
|
|
5
|
+
* Users can customize weights per domain or use built-in presets.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Built-in domain quality tiers
|
|
9
|
+
const DEFAULT_WEIGHTS = {
|
|
10
|
+
// Tier 1: Primary sources, high trust (1.3x boost)
|
|
11
|
+
'github.com': 1.3,
|
|
12
|
+
'stackoverflow.com': 1.3,
|
|
13
|
+
'news.ycombinator.com': 1.3,
|
|
14
|
+
'arxiv.org': 1.3,
|
|
15
|
+
'docs.google.com': 1.2,
|
|
16
|
+
'developer.mozilla.org': 1.3,
|
|
17
|
+
'wikipedia.org': 1.2,
|
|
18
|
+
'en.wikipedia.org': 1.2,
|
|
19
|
+
|
|
20
|
+
// Tier 2: Quality community/editorial (1.15x boost)
|
|
21
|
+
'reddit.com': 1.15,
|
|
22
|
+
'www.reddit.com': 1.15,
|
|
23
|
+
'dev.to': 1.15,
|
|
24
|
+
'medium.com': 1.1,
|
|
25
|
+
'blog.logrocket.com': 1.15,
|
|
26
|
+
'css-tricks.com': 1.15,
|
|
27
|
+
'smashingmagazine.com': 1.15,
|
|
28
|
+
'web.dev': 1.2,
|
|
29
|
+
'npmjs.com': 1.15,
|
|
30
|
+
'www.npmjs.com': 1.15,
|
|
31
|
+
'pypi.org': 1.15,
|
|
32
|
+
|
|
33
|
+
// Tier 3: Known SEO farms / thin content (0.7x penalty)
|
|
34
|
+
'w3schools.com': 0.8,
|
|
35
|
+
'www.w3schools.com': 0.8,
|
|
36
|
+
'geeksforgeeks.org': 0.85,
|
|
37
|
+
'www.geeksforgeeks.org': 0.85,
|
|
38
|
+
'tutorialspoint.com': 0.7,
|
|
39
|
+
'www.tutorialspoint.com': 0.7,
|
|
40
|
+
'javatpoint.com': 0.7,
|
|
41
|
+
'www.javatpoint.com': 0.7,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Content-type signals that indicate quality
|
|
45
|
+
const QUALITY_SIGNALS = {
|
|
46
|
+
// URL patterns that suggest high quality
|
|
47
|
+
positive: [
|
|
48
|
+
/\/blog\//i, // Blog posts (usually more detailed)
|
|
49
|
+
/\/docs\//i, // Documentation
|
|
50
|
+
/\/guide/i, // Guides
|
|
51
|
+
/\/tutorial/i, // Tutorials
|
|
52
|
+
/github\.com\/[\w-]+\/[\w-]+$/, // Repo pages (not search)
|
|
53
|
+
/\/wiki\//i, // Wiki pages
|
|
54
|
+
/\/research\//i, // Research
|
|
55
|
+
],
|
|
56
|
+
// URL patterns that suggest low quality
|
|
57
|
+
negative: [
|
|
58
|
+
/\/tag\//i, // Tag listing pages
|
|
59
|
+
/\/category\//i, // Category pages
|
|
60
|
+
/\/page\/\d+/i, // Pagination
|
|
61
|
+
/\?utm_/i, // Tracking URLs
|
|
62
|
+
/\/amp\//i, // AMP pages (usually stripped)
|
|
63
|
+
/\/slideshow/i, // Slideshow spam
|
|
64
|
+
/\/gallery/i, // Gallery spam
|
|
65
|
+
/\/listicle/i, // Listicle spam
|
|
66
|
+
]
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
class SourceRanker {
|
|
70
|
+
constructor(config = {}) {
|
|
71
|
+
this.weights = { ...DEFAULT_WEIGHTS, ...(config.weights || {}) }
|
|
72
|
+
this.boostDomains = config.boost || [] // Always boost these domains
|
|
73
|
+
this.blockDomains = config.block || [] // Always exclude these domains
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Apply source quality scoring to search results.
|
|
78
|
+
* Modifies scores in-place and reorders by adjusted score.
|
|
79
|
+
*/
|
|
80
|
+
rank(results) {
|
|
81
|
+
if (!results || results.length === 0) return results
|
|
82
|
+
|
|
83
|
+
// Filter blocked domains
|
|
84
|
+
let filtered = results.filter(r => {
|
|
85
|
+
try {
|
|
86
|
+
const host = new URL(r.url).hostname
|
|
87
|
+
return !this.blockDomains.some(d => host.includes(d))
|
|
88
|
+
} catch { return true }
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
// Apply quality weights
|
|
92
|
+
filtered = filtered.map(r => {
|
|
93
|
+
let multiplier = 1.0
|
|
94
|
+
|
|
95
|
+
try {
|
|
96
|
+
const url = new URL(r.url)
|
|
97
|
+
const host = url.hostname
|
|
98
|
+
|
|
99
|
+
// Domain weight
|
|
100
|
+
for (const [domain, weight] of Object.entries(this.weights)) {
|
|
101
|
+
if (host === domain || host.endsWith('.' + domain)) {
|
|
102
|
+
multiplier *= weight
|
|
103
|
+
break
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Boost domains
|
|
108
|
+
if (this.boostDomains.some(d => host.includes(d))) {
|
|
109
|
+
multiplier *= 1.3
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// URL quality signals
|
|
113
|
+
const fullUrl = r.url
|
|
114
|
+
for (const pattern of QUALITY_SIGNALS.positive) {
|
|
115
|
+
if (pattern.test(fullUrl)) { multiplier *= 1.05; break }
|
|
116
|
+
}
|
|
117
|
+
for (const pattern of QUALITY_SIGNALS.negative) {
|
|
118
|
+
if (pattern.test(fullUrl)) { multiplier *= 0.85; break }
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Freshness signal (year in URL)
|
|
122
|
+
const yearMatch = fullUrl.match(/20(2[4-9]|3\d)/)
|
|
123
|
+
if (yearMatch) multiplier *= 1.05 // Recent content boost
|
|
124
|
+
|
|
125
|
+
} catch { /* invalid URL, no adjustment */ }
|
|
126
|
+
|
|
127
|
+
const baseScore = r.score || r.confidence || 0.5
|
|
128
|
+
return { ...r, score: Math.min(1, baseScore * multiplier), _multiplier: multiplier }
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
// Sort by adjusted score
|
|
132
|
+
filtered.sort((a, b) => (b.score || 0) - (a.score || 0))
|
|
133
|
+
|
|
134
|
+
return filtered
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
module.exports = { SourceRanker, DEFAULT_WEIGHTS }
|
package/src/search/summarizer.js
CHANGED
|
@@ -31,10 +31,15 @@ class Summarizer {
|
|
|
31
31
|
.map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
|
|
32
32
|
.join('\n\n')
|
|
33
33
|
|
|
34
|
-
const prompt = `
|
|
34
|
+
const prompt = `Answer this question directly: "${query}"
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
Rules:
|
|
37
|
+
- Give a clear, specific answer. Name things, list tools, state facts.
|
|
38
|
+
- Use [1], [2] etc. to cite sources inline.
|
|
39
|
+
- Never say "based on the provided sources" or "according to search results."
|
|
40
|
+
- Never hedge with "it appears" or "it seems." Be direct.
|
|
41
|
+
- If sources disagree, note it briefly.
|
|
42
|
+
- Keep it concise — 2-4 paragraphs max.
|
|
38
43
|
|
|
39
44
|
Sources:
|
|
40
45
|
${context}
|
|
@@ -77,7 +82,7 @@ Answer:`
|
|
|
77
82
|
const body = JSON.stringify({
|
|
78
83
|
model: this.model,
|
|
79
84
|
messages: [
|
|
80
|
-
{ role: 'system', content: 'You are a
|
|
85
|
+
{ role: 'system', content: 'You are a search engine. Give direct, specific answers with numbered citations. Never hedge or qualify with "based on sources" — just answer the question.' },
|
|
81
86
|
{ role: 'user', content: prompt }
|
|
82
87
|
],
|
|
83
88
|
max_tokens: 500,
|