spectrawl 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/index.js +16 -1
- package/src/search/index.js +34 -8
- package/src/search/source-ranker.js +138 -0
- package/src/search/summarizer.js +9 -4
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.5",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
package/src/index.js
CHANGED
|
@@ -12,9 +12,24 @@ const { EventEmitter, EVENTS } = require('./events')
|
|
|
12
12
|
const { CookieRefresher } = require('./auth/refresh')
|
|
13
13
|
const { loadConfig } = require('./config')
|
|
14
14
|
|
|
15
|
+
function deepMergeConfig(target, source) {
|
|
16
|
+
const result = { ...target }
|
|
17
|
+
for (const key of Object.keys(source)) {
|
|
18
|
+
if (source[key] && typeof source[key] === 'object' && !Array.isArray(source[key])) {
|
|
19
|
+
result[key] = deepMergeConfig(target[key] || {}, source[key])
|
|
20
|
+
} else {
|
|
21
|
+
result[key] = source[key]
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return result
|
|
25
|
+
}
|
|
26
|
+
|
|
15
27
|
class Spectrawl {
|
|
16
28
|
constructor(configPath) {
|
|
17
|
-
|
|
29
|
+
// Accept either a file path (string) or a config object
|
|
30
|
+
this.config = (typeof configPath === 'object' && configPath !== null)
|
|
31
|
+
? deepMergeConfig(loadConfig(null), configPath)
|
|
32
|
+
: loadConfig(configPath)
|
|
18
33
|
this.events = new EventEmitter()
|
|
19
34
|
this.cache = new Cache(this.config.cache)
|
|
20
35
|
this.searchEngine = new SearchEngine(this.config.search, this.cache)
|
package/src/search/index.js
CHANGED
|
@@ -9,6 +9,7 @@ const { scrapeUrls } = require('./scraper')
|
|
|
9
9
|
const { Summarizer } = require('./summarizer')
|
|
10
10
|
const { Reranker } = require('./reranker')
|
|
11
11
|
const { QueryExpander } = require('./query-expander')
|
|
12
|
+
const { SourceRanker } = require('./source-ranker')
|
|
12
13
|
|
|
13
14
|
const ENGINES = {
|
|
14
15
|
searxng: searxngSearch,
|
|
@@ -33,6 +34,7 @@ class SearchEngine {
|
|
|
33
34
|
const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
|
|
34
35
|
this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
|
|
35
36
|
this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
|
|
37
|
+
this.sourceRanker = new SourceRanker(config.sourceRanker || {})
|
|
36
38
|
}
|
|
37
39
|
|
|
38
40
|
/**
|
|
@@ -90,8 +92,10 @@ class SearchEngine {
|
|
|
90
92
|
|
|
91
93
|
const response = { answer, sources: results, cached: false }
|
|
92
94
|
|
|
93
|
-
//
|
|
94
|
-
|
|
95
|
+
// Only cache if we got results
|
|
96
|
+
if (results.length > 0) {
|
|
97
|
+
this.cache?.set('search', cacheKey, response)
|
|
98
|
+
}
|
|
95
99
|
|
|
96
100
|
return response
|
|
97
101
|
}
|
|
@@ -123,12 +127,23 @@ class SearchEngine {
|
|
|
123
127
|
// When using Gemini Grounded, also run DDG in parallel for volume
|
|
124
128
|
const resultSets = []
|
|
125
129
|
if (usesGrounded) {
|
|
126
|
-
// Parallel
|
|
130
|
+
// Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
|
|
131
|
+
const delay = ms => new Promise(r => setTimeout(r, ms))
|
|
127
132
|
const [groundedResults, ddgResults] = await Promise.all([
|
|
128
|
-
this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(()
|
|
129
|
-
this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(()
|
|
133
|
+
this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
|
|
134
|
+
delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
|
|
130
135
|
])
|
|
136
|
+
if (process.env.SPECTRAWL_DEBUG) {
|
|
137
|
+
console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
|
|
138
|
+
}
|
|
131
139
|
resultSets.push(groundedResults, ddgResults)
|
|
140
|
+
|
|
141
|
+
// If primary failed, retry with a different approach
|
|
142
|
+
if (groundedResults.length === 0 && ddgResults.length === 0) {
|
|
143
|
+
await delay(1000)
|
|
144
|
+
const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
|
|
145
|
+
resultSets.push(retry)
|
|
146
|
+
}
|
|
132
147
|
} else {
|
|
133
148
|
for (const q of queries) {
|
|
134
149
|
try {
|
|
@@ -142,13 +157,21 @@ class SearchEngine {
|
|
|
142
157
|
}
|
|
143
158
|
|
|
144
159
|
// Step 3: Merge and deduplicate
|
|
145
|
-
|
|
160
|
+
const flatResults = resultSets.flat()
|
|
161
|
+
let results = dedupeResults(flatResults)
|
|
162
|
+
if (process.env.SPECTRAWL_DEBUG) {
|
|
163
|
+
console.log('[deepSearch] resultSets lengths:', resultSets.map(s => s.length))
|
|
164
|
+
console.log('[deepSearch] flat:', flatResults.length, '→ deduped:', results.length)
|
|
165
|
+
}
|
|
146
166
|
|
|
147
|
-
// Step
|
|
167
|
+
// Step 4a: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
|
|
148
168
|
if (this.reranker && opts.rerank !== false && !usesGrounded) {
|
|
149
169
|
results = await this.reranker.rerank(query, results)
|
|
150
170
|
}
|
|
151
171
|
|
|
172
|
+
// Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
|
|
173
|
+
results = this.sourceRanker.rank(results)
|
|
174
|
+
|
|
152
175
|
// Step 5: Parallel scrape top N for full content (skip in fast mode)
|
|
153
176
|
const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
|
|
154
177
|
if (scrapeCount > 0 && results.length > 0) {
|
|
@@ -188,7 +211,10 @@ class SearchEngine {
|
|
|
188
211
|
cached: false
|
|
189
212
|
}
|
|
190
213
|
|
|
191
|
-
|
|
214
|
+
// Only cache if we got results — never cache failures
|
|
215
|
+
if (response.sources.length > 0) {
|
|
216
|
+
this.cache?.set('search', cacheKey, response)
|
|
217
|
+
}
|
|
192
218
|
return response
|
|
193
219
|
}
|
|
194
220
|
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Source quality ranker — boost trusted sources, penalize SEO spam.
|
|
3
|
+
* This is something Tavily doesn't have.
|
|
4
|
+
*
|
|
5
|
+
* Users can customize weights per domain or use built-in presets.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Built-in domain quality tiers
|
|
9
|
+
const DEFAULT_WEIGHTS = {
|
|
10
|
+
// Tier 1: Primary sources, high trust (1.3x boost)
|
|
11
|
+
'github.com': 1.3,
|
|
12
|
+
'stackoverflow.com': 1.3,
|
|
13
|
+
'news.ycombinator.com': 1.3,
|
|
14
|
+
'arxiv.org': 1.3,
|
|
15
|
+
'docs.google.com': 1.2,
|
|
16
|
+
'developer.mozilla.org': 1.3,
|
|
17
|
+
'wikipedia.org': 1.2,
|
|
18
|
+
'en.wikipedia.org': 1.2,
|
|
19
|
+
|
|
20
|
+
// Tier 2: Quality community/editorial (1.15x boost)
|
|
21
|
+
'reddit.com': 1.15,
|
|
22
|
+
'www.reddit.com': 1.15,
|
|
23
|
+
'dev.to': 1.15,
|
|
24
|
+
'medium.com': 1.1,
|
|
25
|
+
'blog.logrocket.com': 1.15,
|
|
26
|
+
'css-tricks.com': 1.15,
|
|
27
|
+
'smashingmagazine.com': 1.15,
|
|
28
|
+
'web.dev': 1.2,
|
|
29
|
+
'npmjs.com': 1.15,
|
|
30
|
+
'www.npmjs.com': 1.15,
|
|
31
|
+
'pypi.org': 1.15,
|
|
32
|
+
|
|
33
|
+
// Tier 3: Known SEO farms / thin content (0.7x penalty)
|
|
34
|
+
'w3schools.com': 0.8,
|
|
35
|
+
'www.w3schools.com': 0.8,
|
|
36
|
+
'geeksforgeeks.org': 0.85,
|
|
37
|
+
'www.geeksforgeeks.org': 0.85,
|
|
38
|
+
'tutorialspoint.com': 0.7,
|
|
39
|
+
'www.tutorialspoint.com': 0.7,
|
|
40
|
+
'javatpoint.com': 0.7,
|
|
41
|
+
'www.javatpoint.com': 0.7,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Content-type signals that indicate quality
|
|
45
|
+
const QUALITY_SIGNALS = {
|
|
46
|
+
// URL patterns that suggest high quality
|
|
47
|
+
positive: [
|
|
48
|
+
/\/blog\//i, // Blog posts (usually more detailed)
|
|
49
|
+
/\/docs\//i, // Documentation
|
|
50
|
+
/\/guide/i, // Guides
|
|
51
|
+
/\/tutorial/i, // Tutorials
|
|
52
|
+
/github\.com\/[\w-]+\/[\w-]+$/, // Repo pages (not search)
|
|
53
|
+
/\/wiki\//i, // Wiki pages
|
|
54
|
+
/\/research\//i, // Research
|
|
55
|
+
],
|
|
56
|
+
// URL patterns that suggest low quality
|
|
57
|
+
negative: [
|
|
58
|
+
/\/tag\//i, // Tag listing pages
|
|
59
|
+
/\/category\//i, // Category pages
|
|
60
|
+
/\/page\/\d+/i, // Pagination
|
|
61
|
+
/\?utm_/i, // Tracking URLs
|
|
62
|
+
/\/amp\//i, // AMP pages (usually stripped)
|
|
63
|
+
/\/slideshow/i, // Slideshow spam
|
|
64
|
+
/\/gallery/i, // Gallery spam
|
|
65
|
+
/\/listicle/i, // Listicle spam
|
|
66
|
+
]
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
class SourceRanker {
|
|
70
|
+
constructor(config = {}) {
|
|
71
|
+
this.weights = { ...DEFAULT_WEIGHTS, ...(config.weights || {}) }
|
|
72
|
+
this.boostDomains = config.boost || [] // Always boost these domains
|
|
73
|
+
this.blockDomains = config.block || [] // Always exclude these domains
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Apply source quality scoring to search results.
|
|
78
|
+
* Modifies scores in-place and reorders by adjusted score.
|
|
79
|
+
*/
|
|
80
|
+
rank(results) {
|
|
81
|
+
if (!results || results.length === 0) return results
|
|
82
|
+
|
|
83
|
+
// Filter blocked domains
|
|
84
|
+
let filtered = results.filter(r => {
|
|
85
|
+
try {
|
|
86
|
+
const host = new URL(r.url).hostname
|
|
87
|
+
return !this.blockDomains.some(d => host.includes(d))
|
|
88
|
+
} catch { return true }
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
// Apply quality weights
|
|
92
|
+
filtered = filtered.map(r => {
|
|
93
|
+
let multiplier = 1.0
|
|
94
|
+
|
|
95
|
+
try {
|
|
96
|
+
const url = new URL(r.url)
|
|
97
|
+
const host = url.hostname
|
|
98
|
+
|
|
99
|
+
// Domain weight
|
|
100
|
+
for (const [domain, weight] of Object.entries(this.weights)) {
|
|
101
|
+
if (host === domain || host.endsWith('.' + domain)) {
|
|
102
|
+
multiplier *= weight
|
|
103
|
+
break
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Boost domains
|
|
108
|
+
if (this.boostDomains.some(d => host.includes(d))) {
|
|
109
|
+
multiplier *= 1.3
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// URL quality signals
|
|
113
|
+
const fullUrl = r.url
|
|
114
|
+
for (const pattern of QUALITY_SIGNALS.positive) {
|
|
115
|
+
if (pattern.test(fullUrl)) { multiplier *= 1.05; break }
|
|
116
|
+
}
|
|
117
|
+
for (const pattern of QUALITY_SIGNALS.negative) {
|
|
118
|
+
if (pattern.test(fullUrl)) { multiplier *= 0.85; break }
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Freshness signal (year in URL)
|
|
122
|
+
const yearMatch = fullUrl.match(/20(2[4-9]|3\d)/)
|
|
123
|
+
if (yearMatch) multiplier *= 1.05 // Recent content boost
|
|
124
|
+
|
|
125
|
+
} catch { /* invalid URL, no adjustment */ }
|
|
126
|
+
|
|
127
|
+
const baseScore = r.score || r.confidence || 0.5
|
|
128
|
+
return { ...r, score: Math.min(1, baseScore * multiplier), _multiplier: multiplier }
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
// Sort by adjusted score
|
|
132
|
+
filtered.sort((a, b) => (b.score || 0) - (a.score || 0))
|
|
133
|
+
|
|
134
|
+
return filtered
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
module.exports = { SourceRanker, DEFAULT_WEIGHTS }
|
package/src/search/summarizer.js
CHANGED
|
@@ -31,10 +31,15 @@ class Summarizer {
|
|
|
31
31
|
.map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
|
|
32
32
|
.join('\n\n')
|
|
33
33
|
|
|
34
|
-
const prompt = `
|
|
34
|
+
const prompt = `Answer this question directly: "${query}"
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
Rules:
|
|
37
|
+
- Give a clear, specific answer. Name things, list tools, state facts.
|
|
38
|
+
- Use [1], [2] etc. to cite sources inline.
|
|
39
|
+
- Never say "based on the provided sources" or "according to search results."
|
|
40
|
+
- Never hedge with "it appears" or "it seems." Be direct.
|
|
41
|
+
- If sources disagree, note it briefly.
|
|
42
|
+
- Keep it concise — 2-4 paragraphs max.
|
|
38
43
|
|
|
39
44
|
Sources:
|
|
40
45
|
${context}
|
|
@@ -77,7 +82,7 @@ Answer:`
|
|
|
77
82
|
const body = JSON.stringify({
|
|
78
83
|
model: this.model,
|
|
79
84
|
messages: [
|
|
80
|
-
{ role: 'system', content: 'You are a
|
|
85
|
+
{ role: 'system', content: 'You are a search engine. Give direct, specific answers with numbered citations. Never hedge or qualify with "based on sources" — just answer the question.' },
|
|
81
86
|
{ role: 'user', content: prompt }
|
|
82
87
|
],
|
|
83
88
|
max_tokens: 500,
|