spectrawl 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +4 -1
- package/package.json +1 -1
- package/src/config.js +2 -2
- package/src/search/engines/tavily.js +72 -0
- package/src/search/index.js +34 -15
- package/src/search/scraper.js +5 -5
package/index.d.ts
CHANGED
|
@@ -5,6 +5,7 @@ declare module 'spectrawl' {
|
|
|
5
5
|
scrapeTop?: number
|
|
6
6
|
geminiKey?: string
|
|
7
7
|
'gemini-grounded'?: { apiKey?: string; model?: string }
|
|
8
|
+
tavily?: { apiKey?: string; searchDepth?: string; maxResults?: number }
|
|
8
9
|
llm?: { provider: string; model?: string; apiKey?: string }
|
|
9
10
|
sourceRanker?: {
|
|
10
11
|
weights?: Record<string, number>
|
|
@@ -58,10 +59,12 @@ declare module 'spectrawl' {
|
|
|
58
59
|
}
|
|
59
60
|
|
|
60
61
|
interface DeepSearchOptions {
|
|
61
|
-
mode?: 'fast' | 'full'
|
|
62
|
+
mode?: 'fast' | 'snippets' | 'full'
|
|
62
63
|
scrapeTop?: number
|
|
64
|
+
scrapeTimeout?: number
|
|
63
65
|
expand?: boolean
|
|
64
66
|
rerank?: boolean
|
|
67
|
+
summarize?: boolean
|
|
65
68
|
}
|
|
66
69
|
|
|
67
70
|
interface BrowseResult {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.14",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
package/src/config.js
CHANGED
|
@@ -4,8 +4,8 @@ const path = require('path')
|
|
|
4
4
|
const DEFAULTS = {
|
|
5
5
|
port: 3900,
|
|
6
6
|
search: {
|
|
7
|
-
cascade: ['gemini-grounded', 'brave', 'ddg'],
|
|
8
|
-
scrapeTop:
|
|
7
|
+
cascade: ['gemini-grounded', 'tavily', 'brave', 'ddg'],
|
|
8
|
+
scrapeTop: 5,
|
|
9
9
|
searxng: { url: 'http://localhost:8888' },
|
|
10
10
|
llm: null // { provider, model, apiKey }
|
|
11
11
|
},
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Tavily Search API — high-quality search with optional AI answers.
|
|
5
|
+
* Free tier: 1,000 queries/month.
|
|
6
|
+
* Use as fallback after Gemini Grounded's 5K/month free tier.
|
|
7
|
+
*/
|
|
8
|
+
async function tavilySearch(query, config = {}) {
|
|
9
|
+
const apiKey = config.apiKey || process.env.TAVILY_API_KEY
|
|
10
|
+
if (!apiKey) throw new Error('TAVILY_API_KEY required for Tavily search')
|
|
11
|
+
|
|
12
|
+
const body = JSON.stringify({
|
|
13
|
+
query,
|
|
14
|
+
search_depth: config.searchDepth || 'basic',
|
|
15
|
+
include_answer: config.includeAnswer || false,
|
|
16
|
+
include_raw_content: false,
|
|
17
|
+
max_results: config.maxResults || 10,
|
|
18
|
+
...(config.topic && { topic: config.topic }),
|
|
19
|
+
...(config.days && { days: config.days })
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
const data = await post('https://api.tavily.com/search', body, apiKey)
|
|
23
|
+
|
|
24
|
+
if (!data.results) {
|
|
25
|
+
throw new Error(`Tavily search failed: ${JSON.stringify(data).slice(0, 200)}`)
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const results = data.results.map(r => ({
|
|
29
|
+
title: r.title || '',
|
|
30
|
+
url: r.url || '',
|
|
31
|
+
snippet: r.content || '',
|
|
32
|
+
score: r.score || 0,
|
|
33
|
+
source: 'tavily'
|
|
34
|
+
}))
|
|
35
|
+
|
|
36
|
+
// Attach Tavily's answer if requested
|
|
37
|
+
if (data.answer && results.length > 0) {
|
|
38
|
+
results._tavilyAnswer = data.answer
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return results
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function post(url, body, apiKey) {
|
|
45
|
+
return new Promise((resolve, reject) => {
|
|
46
|
+
const urlObj = new URL(url)
|
|
47
|
+
const opts = {
|
|
48
|
+
hostname: urlObj.hostname,
|
|
49
|
+
path: urlObj.pathname,
|
|
50
|
+
method: 'POST',
|
|
51
|
+
headers: {
|
|
52
|
+
'Content-Type': 'application/json',
|
|
53
|
+
'Content-Length': Buffer.byteLength(body),
|
|
54
|
+
'Authorization': `Bearer ${apiKey}`
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
const req = https.request(opts, res => {
|
|
58
|
+
let data = ''
|
|
59
|
+
res.on('data', c => data += c)
|
|
60
|
+
res.on('end', () => {
|
|
61
|
+
try { resolve(JSON.parse(data)) }
|
|
62
|
+
catch (e) { reject(new Error(`Invalid Tavily response: ${data.slice(0, 200)}`)) }
|
|
63
|
+
})
|
|
64
|
+
})
|
|
65
|
+
req.on('error', reject)
|
|
66
|
+
req.setTimeout(10000, () => { req.destroy(); reject(new Error('Tavily search timeout')) })
|
|
67
|
+
req.write(body)
|
|
68
|
+
req.end()
|
|
69
|
+
})
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
module.exports = { tavilySearch }
|
package/src/search/index.js
CHANGED
|
@@ -6,6 +6,7 @@ const { googleCseSearch } = require('./engines/google-cse')
|
|
|
6
6
|
const { jinaSearch } = require('./engines/jina')
|
|
7
7
|
const { bingSearch } = require('./engines/bing')
|
|
8
8
|
const { geminiGroundedSearch } = require('./engines/gemini-grounded')
|
|
9
|
+
const { tavilySearch } = require('./engines/tavily')
|
|
9
10
|
const { scrapeUrls } = require('./scraper')
|
|
10
11
|
const { Summarizer } = require('./summarizer')
|
|
11
12
|
const { Reranker } = require('./reranker')
|
|
@@ -21,7 +22,8 @@ const ENGINES = {
|
|
|
21
22
|
jina: jinaSearch,
|
|
22
23
|
'gemini-grounded': geminiGroundedSearch,
|
|
23
24
|
gemini: geminiGroundedSearch,
|
|
24
|
-
bing: bingSearch
|
|
25
|
+
bing: bingSearch,
|
|
26
|
+
tavily: tavilySearch
|
|
25
27
|
}
|
|
26
28
|
|
|
27
29
|
class SearchEngine {
|
|
@@ -37,6 +39,13 @@ class SearchEngine {
|
|
|
37
39
|
this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
|
|
38
40
|
this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
|
|
39
41
|
this.sourceRanker = new SourceRanker(config.sourceRanker || {})
|
|
42
|
+
|
|
43
|
+
// One-time warning if no Gemini key
|
|
44
|
+
if (!geminiKey && !SearchEngine._keyWarned) {
|
|
45
|
+
SearchEngine._keyWarned = true
|
|
46
|
+
console.warn('\n⚠️ No GEMINI_API_KEY set. Using DDG fallback (limited quality, unreliable from servers).')
|
|
47
|
+
console.warn(' Get a free key (no credit card): https://aistudio.google.com/apikey\n')
|
|
48
|
+
}
|
|
40
49
|
}
|
|
41
50
|
|
|
42
51
|
/**
|
|
@@ -126,23 +135,31 @@ class SearchEngine {
|
|
|
126
135
|
}
|
|
127
136
|
|
|
128
137
|
// Step 2: Search across all query variants
|
|
129
|
-
// When using Gemini Grounded,
|
|
138
|
+
// When using Gemini Grounded, conditionally add DDG for volume
|
|
130
139
|
const resultSets = []
|
|
131
140
|
if (usesGrounded) {
|
|
132
|
-
// Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
|
|
133
141
|
const delay = ms => new Promise(r => setTimeout(r, ms))
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
142
|
+
|
|
143
|
+
// Always run Gemini first
|
|
144
|
+
const groundedResults = await this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] })
|
|
145
|
+
.catch(e => { console.warn('Gemini grounded failed:', e.message); return [] })
|
|
146
|
+
|
|
147
|
+
resultSets.push(groundedResults)
|
|
148
|
+
|
|
149
|
+
// Only run DDG if Gemini returned fewer than 5 results (saves 2-3s)
|
|
150
|
+
if (groundedResults.length < 5) {
|
|
151
|
+
const ddgResults = await this._rawSearch(query, { ...opts, engines: ['ddg'] })
|
|
152
|
+
.catch(e => { console.warn('DDG failed:', e.message); return [] })
|
|
153
|
+
resultSets.push(ddgResults)
|
|
154
|
+
}
|
|
155
|
+
|
|
138
156
|
if (process.env.SPECTRAWL_DEBUG) {
|
|
139
|
-
console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG
|
|
157
|
+
console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG skipped:', groundedResults.length >= 5)
|
|
140
158
|
}
|
|
141
|
-
resultSets.push(groundedResults, ddgResults)
|
|
142
159
|
|
|
143
|
-
// If primary failed, retry with
|
|
144
|
-
if (groundedResults.length === 0
|
|
145
|
-
await delay(
|
|
160
|
+
// If primary failed, retry with full cascade (including tavily if configured)
|
|
161
|
+
if (groundedResults.length === 0) {
|
|
162
|
+
await delay(500)
|
|
146
163
|
const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
|
|
147
164
|
resultSets.push(retry)
|
|
148
165
|
}
|
|
@@ -174,11 +191,13 @@ class SearchEngine {
|
|
|
174
191
|
// Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
|
|
175
192
|
results = this.sourceRanker.rank(results)
|
|
176
193
|
|
|
177
|
-
// Step 5: Parallel scrape top N for full content
|
|
178
|
-
|
|
194
|
+
// Step 5: Parallel scrape top N for full content
|
|
195
|
+
// Skip in fast/snippets mode — just use search snippets (saves 3-8s)
|
|
196
|
+
const skipScrape = opts.mode === 'fast' || opts.mode === 'snippets'
|
|
197
|
+
const scrapeCount = skipScrape ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
|
|
179
198
|
if (scrapeCount > 0 && results.length > 0) {
|
|
180
199
|
const urls = results.slice(0, scrapeCount).map(r => r.url)
|
|
181
|
-
const scraped = await scrapeUrls(urls)
|
|
200
|
+
const scraped = await scrapeUrls(urls, { timeout: opts.scrapeTimeout || 3000 })
|
|
182
201
|
|
|
183
202
|
for (const result of results) {
|
|
184
203
|
const scrapedContent = scraped[result.url]
|
package/src/search/scraper.js
CHANGED
|
@@ -12,15 +12,15 @@ const { jinaExtract } = require('./engines/jina')
|
|
|
12
12
|
*/
|
|
13
13
|
async function scrapeUrls(urls, opts = {}) {
|
|
14
14
|
const results = {}
|
|
15
|
-
const timeout = opts.timeout ||
|
|
16
|
-
const concurrent = opts.concurrent ||
|
|
15
|
+
const timeout = opts.timeout || 3000 // 3s hard cutoff per URL (was 10s)
|
|
16
|
+
const concurrent = opts.concurrent || 5
|
|
17
17
|
const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
|
|
18
18
|
|
|
19
19
|
// All URLs in parallel (with per-URL timeout)
|
|
20
20
|
const promises = urls.map(url => {
|
|
21
21
|
const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
|
|
22
|
-
// Hard timeout per URL
|
|
23
|
-
const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout +
|
|
22
|
+
// Hard timeout per URL — kill slow sites fast
|
|
23
|
+
const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 500))
|
|
24
24
|
return Promise.race([p, timer])
|
|
25
25
|
})
|
|
26
26
|
const allResults = await Promise.all(promises)
|
|
@@ -35,7 +35,7 @@ async function scrapeUrls(urls, opts = {}) {
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
async function scrapeUrl(url, opts = {}) {
|
|
38
|
-
const { timeout =
|
|
38
|
+
const { timeout = 3000, engine = 'auto', browse } = opts
|
|
39
39
|
|
|
40
40
|
// Try Jina first if available (better markdown output)
|
|
41
41
|
if (engine === 'jina' || engine === 'auto') {
|