spectrawl 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +4 -1
- package/package.json +1 -1
- package/src/config.js +2 -2
- package/src/search/engines/tavily.js +72 -0
- package/src/search/index.js +27 -15
- package/src/search/scraper.js +5 -5
package/index.d.ts
CHANGED
|
@@ -5,6 +5,7 @@ declare module 'spectrawl' {
|
|
|
5
5
|
scrapeTop?: number
|
|
6
6
|
geminiKey?: string
|
|
7
7
|
'gemini-grounded'?: { apiKey?: string; model?: string }
|
|
8
|
+
tavily?: { apiKey?: string; searchDepth?: string; maxResults?: number }
|
|
8
9
|
llm?: { provider: string; model?: string; apiKey?: string }
|
|
9
10
|
sourceRanker?: {
|
|
10
11
|
weights?: Record<string, number>
|
|
@@ -58,10 +59,12 @@ declare module 'spectrawl' {
|
|
|
58
59
|
}
|
|
59
60
|
|
|
60
61
|
interface DeepSearchOptions {
|
|
61
|
-
mode?: 'fast' | 'full'
|
|
62
|
+
mode?: 'fast' | 'snippets' | 'full'
|
|
62
63
|
scrapeTop?: number
|
|
64
|
+
scrapeTimeout?: number
|
|
63
65
|
expand?: boolean
|
|
64
66
|
rerank?: boolean
|
|
67
|
+
summarize?: boolean
|
|
65
68
|
}
|
|
66
69
|
|
|
67
70
|
interface BrowseResult {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.15",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
package/src/config.js
CHANGED
|
@@ -4,8 +4,8 @@ const path = require('path')
|
|
|
4
4
|
const DEFAULTS = {
|
|
5
5
|
port: 3900,
|
|
6
6
|
search: {
|
|
7
|
-
cascade: ['gemini-grounded', 'brave', 'ddg'],
|
|
8
|
-
scrapeTop:
|
|
7
|
+
cascade: ['gemini-grounded', 'tavily', 'brave', 'ddg'],
|
|
8
|
+
scrapeTop: 5,
|
|
9
9
|
searxng: { url: 'http://localhost:8888' },
|
|
10
10
|
llm: null // { provider, model, apiKey }
|
|
11
11
|
},
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Tavily Search API — high-quality search with optional AI answers.
|
|
5
|
+
* Free tier: 1,000 queries/month.
|
|
6
|
+
* Use as fallback after Gemini Grounded's 5K/month free tier.
|
|
7
|
+
*/
|
|
8
|
+
async function tavilySearch(query, config = {}) {
|
|
9
|
+
const apiKey = config.apiKey || process.env.TAVILY_API_KEY
|
|
10
|
+
if (!apiKey) throw new Error('TAVILY_API_KEY required for Tavily search')
|
|
11
|
+
|
|
12
|
+
const body = JSON.stringify({
|
|
13
|
+
query,
|
|
14
|
+
search_depth: config.searchDepth || 'basic',
|
|
15
|
+
include_answer: config.includeAnswer || false,
|
|
16
|
+
include_raw_content: false,
|
|
17
|
+
max_results: config.maxResults || 10,
|
|
18
|
+
...(config.topic && { topic: config.topic }),
|
|
19
|
+
...(config.days && { days: config.days })
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
const data = await post('https://api.tavily.com/search', body, apiKey)
|
|
23
|
+
|
|
24
|
+
if (!data.results) {
|
|
25
|
+
throw new Error(`Tavily search failed: ${JSON.stringify(data).slice(0, 200)}`)
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const results = data.results.map(r => ({
|
|
29
|
+
title: r.title || '',
|
|
30
|
+
url: r.url || '',
|
|
31
|
+
snippet: r.content || '',
|
|
32
|
+
score: r.score || 0,
|
|
33
|
+
source: 'tavily'
|
|
34
|
+
}))
|
|
35
|
+
|
|
36
|
+
// Attach Tavily's answer if requested
|
|
37
|
+
if (data.answer && results.length > 0) {
|
|
38
|
+
results._tavilyAnswer = data.answer
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
return results
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function post(url, body, apiKey) {
|
|
45
|
+
return new Promise((resolve, reject) => {
|
|
46
|
+
const urlObj = new URL(url)
|
|
47
|
+
const opts = {
|
|
48
|
+
hostname: urlObj.hostname,
|
|
49
|
+
path: urlObj.pathname,
|
|
50
|
+
method: 'POST',
|
|
51
|
+
headers: {
|
|
52
|
+
'Content-Type': 'application/json',
|
|
53
|
+
'Content-Length': Buffer.byteLength(body),
|
|
54
|
+
'Authorization': `Bearer ${apiKey}`
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
const req = https.request(opts, res => {
|
|
58
|
+
let data = ''
|
|
59
|
+
res.on('data', c => data += c)
|
|
60
|
+
res.on('end', () => {
|
|
61
|
+
try { resolve(JSON.parse(data)) }
|
|
62
|
+
catch (e) { reject(new Error(`Invalid Tavily response: ${data.slice(0, 200)}`)) }
|
|
63
|
+
})
|
|
64
|
+
})
|
|
65
|
+
req.on('error', reject)
|
|
66
|
+
req.setTimeout(10000, () => { req.destroy(); reject(new Error('Tavily search timeout')) })
|
|
67
|
+
req.write(body)
|
|
68
|
+
req.end()
|
|
69
|
+
})
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
module.exports = { tavilySearch }
|
package/src/search/index.js
CHANGED
|
@@ -6,6 +6,7 @@ const { googleCseSearch } = require('./engines/google-cse')
|
|
|
6
6
|
const { jinaSearch } = require('./engines/jina')
|
|
7
7
|
const { bingSearch } = require('./engines/bing')
|
|
8
8
|
const { geminiGroundedSearch } = require('./engines/gemini-grounded')
|
|
9
|
+
const { tavilySearch } = require('./engines/tavily')
|
|
9
10
|
const { scrapeUrls } = require('./scraper')
|
|
10
11
|
const { Summarizer } = require('./summarizer')
|
|
11
12
|
const { Reranker } = require('./reranker')
|
|
@@ -21,7 +22,8 @@ const ENGINES = {
|
|
|
21
22
|
jina: jinaSearch,
|
|
22
23
|
'gemini-grounded': geminiGroundedSearch,
|
|
23
24
|
gemini: geminiGroundedSearch,
|
|
24
|
-
bing: bingSearch
|
|
25
|
+
bing: bingSearch,
|
|
26
|
+
tavily: tavilySearch
|
|
25
27
|
}
|
|
26
28
|
|
|
27
29
|
class SearchEngine {
|
|
@@ -133,23 +135,31 @@ class SearchEngine {
|
|
|
133
135
|
}
|
|
134
136
|
|
|
135
137
|
// Step 2: Search across all query variants
|
|
136
|
-
// When using Gemini Grounded,
|
|
138
|
+
// When using Gemini Grounded, conditionally add DDG for volume
|
|
137
139
|
const resultSets = []
|
|
138
140
|
if (usesGrounded) {
|
|
139
|
-
// Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
|
|
140
141
|
const delay = ms => new Promise(r => setTimeout(r, ms))
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
142
|
+
|
|
143
|
+
// Always run Gemini first
|
|
144
|
+
const groundedResults = await this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] })
|
|
145
|
+
.catch(e => { console.warn('Gemini grounded failed:', e.message); return [] })
|
|
146
|
+
|
|
147
|
+
resultSets.push(groundedResults)
|
|
148
|
+
|
|
149
|
+
// Only run DDG if Gemini returned fewer than 5 results (saves 2-3s)
|
|
150
|
+
if (groundedResults.length < 5) {
|
|
151
|
+
const ddgResults = await this._rawSearch(query, { ...opts, engines: ['ddg'] })
|
|
152
|
+
.catch(e => { console.warn('DDG failed:', e.message); return [] })
|
|
153
|
+
resultSets.push(ddgResults)
|
|
154
|
+
}
|
|
155
|
+
|
|
145
156
|
if (process.env.SPECTRAWL_DEBUG) {
|
|
146
|
-
console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG
|
|
157
|
+
console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG skipped:', groundedResults.length >= 5)
|
|
147
158
|
}
|
|
148
|
-
resultSets.push(groundedResults, ddgResults)
|
|
149
159
|
|
|
150
|
-
// If primary failed, retry with
|
|
151
|
-
if (groundedResults.length === 0
|
|
152
|
-
await delay(
|
|
160
|
+
// If primary failed, retry with full cascade (including tavily if configured)
|
|
161
|
+
if (groundedResults.length === 0) {
|
|
162
|
+
await delay(500)
|
|
153
163
|
const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
|
|
154
164
|
resultSets.push(retry)
|
|
155
165
|
}
|
|
@@ -181,11 +191,13 @@ class SearchEngine {
|
|
|
181
191
|
// Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
|
|
182
192
|
results = this.sourceRanker.rank(results)
|
|
183
193
|
|
|
184
|
-
// Step 5: Parallel scrape top N for full content
|
|
185
|
-
|
|
194
|
+
// Step 5: Parallel scrape top N for full content
|
|
195
|
+
// Skip in fast/snippets mode — just use search snippets (saves 3-8s)
|
|
196
|
+
const skipScrape = opts.mode === 'fast' || opts.mode === 'snippets'
|
|
197
|
+
const scrapeCount = skipScrape ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
|
|
186
198
|
if (scrapeCount > 0 && results.length > 0) {
|
|
187
199
|
const urls = results.slice(0, scrapeCount).map(r => r.url)
|
|
188
|
-
const scraped = await scrapeUrls(urls)
|
|
200
|
+
const scraped = await scrapeUrls(urls, { timeout: opts.scrapeTimeout || 3000 })
|
|
189
201
|
|
|
190
202
|
for (const result of results) {
|
|
191
203
|
const scrapedContent = scraped[result.url]
|
package/src/search/scraper.js
CHANGED
|
@@ -12,15 +12,15 @@ const { jinaExtract } = require('./engines/jina')
|
|
|
12
12
|
*/
|
|
13
13
|
async function scrapeUrls(urls, opts = {}) {
|
|
14
14
|
const results = {}
|
|
15
|
-
const timeout = opts.timeout ||
|
|
16
|
-
const concurrent = opts.concurrent ||
|
|
15
|
+
const timeout = opts.timeout || 5000 // 5s per URL — balances speed vs quality
|
|
16
|
+
const concurrent = opts.concurrent || 5
|
|
17
17
|
const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
|
|
18
18
|
|
|
19
19
|
// All URLs in parallel (with per-URL timeout)
|
|
20
20
|
const promises = urls.map(url => {
|
|
21
21
|
const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
|
|
22
|
-
// Hard timeout per URL
|
|
23
|
-
const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout +
|
|
22
|
+
// Hard timeout per URL — kill slow sites fast
|
|
23
|
+
const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 500))
|
|
24
24
|
return Promise.race([p, timer])
|
|
25
25
|
})
|
|
26
26
|
const allResults = await Promise.all(promises)
|
|
@@ -35,7 +35,7 @@ async function scrapeUrls(urls, opts = {}) {
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
async function scrapeUrl(url, opts = {}) {
|
|
38
|
-
const { timeout =
|
|
38
|
+
const { timeout = 5000, engine = 'auto', browse } = opts
|
|
39
39
|
|
|
40
40
|
// Try Jina first if available (better markdown output)
|
|
41
41
|
if (engine === 'jina' || engine === 'auto') {
|