spectrawl 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts CHANGED
@@ -5,6 +5,7 @@ declare module 'spectrawl' {
5
5
  scrapeTop?: number
6
6
  geminiKey?: string
7
7
  'gemini-grounded'?: { apiKey?: string; model?: string }
8
+ tavily?: { apiKey?: string; searchDepth?: string; maxResults?: number }
8
9
  llm?: { provider: string; model?: string; apiKey?: string }
9
10
  sourceRanker?: {
10
11
  weights?: Record<string, number>
@@ -58,10 +59,12 @@ declare module 'spectrawl' {
58
59
  }
59
60
 
60
61
  interface DeepSearchOptions {
61
- mode?: 'fast' | 'full'
62
+ mode?: 'fast' | 'snippets' | 'full'
62
63
  scrapeTop?: number
64
+ scrapeTimeout?: number
63
65
  expand?: boolean
64
66
  rerank?: boolean
67
+ summarize?: boolean
65
68
  }
66
69
 
67
70
  interface BrowseResult {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.13",
3
+ "version": "0.3.15",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/config.js CHANGED
@@ -4,8 +4,8 @@ const path = require('path')
4
4
  const DEFAULTS = {
5
5
  port: 3900,
6
6
  search: {
7
- cascade: ['gemini-grounded', 'brave', 'ddg'],
8
- scrapeTop: 3,
7
+ cascade: ['gemini-grounded', 'tavily', 'brave', 'ddg'],
8
+ scrapeTop: 5,
9
9
  searxng: { url: 'http://localhost:8888' },
10
10
  llm: null // { provider, model, apiKey }
11
11
  },
@@ -0,0 +1,72 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Tavily Search API — high-quality search with optional AI answers.
5
+ * Free tier: 1,000 queries/month.
6
+ * Use as fallback after Gemini Grounded's 5K/month free tier.
7
+ */
8
+ async function tavilySearch(query, config = {}) {
9
+ const apiKey = config.apiKey || process.env.TAVILY_API_KEY
10
+ if (!apiKey) throw new Error('TAVILY_API_KEY required for Tavily search')
11
+
12
+ const body = JSON.stringify({
13
+ query,
14
+ search_depth: config.searchDepth || 'basic',
15
+ include_answer: config.includeAnswer || false,
16
+ include_raw_content: false,
17
+ max_results: config.maxResults || 10,
18
+ ...(config.topic && { topic: config.topic }),
19
+ ...(config.days && { days: config.days })
20
+ })
21
+
22
+ const data = await post('https://api.tavily.com/search', body, apiKey)
23
+
24
+ if (!data.results) {
25
+ throw new Error(`Tavily search failed: ${JSON.stringify(data).slice(0, 200)}`)
26
+ }
27
+
28
+ const results = data.results.map(r => ({
29
+ title: r.title || '',
30
+ url: r.url || '',
31
+ snippet: r.content || '',
32
+ score: r.score || 0,
33
+ source: 'tavily'
34
+ }))
35
+
36
+ // Attach Tavily's answer if requested
37
+ if (data.answer && results.length > 0) {
38
+ results._tavilyAnswer = data.answer
39
+ }
40
+
41
+ return results
42
+ }
43
+
44
+ function post(url, body, apiKey) {
45
+ return new Promise((resolve, reject) => {
46
+ const urlObj = new URL(url)
47
+ const opts = {
48
+ hostname: urlObj.hostname,
49
+ path: urlObj.pathname,
50
+ method: 'POST',
51
+ headers: {
52
+ 'Content-Type': 'application/json',
53
+ 'Content-Length': Buffer.byteLength(body),
54
+ 'Authorization': `Bearer ${apiKey}`
55
+ }
56
+ }
57
+ const req = https.request(opts, res => {
58
+ let data = ''
59
+ res.on('data', c => data += c)
60
+ res.on('end', () => {
61
+ try { resolve(JSON.parse(data)) }
62
+ catch (e) { reject(new Error(`Invalid Tavily response: ${data.slice(0, 200)}`)) }
63
+ })
64
+ })
65
+ req.on('error', reject)
66
+ req.setTimeout(10000, () => { req.destroy(); reject(new Error('Tavily search timeout')) })
67
+ req.write(body)
68
+ req.end()
69
+ })
70
+ }
71
+
72
+ module.exports = { tavilySearch }
@@ -6,6 +6,7 @@ const { googleCseSearch } = require('./engines/google-cse')
6
6
  const { jinaSearch } = require('./engines/jina')
7
7
  const { bingSearch } = require('./engines/bing')
8
8
  const { geminiGroundedSearch } = require('./engines/gemini-grounded')
9
+ const { tavilySearch } = require('./engines/tavily')
9
10
  const { scrapeUrls } = require('./scraper')
10
11
  const { Summarizer } = require('./summarizer')
11
12
  const { Reranker } = require('./reranker')
@@ -21,7 +22,8 @@ const ENGINES = {
21
22
  jina: jinaSearch,
22
23
  'gemini-grounded': geminiGroundedSearch,
23
24
  gemini: geminiGroundedSearch,
24
- bing: bingSearch
25
+ bing: bingSearch,
26
+ tavily: tavilySearch
25
27
  }
26
28
 
27
29
  class SearchEngine {
@@ -133,23 +135,31 @@ class SearchEngine {
133
135
  }
134
136
 
135
137
  // Step 2: Search across all query variants
136
- // When using Gemini Grounded, also run DDG in parallel for volume
138
+ // When using Gemini Grounded, conditionally add DDG for volume
137
139
  const resultSets = []
138
140
  if (usesGrounded) {
139
- // Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
140
141
  const delay = ms => new Promise(r => setTimeout(r, ms))
141
- const [groundedResults, ddgResults] = await Promise.all([
142
- this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
143
- delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
144
- ])
142
+
143
+ // Always run Gemini first
144
+ const groundedResults = await this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] })
145
+ .catch(e => { console.warn('Gemini grounded failed:', e.message); return [] })
146
+
147
+ resultSets.push(groundedResults)
148
+
149
+ // Only run DDG if Gemini returned fewer than 5 results (saves 2-3s)
150
+ if (groundedResults.length < 5) {
151
+ const ddgResults = await this._rawSearch(query, { ...opts, engines: ['ddg'] })
152
+ .catch(e => { console.warn('DDG failed:', e.message); return [] })
153
+ resultSets.push(ddgResults)
154
+ }
155
+
145
156
  if (process.env.SPECTRAWL_DEBUG) {
146
- console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
157
+ console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG skipped:', groundedResults.length >= 5)
147
158
  }
148
- resultSets.push(groundedResults, ddgResults)
149
159
 
150
- // If primary failed, retry with a different approach
151
- if (groundedResults.length === 0 && ddgResults.length === 0) {
152
- await delay(1000)
160
+ // If primary failed, retry with full cascade (including tavily if configured)
161
+ if (groundedResults.length === 0) {
162
+ await delay(500)
153
163
  const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
154
164
  resultSets.push(retry)
155
165
  }
@@ -181,11 +191,13 @@ class SearchEngine {
181
191
  // Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
182
192
  results = this.sourceRanker.rank(results)
183
193
 
184
- // Step 5: Parallel scrape top N for full content (skip in fast mode)
185
- const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
194
+ // Step 5: Parallel scrape top N for full content
195
+ // Skip in fast/snippets mode just use search snippets (saves 3-8s)
196
+ const skipScrape = opts.mode === 'fast' || opts.mode === 'snippets'
197
+ const scrapeCount = skipScrape ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
186
198
  if (scrapeCount > 0 && results.length > 0) {
187
199
  const urls = results.slice(0, scrapeCount).map(r => r.url)
188
- const scraped = await scrapeUrls(urls)
200
+ const scraped = await scrapeUrls(urls, { timeout: opts.scrapeTimeout || 3000 })
189
201
 
190
202
  for (const result of results) {
191
203
  const scrapedContent = scraped[result.url]
@@ -12,15 +12,15 @@ const { jinaExtract } = require('./engines/jina')
12
12
  */
13
13
  async function scrapeUrls(urls, opts = {}) {
14
14
  const results = {}
15
- const timeout = opts.timeout || 10000
16
- const concurrent = opts.concurrent || 3
15
+ const timeout = opts.timeout || 5000 // 5s per URL — balances speed vs quality
16
+ const concurrent = opts.concurrent || 5
17
17
  const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
18
18
 
19
19
  // All URLs in parallel (with per-URL timeout)
20
20
  const promises = urls.map(url => {
21
21
  const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
22
- // Hard timeout per URL
23
- const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 1000))
22
+ // Hard timeout per URL — kill slow sites fast
23
+ const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 500))
24
24
  return Promise.race([p, timer])
25
25
  })
26
26
  const allResults = await Promise.all(promises)
@@ -35,7 +35,7 @@ async function scrapeUrls(urls, opts = {}) {
35
35
  }
36
36
 
37
37
  async function scrapeUrl(url, opts = {}) {
38
- const { timeout = 10000, engine = 'auto', browse } = opts
38
+ const { timeout = 5000, engine = 'auto', browse } = opts
39
39
 
40
40
  // Try Jina first if available (better markdown output)
41
41
  if (engine === 'jina' || engine === 'auto') {