spectrawl 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts CHANGED
@@ -5,6 +5,7 @@ declare module 'spectrawl' {
5
5
  scrapeTop?: number
6
6
  geminiKey?: string
7
7
  'gemini-grounded'?: { apiKey?: string; model?: string }
8
+ tavily?: { apiKey?: string; searchDepth?: string; maxResults?: number }
8
9
  llm?: { provider: string; model?: string; apiKey?: string }
9
10
  sourceRanker?: {
10
11
  weights?: Record<string, number>
@@ -58,10 +59,12 @@ declare module 'spectrawl' {
58
59
  }
59
60
 
60
61
  interface DeepSearchOptions {
61
- mode?: 'fast' | 'full'
62
+ mode?: 'fast' | 'snippets' | 'full'
62
63
  scrapeTop?: number
64
+ scrapeTimeout?: number
63
65
  expand?: boolean
64
66
  rerank?: boolean
67
+ summarize?: boolean
65
68
  }
66
69
 
67
70
  interface BrowseResult {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.12",
3
+ "version": "0.3.14",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/config.js CHANGED
@@ -4,8 +4,8 @@ const path = require('path')
4
4
  const DEFAULTS = {
5
5
  port: 3900,
6
6
  search: {
7
- cascade: ['gemini-grounded', 'brave', 'ddg'],
8
- scrapeTop: 3,
7
+ cascade: ['gemini-grounded', 'tavily', 'brave', 'ddg'],
8
+ scrapeTop: 5,
9
9
  searxng: { url: 'http://localhost:8888' },
10
10
  llm: null // { provider, model, apiKey }
11
11
  },
@@ -0,0 +1,72 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Tavily Search API — high-quality search with optional AI answers.
5
+ * Free tier: 1,000 queries/month.
6
+ * Use as fallback after Gemini Grounded's 5K/month free tier.
7
+ */
8
+ async function tavilySearch(query, config = {}) {
9
+ const apiKey = config.apiKey || process.env.TAVILY_API_KEY
10
+ if (!apiKey) throw new Error('TAVILY_API_KEY required for Tavily search')
11
+
12
+ const body = JSON.stringify({
13
+ query,
14
+ search_depth: config.searchDepth || 'basic',
15
+ include_answer: config.includeAnswer || false,
16
+ include_raw_content: false,
17
+ max_results: config.maxResults || 10,
18
+ ...(config.topic && { topic: config.topic }),
19
+ ...(config.days && { days: config.days })
20
+ })
21
+
22
+ const data = await post('https://api.tavily.com/search', body, apiKey)
23
+
24
+ if (!data.results) {
25
+ throw new Error(`Tavily search failed: ${JSON.stringify(data).slice(0, 200)}`)
26
+ }
27
+
28
+ const results = data.results.map(r => ({
29
+ title: r.title || '',
30
+ url: r.url || '',
31
+ snippet: r.content || '',
32
+ score: r.score || 0,
33
+ source: 'tavily'
34
+ }))
35
+
36
+ // Attach Tavily's answer if requested
37
+ if (data.answer && results.length > 0) {
38
+ results._tavilyAnswer = data.answer
39
+ }
40
+
41
+ return results
42
+ }
43
+
44
+ function post(url, body, apiKey) {
45
+ return new Promise((resolve, reject) => {
46
+ const urlObj = new URL(url)
47
+ const opts = {
48
+ hostname: urlObj.hostname,
49
+ path: urlObj.pathname,
50
+ method: 'POST',
51
+ headers: {
52
+ 'Content-Type': 'application/json',
53
+ 'Content-Length': Buffer.byteLength(body),
54
+ 'Authorization': `Bearer ${apiKey}`
55
+ }
56
+ }
57
+ const req = https.request(opts, res => {
58
+ let data = ''
59
+ res.on('data', c => data += c)
60
+ res.on('end', () => {
61
+ try { resolve(JSON.parse(data)) }
62
+ catch (e) { reject(new Error(`Invalid Tavily response: ${data.slice(0, 200)}`)) }
63
+ })
64
+ })
65
+ req.on('error', reject)
66
+ req.setTimeout(10000, () => { req.destroy(); reject(new Error('Tavily search timeout')) })
67
+ req.write(body)
68
+ req.end()
69
+ })
70
+ }
71
+
72
+ module.exports = { tavilySearch }
@@ -6,6 +6,7 @@ const { googleCseSearch } = require('./engines/google-cse')
6
6
  const { jinaSearch } = require('./engines/jina')
7
7
  const { bingSearch } = require('./engines/bing')
8
8
  const { geminiGroundedSearch } = require('./engines/gemini-grounded')
9
+ const { tavilySearch } = require('./engines/tavily')
9
10
  const { scrapeUrls } = require('./scraper')
10
11
  const { Summarizer } = require('./summarizer')
11
12
  const { Reranker } = require('./reranker')
@@ -21,7 +22,8 @@ const ENGINES = {
21
22
  jina: jinaSearch,
22
23
  'gemini-grounded': geminiGroundedSearch,
23
24
  gemini: geminiGroundedSearch,
24
- bing: bingSearch
25
+ bing: bingSearch,
26
+ tavily: tavilySearch
25
27
  }
26
28
 
27
29
  class SearchEngine {
@@ -37,6 +39,13 @@ class SearchEngine {
37
39
  this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
38
40
  this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
39
41
  this.sourceRanker = new SourceRanker(config.sourceRanker || {})
42
+
43
+ // One-time warning if no Gemini key
44
+ if (!geminiKey && !SearchEngine._keyWarned) {
45
+ SearchEngine._keyWarned = true
46
+ console.warn('\n⚠️ No GEMINI_API_KEY set. Using DDG fallback (limited quality, unreliable from servers).')
47
+ console.warn(' Get a free key (no credit card): https://aistudio.google.com/apikey\n')
48
+ }
40
49
  }
41
50
 
42
51
  /**
@@ -126,23 +135,31 @@ class SearchEngine {
126
135
  }
127
136
 
128
137
  // Step 2: Search across all query variants
129
- // When using Gemini Grounded, also run DDG in parallel for volume
138
+ // When using Gemini Grounded, conditionally add DDG for volume
130
139
  const resultSets = []
131
140
  if (usesGrounded) {
132
- // Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
133
141
  const delay = ms => new Promise(r => setTimeout(r, ms))
134
- const [groundedResults, ddgResults] = await Promise.all([
135
- this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
136
- delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
137
- ])
142
+
143
+ // Always run Gemini first
144
+ const groundedResults = await this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] })
145
+ .catch(e => { console.warn('Gemini grounded failed:', e.message); return [] })
146
+
147
+ resultSets.push(groundedResults)
148
+
149
+ // Only run DDG if Gemini returned fewer than 5 results (saves 2-3s)
150
+ if (groundedResults.length < 5) {
151
+ const ddgResults = await this._rawSearch(query, { ...opts, engines: ['ddg'] })
152
+ .catch(e => { console.warn('DDG failed:', e.message); return [] })
153
+ resultSets.push(ddgResults)
154
+ }
155
+
138
156
  if (process.env.SPECTRAWL_DEBUG) {
139
- console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
157
+ console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG skipped:', groundedResults.length >= 5)
140
158
  }
141
- resultSets.push(groundedResults, ddgResults)
142
159
 
143
- // If primary failed, retry with a different approach
144
- if (groundedResults.length === 0 && ddgResults.length === 0) {
145
- await delay(1000)
160
+ // If primary failed, retry with full cascade (including tavily if configured)
161
+ if (groundedResults.length === 0) {
162
+ await delay(500)
146
163
  const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
147
164
  resultSets.push(retry)
148
165
  }
@@ -174,11 +191,13 @@ class SearchEngine {
174
191
  // Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
175
192
  results = this.sourceRanker.rank(results)
176
193
 
177
- // Step 5: Parallel scrape top N for full content (skip in fast mode)
178
- const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
194
+ // Step 5: Parallel scrape top N for full content
195
+ // Skip in fast/snippets mode just use search snippets (saves 3-8s)
196
+ const skipScrape = opts.mode === 'fast' || opts.mode === 'snippets'
197
+ const scrapeCount = skipScrape ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
179
198
  if (scrapeCount > 0 && results.length > 0) {
180
199
  const urls = results.slice(0, scrapeCount).map(r => r.url)
181
- const scraped = await scrapeUrls(urls)
200
+ const scraped = await scrapeUrls(urls, { timeout: opts.scrapeTimeout || 3000 })
182
201
 
183
202
  for (const result of results) {
184
203
  const scrapedContent = scraped[result.url]
@@ -12,15 +12,15 @@ const { jinaExtract } = require('./engines/jina')
12
12
  */
13
13
  async function scrapeUrls(urls, opts = {}) {
14
14
  const results = {}
15
- const timeout = opts.timeout || 10000
16
- const concurrent = opts.concurrent || 3
15
+ const timeout = opts.timeout || 3000 // 3s hard cutoff per URL (was 10s)
16
+ const concurrent = opts.concurrent || 5
17
17
  const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
18
18
 
19
19
  // All URLs in parallel (with per-URL timeout)
20
20
  const promises = urls.map(url => {
21
21
  const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
22
- // Hard timeout per URL
23
- const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 1000))
22
+ // Hard timeout per URL — kill slow sites fast
23
+ const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 500))
24
24
  return Promise.race([p, timer])
25
25
  })
26
26
  const allResults = await Promise.all(promises)
@@ -35,7 +35,7 @@ async function scrapeUrls(urls, opts = {}) {
35
35
  }
36
36
 
37
37
  async function scrapeUrl(url, opts = {}) {
38
- const { timeout = 10000, engine = 'auto', browse } = opts
38
+ const { timeout = 3000, engine = 'auto', browse } = opts
39
39
 
40
40
  // Try Jina first if available (better markdown output)
41
41
  if (engine === 'jina' || engine === 'auto') {