npm - spectrawl - Versions diffs - 0.3.13 → 0.3.15 - Mend

spectrawl 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/index.d.ts +4 -1
package/package.json +1 -1
package/src/config.js +2 -2
package/src/search/engines/tavily.js +72 -0
package/src/search/index.js +27 -15
package/src/search/scraper.js +5 -5

package/index.d.ts CHANGED Viewed

@@ -5,6 +5,7 @@ declare module 'spectrawl' {
       scrapeTop?: number
       geminiKey?: string
       'gemini-grounded'?: { apiKey?: string; model?: string }
+      tavily?: { apiKey?: string; searchDepth?: string; maxResults?: number }
       llm?: { provider: string; model?: string; apiKey?: string }
       sourceRanker?: {
         weights?: Record<string, number>
@@ -58,10 +59,12 @@ declare module 'spectrawl' {
   }
   interface DeepSearchOptions {
-    mode?: 'fast' | 'full'
+    mode?: 'fast' | 'snippets' | 'full'
     scrapeTop?: number
+    scrapeTimeout?: number
     expand?: boolean
     rerank?: boolean
+    summarize?: boolean
   }
   interface BrowseResult {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.13",
+  "version": "0.3.15",
   "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/config.js CHANGED Viewed

@@ -4,8 +4,8 @@ const path = require('path')
 const DEFAULTS = {
   port: 3900,
   search: {
-    cascade: ['gemini-grounded', 'brave', 'ddg'],
-    scrapeTop: 3,
+    cascade: ['gemini-grounded', 'tavily', 'brave', 'ddg'],
+    scrapeTop: 5,
     searxng: { url: 'http://localhost:8888' },
     llm: null // { provider, model, apiKey }
   },

package/src/search/engines/tavily.js ADDED Viewed

@@ -0,0 +1,72 @@
+const https = require('https')
+/**
+ * Tavily Search API — high-quality search with optional AI answers.
+ * Free tier: 1,000 queries/month.
+ * Use as fallback after Gemini Grounded's 5K/month free tier.
+ */
+async function tavilySearch(query, config = {}) {
+  const apiKey = config.apiKey || process.env.TAVILY_API_KEY
+  if (!apiKey) throw new Error('TAVILY_API_KEY required for Tavily search')
+  const body = JSON.stringify({
+    query,
+    search_depth: config.searchDepth || 'basic',
+    include_answer: config.includeAnswer || false,
+    include_raw_content: false,
+    max_results: config.maxResults || 10,
+    ...(config.topic && { topic: config.topic }),
+    ...(config.days && { days: config.days })
+  })
+  const data = await post('https://api.tavily.com/search', body, apiKey)
+  if (!data.results) {
+    throw new Error(`Tavily search failed: ${JSON.stringify(data).slice(0, 200)}`)
+  }
+  const results = data.results.map(r => ({
+    title: r.title || '',
+    url: r.url || '',
+    snippet: r.content || '',
+    score: r.score || 0,
+    source: 'tavily'
+  }))
+  // Attach Tavily's answer if requested
+  if (data.answer && results.length > 0) {
+    results._tavilyAnswer = data.answer
+  }
+  return results
+}
+function post(url, body, apiKey) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const opts = {
+      hostname: urlObj.hostname,
+      path: urlObj.pathname,
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'Content-Length': Buffer.byteLength(body),
+        'Authorization': `Bearer ${apiKey}`
+      }
+    }
+    const req = https.request(opts, res => {
+      let data = ''
+      res.on('data', c => data += c)
+      res.on('end', () => {
+        try { resolve(JSON.parse(data)) }
+        catch (e) { reject(new Error(`Invalid Tavily response: ${data.slice(0, 200)}`)) }
+      })
+    })
+    req.on('error', reject)
+    req.setTimeout(10000, () => { req.destroy(); reject(new Error('Tavily search timeout')) })
+    req.write(body)
+    req.end()
+  })
+}
+module.exports = { tavilySearch }

package/src/search/index.js CHANGED Viewed

@@ -6,6 +6,7 @@ const { googleCseSearch } = require('./engines/google-cse')
 const { jinaSearch } = require('./engines/jina')
 const { bingSearch } = require('./engines/bing')
 const { geminiGroundedSearch } = require('./engines/gemini-grounded')
+const { tavilySearch } = require('./engines/tavily')
 const { scrapeUrls } = require('./scraper')
 const { Summarizer } = require('./summarizer')
 const { Reranker } = require('./reranker')
@@ -21,7 +22,8 @@ const ENGINES = {
   jina: jinaSearch,
   'gemini-grounded': geminiGroundedSearch,
   gemini: geminiGroundedSearch,
-  bing: bingSearch
+  bing: bingSearch,
+  tavily: tavilySearch
 }
 class SearchEngine {
@@ -133,23 +135,31 @@ class SearchEngine {
     }
     // Step 2: Search across all query variants
-    // When using Gemini Grounded, also run DDG in parallel for volume
+    // When using Gemini Grounded, conditionally add DDG for volume
     const resultSets = []
     if (usesGrounded) {
-      // Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
       const delay = ms => new Promise(r => setTimeout(r, ms))
-      const [groundedResults, ddgResults] = await Promise.all([
-        this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
-        delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
-      ])
+      // Always run Gemini first
+      const groundedResults = await this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] })
+        .catch(e => { console.warn('Gemini grounded failed:', e.message); return [] })
+      resultSets.push(groundedResults)
+      // Only run DDG if Gemini returned fewer than 5 results (saves 2-3s)
+      if (groundedResults.length < 5) {
+        const ddgResults = await this._rawSearch(query, { ...opts, engines: ['ddg'] })
+          .catch(e => { console.warn('DDG failed:', e.message); return [] })
+        resultSets.push(ddgResults)
+      }
       if (process.env.SPECTRAWL_DEBUG) {
-        console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
+        console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG skipped:', groundedResults.length >= 5)
       }
-      resultSets.push(groundedResults, ddgResults)
-      // If primary failed, retry with a different approach
-      if (groundedResults.length === 0 && ddgResults.length === 0) {
-        await delay(1000)
+      // If primary failed, retry with full cascade (including tavily if configured)
+      if (groundedResults.length === 0) {
+        await delay(500)
         const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
         resultSets.push(retry)
       }
@@ -181,11 +191,13 @@ class SearchEngine {
     // Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
     results = this.sourceRanker.rank(results)
-    // Step 5: Parallel scrape top N for full content (skip in fast mode)
-    const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
+    // Step 5: Parallel scrape top N for full content
+    // Skip in fast/snippets mode — just use search snippets (saves 3-8s)
+    const skipScrape = opts.mode === 'fast' || opts.mode === 'snippets'
+    const scrapeCount = skipScrape ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
     if (scrapeCount > 0 && results.length > 0) {
       const urls = results.slice(0, scrapeCount).map(r => r.url)
-      const scraped = await scrapeUrls(urls)
+      const scraped = await scrapeUrls(urls, { timeout: opts.scrapeTimeout || 3000 })
       for (const result of results) {
         const scrapedContent = scraped[result.url]

package/src/search/scraper.js CHANGED Viewed

@@ -12,15 +12,15 @@ const { jinaExtract } = require('./engines/jina')
  */
 async function scrapeUrls(urls, opts = {}) {
   const results = {}
-  const timeout = opts.timeout || 10000
-  const concurrent = opts.concurrent || 3
+  const timeout = opts.timeout || 5000  // 5s per URL — balances speed vs quality
+  const concurrent = opts.concurrent || 5
   const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
   // All URLs in parallel (with per-URL timeout)
   const promises = urls.map(url => {
     const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
-    // Hard timeout per URL
-    const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 1000))
+    // Hard timeout per URL — kill slow sites fast
+    const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 500))
     return Promise.race([p, timer])
   })
   const allResults = await Promise.all(promises)
@@ -35,7 +35,7 @@ async function scrapeUrls(urls, opts = {}) {
 }
 async function scrapeUrl(url, opts = {}) {
-  const { timeout = 10000, engine = 'auto', browse } = opts
+  const { timeout = 5000, engine = 'auto', browse } = opts
   // Try Jina first if available (better markdown output)
   if (engine === 'jina' || engine === 'auto') {