npm - spectrawl - Versions diffs - 0.3.4 → 0.3.6 - Mend

spectrawl 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +1 -1
package/src/index.js +16 -1
package/src/search/engines/bing.js +123 -0
package/src/search/engines/ddg.js +81 -35
package/src/search/index.js +37 -9
package/src/search/source-ranker.js +138 -0
package/src/search/summarizer.js +9 -4

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.4",
+  "version": "0.3.6",
   "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/index.js CHANGED Viewed

@@ -12,9 +12,24 @@ const { EventEmitter, EVENTS } = require('./events')
 const { CookieRefresher } = require('./auth/refresh')
 const { loadConfig } = require('./config')
+function deepMergeConfig(target, source) {
+  const result = { ...target }
+  for (const key of Object.keys(source)) {
+    if (source[key] && typeof source[key] === 'object' && !Array.isArray(source[key])) {
+      result[key] = deepMergeConfig(target[key] || {}, source[key])
+    } else {
+      result[key] = source[key]
+    }
+  }
+  return result
+}
 class Spectrawl {
   constructor(configPath) {
-    this.config = loadConfig(configPath)
+    // Accept either a file path (string) or a config object
+    this.config = (typeof configPath === 'object' && configPath !== null)
+      ? deepMergeConfig(loadConfig(null), configPath)
+      : loadConfig(configPath)
     this.events = new EventEmitter()
     this.cache = new Cache(this.config.cache)
     this.searchEngine = new SearchEngine(this.config.search, this.cache)

package/src/search/engines/bing.js ADDED Viewed

@@ -0,0 +1,123 @@
+const https = require('https')
+const { URL } = require('url')
+/**
+ * Bing web search — scrapes Bing HTML results.
+ * No API key needed. More reliable from datacenter IPs than DDG.
+ * DDG actually uses Bing's index anyway — this goes direct.
+ */
+async function bingSearch(query, config = {}) {
+  const maxResults = config.maxResults || 10
+  try {
+    const html = await fetchBing(query)
+    // Detect blocks
+    if (html.includes('captcha') || html.includes('unusual traffic') || html.length < 1000) {
+      return []
+    }
+    return parseBingResults(html, maxResults)
+  } catch (e) {
+    return []
+  }
+}
+function parseBingResults(html, maxResults) {
+  const results = []
+  // Bing result blocks: <li class="b_algo">
+  const blockRegex = /<li\s+class="b_algo">([\s\S]*?)<\/li>/g
+  let block
+  while ((block = blockRegex.exec(html)) !== null && results.length < maxResults) {
+    const content = block[1]
+    // Extract URL and title from <h2><a href="...">title</a></h2>
+    const linkMatch = content.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i)
+    if (!linkMatch) continue
+    const url = linkMatch[1]
+    const title = stripHtml(linkMatch[2])
+    // Skip Bing internal links
+    if (url.includes('bing.com') || url.includes('microsoft.com/bing')) continue
+    // Extract snippet from <p> or <div class="b_caption">
+    const snippetMatch = content.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
+                         content.match(/<div\s+class="b_caption"[^>]*>([\s\S]*?)<\/div>/i)
+    const snippet = snippetMatch ? stripHtml(snippetMatch[1]) : ''
+    results.push({ url, title, snippet, engine: 'bing' })
+  }
+  return results
+}
+function fetchBing(query) {
+  return new Promise((resolve, reject) => {
+    const path = `/search?q=${encodeURIComponent(query)}&setlang=en&count=15`
+    const opts = {
+      hostname: 'www.bing.com',
+      path,
+      method: 'GET',
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept-Encoding': 'identity',
+        'DNT': '1'
+      }
+    }
+    const req = https.get(opts, res => {
+      // Follow redirects
+      if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
+        const loc = res.headers.location
+        if (loc.startsWith('http')) {
+          return fetchUrl(loc).then(resolve).catch(reject)
+        }
+        return fetchUrl('https://www.bing.com' + loc).then(resolve).catch(reject)
+      }
+      let data = ''
+      res.on('data', chunk => data += chunk)
+      res.on('end', () => resolve(data))
+    })
+    req.on('error', reject)
+    req.setTimeout(8000, () => { req.destroy(); reject(new Error('Bing timeout')) })
+  })
+}
+function fetchUrl(url) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const client = urlObj.protocol === 'https:' ? https : require('http')
+    client.get({
+      hostname: urlObj.hostname,
+      path: urlObj.pathname + urlObj.search,
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        'Accept': 'text/html'
+      }
+    }, res => {
+      let data = ''
+      res.on('data', c => data += c)
+      res.on('end', () => resolve(data))
+    }).on('error', reject)
+  })
+}
+function stripHtml(html) {
+  return html
+    .replace(/<[^>]+>/g, '')
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/&nbsp;/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim()
+}
+module.exports = { bingSearch }

package/src/search/engines/ddg.js CHANGED Viewed

@@ -1,31 +1,46 @@
 const https = require('https')
+const http = require('http')
 const { URL } = require('url')
 /**
  * DuckDuckGo search — free, unlimited, no API key needed.
- * Uses JSON API + HTML fallback. Filters ads automatically.
+ * Uses JSON API + HTML fallback + lite fallback.
+ * Built-in retry with backoff for datacenter IP rate limiting.
+ * Optional proxy support for reliable results.
  */
 async function ddgSearch(query, config = {}) {
   const maxResults = config.maxResults || 10
-  // Strategy 1: JSON API (instant answers)
-  try {
-    const results = await ddgJsonApi(query, maxResults)
-    if (results.length > 0) return results
-  } catch (e) { /* fall through */ }
-  // Strategy 2: HTML search
-  try {
-    const results = await ddgHtmlSearch(query, maxResults)
-    if (results.length > 0) return results
-  } catch (e) { /* fall through */ }
+  const proxy = config.proxy || null
+  // Try up to 2 times with backoff
+  for (let attempt = 0; attempt < 2; attempt++) {
+    if (attempt > 0) await delay(1000 + Math.random() * 1000)
+    // Strategy 1: JSON API (instant answers — most reliable from datacenter)
+    try {
+      const results = await ddgJsonApi(query, maxResults, proxy)
+      if (results.length > 0) return results
+    } catch (e) { /* fall through */ }
+    // Strategy 2: HTML search (html.duckduckgo.com)
+    try {
+      const results = await ddgHtmlSearch(query, maxResults, 'html.duckduckgo.com', proxy)
+      if (results.length > 0) return results
+    } catch (e) { /* fall through */ }
+    // Strategy 3: Lite search (lite.duckduckgo.com — simpler, less likely to CAPTCHA)
+    try {
+      const results = await ddgHtmlSearch(query, maxResults, 'lite.duckduckgo.com', proxy)
+      if (results.length > 0) return results
+    } catch (e) { /* fall through */ }
+  }
   return []
 }
-async function ddgJsonApi(query, maxResults) {
+async function ddgJsonApi(query, maxResults, proxy) {
   const url = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`
-  const data = await fetchJson(url)
+  const data = await fetchJson(url, proxy)
   const results = []
@@ -82,9 +97,14 @@ async function ddgJsonApi(query, maxResults) {
   return results
 }
-async function ddgHtmlSearch(query, maxResults) {
-  const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`
-  const html = await fetchHtml(url)
+async function ddgHtmlSearch(query, maxResults, hostname, proxy) {
+  const path = `/html/?q=${encodeURIComponent(query)}`
+  const html = await fetchHtml(`https://${hostname}${path}`, proxy)
+  // Detect CAPTCHA / rate limit
+  if (html.includes('g-recaptcha') || html.includes('bot detected') || html.length < 500) {
+    return []
+  }
   const results = []
@@ -95,11 +115,30 @@ async function ddgHtmlSearch(query, maxResults) {
   let match
   while ((match = resultRegex.exec(html)) !== null) {
     const url = decodeUddg(match[1])
-    // Filter ads — DDG ads go through duckduckgo.com/y.js
     if (isAd(url)) continue
     links.push({ url, title: stripHtml(match[2]) })
   }
+  // Lite endpoint uses different selectors
+  if (links.length === 0) {
+    const liteRegex = /<a[^>]+class="result-link"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/g
+    while ((match = liteRegex.exec(html)) !== null) {
+      const url = decodeUddg(match[1])
+      if (isAd(url)) continue
+      links.push({ url, title: stripHtml(match[2]) })
+    }
+    // Even simpler: just grab all non-DDG links from lite results
+    if (links.length === 0) {
+      const anyLink = /<a[^>]*href="(https?:\/\/(?!duckduckgo)[^"]+)"[^>]*>([\s\S]*?)<\/a>/g
+      while ((match = anyLink.exec(html)) !== null) {
+        if (results.length >= maxResults) break
+        const url = match[1]
+        if (isAd(url)) continue
+        links.push({ url, title: stripHtml(match[2]) })
+      }
+    }
+  }
   const snippets = []
   while ((match = snippetRegex.exec(html)) !== null) {
     snippets.push(stripHtml(match[1]))
@@ -117,18 +156,11 @@ async function ddgHtmlSearch(query, maxResults) {
   return results
 }
-/**
- * Filter out DDG ads.
- */
 function isAd(url) {
   if (!url) return true
   if (url.includes('duckduckgo.com/y.js')) return true
   if (url.includes('ad_provider=')) return true
   if (url.includes('ad_domain=')) return true
-  if (url.startsWith('//duckduckgo.com/l/?')) {
-    // This is a redirect — might be organic
-    return false
-  }
   return false
 }
@@ -140,28 +172,34 @@ function decodeUddg(url) {
   return url
 }
-function fetchJson(url) {
+function delay(ms) { return new Promise(r => setTimeout(r, ms)) }
+function fetchJson(url, proxy) {
   return new Promise((resolve, reject) => {
     const urlObj = new URL(url)
-    https.get({
+    const opts = {
       hostname: urlObj.hostname,
       path: urlObj.pathname + urlObj.search,
-      headers: { 'User-Agent': 'Spectrawl/0.1.0' }
-    }, res => {
+      headers: { 'User-Agent': 'Spectrawl/0.3' }
+    }
+    const req = https.get(opts, res => {
       let data = ''
       res.on('data', chunk => data += chunk)
       res.on('end', () => {
         try { resolve(JSON.parse(data)) }
         catch (e) { reject(new Error('Invalid JSON from DDG API')) }
       })
-    }).on('error', reject)
+    })
+    req.on('error', reject)
+    req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG timeout')) })
   })
 }
-function fetchHtml(url) {
+function fetchHtml(url, proxy) {
   return new Promise((resolve, reject) => {
     const urlObj = new URL(url)
-    https.get({
+    const opts = {
       hostname: urlObj.hostname,
       path: urlObj.pathname + urlObj.search,
       headers: {
@@ -169,11 +207,19 @@ function fetchHtml(url) {
         'Accept': 'text/html',
         'Accept-Language': 'en-US,en;q=0.9'
       }
-    }, res => {
+    }
+    const req = https.get(opts, res => {
+      // Follow redirects
+      if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
+        return fetchHtml(res.headers.location, proxy).then(resolve).catch(reject)
+      }
       let data = ''
       res.on('data', chunk => data += chunk)
       res.on('end', () => resolve(data))
-    }).on('error', reject)
+    })
+    req.on('error', reject)
+    req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG HTML timeout')) })
   })
 }

package/src/search/index.js CHANGED Viewed

@@ -4,11 +4,13 @@ const { serperSearch } = require('./engines/serper')
 const { searxngSearch } = require('./engines/searxng')
 const { googleCseSearch } = require('./engines/google-cse')
 const { jinaSearch } = require('./engines/jina')
+const { bingSearch } = require('./engines/bing')
 const { geminiGroundedSearch } = require('./engines/gemini-grounded')
 const { scrapeUrls } = require('./scraper')
 const { Summarizer } = require('./summarizer')
 const { Reranker } = require('./reranker')
 const { QueryExpander } = require('./query-expander')
+const { SourceRanker } = require('./source-ranker')
 const ENGINES = {
   searxng: searxngSearch,
@@ -18,7 +20,8 @@ const ENGINES = {
   'google-cse': googleCseSearch,
   jina: jinaSearch,
   'gemini-grounded': geminiGroundedSearch,
-  gemini: geminiGroundedSearch
+  gemini: geminiGroundedSearch,
+  bing: bingSearch
 }
 class SearchEngine {
@@ -33,6 +36,7 @@ class SearchEngine {
     const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
     this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
     this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
+    this.sourceRanker = new SourceRanker(config.sourceRanker || {})
   }
   /**
@@ -90,8 +94,10 @@ class SearchEngine {
     const response = { answer, sources: results, cached: false }
-    // Cache the result
-    this.cache?.set('search', cacheKey, response)
+    // Only cache if we got results
+    if (results.length > 0) {
+      this.cache?.set('search', cacheKey, response)
+    }
     return response
   }
@@ -123,12 +129,23 @@ class SearchEngine {
     // When using Gemini Grounded, also run DDG in parallel for volume
     const resultSets = []
     if (usesGrounded) {
-      // Parallel: Gemini for quality + DDG for volume
+      // Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
+      const delay = ms => new Promise(r => setTimeout(r, ms))
       const [groundedResults, ddgResults] = await Promise.all([
-        this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
-        this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
+        this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
+        delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
       ])
+      if (process.env.SPECTRAWL_DEBUG) {
+        console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
+      }
       resultSets.push(groundedResults, ddgResults)
+      // If primary failed, retry with a different approach
+      if (groundedResults.length === 0 && ddgResults.length === 0) {
+        await delay(1000)
+        const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
+        resultSets.push(retry)
+      }
     } else {
       for (const q of queries) {
         try {
@@ -142,13 +159,21 @@ class SearchEngine {
     }
     // Step 3: Merge and deduplicate
-    let results = dedupeResults(resultSets.flat())
+    const flatResults = resultSets.flat()
+    let results = dedupeResults(flatResults)
+    if (process.env.SPECTRAWL_DEBUG) {
+      console.log('[deepSearch] resultSets lengths:', resultSets.map(s => s.length))
+      console.log('[deepSearch] flat:', flatResults.length, '→ deduped:', results.length)
+    }
-    // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
+    // Step 4a: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
     if (this.reranker && opts.rerank !== false && !usesGrounded) {
       results = await this.reranker.rerank(query, results)
     }
+    // Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
+    results = this.sourceRanker.rank(results)
     // Step 5: Parallel scrape top N for full content (skip in fast mode)
     const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
     if (scrapeCount > 0 && results.length > 0) {
@@ -188,7 +213,10 @@ class SearchEngine {
       cached: false
     }
-    this.cache?.set('search', cacheKey, response)
+    // Only cache if we got results — never cache failures
+    if (response.sources.length > 0) {
+      this.cache?.set('search', cacheKey, response)
+    }
     return response
   }

package/src/search/source-ranker.js ADDED Viewed

@@ -0,0 +1,138 @@
+/**
+ * Source quality ranker — boost trusted sources, penalize SEO spam.
+ * This is something Tavily doesn't have.
+ *
+ * Users can customize weights per domain or use built-in presets.
+ */
+// Built-in domain quality tiers
+const DEFAULT_WEIGHTS = {
+  // Tier 1: Primary sources, high trust (1.3x boost)
+  'github.com': 1.3,
+  'stackoverflow.com': 1.3,
+  'news.ycombinator.com': 1.3,
+  'arxiv.org': 1.3,
+  'docs.google.com': 1.2,
+  'developer.mozilla.org': 1.3,
+  'wikipedia.org': 1.2,
+  'en.wikipedia.org': 1.2,
+  // Tier 2: Quality community/editorial (1.15x boost)
+  'reddit.com': 1.15,
+  'www.reddit.com': 1.15,
+  'dev.to': 1.15,
+  'medium.com': 1.1,
+  'blog.logrocket.com': 1.15,
+  'css-tricks.com': 1.15,
+  'smashingmagazine.com': 1.15,
+  'web.dev': 1.2,
+  'npmjs.com': 1.15,
+  'www.npmjs.com': 1.15,
+  'pypi.org': 1.15,
+  // Tier 3: Known SEO farms / thin content (0.7x penalty)
+  'w3schools.com': 0.8,
+  'www.w3schools.com': 0.8,
+  'geeksforgeeks.org': 0.85,
+  'www.geeksforgeeks.org': 0.85,
+  'tutorialspoint.com': 0.7,
+  'www.tutorialspoint.com': 0.7,
+  'javatpoint.com': 0.7,
+  'www.javatpoint.com': 0.7,
+}
+// Content-type signals that indicate quality
+const QUALITY_SIGNALS = {
+  // URL patterns that suggest high quality
+  positive: [
+    /\/blog\//i,          // Blog posts (usually more detailed)
+    /\/docs\//i,          // Documentation
+    /\/guide/i,           // Guides
+    /\/tutorial/i,        // Tutorials
+    /github\.com\/[\w-]+\/[\w-]+$/,  // Repo pages (not search)
+    /\/wiki\//i,          // Wiki pages
+    /\/research\//i,      // Research
+  ],
+  // URL patterns that suggest low quality
+  negative: [
+    /\/tag\//i,           // Tag listing pages
+    /\/category\//i,      // Category pages
+    /\/page\/\d+/i,       // Pagination
+    /\?utm_/i,            // Tracking URLs
+    /\/amp\//i,           // AMP pages (usually stripped)
+    /\/slideshow/i,       // Slideshow spam
+    /\/gallery/i,         // Gallery spam
+    /\/listicle/i,        // Listicle spam
+  ]
+}
+class SourceRanker {
+  constructor(config = {}) {
+    this.weights = { ...DEFAULT_WEIGHTS, ...(config.weights || {}) }
+    this.boostDomains = config.boost || []   // Always boost these domains
+    this.blockDomains = config.block || []   // Always exclude these domains
+  }
+  /**
+   * Apply source quality scoring to search results.
+   * Modifies scores in-place and reorders by adjusted score.
+   */
+  rank(results) {
+    if (!results || results.length === 0) return results
+    // Filter blocked domains
+    let filtered = results.filter(r => {
+      try {
+        const host = new URL(r.url).hostname
+        return !this.blockDomains.some(d => host.includes(d))
+      } catch { return true }
+    })
+    // Apply quality weights
+    filtered = filtered.map(r => {
+      let multiplier = 1.0
+      try {
+        const url = new URL(r.url)
+        const host = url.hostname
+        // Domain weight
+        for (const [domain, weight] of Object.entries(this.weights)) {
+          if (host === domain || host.endsWith('.' + domain)) {
+            multiplier *= weight
+            break
+          }
+        }
+        // Boost domains
+        if (this.boostDomains.some(d => host.includes(d))) {
+          multiplier *= 1.3
+        }
+        // URL quality signals
+        const fullUrl = r.url
+        for (const pattern of QUALITY_SIGNALS.positive) {
+          if (pattern.test(fullUrl)) { multiplier *= 1.05; break }
+        }
+        for (const pattern of QUALITY_SIGNALS.negative) {
+          if (pattern.test(fullUrl)) { multiplier *= 0.85; break }
+        }
+        // Freshness signal (year in URL)
+        const yearMatch = fullUrl.match(/20(2[4-9]|3\d)/)
+        if (yearMatch) multiplier *= 1.05  // Recent content boost
+      } catch { /* invalid URL, no adjustment */ }
+      const baseScore = r.score || r.confidence || 0.5
+      return { ...r, score: Math.min(1, baseScore * multiplier), _multiplier: multiplier }
+    })
+    // Sort by adjusted score
+    filtered.sort((a, b) => (b.score || 0) - (a.score || 0))
+    return filtered
+  }
+}
+module.exports = { SourceRanker, DEFAULT_WEIGHTS }

package/src/search/summarizer.js CHANGED Viewed

@@ -31,10 +31,15 @@ class Summarizer {
       .map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
       .join('\n\n')
-    const prompt = `Based on the following search results, provide a concise answer to the query: "${query}"
+    const prompt = `Answer this question directly: "${query}"
-Include citations as [1], [2], etc. referencing the source numbers below.
-Be direct and factual. If the sources don't contain enough information, say so.
+Rules:
+- Give a clear, specific answer. Name things, list tools, state facts.
+- Use [1], [2] etc. to cite sources inline.
+- Never say "based on the provided sources" or "according to search results."
+- Never hedge with "it appears" or "it seems." Be direct.
+- If sources disagree, note it briefly.
+- Keep it concise — 2-4 paragraphs max.
 Sources:
 ${context}
@@ -77,7 +82,7 @@ Answer:`
     const body = JSON.stringify({
       model: this.model,
       messages: [
-        { role: 'system', content: 'You are a concise search assistant. Answer with citations.' },
+        { role: 'system', content: 'You are a search engine. Give direct, specific answers with numbered citations. Never hedge or qualify with "based on sources" — just answer the question.' },
         { role: 'user', content: prompt }
       ],
       max_tokens: 500,