npm - spectrawl - Versions diffs - 0.3.4 → 0.3.5 - Mend

spectrawl 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/index.js +16 -1
package/src/search/index.js +34 -8
package/src/search/source-ranker.js +138 -0
package/src/search/summarizer.js +9 -4

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.4",
+  "version": "0.3.5",
   "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/index.js CHANGED Viewed

@@ -12,9 +12,24 @@ const { EventEmitter, EVENTS } = require('./events')
 const { CookieRefresher } = require('./auth/refresh')
 const { loadConfig } = require('./config')
+function deepMergeConfig(target, source) {
+  const result = { ...target }
+  for (const key of Object.keys(source)) {
+    if (source[key] && typeof source[key] === 'object' && !Array.isArray(source[key])) {
+      result[key] = deepMergeConfig(target[key] || {}, source[key])
+    } else {
+      result[key] = source[key]
+    }
+  }
+  return result
+}
 class Spectrawl {
   constructor(configPath) {
-    this.config = loadConfig(configPath)
+    // Accept either a file path (string) or a config object
+    this.config = (typeof configPath === 'object' && configPath !== null)
+      ? deepMergeConfig(loadConfig(null), configPath)
+      : loadConfig(configPath)
     this.events = new EventEmitter()
     this.cache = new Cache(this.config.cache)
     this.searchEngine = new SearchEngine(this.config.search, this.cache)

package/src/search/index.js CHANGED Viewed

@@ -9,6 +9,7 @@ const { scrapeUrls } = require('./scraper')
 const { Summarizer } = require('./summarizer')
 const { Reranker } = require('./reranker')
 const { QueryExpander } = require('./query-expander')
+const { SourceRanker } = require('./source-ranker')
 const ENGINES = {
   searxng: searxngSearch,
@@ -33,6 +34,7 @@ class SearchEngine {
     const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
     this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
     this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
+    this.sourceRanker = new SourceRanker(config.sourceRanker || {})
   }
   /**
@@ -90,8 +92,10 @@ class SearchEngine {
     const response = { answer, sources: results, cached: false }
-    // Cache the result
-    this.cache?.set('search', cacheKey, response)
+    // Only cache if we got results
+    if (results.length > 0) {
+      this.cache?.set('search', cacheKey, response)
+    }
     return response
   }
@@ -123,12 +127,23 @@ class SearchEngine {
     // When using Gemini Grounded, also run DDG in parallel for volume
     const resultSets = []
     if (usesGrounded) {
-      // Parallel: Gemini for quality + DDG for volume
+      // Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
+      const delay = ms => new Promise(r => setTimeout(r, ms))
       const [groundedResults, ddgResults] = await Promise.all([
-        this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
-        this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
+        this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
+        delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
       ])
+      if (process.env.SPECTRAWL_DEBUG) {
+        console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
+      }
       resultSets.push(groundedResults, ddgResults)
+      // If primary failed, retry with a different approach
+      if (groundedResults.length === 0 && ddgResults.length === 0) {
+        await delay(1000)
+        const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
+        resultSets.push(retry)
+      }
     } else {
       for (const q of queries) {
         try {
@@ -142,13 +157,21 @@ class SearchEngine {
     }
     // Step 3: Merge and deduplicate
-    let results = dedupeResults(resultSets.flat())
+    const flatResults = resultSets.flat()
+    let results = dedupeResults(flatResults)
+    if (process.env.SPECTRAWL_DEBUG) {
+      console.log('[deepSearch] resultSets lengths:', resultSets.map(s => s.length))
+      console.log('[deepSearch] flat:', flatResults.length, '→ deduped:', results.length)
+    }
-    // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
+    // Step 4a: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
     if (this.reranker && opts.rerank !== false && !usesGrounded) {
       results = await this.reranker.rerank(query, results)
     }
+    // Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
+    results = this.sourceRanker.rank(results)
     // Step 5: Parallel scrape top N for full content (skip in fast mode)
     const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
     if (scrapeCount > 0 && results.length > 0) {
@@ -188,7 +211,10 @@ class SearchEngine {
       cached: false
     }
-    this.cache?.set('search', cacheKey, response)
+    // Only cache if we got results — never cache failures
+    if (response.sources.length > 0) {
+      this.cache?.set('search', cacheKey, response)
+    }
     return response
   }

package/src/search/source-ranker.js ADDED Viewed

@@ -0,0 +1,138 @@
+/**
+ * Source quality ranker — boost trusted sources, penalize SEO spam.
+ * This is something Tavily doesn't have.
+ *
+ * Users can customize weights per domain or use built-in presets.
+ */
+// Built-in domain quality tiers
+const DEFAULT_WEIGHTS = {
+  // Tier 1: Primary sources, high trust (1.3x boost)
+  'github.com': 1.3,
+  'stackoverflow.com': 1.3,
+  'news.ycombinator.com': 1.3,
+  'arxiv.org': 1.3,
+  'docs.google.com': 1.2,
+  'developer.mozilla.org': 1.3,
+  'wikipedia.org': 1.2,
+  'en.wikipedia.org': 1.2,
+  // Tier 2: Quality community/editorial (1.15x boost)
+  'reddit.com': 1.15,
+  'www.reddit.com': 1.15,
+  'dev.to': 1.15,
+  'medium.com': 1.1,
+  'blog.logrocket.com': 1.15,
+  'css-tricks.com': 1.15,
+  'smashingmagazine.com': 1.15,
+  'web.dev': 1.2,
+  'npmjs.com': 1.15,
+  'www.npmjs.com': 1.15,
+  'pypi.org': 1.15,
+  // Tier 3: Known SEO farms / thin content (0.7x penalty)
+  'w3schools.com': 0.8,
+  'www.w3schools.com': 0.8,
+  'geeksforgeeks.org': 0.85,
+  'www.geeksforgeeks.org': 0.85,
+  'tutorialspoint.com': 0.7,
+  'www.tutorialspoint.com': 0.7,
+  'javatpoint.com': 0.7,
+  'www.javatpoint.com': 0.7,
+}
+// Content-type signals that indicate quality
+const QUALITY_SIGNALS = {
+  // URL patterns that suggest high quality
+  positive: [
+    /\/blog\//i,          // Blog posts (usually more detailed)
+    /\/docs\//i,          // Documentation
+    /\/guide/i,           // Guides
+    /\/tutorial/i,        // Tutorials
+    /github\.com\/[\w-]+\/[\w-]+$/,  // Repo pages (not search)
+    /\/wiki\//i,          // Wiki pages
+    /\/research\//i,      // Research
+  ],
+  // URL patterns that suggest low quality
+  negative: [
+    /\/tag\//i,           // Tag listing pages
+    /\/category\//i,      // Category pages
+    /\/page\/\d+/i,       // Pagination
+    /\?utm_/i,            // Tracking URLs
+    /\/amp\//i,           // AMP pages (usually stripped)
+    /\/slideshow/i,       // Slideshow spam
+    /\/gallery/i,         // Gallery spam
+    /\/listicle/i,        // Listicle spam
+  ]
+}
+class SourceRanker {
+  constructor(config = {}) {
+    this.weights = { ...DEFAULT_WEIGHTS, ...(config.weights || {}) }
+    this.boostDomains = config.boost || []   // Always boost these domains
+    this.blockDomains = config.block || []   // Always exclude these domains
+  }
+  /**
+   * Apply source quality scoring to search results.
+   * Modifies scores in-place and reorders by adjusted score.
+   */
+  rank(results) {
+    if (!results || results.length === 0) return results
+    // Filter blocked domains
+    let filtered = results.filter(r => {
+      try {
+        const host = new URL(r.url).hostname
+        return !this.blockDomains.some(d => host.includes(d))
+      } catch { return true }
+    })
+    // Apply quality weights
+    filtered = filtered.map(r => {
+      let multiplier = 1.0
+      try {
+        const url = new URL(r.url)
+        const host = url.hostname
+        // Domain weight
+        for (const [domain, weight] of Object.entries(this.weights)) {
+          if (host === domain || host.endsWith('.' + domain)) {
+            multiplier *= weight
+            break
+          }
+        }
+        // Boost domains
+        if (this.boostDomains.some(d => host.includes(d))) {
+          multiplier *= 1.3
+        }
+        // URL quality signals
+        const fullUrl = r.url
+        for (const pattern of QUALITY_SIGNALS.positive) {
+          if (pattern.test(fullUrl)) { multiplier *= 1.05; break }
+        }
+        for (const pattern of QUALITY_SIGNALS.negative) {
+          if (pattern.test(fullUrl)) { multiplier *= 0.85; break }
+        }
+        // Freshness signal (year in URL)
+        const yearMatch = fullUrl.match(/20(2[4-9]|3\d)/)
+        if (yearMatch) multiplier *= 1.05  // Recent content boost
+      } catch { /* invalid URL, no adjustment */ }
+      const baseScore = r.score || r.confidence || 0.5
+      return { ...r, score: Math.min(1, baseScore * multiplier), _multiplier: multiplier }
+    })
+    // Sort by adjusted score
+    filtered.sort((a, b) => (b.score || 0) - (a.score || 0))
+    return filtered
+  }
+}
+module.exports = { SourceRanker, DEFAULT_WEIGHTS }

package/src/search/summarizer.js CHANGED Viewed

@@ -31,10 +31,15 @@ class Summarizer {
       .map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
       .join('\n\n')
-    const prompt = `Based on the following search results, provide a concise answer to the query: "${query}"
+    const prompt = `Answer this question directly: "${query}"
-Include citations as [1], [2], etc. referencing the source numbers below.
-Be direct and factual. If the sources don't contain enough information, say so.
+Rules:
+- Give a clear, specific answer. Name things, list tools, state facts.
+- Use [1], [2] etc. to cite sources inline.
+- Never say "based on the provided sources" or "according to search results."
+- Never hedge with "it appears" or "it seems." Be direct.
+- If sources disagree, note it briefly.
+- Keep it concise — 2-4 paragraphs max.
 Sources:
 ${context}
@@ -77,7 +82,7 @@ Answer:`
     const body = JSON.stringify({
       model: this.model,
       messages: [
-        { role: 'system', content: 'You are a concise search assistant. Answer with citations.' },
+        { role: 'system', content: 'You are a search engine. Give direct, specific answers with numbered citations. Never hedge or qualify with "based on sources" — just answer the question.' },
         { role: 'user', content: prompt }
       ],
       max_tokens: 500,