npm - spectrawl - Versions diffs - 0.3.1 → 0.3.3 - Mend

spectrawl 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/search/engines/gemini-grounded.js +47 -7
package/src/search/index.js +26 -18
package/src/search/scraper.js +40 -5

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.1",
+  "version": "0.3.3",
   "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/search/engines/gemini-grounded.js CHANGED Viewed

@@ -37,19 +37,34 @@ async function geminiGroundedSearch(query, config = {}) {
   const chunks = grounding?.groundingChunks || []
   const answer = candidate?.content?.parts?.map(p => p.text).filter(Boolean).join('\n') || ''
-  // Convert grounding chunks to standard search result format
-  const results = chunks.map((chunk, i) => ({
+  // Resolve redirect URLs to actual URLs (parallel, with timeout)
+  const rawResults = chunks.map((chunk, i) => ({
     title: chunk.web?.title || `Result ${i + 1}`,
-    url: chunk.web?.uri || '',
-    snippet: '', // Gemini doesn't give per-result snippets
+    redirectUrl: chunk.web?.uri || '',
+    snippet: '',
     source: 'gemini-grounded'
-  })).filter(r => r.url)
+  })).filter(r => r.redirectUrl)
-  // Also try to extract URLs from grounding support
+  // Follow redirects to get real URLs
+  const resolved = await Promise.all(
+    rawResults.map(r => resolveRedirect(r.redirectUrl).catch(() => r.redirectUrl))
+  )
+  const results = rawResults.map((r, i) => ({
+    ...r,
+    url: resolved[i] || r.redirectUrl
+  }))
+  // Add confidence scores from grounding supports
   const supports = grounding?.groundingSupports || []
   for (const support of supports) {
     const indices = support.groundingChunkIndices || []
-    // Already captured above
+    const scores = support.confidenceScores || []
+    indices.forEach((idx, j) => {
+      if (results[idx] && scores[j]) {
+        results[idx].confidence = Math.max(results[idx].confidence || 0, scores[j])
+      }
+    })
   }
   // Attach the AI answer as metadata
@@ -60,6 +75,31 @@ async function geminiGroundedSearch(query, config = {}) {
   return results
 }
+/**
+ * Follow a redirect URL to get the actual destination.
+ */
+function resolveRedirect(url) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const client = urlObj.protocol === 'https:' ? https : require('http')
+    const req = client.request({
+      hostname: urlObj.hostname,
+      path: urlObj.pathname + urlObj.search,
+      method: 'HEAD',
+      headers: { 'User-Agent': 'Spectrawl/0.3' }
+    }, res => {
+      if ([301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
+        resolve(res.headers.location)
+      } else {
+        resolve(url)
+      }
+    })
+    req.on('error', () => resolve(url))
+    req.setTimeout(3000, () => { req.destroy(); resolve(url) })
+    req.end()
+  })
+}
 function post(url, body) {
   return new Promise((resolve, reject) => {
     const urlObj = new URL(url)

package/src/search/index.js CHANGED Viewed

@@ -108,7 +108,7 @@ class SearchEngine {
     }
     // Check cache
-    const cacheKey = `deep:${query}:${JSON.stringify(opts)}`
+    const cacheKey = `deep:${opts.mode || 'full'}:${query}`
     const cached = this.cache?.get('search', cacheKey)
     if (cached) return { ...cached, cached: true }
@@ -119,31 +119,38 @@ class SearchEngine {
       queries = await this.expander.expand(query)
     }
-    // Step 2: Search across all query variants (with stagger to avoid rate limits)
+    // Step 2: Search across all query variants
+    // When using Gemini Grounded, also run DDG in parallel for volume
     const resultSets = []
-    for (const q of queries) {
-      try {
-        const r = await this._rawSearch(q, opts)
-        resultSets.push(r)
-      } catch (e) {
-        resultSets.push([])
+    if (usesGrounded) {
+      // Parallel: Gemini for quality + DDG for volume
+      const [groundedResults, ddgResults] = await Promise.all([
+        this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
+        this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
+      ])
+      resultSets.push(groundedResults, ddgResults)
+    } else {
+      for (const q of queries) {
+        try {
+          const r = await this._rawSearch(q, opts)
+          resultSets.push(r)
+        } catch (e) {
+          resultSets.push([])
+        }
+        if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
       }
-      // Small delay between queries to avoid rate limiting
-      if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
     }
     // Step 3: Merge and deduplicate
-    let results = this.expander
-      ? this.expander.mergeResults(resultSets)
-      : dedupeResults(resultSets.flat())
+    let results = dedupeResults(resultSets.flat())
-    // Step 4: Rerank by relevance
-    if (this.reranker && opts.rerank !== false) {
+    // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
+    if (this.reranker && opts.rerank !== false && !usesGrounded) {
       results = await this.reranker.rerank(query, results)
     }
-    // Step 5: Parallel scrape top N for full content
-    const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
+    // Step 5: Parallel scrape top N for full content (skip in fast mode)
+    const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
     if (scrapeCount > 0 && results.length > 0) {
       const urls = results.slice(0, scrapeCount).map(r => r.url)
       const scraped = await scrapeUrls(urls)
@@ -192,8 +199,9 @@ class SearchEngine {
   async _rawSearch(query, opts = {}) {
     let results = []
     const minResults = opts.minResults || 5
+    const cascade = opts.engines || this.cascade
-    for (const engineName of this.cascade) {
+    for (const engineName of cascade) {
       const engine = ENGINES[engineName]
       if (!engine) continue

package/src/search/scraper.js CHANGED Viewed

@@ -35,13 +35,13 @@ async function scrapeUrls(urls, opts = {}) {
 }
 async function scrapeUrl(url, opts = {}) {
-  const { timeout = 10000, engine = 'auto' } = opts
+  const { timeout = 10000, engine = 'auto', browse } = opts
   // Try Jina first if available (better markdown output)
   if (engine === 'jina' || engine === 'auto') {
     try {
       const result = await jinaExtract(url)
-      if (result.content && result.content.length > 100) {
+      if (result.content && result.content.length > 200) {
         return result.content
       }
     } catch (e) {
@@ -49,9 +49,44 @@ async function scrapeUrl(url, opts = {}) {
     }
   }
-  // Readability fallback
-  const html = await fetchPage(url, timeout)
-  return extractMarkdown(html)
+  // Readability fallback (HTTP fetch + HTML→markdown)
+  try {
+    const html = await fetchPage(url, timeout)
+    const content = extractMarkdown(html)
+    if (content && content.length > 200) {
+      return content
+    }
+  } catch (e) {
+    // Fall through to browser
+  }
+  // Browser fallback for JS-rendered pages or when extraction is too short
+  // This is where we beat Tavily — they can't render JS pages
+  if (browse !== false) {
+    try {
+      const { BrowseEngine } = require('../browse')
+      const browser = new BrowseEngine()
+      const result = await browser.browse(url, {
+        timeout,
+        extractText: true,
+        screenshot: false
+      })
+      await browser.close()
+      if (result.text && result.text.length > 200) {
+        return result.text
+      }
+    } catch (e) {
+      // All methods exhausted
+    }
+  }
+  // Return whatever we got, even if short
+  try {
+    const html = await fetchPage(url, timeout)
+    return extractMarkdown(html)
+  } catch (e) {
+    return ''
+  }
 }
 function fetchPage(url, timeout = 10000, redirects = 3) {