npm - spectrawl - Versions diffs - 0.3.2 → 0.3.3 - Mend

spectrawl 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.2",
+  "version": "0.3.3",
   "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/search/index.js CHANGED Viewed

@@ -144,8 +144,8 @@ class SearchEngine {
     // Step 3: Merge and deduplicate
     let results = dedupeResults(resultSets.flat())
-    // Step 4: Rerank by relevance
-    if (this.reranker && opts.rerank !== false) {
+    // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
+    if (this.reranker && opts.rerank !== false && !usesGrounded) {
       results = await this.reranker.rerank(query, results)
     }

package/src/search/scraper.js CHANGED Viewed

@@ -35,13 +35,13 @@ async function scrapeUrls(urls, opts = {}) {
 }
 async function scrapeUrl(url, opts = {}) {
-  const { timeout = 10000, engine = 'auto' } = opts
+  const { timeout = 10000, engine = 'auto', browse } = opts
   // Try Jina first if available (better markdown output)
   if (engine === 'jina' || engine === 'auto') {
     try {
       const result = await jinaExtract(url)
-      if (result.content && result.content.length > 100) {
+      if (result.content && result.content.length > 200) {
         return result.content
       }
     } catch (e) {
@@ -49,9 +49,44 @@ async function scrapeUrl(url, opts = {}) {
     }
   }
-  // Readability fallback
-  const html = await fetchPage(url, timeout)
-  return extractMarkdown(html)
+  // Readability fallback (HTTP fetch + HTML→markdown)
+  try {
+    const html = await fetchPage(url, timeout)
+    const content = extractMarkdown(html)
+    if (content && content.length > 200) {
+      return content
+    }
+  } catch (e) {
+    // Fall through to browser
+  }
+  // Browser fallback for JS-rendered pages or when extraction is too short
+  // This is where we beat Tavily — they can't render JS pages
+  if (browse !== false) {
+    try {
+      const { BrowseEngine } = require('../browse')
+      const browser = new BrowseEngine()
+      const result = await browser.browse(url, {
+        timeout,
+        extractText: true,
+        screenshot: false
+      })
+      await browser.close()
+      if (result.text && result.text.length > 200) {
+        return result.text
+      }
+    } catch (e) {
+      // All methods exhausted
+    }
+  }
+  // Return whatever we got, even if short
+  try {
+    const html = await fetchPage(url, timeout)
+    return extractMarkdown(html)
+  } catch (e) {
+    return ''
+  }
 }
 function fetchPage(url, timeout = 10000, redirects = 3) {