spectrawl 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.2",
3
+ "version": "0.3.3",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -144,8 +144,8 @@ class SearchEngine {
144
144
  // Step 3: Merge and deduplicate
145
145
  let results = dedupeResults(resultSets.flat())
146
146
 
147
- // Step 4: Rerank by relevance
148
- if (this.reranker && opts.rerank !== false) {
147
+ // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
148
+ if (this.reranker && opts.rerank !== false && !usesGrounded) {
149
149
  results = await this.reranker.rerank(query, results)
150
150
  }
151
151
 
@@ -35,13 +35,13 @@ async function scrapeUrls(urls, opts = {}) {
35
35
  }
36
36
 
37
37
  async function scrapeUrl(url, opts = {}) {
38
- const { timeout = 10000, engine = 'auto' } = opts
38
+ const { timeout = 10000, engine = 'auto', browse } = opts
39
39
 
40
40
  // Try Jina first if available (better markdown output)
41
41
  if (engine === 'jina' || engine === 'auto') {
42
42
  try {
43
43
  const result = await jinaExtract(url)
44
- if (result.content && result.content.length > 100) {
44
+ if (result.content && result.content.length > 200) {
45
45
  return result.content
46
46
  }
47
47
  } catch (e) {
@@ -49,9 +49,44 @@ async function scrapeUrl(url, opts = {}) {
49
49
  }
50
50
  }
51
51
 
52
- // Readability fallback
53
- const html = await fetchPage(url, timeout)
54
- return extractMarkdown(html)
52
+ // Readability fallback (HTTP fetch + HTML→markdown)
53
+ try {
54
+ const html = await fetchPage(url, timeout)
55
+ const content = extractMarkdown(html)
56
+ if (content && content.length > 200) {
57
+ return content
58
+ }
59
+ } catch (e) {
60
+ // Fall through to browser
61
+ }
62
+
63
+ // Browser fallback for JS-rendered pages or when extraction is too short
64
+ // This is where we beat Tavily — they can't render JS pages
65
+ if (browse !== false) {
66
+ try {
67
+ const { BrowseEngine } = require('../browse')
68
+ const browser = new BrowseEngine()
69
+ const result = await browser.browse(url, {
70
+ timeout,
71
+ extractText: true,
72
+ screenshot: false
73
+ })
74
+ await browser.close()
75
+ if (result.text && result.text.length > 200) {
76
+ return result.text
77
+ }
78
+ } catch (e) {
79
+ // All methods exhausted
80
+ }
81
+ }
82
+
83
+ // Return whatever we got, even if short
84
+ try {
85
+ const html = await fetchPage(url, timeout)
86
+ return extractMarkdown(html)
87
+ } catch (e) {
88
+ return ''
89
+ }
55
90
  }
56
91
 
57
92
  function fetchPage(url, timeout = 10000, redirects = 3) {