spectrawl 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.1",
3
+ "version": "0.3.3",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -37,19 +37,34 @@ async function geminiGroundedSearch(query, config = {}) {
37
37
  const chunks = grounding?.groundingChunks || []
38
38
  const answer = candidate?.content?.parts?.map(p => p.text).filter(Boolean).join('\n') || ''
39
39
 
40
- // Convert grounding chunks to standard search result format
41
- const results = chunks.map((chunk, i) => ({
40
+ // Resolve redirect URLs to actual URLs (parallel, with timeout)
41
+ const rawResults = chunks.map((chunk, i) => ({
42
42
  title: chunk.web?.title || `Result ${i + 1}`,
43
- url: chunk.web?.uri || '',
44
- snippet: '', // Gemini doesn't give per-result snippets
43
+ redirectUrl: chunk.web?.uri || '',
44
+ snippet: '',
45
45
  source: 'gemini-grounded'
46
- })).filter(r => r.url)
46
+ })).filter(r => r.redirectUrl)
47
47
 
48
- // Also try to extract URLs from grounding support
48
+ // Follow redirects to get real URLs
49
+ const resolved = await Promise.all(
50
+ rawResults.map(r => resolveRedirect(r.redirectUrl).catch(() => r.redirectUrl))
51
+ )
52
+
53
+ const results = rawResults.map((r, i) => ({
54
+ ...r,
55
+ url: resolved[i] || r.redirectUrl
56
+ }))
57
+
58
+ // Add confidence scores from grounding supports
49
59
  const supports = grounding?.groundingSupports || []
50
60
  for (const support of supports) {
51
61
  const indices = support.groundingChunkIndices || []
52
- // Already captured above
62
+ const scores = support.confidenceScores || []
63
+ indices.forEach((idx, j) => {
64
+ if (results[idx] && scores[j]) {
65
+ results[idx].confidence = Math.max(results[idx].confidence || 0, scores[j])
66
+ }
67
+ })
53
68
  }
54
69
 
55
70
  // Attach the AI answer as metadata
@@ -60,6 +75,31 @@ async function geminiGroundedSearch(query, config = {}) {
60
75
  return results
61
76
  }
62
77
 
78
+ /**
79
+ * Follow a redirect URL to get the actual destination.
80
+ */
81
+ function resolveRedirect(url) {
82
+ return new Promise((resolve, reject) => {
83
+ const urlObj = new URL(url)
84
+ const client = urlObj.protocol === 'https:' ? https : require('http')
85
+ const req = client.request({
86
+ hostname: urlObj.hostname,
87
+ path: urlObj.pathname + urlObj.search,
88
+ method: 'HEAD',
89
+ headers: { 'User-Agent': 'Spectrawl/0.3' }
90
+ }, res => {
91
+ if ([301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
92
+ resolve(res.headers.location)
93
+ } else {
94
+ resolve(url)
95
+ }
96
+ })
97
+ req.on('error', () => resolve(url))
98
+ req.setTimeout(3000, () => { req.destroy(); resolve(url) })
99
+ req.end()
100
+ })
101
+ }
102
+
63
103
  function post(url, body) {
64
104
  return new Promise((resolve, reject) => {
65
105
  const urlObj = new URL(url)
@@ -108,7 +108,7 @@ class SearchEngine {
108
108
  }
109
109
 
110
110
  // Check cache
111
- const cacheKey = `deep:${query}:${JSON.stringify(opts)}`
111
+ const cacheKey = `deep:${opts.mode || 'full'}:${query}`
112
112
  const cached = this.cache?.get('search', cacheKey)
113
113
  if (cached) return { ...cached, cached: true }
114
114
 
@@ -119,31 +119,38 @@ class SearchEngine {
119
119
  queries = await this.expander.expand(query)
120
120
  }
121
121
 
122
- // Step 2: Search across all query variants (with stagger to avoid rate limits)
122
+ // Step 2: Search across all query variants
123
+ // When using Gemini Grounded, also run DDG in parallel for volume
123
124
  const resultSets = []
124
- for (const q of queries) {
125
- try {
126
- const r = await this._rawSearch(q, opts)
127
- resultSets.push(r)
128
- } catch (e) {
129
- resultSets.push([])
125
+ if (usesGrounded) {
126
+ // Parallel: Gemini for quality + DDG for volume
127
+ const [groundedResults, ddgResults] = await Promise.all([
128
+ this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
129
+ this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
130
+ ])
131
+ resultSets.push(groundedResults, ddgResults)
132
+ } else {
133
+ for (const q of queries) {
134
+ try {
135
+ const r = await this._rawSearch(q, opts)
136
+ resultSets.push(r)
137
+ } catch (e) {
138
+ resultSets.push([])
139
+ }
140
+ if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
130
141
  }
131
- // Small delay between queries to avoid rate limiting
132
- if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
133
142
  }
134
143
 
135
144
  // Step 3: Merge and deduplicate
136
- let results = this.expander
137
- ? this.expander.mergeResults(resultSets)
138
- : dedupeResults(resultSets.flat())
145
+ let results = dedupeResults(resultSets.flat())
139
146
 
140
- // Step 4: Rerank by relevance
141
- if (this.reranker && opts.rerank !== false) {
147
+ // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
148
+ if (this.reranker && opts.rerank !== false && !usesGrounded) {
142
149
  results = await this.reranker.rerank(query, results)
143
150
  }
144
151
 
145
- // Step 5: Parallel scrape top N for full content
146
- const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
152
+ // Step 5: Parallel scrape top N for full content (skip in fast mode)
153
+ const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
147
154
  if (scrapeCount > 0 && results.length > 0) {
148
155
  const urls = results.slice(0, scrapeCount).map(r => r.url)
149
156
  const scraped = await scrapeUrls(urls)
@@ -192,8 +199,9 @@ class SearchEngine {
192
199
  async _rawSearch(query, opts = {}) {
193
200
  let results = []
194
201
  const minResults = opts.minResults || 5
202
+ const cascade = opts.engines || this.cascade
195
203
 
196
- for (const engineName of this.cascade) {
204
+ for (const engineName of cascade) {
197
205
  const engine = ENGINES[engineName]
198
206
  if (!engine) continue
199
207
 
@@ -35,13 +35,13 @@ async function scrapeUrls(urls, opts = {}) {
35
35
  }
36
36
 
37
37
  async function scrapeUrl(url, opts = {}) {
38
- const { timeout = 10000, engine = 'auto' } = opts
38
+ const { timeout = 10000, engine = 'auto', browse } = opts
39
39
 
40
40
  // Try Jina first if available (better markdown output)
41
41
  if (engine === 'jina' || engine === 'auto') {
42
42
  try {
43
43
  const result = await jinaExtract(url)
44
- if (result.content && result.content.length > 100) {
44
+ if (result.content && result.content.length > 200) {
45
45
  return result.content
46
46
  }
47
47
  } catch (e) {
@@ -49,9 +49,44 @@ async function scrapeUrl(url, opts = {}) {
49
49
  }
50
50
  }
51
51
 
52
- // Readability fallback
53
- const html = await fetchPage(url, timeout)
54
- return extractMarkdown(html)
52
+ // Readability fallback (HTTP fetch + HTML→markdown)
53
+ try {
54
+ const html = await fetchPage(url, timeout)
55
+ const content = extractMarkdown(html)
56
+ if (content && content.length > 200) {
57
+ return content
58
+ }
59
+ } catch (e) {
60
+ // Fall through to browser
61
+ }
62
+
63
+ // Browser fallback for JS-rendered pages or when extraction is too short
64
+ // This is where we beat Tavily — they can't render JS pages
65
+ if (browse !== false) {
66
+ try {
67
+ const { BrowseEngine } = require('../browse')
68
+ const browser = new BrowseEngine()
69
+ const result = await browser.browse(url, {
70
+ timeout,
71
+ extractText: true,
72
+ screenshot: false
73
+ })
74
+ await browser.close()
75
+ if (result.text && result.text.length > 200) {
76
+ return result.text
77
+ }
78
+ } catch (e) {
79
+ // All methods exhausted
80
+ }
81
+ }
82
+
83
+ // Return whatever we got, even if short
84
+ try {
85
+ const html = await fetchPage(url, timeout)
86
+ return extractMarkdown(html)
87
+ } catch (e) {
88
+ return ''
89
+ }
55
90
  }
56
91
 
57
92
  function fetchPage(url, timeout = 10000, redirects = 3) {