spectrawl 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.0",
3
+ "version": "0.3.2",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -0,0 +1,130 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Gemini Grounded Search — uses Google's Gemini API with built-in Google Search.
5
+ * Free tier: 1,500 req/day for Flash.
6
+ * Returns both an AI answer AND the search results it found.
7
+ *
8
+ * This is basically free Google search + AI summarization in one call.
9
+ */
10
+ async function geminiGroundedSearch(query, config = {}) {
11
+ const apiKey = config.apiKey || process.env.GEMINI_API_KEY
12
+ if (!apiKey) throw new Error('GEMINI_API_KEY required for grounded search')
13
+
14
+ const model = config.model || 'gemini-2.0-flash'
15
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`
16
+
17
+ const body = JSON.stringify({
18
+ contents: [{
19
+ parts: [{ text: `Search the web and provide relevant results for: ${query}` }]
20
+ }],
21
+ tools: [{ google_search: {} }],
22
+ generationConfig: {
23
+ temperature: 0.1,
24
+ maxOutputTokens: 1000
25
+ }
26
+ })
27
+
28
+ const data = await post(url, body)
29
+
30
+ if (data.error) {
31
+ throw new Error(`Gemini grounded search: ${data.error.message}`)
32
+ }
33
+
34
+ // Extract grounding metadata (search results)
35
+ const candidate = data.candidates?.[0]
36
+ const grounding = candidate?.groundingMetadata
37
+ const chunks = grounding?.groundingChunks || []
38
+ const answer = candidate?.content?.parts?.map(p => p.text).filter(Boolean).join('\n') || ''
39
+
40
+ // Resolve redirect URLs to actual URLs (parallel, with timeout)
41
+ const rawResults = chunks.map((chunk, i) => ({
42
+ title: chunk.web?.title || `Result ${i + 1}`,
43
+ redirectUrl: chunk.web?.uri || '',
44
+ snippet: '',
45
+ source: 'gemini-grounded'
46
+ })).filter(r => r.redirectUrl)
47
+
48
+ // Follow redirects to get real URLs
49
+ const resolved = await Promise.all(
50
+ rawResults.map(r => resolveRedirect(r.redirectUrl).catch(() => r.redirectUrl))
51
+ )
52
+
53
+ const results = rawResults.map((r, i) => ({
54
+ ...r,
55
+ url: resolved[i] || r.redirectUrl
56
+ }))
57
+
58
+ // Add confidence scores from grounding supports
59
+ const supports = grounding?.groundingSupports || []
60
+ for (const support of supports) {
61
+ const indices = support.groundingChunkIndices || []
62
+ const scores = support.confidenceScores || []
63
+ indices.forEach((idx, j) => {
64
+ if (results[idx] && scores[j]) {
65
+ results[idx].confidence = Math.max(results[idx].confidence || 0, scores[j])
66
+ }
67
+ })
68
+ }
69
+
70
+ // Attach the AI answer as metadata
71
+ if (results.length > 0) {
72
+ results._groundedAnswer = answer
73
+ }
74
+
75
+ return results
76
+ }
77
+
78
+ /**
79
+ * Follow a redirect URL to get the actual destination.
80
+ */
81
+ function resolveRedirect(url) {
82
+ return new Promise((resolve, reject) => {
83
+ const urlObj = new URL(url)
84
+ const client = urlObj.protocol === 'https:' ? https : require('http')
85
+ const req = client.request({
86
+ hostname: urlObj.hostname,
87
+ path: urlObj.pathname + urlObj.search,
88
+ method: 'HEAD',
89
+ headers: { 'User-Agent': 'Spectrawl/0.3' }
90
+ }, res => {
91
+ if ([301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
92
+ resolve(res.headers.location)
93
+ } else {
94
+ resolve(url)
95
+ }
96
+ })
97
+ req.on('error', () => resolve(url))
98
+ req.setTimeout(3000, () => { req.destroy(); resolve(url) })
99
+ req.end()
100
+ })
101
+ }
102
+
103
+ function post(url, body) {
104
+ return new Promise((resolve, reject) => {
105
+ const urlObj = new URL(url)
106
+ const opts = {
107
+ hostname: urlObj.hostname,
108
+ path: urlObj.pathname + urlObj.search,
109
+ method: 'POST',
110
+ headers: {
111
+ 'Content-Type': 'application/json',
112
+ 'Content-Length': Buffer.byteLength(body)
113
+ }
114
+ }
115
+ const req = https.request(opts, res => {
116
+ let data = ''
117
+ res.on('data', c => data += c)
118
+ res.on('end', () => {
119
+ try { resolve(JSON.parse(data)) }
120
+ catch (e) { reject(new Error(`Invalid Gemini response: ${data.slice(0, 200)}`)) }
121
+ })
122
+ })
123
+ req.on('error', reject)
124
+ req.setTimeout(15000, () => { req.destroy(); reject(new Error('Gemini grounded search timeout')) })
125
+ req.write(body)
126
+ req.end()
127
+ })
128
+ }
129
+
130
+ module.exports = { geminiGroundedSearch }
@@ -4,6 +4,7 @@ const { serperSearch } = require('./engines/serper')
4
4
  const { searxngSearch } = require('./engines/searxng')
5
5
  const { googleCseSearch } = require('./engines/google-cse')
6
6
  const { jinaSearch } = require('./engines/jina')
7
+ const { geminiGroundedSearch } = require('./engines/gemini-grounded')
7
8
  const { scrapeUrls } = require('./scraper')
8
9
  const { Summarizer } = require('./summarizer')
9
10
  const { Reranker } = require('./reranker')
@@ -15,7 +16,9 @@ const ENGINES = {
15
16
  brave: braveSearch,
16
17
  serper: serperSearch,
17
18
  'google-cse': googleCseSearch,
18
- jina: jinaSearch
19
+ jina: jinaSearch,
20
+ 'gemini-grounded': geminiGroundedSearch,
21
+ gemini: geminiGroundedSearch
19
22
  }
20
23
 
21
24
  class SearchEngine {
@@ -105,41 +108,49 @@ class SearchEngine {
105
108
  }
106
109
 
107
110
  // Check cache
108
- const cacheKey = `deep:${query}:${JSON.stringify(opts)}`
111
+ const cacheKey = `deep:${opts.mode || 'full'}:${query}`
109
112
  const cached = this.cache?.get('search', cacheKey)
110
113
  if (cached) return { ...cached, cached: true }
111
114
 
112
- // Step 1: Query expansion
115
+ // Step 1: Query expansion (skip if using Gemini grounded — it searches Google natively)
113
116
  let queries = [query]
114
- if (this.expander && opts.expand !== false) {
117
+ const usesGrounded = this.cascade.includes('gemini-grounded') || this.cascade.includes('gemini')
118
+ if (this.expander && opts.expand !== false && !usesGrounded) {
115
119
  queries = await this.expander.expand(query)
116
120
  }
117
121
 
118
- // Step 2: Search across all query variants (with stagger to avoid rate limits)
122
+ // Step 2: Search across all query variants
123
+ // When using Gemini Grounded, also run DDG in parallel for volume
119
124
  const resultSets = []
120
- for (const q of queries) {
121
- try {
122
- const r = await this._rawSearch(q, opts)
123
- resultSets.push(r)
124
- } catch (e) {
125
- resultSets.push([])
125
+ if (usesGrounded) {
126
+ // Parallel: Gemini for quality + DDG for volume
127
+ const [groundedResults, ddgResults] = await Promise.all([
128
+ this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
129
+ this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
130
+ ])
131
+ resultSets.push(groundedResults, ddgResults)
132
+ } else {
133
+ for (const q of queries) {
134
+ try {
135
+ const r = await this._rawSearch(q, opts)
136
+ resultSets.push(r)
137
+ } catch (e) {
138
+ resultSets.push([])
139
+ }
140
+ if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
126
141
  }
127
- // Small delay between queries to avoid rate limiting
128
- if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
129
142
  }
130
143
 
131
144
  // Step 3: Merge and deduplicate
132
- let results = this.expander
133
- ? this.expander.mergeResults(resultSets)
134
- : dedupeResults(resultSets.flat())
145
+ let results = dedupeResults(resultSets.flat())
135
146
 
136
147
  // Step 4: Rerank by relevance
137
148
  if (this.reranker && opts.rerank !== false) {
138
149
  results = await this.reranker.rerank(query, results)
139
150
  }
140
151
 
141
- // Step 5: Parallel scrape top N for full content
142
- const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
152
+ // Step 5: Parallel scrape top N for full content (skip in fast mode)
153
+ const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
143
154
  if (scrapeCount > 0 && results.length > 0) {
144
155
  const urls = results.slice(0, scrapeCount).map(r => r.url)
145
156
  const scraped = await scrapeUrls(urls)
@@ -188,8 +199,9 @@ class SearchEngine {
188
199
  async _rawSearch(query, opts = {}) {
189
200
  let results = []
190
201
  const minResults = opts.minResults || 5
202
+ const cascade = opts.engines || this.cascade
191
203
 
192
- for (const engineName of this.cascade) {
204
+ for (const engineName of cascade) {
193
205
  const engine = ENGINES[engineName]
194
206
  if (!engine) continue
195
207
 
@@ -16,17 +16,20 @@ async function scrapeUrls(urls, opts = {}) {
16
16
  const concurrent = opts.concurrent || 3
17
17
  const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
18
18
 
19
- for (let i = 0; i < urls.length; i += concurrent) {
20
- const batch = urls.slice(i, i + concurrent)
21
- const promises = batch.map(url => scrapeUrl(url, { timeout, engine }).catch(() => null))
22
- const batchResults = await Promise.all(promises)
23
-
24
- batch.forEach((url, idx) => {
25
- if (batchResults[idx]) {
26
- results[url] = batchResults[idx]
27
- }
28
- })
29
- }
19
+ // All URLs in parallel (with per-URL timeout)
20
+ const promises = urls.map(url => {
21
+ const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
22
+ // Hard timeout per URL
23
+ const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 1000))
24
+ return Promise.race([p, timer])
25
+ })
26
+ const allResults = await Promise.all(promises)
27
+
28
+ urls.forEach((url, idx) => {
29
+ if (allResults[idx]) {
30
+ results[url] = allResults[idx]
31
+ }
32
+ })
30
33
 
31
34
  return results
32
35
  }