spectrawl 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.2.1",
3
+ "version": "0.3.0",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/index.js CHANGED
@@ -34,6 +34,18 @@ class Spectrawl {
34
34
  return this.searchEngine.search(query, opts)
35
35
  }
36
36
 
37
+ /**
38
+ * Deep search — Tavily-equivalent "advanced" mode.
39
+ * Query expansion → parallel search → rerank → scrape → AI answer with citations.
40
+ * Requires GEMINI_API_KEY (free tier) or configured LLM.
41
+ * @param {string} query - Search query
42
+ * @param {object} opts - { scrapeTop, expand, rerank }
43
+ * @returns {Promise<{answer, sources[], queries[], cached}>}
44
+ */
45
+ async deepSearch(query, opts = {}) {
46
+ return this.searchEngine.deepSearch(query, opts)
47
+ }
48
+
37
49
  /**
38
50
  * Browse a URL with stealth and optional auth.
39
51
  * @param {string} url - URL to browse
@@ -6,6 +6,8 @@ const { googleCseSearch } = require('./engines/google-cse')
6
6
  const { jinaSearch } = require('./engines/jina')
7
7
  const { scrapeUrls } = require('./scraper')
8
8
  const { Summarizer } = require('./summarizer')
9
+ const { Reranker } = require('./reranker')
10
+ const { QueryExpander } = require('./query-expander')
9
11
 
10
12
  const ENGINES = {
11
13
  searxng: searxngSearch,
@@ -23,6 +25,11 @@ class SearchEngine {
23
25
  this.cascade = config.cascade || ['ddg', 'brave', 'serper']
24
26
  this.scrapeTop = config.scrapeTop || 3
25
27
  this.summarizer = config.llm ? new Summarizer(config.llm) : null
28
+
29
+ // Gemini-powered features (free tier)
30
+ const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
31
+ this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
32
+ this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
26
33
  }
27
34
 
28
35
  /**
@@ -86,6 +93,118 @@ class SearchEngine {
86
93
  return response
87
94
  }
88
95
 
96
+ /**
97
+ * Deep search — Tavily-equivalent "advanced" mode.
98
+ * Query expansion → parallel search → merge/dedup → rerank → scrape top N → summarize with citations.
99
+ *
100
+ * Returns: { answer, sources: [{title, url, content, score}], cached }
101
+ */
102
+ async deepSearch(query, opts = {}) {
103
+ if (!query || !query.trim()) {
104
+ throw new Error('Search query is required')
105
+ }
106
+
107
+ // Check cache
108
+ const cacheKey = `deep:${query}:${JSON.stringify(opts)}`
109
+ const cached = this.cache?.get('search', cacheKey)
110
+ if (cached) return { ...cached, cached: true }
111
+
112
+ // Step 1: Query expansion
113
+ let queries = [query]
114
+ if (this.expander && opts.expand !== false) {
115
+ queries = await this.expander.expand(query)
116
+ }
117
+
118
+ // Step 2: Search across all query variants (with stagger to avoid rate limits)
119
+ const resultSets = []
120
+ for (const q of queries) {
121
+ try {
122
+ const r = await this._rawSearch(q, opts)
123
+ resultSets.push(r)
124
+ } catch (e) {
125
+ resultSets.push([])
126
+ }
127
+ // Small delay between queries to avoid rate limiting
128
+ if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
129
+ }
130
+
131
+ // Step 3: Merge and deduplicate
132
+ let results = this.expander
133
+ ? this.expander.mergeResults(resultSets)
134
+ : dedupeResults(resultSets.flat())
135
+
136
+ // Step 4: Rerank by relevance
137
+ if (this.reranker && opts.rerank !== false) {
138
+ results = await this.reranker.rerank(query, results)
139
+ }
140
+
141
+ // Step 5: Parallel scrape top N for full content
142
+ const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
143
+ if (scrapeCount > 0 && results.length > 0) {
144
+ const urls = results.slice(0, scrapeCount).map(r => r.url)
145
+ const scraped = await scrapeUrls(urls)
146
+
147
+ for (const result of results) {
148
+ const scrapedContent = scraped[result.url]
149
+ if (scrapedContent) {
150
+ result.fullContent = scrapedContent
151
+ }
152
+ }
153
+ }
154
+
155
+ // Step 6: Summarize with citations
156
+ let answer = null
157
+ const summarizer = this.summarizer || (this.reranker ? new Summarizer({
158
+ provider: 'gemini',
159
+ model: 'gemini-2.0-flash',
160
+ apiKey: process.env.GEMINI_API_KEY
161
+ }) : null)
162
+
163
+ if (summarizer) {
164
+ answer = await summarizer.summarize(query, results)
165
+ }
166
+
167
+ const response = {
168
+ answer,
169
+ sources: results.map(r => ({
170
+ title: r.title,
171
+ url: r.url,
172
+ snippet: r.snippet,
173
+ content: r.fullContent?.slice(0, 2000) || r.snippet || '',
174
+ score: r.score || null
175
+ })),
176
+ queries, // show which queries were used
177
+ cached: false
178
+ }
179
+
180
+ this.cache?.set('search', cacheKey, response)
181
+ return response
182
+ }
183
+
184
+ /**
185
+ * Raw search without reranking or summarization.
186
+ * Used internally by deepSearch for parallel query variants.
187
+ */
188
+ async _rawSearch(query, opts = {}) {
189
+ let results = []
190
+ const minResults = opts.minResults || 5
191
+
192
+ for (const engineName of this.cascade) {
193
+ const engine = ENGINES[engineName]
194
+ if (!engine) continue
195
+
196
+ try {
197
+ const engineResults = await engine(query, this.config[engineName] || {})
198
+ results = dedupeResults([...results, ...engineResults])
199
+ if (results.length >= minResults) break
200
+ } catch (err) {
201
+ continue
202
+ }
203
+ }
204
+
205
+ return results
206
+ }
207
+
89
208
  async _summarize(query, results) {
90
209
  if (!this.summarizer) return null
91
210
  return this.summarizer.summarize(query, results)
@@ -0,0 +1,122 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * Query expansion — generates variant queries to catch what one search misses.
5
+ * "best CRM" → ["top CRM software 2026", "CRM comparison startups", "best CRM for small business"]
6
+ * Merges and deduplicates results across all variants.
7
+ */
8
+ class QueryExpander {
9
+ constructor(config = {}) {
10
+ this.provider = config.provider || 'gemini'
11
+ this.model = config.model || 'gemini-2.0-flash'
12
+ this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
13
+ this.variants = config.variants || 3
14
+ }
15
+
16
+ /**
17
+ * Expand a query into multiple search variants.
18
+ * Returns array of query strings (including the original).
19
+ */
20
+ async expand(query) {
21
+ if (!this.apiKey) return [query]
22
+
23
+ const prompt = `Generate ${this.variants} alternative search queries for: "${query}"
24
+
25
+ Requirements:
26
+ - Each should find different but relevant results
27
+ - Include synonyms, related terms, different phrasings
28
+ - One should be more specific, one broader, one from a different angle
29
+
30
+ Respond with ONLY a JSON array of strings. No explanation.
31
+ Example: ["alternative query 1", "alternative query 2", "alternative query 3"]`
32
+
33
+ try {
34
+ const text = await this._call(prompt)
35
+ const match = text.match(/\[[\s\S]*?\]/)
36
+ if (!match) return [query]
37
+
38
+ const variants = JSON.parse(match[0])
39
+ if (!Array.isArray(variants)) return [query]
40
+
41
+ return [query, ...variants.slice(0, this.variants)]
42
+ } catch (err) {
43
+ console.warn('Query expansion failed:', err.message)
44
+ return [query]
45
+ }
46
+ }
47
+
48
+ /**
49
+ * Merge and deduplicate results from multiple queries.
50
+ * Keeps highest-scored version of each URL.
51
+ */
52
+ mergeResults(resultSets) {
53
+ const seen = new Map() // url → result
54
+
55
+ for (const results of resultSets) {
56
+ for (const r of results) {
57
+ const url = r.url?.toLowerCase()
58
+ if (!url) continue
59
+
60
+ const existing = seen.get(url)
61
+ if (!existing || (r.score || 0) > (existing.score || 0)) {
62
+ seen.set(url, r)
63
+ }
64
+ }
65
+ }
66
+
67
+ return Array.from(seen.values())
68
+ }
69
+
70
+ async _call(prompt) {
71
+ if (this.provider === 'gemini') {
72
+ const model = this.model || 'gemini-2.0-flash'
73
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
74
+ const body = JSON.stringify({
75
+ contents: [{ parts: [{ text: prompt }] }],
76
+ generationConfig: { temperature: 0.7, maxOutputTokens: 200 }
77
+ })
78
+ const data = await postJson(url, body)
79
+ return data.candidates?.[0]?.content?.parts?.[0]?.text || '[]'
80
+ }
81
+
82
+ const url = 'https://api.openai.com/v1/chat/completions'
83
+ const body = JSON.stringify({
84
+ model: this.model,
85
+ messages: [{ role: 'user', content: prompt }],
86
+ max_tokens: 200,
87
+ temperature: 0.7
88
+ })
89
+ const data = await postJson(url, body, { 'Authorization': `Bearer ${this.apiKey}` })
90
+ return data.choices?.[0]?.message?.content || '[]'
91
+ }
92
+ }
93
+
94
+ function postJson(url, body, extraHeaders = {}) {
95
+ return new Promise((resolve, reject) => {
96
+ const urlObj = new URL(url)
97
+ const opts = {
98
+ hostname: urlObj.hostname,
99
+ path: urlObj.pathname + urlObj.search,
100
+ method: 'POST',
101
+ headers: {
102
+ 'Content-Type': 'application/json',
103
+ 'Content-Length': Buffer.byteLength(body),
104
+ ...extraHeaders
105
+ }
106
+ }
107
+ const req = https.request(opts, res => {
108
+ let data = ''
109
+ res.on('data', c => data += c)
110
+ res.on('end', () => {
111
+ try { resolve(JSON.parse(data)) }
112
+ catch (e) { reject(new Error(`Invalid response: ${data.slice(0, 200)}`)) }
113
+ })
114
+ })
115
+ req.on('error', reject)
116
+ req.setTimeout(15000, () => { req.destroy(); reject(new Error('Expander timeout')) })
117
+ req.write(body)
118
+ req.end()
119
+ })
120
+ }
121
+
122
+ module.exports = { QueryExpander }
@@ -0,0 +1,114 @@
1
+ const https = require('https')
2
+
3
+ /**
4
+ * AI result reranker — scores search results by relevance.
5
+ * Uses Gemini Flash by default (free, fast).
6
+ * This is Tavily's secret sauce: AI-scored relevance, not raw search order.
7
+ */
8
+ class Reranker {
9
+ constructor(config = {}) {
10
+ this.provider = config.provider || 'gemini'
11
+ this.model = config.model || 'gemini-2.0-flash'
12
+ this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
13
+ }
14
+
15
+ /**
16
+ * Rerank results by relevance to query.
17
+ * Returns results sorted by score (highest first) with score field added.
18
+ */
19
+ async rerank(query, results) {
20
+ if (!this.apiKey || results.length <= 1) return results
21
+
22
+ const batch = results.slice(0, 20) // Max 20 results to rerank
23
+
24
+ const prompt = `Score each search result's relevance to the query on a scale of 0.0 to 1.0.
25
+
26
+ Query: "${query}"
27
+
28
+ Results:
29
+ ${batch.map((r, i) => `[${i}] ${r.title}\n${(r.snippet || r.content || '').slice(0, 200)}`).join('\n\n')}
30
+
31
+ Respond with ONLY a JSON array of scores, one per result. Example: [0.95, 0.72, 0.31]
32
+ No explanation, just the array.`
33
+
34
+ try {
35
+ const text = await this._call(prompt)
36
+ const scores = JSON.parse(text.match(/\[[\d.,\s]+\]/)?.[0] || '[]')
37
+
38
+ if (scores.length !== batch.length) return results
39
+
40
+ // Attach scores and sort
41
+ const scored = batch.map((r, i) => ({ ...r, score: scores[i] || 0 }))
42
+ scored.sort((a, b) => b.score - a.score)
43
+
44
+ // Append any results beyond the batch limit
45
+ if (results.length > 20) {
46
+ scored.push(...results.slice(20).map(r => ({ ...r, score: 0 })))
47
+ }
48
+
49
+ return scored
50
+ } catch (err) {
51
+ console.warn('Reranking failed, using original order:', err.message)
52
+ return results
53
+ }
54
+ }
55
+
56
+ async _call(prompt) {
57
+ if (this.provider === 'gemini') {
58
+ const model = this.model || 'gemini-2.0-flash'
59
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
60
+ const body = JSON.stringify({
61
+ contents: [{ parts: [{ text: prompt }] }],
62
+ generationConfig: { temperature: 0, maxOutputTokens: 200 }
63
+ })
64
+ const data = await postJson(url, body)
65
+ return data.candidates?.[0]?.content?.parts?.[0]?.text || '[]'
66
+ }
67
+
68
+ // Fallback: OpenAI-compatible
69
+ const url = this.provider === 'minimax'
70
+ ? 'https://api.minimax.chat/v1/text/chatcompletion_v2'
71
+ : 'https://api.openai.com/v1/chat/completions'
72
+
73
+ const body = JSON.stringify({
74
+ model: this.model,
75
+ messages: [{ role: 'user', content: prompt }],
76
+ max_tokens: 200,
77
+ temperature: 0
78
+ })
79
+ const data = await postJson(url, body, {
80
+ 'Authorization': `Bearer ${this.apiKey}`
81
+ })
82
+ return data.choices?.[0]?.message?.content || '[]'
83
+ }
84
+ }
85
+
86
+ function postJson(url, body, extraHeaders = {}) {
87
+ return new Promise((resolve, reject) => {
88
+ const urlObj = new URL(url)
89
+ const opts = {
90
+ hostname: urlObj.hostname,
91
+ path: urlObj.pathname + urlObj.search,
92
+ method: 'POST',
93
+ headers: {
94
+ 'Content-Type': 'application/json',
95
+ 'Content-Length': Buffer.byteLength(body),
96
+ ...extraHeaders
97
+ }
98
+ }
99
+ const req = https.request(opts, res => {
100
+ let data = ''
101
+ res.on('data', c => data += c)
102
+ res.on('end', () => {
103
+ try { resolve(JSON.parse(data)) }
104
+ catch (e) { reject(new Error(`Invalid response: ${data.slice(0, 200)}`)) }
105
+ })
106
+ })
107
+ req.on('error', reject)
108
+ req.setTimeout(15000, () => { req.destroy(); reject(new Error('Reranker timeout')) })
109
+ req.write(body)
110
+ req.end()
111
+ })
112
+ }
113
+
114
+ module.exports = { Reranker }
@@ -17,7 +17,8 @@ class Summarizer {
17
17
  openai: 'OPENAI_API_KEY',
18
18
  anthropic: 'ANTHROPIC_API_KEY',
19
19
  minimax: 'MINIMAX_API_KEY',
20
- xai: 'XAI_API_KEY'
20
+ xai: 'XAI_API_KEY',
21
+ gemini: 'GEMINI_API_KEY'
21
22
  }
22
23
  return keys[this.provider] || 'OPENAI_API_KEY'
23
24
  }
@@ -56,6 +57,8 @@ Answer:`
56
57
  return this._openaiCompatible(prompt)
57
58
  case 'anthropic':
58
59
  return this._anthropic(prompt)
60
+ case 'gemini':
61
+ return this._gemini(prompt)
59
62
  case 'ollama':
60
63
  return this._ollama(prompt)
61
64
  default:
@@ -106,6 +109,18 @@ Answer:`
106
109
  return data.content?.[0]?.text || null
107
110
  }
108
111
 
112
+ async _gemini(prompt) {
113
+ const model = this.model || 'gemini-2.0-flash'
114
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
115
+ const body = JSON.stringify({
116
+ contents: [{ parts: [{ text: prompt }] }],
117
+ generationConfig: { temperature: 0.3, maxOutputTokens: 500 }
118
+ })
119
+
120
+ const data = await postJson(url, body, { 'Content-Type': 'application/json' })
121
+ return data.candidates?.[0]?.content?.parts?.[0]?.text || null
122
+ }
123
+
109
124
  async _ollama(prompt) {
110
125
  const url = this.baseUrl || 'http://localhost:11434/api/generate'
111
126
  const body = JSON.stringify({