spectrawl 0.3.5 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.5",
3
+ "version": "0.3.6",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -0,0 +1,123 @@
1
+ const https = require('https')
2
+ const { URL } = require('url')
3
+
4
+ /**
5
+ * Bing web search — scrapes Bing HTML results.
6
+ * No API key needed. More reliable from datacenter IPs than DDG.
7
+ * DDG actually uses Bing's index anyway — this goes direct.
8
+ */
9
+ async function bingSearch(query, config = {}) {
10
+ const maxResults = config.maxResults || 10
11
+
12
+ try {
13
+ const html = await fetchBing(query)
14
+
15
+ // Detect blocks
16
+ if (html.includes('captcha') || html.includes('unusual traffic') || html.length < 1000) {
17
+ return []
18
+ }
19
+
20
+ return parseBingResults(html, maxResults)
21
+ } catch (e) {
22
+ return []
23
+ }
24
+ }
25
+
26
+ function parseBingResults(html, maxResults) {
27
+ const results = []
28
+
29
+ // Bing result blocks: <li class="b_algo">
30
+ const blockRegex = /<li\s+class="b_algo">([\s\S]*?)<\/li>/g
31
+ let block
32
+ while ((block = blockRegex.exec(html)) !== null && results.length < maxResults) {
33
+ const content = block[1]
34
+
35
+ // Extract URL and title from <h2><a href="...">title</a></h2>
36
+ const linkMatch = content.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i)
37
+ if (!linkMatch) continue
38
+
39
+ const url = linkMatch[1]
40
+ const title = stripHtml(linkMatch[2])
41
+
42
+ // Skip Bing internal links
43
+ if (url.includes('bing.com') || url.includes('microsoft.com/bing')) continue
44
+
45
+ // Extract snippet from <p> or <div class="b_caption">
46
+ const snippetMatch = content.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
47
+ content.match(/<div\s+class="b_caption"[^>]*>([\s\S]*?)<\/div>/i)
48
+ const snippet = snippetMatch ? stripHtml(snippetMatch[1]) : ''
49
+
50
+ results.push({ url, title, snippet, engine: 'bing' })
51
+ }
52
+
53
+ return results
54
+ }
55
+
56
+ function fetchBing(query) {
57
+ return new Promise((resolve, reject) => {
58
+ const path = `/search?q=${encodeURIComponent(query)}&setlang=en&count=15`
59
+ const opts = {
60
+ hostname: 'www.bing.com',
61
+ path,
62
+ method: 'GET',
63
+ headers: {
64
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
65
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66
+ 'Accept-Language': 'en-US,en;q=0.9',
67
+ 'Accept-Encoding': 'identity',
68
+ 'DNT': '1'
69
+ }
70
+ }
71
+
72
+ const req = https.get(opts, res => {
73
+ // Follow redirects
74
+ if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
75
+ const loc = res.headers.location
76
+ if (loc.startsWith('http')) {
77
+ return fetchUrl(loc).then(resolve).catch(reject)
78
+ }
79
+ return fetchUrl('https://www.bing.com' + loc).then(resolve).catch(reject)
80
+ }
81
+
82
+ let data = ''
83
+ res.on('data', chunk => data += chunk)
84
+ res.on('end', () => resolve(data))
85
+ })
86
+ req.on('error', reject)
87
+ req.setTimeout(8000, () => { req.destroy(); reject(new Error('Bing timeout')) })
88
+ })
89
+ }
90
+
91
+ function fetchUrl(url) {
92
+ return new Promise((resolve, reject) => {
93
+ const urlObj = new URL(url)
94
+ const client = urlObj.protocol === 'https:' ? https : require('http')
95
+ client.get({
96
+ hostname: urlObj.hostname,
97
+ path: urlObj.pathname + urlObj.search,
98
+ headers: {
99
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
100
+ 'Accept': 'text/html'
101
+ }
102
+ }, res => {
103
+ let data = ''
104
+ res.on('data', c => data += c)
105
+ res.on('end', () => resolve(data))
106
+ }).on('error', reject)
107
+ })
108
+ }
109
+
110
+ function stripHtml(html) {
111
+ return html
112
+ .replace(/<[^>]+>/g, '')
113
+ .replace(/&amp;/g, '&')
114
+ .replace(/&lt;/g, '<')
115
+ .replace(/&gt;/g, '>')
116
+ .replace(/&quot;/g, '"')
117
+ .replace(/&#39;/g, "'")
118
+ .replace(/&nbsp;/g, ' ')
119
+ .replace(/\s+/g, ' ')
120
+ .trim()
121
+ }
122
+
123
+ module.exports = { bingSearch }
@@ -1,31 +1,46 @@
1
1
  const https = require('https')
2
+ const http = require('http')
2
3
  const { URL } = require('url')
3
4
 
4
5
  /**
5
6
  * DuckDuckGo search — free, unlimited, no API key needed.
6
- * Uses JSON API + HTML fallback. Filters ads automatically.
7
+ * Uses JSON API + HTML fallback + lite fallback.
8
+ * Built-in retry with backoff for datacenter IP rate limiting.
9
+ * Optional proxy support for reliable results.
7
10
  */
8
11
  async function ddgSearch(query, config = {}) {
9
12
  const maxResults = config.maxResults || 10
10
-
11
- // Strategy 1: JSON API (instant answers)
12
- try {
13
- const results = await ddgJsonApi(query, maxResults)
14
- if (results.length > 0) return results
15
- } catch (e) { /* fall through */ }
16
-
17
- // Strategy 2: HTML search
18
- try {
19
- const results = await ddgHtmlSearch(query, maxResults)
20
- if (results.length > 0) return results
21
- } catch (e) { /* fall through */ }
13
+ const proxy = config.proxy || null
14
+
15
+ // Try up to 2 times with backoff
16
+ for (let attempt = 0; attempt < 2; attempt++) {
17
+ if (attempt > 0) await delay(1000 + Math.random() * 1000)
18
+
19
+ // Strategy 1: JSON API (instant answers — most reliable from datacenter)
20
+ try {
21
+ const results = await ddgJsonApi(query, maxResults, proxy)
22
+ if (results.length > 0) return results
23
+ } catch (e) { /* fall through */ }
24
+
25
+ // Strategy 2: HTML search (html.duckduckgo.com)
26
+ try {
27
+ const results = await ddgHtmlSearch(query, maxResults, 'html.duckduckgo.com', proxy)
28
+ if (results.length > 0) return results
29
+ } catch (e) { /* fall through */ }
30
+
31
+ // Strategy 3: Lite search (lite.duckduckgo.com — simpler, less likely to CAPTCHA)
32
+ try {
33
+ const results = await ddgHtmlSearch(query, maxResults, 'lite.duckduckgo.com', proxy)
34
+ if (results.length > 0) return results
35
+ } catch (e) { /* fall through */ }
36
+ }
22
37
 
23
38
  return []
24
39
  }
25
40
 
26
- async function ddgJsonApi(query, maxResults) {
41
+ async function ddgJsonApi(query, maxResults, proxy) {
27
42
  const url = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`
28
- const data = await fetchJson(url)
43
+ const data = await fetchJson(url, proxy)
29
44
 
30
45
  const results = []
31
46
 
@@ -82,9 +97,14 @@ async function ddgJsonApi(query, maxResults) {
82
97
  return results
83
98
  }
84
99
 
85
- async function ddgHtmlSearch(query, maxResults) {
86
- const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`
87
- const html = await fetchHtml(url)
100
+ async function ddgHtmlSearch(query, maxResults, hostname, proxy) {
101
+ const path = `/html/?q=${encodeURIComponent(query)}`
102
+ const html = await fetchHtml(`https://${hostname}${path}`, proxy)
103
+
104
+ // Detect CAPTCHA / rate limit
105
+ if (html.includes('g-recaptcha') || html.includes('bot detected') || html.length < 500) {
106
+ return []
107
+ }
88
108
 
89
109
  const results = []
90
110
 
@@ -95,11 +115,30 @@ async function ddgHtmlSearch(query, maxResults) {
95
115
  let match
96
116
  while ((match = resultRegex.exec(html)) !== null) {
97
117
  const url = decodeUddg(match[1])
98
- // Filter ads — DDG ads go through duckduckgo.com/y.js
99
118
  if (isAd(url)) continue
100
119
  links.push({ url, title: stripHtml(match[2]) })
101
120
  }
102
121
 
122
+ // Lite endpoint uses different selectors
123
+ if (links.length === 0) {
124
+ const liteRegex = /<a[^>]+class="result-link"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/g
125
+ while ((match = liteRegex.exec(html)) !== null) {
126
+ const url = decodeUddg(match[1])
127
+ if (isAd(url)) continue
128
+ links.push({ url, title: stripHtml(match[2]) })
129
+ }
130
+ // Even simpler: just grab all non-DDG links from lite results
131
+ if (links.length === 0) {
132
+ const anyLink = /<a[^>]*href="(https?:\/\/(?!duckduckgo)[^"]+)"[^>]*>([\s\S]*?)<\/a>/g
133
+ while ((match = anyLink.exec(html)) !== null) {
134
+ if (results.length >= maxResults) break
135
+ const url = match[1]
136
+ if (isAd(url)) continue
137
+ links.push({ url, title: stripHtml(match[2]) })
138
+ }
139
+ }
140
+ }
141
+
103
142
  const snippets = []
104
143
  while ((match = snippetRegex.exec(html)) !== null) {
105
144
  snippets.push(stripHtml(match[1]))
@@ -117,18 +156,11 @@ async function ddgHtmlSearch(query, maxResults) {
117
156
  return results
118
157
  }
119
158
 
120
- /**
121
- * Filter out DDG ads.
122
- */
123
159
  function isAd(url) {
124
160
  if (!url) return true
125
161
  if (url.includes('duckduckgo.com/y.js')) return true
126
162
  if (url.includes('ad_provider=')) return true
127
163
  if (url.includes('ad_domain=')) return true
128
- if (url.startsWith('//duckduckgo.com/l/?')) {
129
- // This is a redirect — might be organic
130
- return false
131
- }
132
164
  return false
133
165
  }
134
166
 
@@ -140,28 +172,34 @@ function decodeUddg(url) {
140
172
  return url
141
173
  }
142
174
 
143
- function fetchJson(url) {
175
+ function delay(ms) { return new Promise(r => setTimeout(r, ms)) }
176
+
177
+ function fetchJson(url, proxy) {
144
178
  return new Promise((resolve, reject) => {
145
179
  const urlObj = new URL(url)
146
- https.get({
180
+ const opts = {
147
181
  hostname: urlObj.hostname,
148
182
  path: urlObj.pathname + urlObj.search,
149
- headers: { 'User-Agent': 'Spectrawl/0.1.0' }
150
- }, res => {
183
+ headers: { 'User-Agent': 'Spectrawl/0.3' }
184
+ }
185
+
186
+ const req = https.get(opts, res => {
151
187
  let data = ''
152
188
  res.on('data', chunk => data += chunk)
153
189
  res.on('end', () => {
154
190
  try { resolve(JSON.parse(data)) }
155
191
  catch (e) { reject(new Error('Invalid JSON from DDG API')) }
156
192
  })
157
- }).on('error', reject)
193
+ })
194
+ req.on('error', reject)
195
+ req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG timeout')) })
158
196
  })
159
197
  }
160
198
 
161
- function fetchHtml(url) {
199
+ function fetchHtml(url, proxy) {
162
200
  return new Promise((resolve, reject) => {
163
201
  const urlObj = new URL(url)
164
- https.get({
202
+ const opts = {
165
203
  hostname: urlObj.hostname,
166
204
  path: urlObj.pathname + urlObj.search,
167
205
  headers: {
@@ -169,11 +207,19 @@ function fetchHtml(url) {
169
207
  'Accept': 'text/html',
170
208
  'Accept-Language': 'en-US,en;q=0.9'
171
209
  }
172
- }, res => {
210
+ }
211
+
212
+ const req = https.get(opts, res => {
213
+ // Follow redirects
214
+ if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
215
+ return fetchHtml(res.headers.location, proxy).then(resolve).catch(reject)
216
+ }
173
217
  let data = ''
174
218
  res.on('data', chunk => data += chunk)
175
219
  res.on('end', () => resolve(data))
176
- }).on('error', reject)
220
+ })
221
+ req.on('error', reject)
222
+ req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG HTML timeout')) })
177
223
  })
178
224
  }
179
225
 
@@ -4,6 +4,7 @@ const { serperSearch } = require('./engines/serper')
4
4
  const { searxngSearch } = require('./engines/searxng')
5
5
  const { googleCseSearch } = require('./engines/google-cse')
6
6
  const { jinaSearch } = require('./engines/jina')
7
+ const { bingSearch } = require('./engines/bing')
7
8
  const { geminiGroundedSearch } = require('./engines/gemini-grounded')
8
9
  const { scrapeUrls } = require('./scraper')
9
10
  const { Summarizer } = require('./summarizer')
@@ -19,7 +20,8 @@ const ENGINES = {
19
20
  'google-cse': googleCseSearch,
20
21
  jina: jinaSearch,
21
22
  'gemini-grounded': geminiGroundedSearch,
22
- gemini: geminiGroundedSearch
23
+ gemini: geminiGroundedSearch,
24
+ bing: bingSearch
23
25
  }
24
26
 
25
27
  class SearchEngine {