spectrawl 0.3.5 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/search/engines/bing.js +123 -0
- package/src/search/engines/ddg.js +81 -35
- package/src/search/index.js +3 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.6",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
const { URL } = require('url')
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Bing web search — scrapes Bing HTML results.
|
|
6
|
+
* No API key needed. More reliable from datacenter IPs than DDG.
|
|
7
|
+
* DDG actually uses Bing's index anyway — this goes direct.
|
|
8
|
+
*/
|
|
9
|
+
async function bingSearch(query, config = {}) {
|
|
10
|
+
const maxResults = config.maxResults || 10
|
|
11
|
+
|
|
12
|
+
try {
|
|
13
|
+
const html = await fetchBing(query)
|
|
14
|
+
|
|
15
|
+
// Detect blocks
|
|
16
|
+
if (html.includes('captcha') || html.includes('unusual traffic') || html.length < 1000) {
|
|
17
|
+
return []
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return parseBingResults(html, maxResults)
|
|
21
|
+
} catch (e) {
|
|
22
|
+
return []
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function parseBingResults(html, maxResults) {
|
|
27
|
+
const results = []
|
|
28
|
+
|
|
29
|
+
// Bing result blocks: <li class="b_algo">
|
|
30
|
+
const blockRegex = /<li\s+class="b_algo">([\s\S]*?)<\/li>/g
|
|
31
|
+
let block
|
|
32
|
+
while ((block = blockRegex.exec(html)) !== null && results.length < maxResults) {
|
|
33
|
+
const content = block[1]
|
|
34
|
+
|
|
35
|
+
// Extract URL and title from <h2><a href="...">title</a></h2>
|
|
36
|
+
const linkMatch = content.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i)
|
|
37
|
+
if (!linkMatch) continue
|
|
38
|
+
|
|
39
|
+
const url = linkMatch[1]
|
|
40
|
+
const title = stripHtml(linkMatch[2])
|
|
41
|
+
|
|
42
|
+
// Skip Bing internal links
|
|
43
|
+
if (url.includes('bing.com') || url.includes('microsoft.com/bing')) continue
|
|
44
|
+
|
|
45
|
+
// Extract snippet from <p> or <div class="b_caption">
|
|
46
|
+
const snippetMatch = content.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
|
|
47
|
+
content.match(/<div\s+class="b_caption"[^>]*>([\s\S]*?)<\/div>/i)
|
|
48
|
+
const snippet = snippetMatch ? stripHtml(snippetMatch[1]) : ''
|
|
49
|
+
|
|
50
|
+
results.push({ url, title, snippet, engine: 'bing' })
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return results
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function fetchBing(query) {
|
|
57
|
+
return new Promise((resolve, reject) => {
|
|
58
|
+
const path = `/search?q=${encodeURIComponent(query)}&setlang=en&count=15`
|
|
59
|
+
const opts = {
|
|
60
|
+
hostname: 'www.bing.com',
|
|
61
|
+
path,
|
|
62
|
+
method: 'GET',
|
|
63
|
+
headers: {
|
|
64
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
65
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
66
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
67
|
+
'Accept-Encoding': 'identity',
|
|
68
|
+
'DNT': '1'
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const req = https.get(opts, res => {
|
|
73
|
+
// Follow redirects
|
|
74
|
+
if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
|
|
75
|
+
const loc = res.headers.location
|
|
76
|
+
if (loc.startsWith('http')) {
|
|
77
|
+
return fetchUrl(loc).then(resolve).catch(reject)
|
|
78
|
+
}
|
|
79
|
+
return fetchUrl('https://www.bing.com' + loc).then(resolve).catch(reject)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
let data = ''
|
|
83
|
+
res.on('data', chunk => data += chunk)
|
|
84
|
+
res.on('end', () => resolve(data))
|
|
85
|
+
})
|
|
86
|
+
req.on('error', reject)
|
|
87
|
+
req.setTimeout(8000, () => { req.destroy(); reject(new Error('Bing timeout')) })
|
|
88
|
+
})
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function fetchUrl(url) {
|
|
92
|
+
return new Promise((resolve, reject) => {
|
|
93
|
+
const urlObj = new URL(url)
|
|
94
|
+
const client = urlObj.protocol === 'https:' ? https : require('http')
|
|
95
|
+
client.get({
|
|
96
|
+
hostname: urlObj.hostname,
|
|
97
|
+
path: urlObj.pathname + urlObj.search,
|
|
98
|
+
headers: {
|
|
99
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
100
|
+
'Accept': 'text/html'
|
|
101
|
+
}
|
|
102
|
+
}, res => {
|
|
103
|
+
let data = ''
|
|
104
|
+
res.on('data', c => data += c)
|
|
105
|
+
res.on('end', () => resolve(data))
|
|
106
|
+
}).on('error', reject)
|
|
107
|
+
})
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function stripHtml(html) {
|
|
111
|
+
return html
|
|
112
|
+
.replace(/<[^>]+>/g, '')
|
|
113
|
+
.replace(/&/g, '&')
|
|
114
|
+
.replace(/</g, '<')
|
|
115
|
+
.replace(/>/g, '>')
|
|
116
|
+
.replace(/"/g, '"')
|
|
117
|
+
.replace(/'/g, "'")
|
|
118
|
+
.replace(/ /g, ' ')
|
|
119
|
+
.replace(/\s+/g, ' ')
|
|
120
|
+
.trim()
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
module.exports = { bingSearch }
|
|
@@ -1,31 +1,46 @@
|
|
|
1
1
|
const https = require('https')
|
|
2
|
+
const http = require('http')
|
|
2
3
|
const { URL } = require('url')
|
|
3
4
|
|
|
4
5
|
/**
|
|
5
6
|
* DuckDuckGo search — free, unlimited, no API key needed.
|
|
6
|
-
* Uses JSON API + HTML fallback
|
|
7
|
+
* Uses JSON API + HTML fallback + lite fallback.
|
|
8
|
+
* Built-in retry with backoff for datacenter IP rate limiting.
|
|
9
|
+
* Optional proxy support for reliable results.
|
|
7
10
|
*/
|
|
8
11
|
async function ddgSearch(query, config = {}) {
|
|
9
12
|
const maxResults = config.maxResults || 10
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
if (
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
13
|
+
const proxy = config.proxy || null
|
|
14
|
+
|
|
15
|
+
// Try up to 2 times with backoff
|
|
16
|
+
for (let attempt = 0; attempt < 2; attempt++) {
|
|
17
|
+
if (attempt > 0) await delay(1000 + Math.random() * 1000)
|
|
18
|
+
|
|
19
|
+
// Strategy 1: JSON API (instant answers — most reliable from datacenter)
|
|
20
|
+
try {
|
|
21
|
+
const results = await ddgJsonApi(query, maxResults, proxy)
|
|
22
|
+
if (results.length > 0) return results
|
|
23
|
+
} catch (e) { /* fall through */ }
|
|
24
|
+
|
|
25
|
+
// Strategy 2: HTML search (html.duckduckgo.com)
|
|
26
|
+
try {
|
|
27
|
+
const results = await ddgHtmlSearch(query, maxResults, 'html.duckduckgo.com', proxy)
|
|
28
|
+
if (results.length > 0) return results
|
|
29
|
+
} catch (e) { /* fall through */ }
|
|
30
|
+
|
|
31
|
+
// Strategy 3: Lite search (lite.duckduckgo.com — simpler, less likely to CAPTCHA)
|
|
32
|
+
try {
|
|
33
|
+
const results = await ddgHtmlSearch(query, maxResults, 'lite.duckduckgo.com', proxy)
|
|
34
|
+
if (results.length > 0) return results
|
|
35
|
+
} catch (e) { /* fall through */ }
|
|
36
|
+
}
|
|
22
37
|
|
|
23
38
|
return []
|
|
24
39
|
}
|
|
25
40
|
|
|
26
|
-
async function ddgJsonApi(query, maxResults) {
|
|
41
|
+
async function ddgJsonApi(query, maxResults, proxy) {
|
|
27
42
|
const url = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`
|
|
28
|
-
const data = await fetchJson(url)
|
|
43
|
+
const data = await fetchJson(url, proxy)
|
|
29
44
|
|
|
30
45
|
const results = []
|
|
31
46
|
|
|
@@ -82,9 +97,14 @@ async function ddgJsonApi(query, maxResults) {
|
|
|
82
97
|
return results
|
|
83
98
|
}
|
|
84
99
|
|
|
85
|
-
async function ddgHtmlSearch(query, maxResults) {
|
|
86
|
-
const
|
|
87
|
-
const html = await fetchHtml(
|
|
100
|
+
async function ddgHtmlSearch(query, maxResults, hostname, proxy) {
|
|
101
|
+
const path = `/html/?q=${encodeURIComponent(query)}`
|
|
102
|
+
const html = await fetchHtml(`https://${hostname}${path}`, proxy)
|
|
103
|
+
|
|
104
|
+
// Detect CAPTCHA / rate limit
|
|
105
|
+
if (html.includes('g-recaptcha') || html.includes('bot detected') || html.length < 500) {
|
|
106
|
+
return []
|
|
107
|
+
}
|
|
88
108
|
|
|
89
109
|
const results = []
|
|
90
110
|
|
|
@@ -95,11 +115,30 @@ async function ddgHtmlSearch(query, maxResults) {
|
|
|
95
115
|
let match
|
|
96
116
|
while ((match = resultRegex.exec(html)) !== null) {
|
|
97
117
|
const url = decodeUddg(match[1])
|
|
98
|
-
// Filter ads — DDG ads go through duckduckgo.com/y.js
|
|
99
118
|
if (isAd(url)) continue
|
|
100
119
|
links.push({ url, title: stripHtml(match[2]) })
|
|
101
120
|
}
|
|
102
121
|
|
|
122
|
+
// Lite endpoint uses different selectors
|
|
123
|
+
if (links.length === 0) {
|
|
124
|
+
const liteRegex = /<a[^>]+class="result-link"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/g
|
|
125
|
+
while ((match = liteRegex.exec(html)) !== null) {
|
|
126
|
+
const url = decodeUddg(match[1])
|
|
127
|
+
if (isAd(url)) continue
|
|
128
|
+
links.push({ url, title: stripHtml(match[2]) })
|
|
129
|
+
}
|
|
130
|
+
// Even simpler: just grab all non-DDG links from lite results
|
|
131
|
+
if (links.length === 0) {
|
|
132
|
+
const anyLink = /<a[^>]*href="(https?:\/\/(?!duckduckgo)[^"]+)"[^>]*>([\s\S]*?)<\/a>/g
|
|
133
|
+
while ((match = anyLink.exec(html)) !== null) {
|
|
134
|
+
if (results.length >= maxResults) break
|
|
135
|
+
const url = match[1]
|
|
136
|
+
if (isAd(url)) continue
|
|
137
|
+
links.push({ url, title: stripHtml(match[2]) })
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
103
142
|
const snippets = []
|
|
104
143
|
while ((match = snippetRegex.exec(html)) !== null) {
|
|
105
144
|
snippets.push(stripHtml(match[1]))
|
|
@@ -117,18 +156,11 @@ async function ddgHtmlSearch(query, maxResults) {
|
|
|
117
156
|
return results
|
|
118
157
|
}
|
|
119
158
|
|
|
120
|
-
/**
|
|
121
|
-
* Filter out DDG ads.
|
|
122
|
-
*/
|
|
123
159
|
function isAd(url) {
|
|
124
160
|
if (!url) return true
|
|
125
161
|
if (url.includes('duckduckgo.com/y.js')) return true
|
|
126
162
|
if (url.includes('ad_provider=')) return true
|
|
127
163
|
if (url.includes('ad_domain=')) return true
|
|
128
|
-
if (url.startsWith('//duckduckgo.com/l/?')) {
|
|
129
|
-
// This is a redirect — might be organic
|
|
130
|
-
return false
|
|
131
|
-
}
|
|
132
164
|
return false
|
|
133
165
|
}
|
|
134
166
|
|
|
@@ -140,28 +172,34 @@ function decodeUddg(url) {
|
|
|
140
172
|
return url
|
|
141
173
|
}
|
|
142
174
|
|
|
143
|
-
function
|
|
175
|
+
function delay(ms) { return new Promise(r => setTimeout(r, ms)) }
|
|
176
|
+
|
|
177
|
+
function fetchJson(url, proxy) {
|
|
144
178
|
return new Promise((resolve, reject) => {
|
|
145
179
|
const urlObj = new URL(url)
|
|
146
|
-
|
|
180
|
+
const opts = {
|
|
147
181
|
hostname: urlObj.hostname,
|
|
148
182
|
path: urlObj.pathname + urlObj.search,
|
|
149
|
-
headers: { 'User-Agent': 'Spectrawl/0.
|
|
150
|
-
}
|
|
183
|
+
headers: { 'User-Agent': 'Spectrawl/0.3' }
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const req = https.get(opts, res => {
|
|
151
187
|
let data = ''
|
|
152
188
|
res.on('data', chunk => data += chunk)
|
|
153
189
|
res.on('end', () => {
|
|
154
190
|
try { resolve(JSON.parse(data)) }
|
|
155
191
|
catch (e) { reject(new Error('Invalid JSON from DDG API')) }
|
|
156
192
|
})
|
|
157
|
-
})
|
|
193
|
+
})
|
|
194
|
+
req.on('error', reject)
|
|
195
|
+
req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG timeout')) })
|
|
158
196
|
})
|
|
159
197
|
}
|
|
160
198
|
|
|
161
|
-
function fetchHtml(url) {
|
|
199
|
+
function fetchHtml(url, proxy) {
|
|
162
200
|
return new Promise((resolve, reject) => {
|
|
163
201
|
const urlObj = new URL(url)
|
|
164
|
-
|
|
202
|
+
const opts = {
|
|
165
203
|
hostname: urlObj.hostname,
|
|
166
204
|
path: urlObj.pathname + urlObj.search,
|
|
167
205
|
headers: {
|
|
@@ -169,11 +207,19 @@ function fetchHtml(url) {
|
|
|
169
207
|
'Accept': 'text/html',
|
|
170
208
|
'Accept-Language': 'en-US,en;q=0.9'
|
|
171
209
|
}
|
|
172
|
-
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const req = https.get(opts, res => {
|
|
213
|
+
// Follow redirects
|
|
214
|
+
if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
|
|
215
|
+
return fetchHtml(res.headers.location, proxy).then(resolve).catch(reject)
|
|
216
|
+
}
|
|
173
217
|
let data = ''
|
|
174
218
|
res.on('data', chunk => data += chunk)
|
|
175
219
|
res.on('end', () => resolve(data))
|
|
176
|
-
})
|
|
220
|
+
})
|
|
221
|
+
req.on('error', reject)
|
|
222
|
+
req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG HTML timeout')) })
|
|
177
223
|
})
|
|
178
224
|
}
|
|
179
225
|
|
package/src/search/index.js
CHANGED
|
@@ -4,6 +4,7 @@ const { serperSearch } = require('./engines/serper')
|
|
|
4
4
|
const { searxngSearch } = require('./engines/searxng')
|
|
5
5
|
const { googleCseSearch } = require('./engines/google-cse')
|
|
6
6
|
const { jinaSearch } = require('./engines/jina')
|
|
7
|
+
const { bingSearch } = require('./engines/bing')
|
|
7
8
|
const { geminiGroundedSearch } = require('./engines/gemini-grounded')
|
|
8
9
|
const { scrapeUrls } = require('./scraper')
|
|
9
10
|
const { Summarizer } = require('./summarizer')
|
|
@@ -19,7 +20,8 @@ const ENGINES = {
|
|
|
19
20
|
'google-cse': googleCseSearch,
|
|
20
21
|
jina: jinaSearch,
|
|
21
22
|
'gemini-grounded': geminiGroundedSearch,
|
|
22
|
-
gemini: geminiGroundedSearch
|
|
23
|
+
gemini: geminiGroundedSearch,
|
|
24
|
+
bing: bingSearch
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
class SearchEngine {
|