spectrawl 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +219 -1
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -45,8 +45,42 @@ class BrowseEngine {
|
|
|
45
45
|
return this._browseRemoteCamoufox(url, opts)
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
+
// Site-specific pre-routing: use known-working alternatives before trying direct browse
|
|
49
|
+
const siteOverride = this._getSiteOverride(url)
|
|
50
|
+
if (siteOverride && !opts._skipOverride) {
|
|
51
|
+
try {
|
|
52
|
+
const result = await siteOverride(url, opts)
|
|
53
|
+
if (result && !result.blocked && (result.content || '').length > 50) {
|
|
54
|
+
return result // Override succeeded with content
|
|
55
|
+
}
|
|
56
|
+
if (result && result.blocked) {
|
|
57
|
+
return result // Override confirmed site is blocked — return with actionable message
|
|
58
|
+
}
|
|
59
|
+
// Override returned empty/null — fall through to normal browse
|
|
60
|
+
} catch (e) {
|
|
61
|
+
// Override failed — fall through
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
48
65
|
try {
|
|
49
|
-
|
|
66
|
+
const result = await this._browsePlaywright(url, opts)
|
|
67
|
+
|
|
68
|
+
// Post-browse content quality check
|
|
69
|
+
if (result && result.blocked) {
|
|
70
|
+
console.log(`[browse] Blocked on ${url}: ${result.blockType} — ${result.blockDetail}`)
|
|
71
|
+
// Try site override as fallback
|
|
72
|
+
if (siteOverride) {
|
|
73
|
+
try {
|
|
74
|
+
const fallback = await siteOverride(url, { ...opts, _skipOverride: true })
|
|
75
|
+
if (fallback && !fallback.blocked && (fallback.content || '').length > 50) {
|
|
76
|
+
fallback._fallback = true
|
|
77
|
+
return fallback
|
|
78
|
+
}
|
|
79
|
+
} catch (e) { /* fallback failed too */ }
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return result
|
|
50
84
|
} catch (err) {
|
|
51
85
|
// If blocked and remote Camoufox available, try that
|
|
52
86
|
if (this._isBlocked(err) && this.remoteCamoufox) {
|
|
@@ -64,6 +98,149 @@ class BrowseEngine {
|
|
|
64
98
|
}
|
|
65
99
|
}
|
|
66
100
|
|
|
101
|
+
/**
|
|
102
|
+
* Get site-specific override for sites that block datacenter IPs.
|
|
103
|
+
* Returns a function that fetches content via alternative methods.
|
|
104
|
+
*/
|
|
105
|
+
_getSiteOverride(url) {
|
|
106
|
+
// Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
|
|
107
|
+
// Fallback: PullPush API (free Reddit archive, no auth, no IP block)
|
|
108
|
+
if (url.includes('reddit.com')) {
|
|
109
|
+
return async (originalUrl, opts) => {
|
|
110
|
+
try {
|
|
111
|
+
const parsed = new URL(originalUrl)
|
|
112
|
+
const pathParts = parsed.pathname.split('/').filter(Boolean)
|
|
113
|
+
|
|
114
|
+
// Extract subreddit and post ID from URL
|
|
115
|
+
let subreddit = null, postId = null, isComments = false
|
|
116
|
+
for (let i = 0; i < pathParts.length; i++) {
|
|
117
|
+
if (pathParts[i] === 'r' && pathParts[i + 1]) subreddit = pathParts[i + 1]
|
|
118
|
+
if (pathParts[i] === 'comments' && pathParts[i + 1]) { postId = pathParts[i + 1]; isComments = true }
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const h = require('https')
|
|
122
|
+
const fetchJson = (apiUrl) => new Promise((resolve) => {
|
|
123
|
+
const req = h.get(apiUrl, {
|
|
124
|
+
headers: { 'User-Agent': 'Spectrawl/0.6.1' },
|
|
125
|
+
timeout: 10000
|
|
126
|
+
}, res => {
|
|
127
|
+
if (res.statusCode !== 200) return resolve(null)
|
|
128
|
+
let data = ''
|
|
129
|
+
res.on('data', c => data += c)
|
|
130
|
+
res.on('end', () => { try { resolve(JSON.parse(data)) } catch { resolve(null) } })
|
|
131
|
+
})
|
|
132
|
+
req.on('error', () => resolve(null))
|
|
133
|
+
req.setTimeout(10000, () => { req.destroy(); resolve(null) })
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
let content = ''
|
|
137
|
+
|
|
138
|
+
if (postId) {
|
|
139
|
+
// Specific thread: get post + comments
|
|
140
|
+
const postData = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?ids=${postId}`)
|
|
141
|
+
const comments = await fetchJson(`https://api.pullpush.io/reddit/search/comment/?link_id=${postId}&size=25&sort=score&sort_type=desc`)
|
|
142
|
+
|
|
143
|
+
if (postData?.data?.[0]) {
|
|
144
|
+
const post = postData.data[0]
|
|
145
|
+
content = `# ${post.title}\n\nby u/${post.author} in r/${post.subreddit} | ${post.score} points | ${post.num_comments} comments\n\n${post.selftext || post.url || ''}\n\n---\n\n## Comments\n\n`
|
|
146
|
+
if (comments?.data) {
|
|
147
|
+
for (const c of comments.data) {
|
|
148
|
+
content += `**u/${c.author}** (${c.score} pts):\n${c.body}\n\n`
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
} else if (subreddit) {
|
|
153
|
+
// Subreddit listing
|
|
154
|
+
const sort = parsed.pathname.includes('/top') ? 'score' : 'created_utc'
|
|
155
|
+
const order = sort === 'score' ? 'desc' : 'desc'
|
|
156
|
+
const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?subreddit=${subreddit}&size=25&sort=${sort}&sort_type=${order}`)
|
|
157
|
+
|
|
158
|
+
if (data?.data) {
|
|
159
|
+
content = `# r/${subreddit}\n\n`
|
|
160
|
+
for (const post of data.data) {
|
|
161
|
+
content += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments) by u/${post.author}\n ${post.url || ''}\n ${(post.selftext || '').slice(0, 200)}\n\n`
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
} else {
|
|
165
|
+
// Generic Reddit URL — try search
|
|
166
|
+
const query = parsed.searchParams.get('q') || pathParts.join(' ')
|
|
167
|
+
if (query) {
|
|
168
|
+
const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?q=${encodeURIComponent(query)}&size=15&sort=score&sort_type=desc`)
|
|
169
|
+
if (data?.data) {
|
|
170
|
+
content = `# Reddit search: ${query}\n\n`
|
|
171
|
+
for (const post of data.data) {
|
|
172
|
+
content += `- **${post.title}** in r/${post.subreddit} (${post.score} pts) by u/${post.author}\n ${(post.selftext || '').slice(0, 200)}\n\n`
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (content && content.length > 50) {
|
|
179
|
+
return {
|
|
180
|
+
content,
|
|
181
|
+
url: originalUrl,
|
|
182
|
+
title: subreddit ? `r/${subreddit}` : 'Reddit',
|
|
183
|
+
statusCode: 200,
|
|
184
|
+
cached: false,
|
|
185
|
+
engine: 'pullpush-api',
|
|
186
|
+
blocked: false
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
} catch (e) { /* fall through */ }
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
content: '',
|
|
193
|
+
url: originalUrl,
|
|
194
|
+
title: 'Reddit',
|
|
195
|
+
statusCode: 403,
|
|
196
|
+
cached: false,
|
|
197
|
+
engine: 'blocked',
|
|
198
|
+
blocked: true,
|
|
199
|
+
blockType: 'reddit',
|
|
200
|
+
blockDetail: 'Reddit blocked and PullPush API unavailable. Use /search with a Reddit-related query to get cached content, or configure a residential proxy.'
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Amazon: try Jina Reader
|
|
206
|
+
if (url.includes('amazon.com') || url.includes('amazon.co')) {
|
|
207
|
+
return async (originalUrl, opts) => {
|
|
208
|
+
try {
|
|
209
|
+
const jinaUrl = `https://r.jina.ai/${originalUrl}`
|
|
210
|
+
const h = require('https')
|
|
211
|
+
const content = await new Promise((resolve, reject) => {
|
|
212
|
+
const req = h.get(jinaUrl, {
|
|
213
|
+
headers: { 'Accept': 'text/plain', 'User-Agent': 'Spectrawl/1.0' },
|
|
214
|
+
timeout: 10000
|
|
215
|
+
}, res => {
|
|
216
|
+
if (res.statusCode !== 200) return resolve(null)
|
|
217
|
+
let data = ''
|
|
218
|
+
res.on('data', c => data += c)
|
|
219
|
+
res.on('end', () => resolve(data))
|
|
220
|
+
})
|
|
221
|
+
req.on('error', () => resolve(null))
|
|
222
|
+
req.setTimeout(10000, () => { req.destroy(); resolve(null) })
|
|
223
|
+
})
|
|
224
|
+
|
|
225
|
+
if (content && content.length > 100) {
|
|
226
|
+
return {
|
|
227
|
+
content,
|
|
228
|
+
url: originalUrl,
|
|
229
|
+
title: 'Amazon (via Jina Reader)',
|
|
230
|
+
statusCode: 200,
|
|
231
|
+
cached: false,
|
|
232
|
+
engine: 'jina-reader',
|
|
233
|
+
blocked: false
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
} catch (e) { /* fall through */ }
|
|
237
|
+
return null
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return null
|
|
242
|
+
}
|
|
243
|
+
|
|
67
244
|
/**
|
|
68
245
|
* Launch Playwright with the best available browser.
|
|
69
246
|
* Priority: Camoufox binary > stealth Chromium > vanilla Chromium
|
|
@@ -402,6 +579,37 @@ function detectBlockPage(content, title, html, url) {
|
|
|
402
579
|
return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
|
|
403
580
|
}
|
|
404
581
|
|
|
582
|
+
// Reddit network block
|
|
583
|
+
if (text.includes('been blocked by network security') ||
|
|
584
|
+
text.includes('log in to your reddit account') && text.includes('blocked') ||
|
|
585
|
+
text.includes('whoa there, pardner') ||
|
|
586
|
+
text.includes('your request has been blocked') && url?.includes('reddit')) {
|
|
587
|
+
return { type: 'reddit', detail: 'Reddit network-level IP block (datacenter IP detected)' }
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
// Amazon bot detection / CAPTCHA wall
|
|
591
|
+
if ((text.includes('continue shopping') && text.length < 300) ||
|
|
592
|
+
text.includes('sorry, we just need to make sure you') ||
|
|
593
|
+
text.includes('enter the characters you see below') ||
|
|
594
|
+
(text.includes('robot') && text.includes('sorry') && url?.includes('amazon')) ||
|
|
595
|
+
(titleLower.includes('robot check') && url?.includes('amazon'))) {
|
|
596
|
+
return { type: 'amazon', detail: 'Amazon CAPTCHA/bot detection wall' }
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
// LinkedIn auth wall / cookie consent wall
|
|
600
|
+
if ((text.includes('sign in') && text.includes('linkedin') && text.length < 1000) ||
|
|
601
|
+
(text.includes('join now') && text.includes('linkedin') && text.length < 1000) ||
|
|
602
|
+
(text.includes('essential and non-essential cookies') && url?.includes('linkedin'))) {
|
|
603
|
+
return { type: 'linkedin', detail: 'LinkedIn authentication or cookie consent wall' }
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// Google / YouTube consent
|
|
607
|
+
if (text.includes('before you continue to google') ||
|
|
608
|
+
text.includes('before you continue to youtube') ||
|
|
609
|
+
(titleLower.includes('consent') && (url?.includes('google') || url?.includes('youtube')))) {
|
|
610
|
+
return { type: 'google-consent', detail: 'Google/YouTube consent page' }
|
|
611
|
+
}
|
|
612
|
+
|
|
405
613
|
// Generic bot detection signals
|
|
406
614
|
if (text.length < 200 && (
|
|
407
615
|
text.includes('access denied') || text.includes('403 forbidden') ||
|
|
@@ -410,6 +618,16 @@ function detectBlockPage(content, title, html, url) {
|
|
|
410
618
|
return { type: 'generic', detail: 'Generic bot detection or access denied page' }
|
|
411
619
|
}
|
|
412
620
|
|
|
621
|
+
// Content quality heuristic — suspiciously short content from sites that should have more
|
|
622
|
+
if (text.length < 100 && text.length > 0 && url) {
|
|
623
|
+
const knownLargeSites = ['reddit.com', 'amazon.com', 'linkedin.com', 'facebook.com',
|
|
624
|
+
'twitter.com', 'x.com', 'instagram.com', 'g2.com', 'yelp.com',
|
|
625
|
+
'glassdoor.com', 'indeed.com', 'zillow.com', 'ebay.com']
|
|
626
|
+
if (knownLargeSites.some(s => url.includes(s))) {
|
|
627
|
+
return { type: 'suspected-block', detail: `Suspiciously short content (${text.length} chars) from ${url} — likely blocked or gated` }
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
413
631
|
return null
|
|
414
632
|
}
|
|
415
633
|
|