spectrawl 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/browse/index.js +219 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.6.0",
3
+ "version": "0.6.2",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -45,8 +45,42 @@ class BrowseEngine {
45
45
  return this._browseRemoteCamoufox(url, opts)
46
46
  }
47
47
 
48
+ // Site-specific pre-routing: use known-working alternatives before trying direct browse
49
+ const siteOverride = this._getSiteOverride(url)
50
+ if (siteOverride && !opts._skipOverride) {
51
+ try {
52
+ const result = await siteOverride(url, opts)
53
+ if (result && !result.blocked && (result.content || '').length > 50) {
54
+ return result // Override succeeded with content
55
+ }
56
+ if (result && result.blocked) {
57
+ return result // Override confirmed site is blocked — return with actionable message
58
+ }
59
+ // Override returned empty/null — fall through to normal browse
60
+ } catch (e) {
61
+ // Override failed — fall through
62
+ }
63
+ }
64
+
48
65
  try {
49
- return await this._browsePlaywright(url, opts)
66
+ const result = await this._browsePlaywright(url, opts)
67
+
68
+ // Post-browse content quality check
69
+ if (result && result.blocked) {
70
+ console.log(`[browse] Blocked on ${url}: ${result.blockType} — ${result.blockDetail}`)
71
+ // Try site override as fallback
72
+ if (siteOverride) {
73
+ try {
74
+ const fallback = await siteOverride(url, { ...opts, _skipOverride: true })
75
+ if (fallback && !fallback.blocked && (fallback.content || '').length > 50) {
76
+ fallback._fallback = true
77
+ return fallback
78
+ }
79
+ } catch (e) { /* fallback failed too */ }
80
+ }
81
+ }
82
+
83
+ return result
50
84
  } catch (err) {
51
85
  // If blocked and remote Camoufox available, try that
52
86
  if (this._isBlocked(err) && this.remoteCamoufox) {
@@ -64,6 +98,149 @@ class BrowseEngine {
64
98
  }
65
99
  }
66
100
 
101
+ /**
102
+ * Get site-specific override for sites that block datacenter IPs.
103
+ * Returns a function that fetches content via alternative methods.
104
+ */
105
+ _getSiteOverride(url) {
106
+ // Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
107
+ // Fallback: PullPush API (free Reddit archive, no auth, no IP block)
108
+ if (url.includes('reddit.com')) {
109
+ return async (originalUrl, opts) => {
110
+ try {
111
+ const parsed = new URL(originalUrl)
112
+ const pathParts = parsed.pathname.split('/').filter(Boolean)
113
+
114
+ // Extract subreddit and post ID from URL
115
+ let subreddit = null, postId = null, isComments = false
116
+ for (let i = 0; i < pathParts.length; i++) {
117
+ if (pathParts[i] === 'r' && pathParts[i + 1]) subreddit = pathParts[i + 1]
118
+ if (pathParts[i] === 'comments' && pathParts[i + 1]) { postId = pathParts[i + 1]; isComments = true }
119
+ }
120
+
121
+ const h = require('https')
122
+ const fetchJson = (apiUrl) => new Promise((resolve) => {
123
+ const req = h.get(apiUrl, {
124
+ headers: { 'User-Agent': 'Spectrawl/0.6.1' },
125
+ timeout: 10000
126
+ }, res => {
127
+ if (res.statusCode !== 200) return resolve(null)
128
+ let data = ''
129
+ res.on('data', c => data += c)
130
+ res.on('end', () => { try { resolve(JSON.parse(data)) } catch { resolve(null) } })
131
+ })
132
+ req.on('error', () => resolve(null))
133
+ req.setTimeout(10000, () => { req.destroy(); resolve(null) })
134
+ })
135
+
136
+ let content = ''
137
+
138
+ if (postId) {
139
+ // Specific thread: get post + comments
140
+ const postData = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?ids=${postId}`)
141
+ const comments = await fetchJson(`https://api.pullpush.io/reddit/search/comment/?link_id=${postId}&size=25&sort=score&sort_type=desc`)
142
+
143
+ if (postData?.data?.[0]) {
144
+ const post = postData.data[0]
145
+ content = `# ${post.title}\n\nby u/${post.author} in r/${post.subreddit} | ${post.score} points | ${post.num_comments} comments\n\n${post.selftext || post.url || ''}\n\n---\n\n## Comments\n\n`
146
+ if (comments?.data) {
147
+ for (const c of comments.data) {
148
+ content += `**u/${c.author}** (${c.score} pts):\n${c.body}\n\n`
149
+ }
150
+ }
151
+ }
152
+ } else if (subreddit) {
153
+ // Subreddit listing
154
+ const sort = parsed.pathname.includes('/top') ? 'score' : 'created_utc'
155
+ const order = sort === 'score' ? 'desc' : 'desc'
156
+ const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?subreddit=${subreddit}&size=25&sort=${sort}&sort_type=${order}`)
157
+
158
+ if (data?.data) {
159
+ content = `# r/${subreddit}\n\n`
160
+ for (const post of data.data) {
161
+ content += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments) by u/${post.author}\n ${post.url || ''}\n ${(post.selftext || '').slice(0, 200)}\n\n`
162
+ }
163
+ }
164
+ } else {
165
+ // Generic Reddit URL — try search
166
+ const query = parsed.searchParams.get('q') || pathParts.join(' ')
167
+ if (query) {
168
+ const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?q=${encodeURIComponent(query)}&size=15&sort=score&sort_type=desc`)
169
+ if (data?.data) {
170
+ content = `# Reddit search: ${query}\n\n`
171
+ for (const post of data.data) {
172
+ content += `- **${post.title}** in r/${post.subreddit} (${post.score} pts) by u/${post.author}\n ${(post.selftext || '').slice(0, 200)}\n\n`
173
+ }
174
+ }
175
+ }
176
+ }
177
+
178
+ if (content && content.length > 50) {
179
+ return {
180
+ content,
181
+ url: originalUrl,
182
+ title: subreddit ? `r/${subreddit}` : 'Reddit',
183
+ statusCode: 200,
184
+ cached: false,
185
+ engine: 'pullpush-api',
186
+ blocked: false
187
+ }
188
+ }
189
+ } catch (e) { /* fall through */ }
190
+
191
+ return {
192
+ content: '',
193
+ url: originalUrl,
194
+ title: 'Reddit',
195
+ statusCode: 403,
196
+ cached: false,
197
+ engine: 'blocked',
198
+ blocked: true,
199
+ blockType: 'reddit',
200
+ blockDetail: 'Reddit blocked and PullPush API unavailable. Use /search with a Reddit-related query to get cached content, or configure a residential proxy.'
201
+ }
202
+ }
203
+ }
204
+
205
+ // Amazon: try Jina Reader
206
+ if (url.includes('amazon.com') || url.includes('amazon.co')) {
207
+ return async (originalUrl, opts) => {
208
+ try {
209
+ const jinaUrl = `https://r.jina.ai/${originalUrl}`
210
+ const h = require('https')
211
+ const content = await new Promise((resolve, reject) => {
212
+ const req = h.get(jinaUrl, {
213
+ headers: { 'Accept': 'text/plain', 'User-Agent': 'Spectrawl/1.0' },
214
+ timeout: 10000
215
+ }, res => {
216
+ if (res.statusCode !== 200) return resolve(null)
217
+ let data = ''
218
+ res.on('data', c => data += c)
219
+ res.on('end', () => resolve(data))
220
+ })
221
+ req.on('error', () => resolve(null))
222
+ req.setTimeout(10000, () => { req.destroy(); resolve(null) })
223
+ })
224
+
225
+ if (content && content.length > 100) {
226
+ return {
227
+ content,
228
+ url: originalUrl,
229
+ title: 'Amazon (via Jina Reader)',
230
+ statusCode: 200,
231
+ cached: false,
232
+ engine: 'jina-reader',
233
+ blocked: false
234
+ }
235
+ }
236
+ } catch (e) { /* fall through */ }
237
+ return null
238
+ }
239
+ }
240
+
241
+ return null
242
+ }
243
+
67
244
  /**
68
245
  * Launch Playwright with the best available browser.
69
246
  * Priority: Camoufox binary > stealth Chromium > vanilla Chromium
@@ -402,6 +579,37 @@ function detectBlockPage(content, title, html, url) {
402
579
  return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
403
580
  }
404
581
 
582
+ // Reddit network block
583
+ if (text.includes('been blocked by network security') ||
584
+ text.includes('log in to your reddit account') && text.includes('blocked') ||
585
+ text.includes('whoa there, pardner') ||
586
+ text.includes('your request has been blocked') && url?.includes('reddit')) {
587
+ return { type: 'reddit', detail: 'Reddit network-level IP block (datacenter IP detected)' }
588
+ }
589
+
590
+ // Amazon bot detection / CAPTCHA wall
591
+ if ((text.includes('continue shopping') && text.length < 300) ||
592
+ text.includes('sorry, we just need to make sure you') ||
593
+ text.includes('enter the characters you see below') ||
594
+ (text.includes('robot') && text.includes('sorry') && url?.includes('amazon')) ||
595
+ (titleLower.includes('robot check') && url?.includes('amazon'))) {
596
+ return { type: 'amazon', detail: 'Amazon CAPTCHA/bot detection wall' }
597
+ }
598
+
599
+ // LinkedIn auth wall / cookie consent wall
600
+ if ((text.includes('sign in') && text.includes('linkedin') && text.length < 1000) ||
601
+ (text.includes('join now') && text.includes('linkedin') && text.length < 1000) ||
602
+ (text.includes('essential and non-essential cookies') && url?.includes('linkedin'))) {
603
+ return { type: 'linkedin', detail: 'LinkedIn authentication or cookie consent wall' }
604
+ }
605
+
606
+ // Google / YouTube consent
607
+ if (text.includes('before you continue to google') ||
608
+ text.includes('before you continue to youtube') ||
609
+ (titleLower.includes('consent') && (url?.includes('google') || url?.includes('youtube')))) {
610
+ return { type: 'google-consent', detail: 'Google/YouTube consent page' }
611
+ }
612
+
405
613
  // Generic bot detection signals
406
614
  if (text.length < 200 && (
407
615
  text.includes('access denied') || text.includes('403 forbidden') ||
@@ -410,6 +618,16 @@ function detectBlockPage(content, title, html, url) {
410
618
  return { type: 'generic', detail: 'Generic bot detection or access denied page' }
411
619
  }
412
620
 
621
+ // Content quality heuristic — suspiciously short content from sites that should have more
622
+ if (text.length < 100 && text.length > 0 && url) {
623
+ const knownLargeSites = ['reddit.com', 'amazon.com', 'linkedin.com', 'facebook.com',
624
+ 'twitter.com', 'x.com', 'instagram.com', 'g2.com', 'yelp.com',
625
+ 'glassdoor.com', 'indeed.com', 'zillow.com', 'ebay.com']
626
+ if (knownLargeSites.some(s => url.includes(s))) {
627
+ return { type: 'suspected-block', detail: `Suspiciously short content (${text.length} chars) from ${url} — likely blocked or gated` }
628
+ }
629
+ }
630
+
413
631
  return null
414
632
  }
415
633