spectrawl 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/browse/index.js +171 -1
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.6.0",
3
+ "version": "0.6.1",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -45,8 +45,42 @@ class BrowseEngine {
45
45
  return this._browseRemoteCamoufox(url, opts)
46
46
  }
47
47
 
48
+ // Site-specific pre-routing: use known-working alternatives before trying direct browse
49
+ const siteOverride = this._getSiteOverride(url)
50
+ if (siteOverride && !opts._skipOverride) {
51
+ try {
52
+ const result = await siteOverride(url, opts)
53
+ if (result && !result.blocked && (result.content || '').length > 50) {
54
+ return result // Override succeeded with content
55
+ }
56
+ if (result && result.blocked) {
57
+ return result // Override confirmed site is blocked — return with actionable message
58
+ }
59
+ // Override returned empty/null — fall through to normal browse
60
+ } catch (e) {
61
+ // Override failed — fall through
62
+ }
63
+ }
64
+
48
65
  try {
49
- return await this._browsePlaywright(url, opts)
66
+ const result = await this._browsePlaywright(url, opts)
67
+
68
+ // Post-browse content quality check
69
+ if (result && result.blocked) {
70
+ console.log(`[browse] Blocked on ${url}: ${result.blockType} — ${result.blockDetail}`)
71
+ // Try site override as fallback
72
+ if (siteOverride) {
73
+ try {
74
+ const fallback = await siteOverride(url, { ...opts, _skipOverride: true })
75
+ if (fallback && !fallback.blocked && (fallback.content || '').length > 50) {
76
+ fallback._fallback = true
77
+ return fallback
78
+ }
79
+ } catch (e) { /* fallback failed too */ }
80
+ }
81
+ }
82
+
83
+ return result
50
84
  } catch (err) {
51
85
  // If blocked and remote Camoufox available, try that
52
86
  if (this._isBlocked(err) && this.remoteCamoufox) {
@@ -64,6 +98,101 @@ class BrowseEngine {
64
98
  }
65
99
  }
66
100
 
101
+ /**
102
+ * Get site-specific override for sites that block datacenter IPs.
103
+ * Returns a function that fetches content via alternative methods.
104
+ */
105
+ _getSiteOverride(url) {
106
+ // Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
107
+ // Fallback: return block info with actionable message + try Jina
108
+ if (url.includes('reddit.com')) {
109
+ return async (originalUrl, opts) => {
110
+ // Try Jina Reader first (sometimes works)
111
+ try {
112
+ const jinaUrl = `https://r.jina.ai/${originalUrl}`
113
+ const h = require('https')
114
+ const content = await new Promise((resolve, reject) => {
115
+ const req = h.get(jinaUrl, {
116
+ headers: { 'Accept': 'text/plain', 'User-Agent': 'Spectrawl/1.0' },
117
+ timeout: 10000
118
+ }, res => {
119
+ if (res.statusCode !== 200) return resolve(null)
120
+ let data = ''
121
+ res.on('data', c => data += c)
122
+ res.on('end', () => resolve(data))
123
+ })
124
+ req.on('error', () => resolve(null))
125
+ req.setTimeout(10000, () => { req.destroy(); resolve(null) })
126
+ })
127
+
128
+ if (content && content.length > 200 && !content.includes('blocked by network')) {
129
+ return {
130
+ content,
131
+ url: originalUrl,
132
+ title: 'Reddit (via Jina Reader)',
133
+ statusCode: 200,
134
+ cached: false,
135
+ engine: 'jina-reader',
136
+ blocked: false
137
+ }
138
+ }
139
+ } catch (e) { /* try next */ }
140
+
141
+ // All direct methods fail from datacenter IPs
142
+ // Return explicit block with guidance
143
+ return {
144
+ content: '',
145
+ url: originalUrl,
146
+ title: 'Reddit',
147
+ statusCode: 403,
148
+ cached: false,
149
+ engine: 'blocked',
150
+ blocked: true,
151
+ blockType: 'reddit',
152
+ blockDetail: 'Reddit blocks all datacenter IPs. Use /search with a Reddit-related query to get cached Reddit content via Google, or configure a residential proxy.'
153
+ }
154
+ }
155
+ }
156
+
157
+ // Amazon: try Jina Reader
158
+ if (url.includes('amazon.com') || url.includes('amazon.co')) {
159
+ return async (originalUrl, opts) => {
160
+ try {
161
+ const jinaUrl = `https://r.jina.ai/${originalUrl}`
162
+ const h = require('https')
163
+ const content = await new Promise((resolve, reject) => {
164
+ const req = h.get(jinaUrl, {
165
+ headers: { 'Accept': 'text/plain', 'User-Agent': 'Spectrawl/1.0' },
166
+ timeout: 10000
167
+ }, res => {
168
+ if (res.statusCode !== 200) return resolve(null)
169
+ let data = ''
170
+ res.on('data', c => data += c)
171
+ res.on('end', () => resolve(data))
172
+ })
173
+ req.on('error', () => resolve(null))
174
+ req.setTimeout(10000, () => { req.destroy(); resolve(null) })
175
+ })
176
+
177
+ if (content && content.length > 100) {
178
+ return {
179
+ content,
180
+ url: originalUrl,
181
+ title: 'Amazon (via Jina Reader)',
182
+ statusCode: 200,
183
+ cached: false,
184
+ engine: 'jina-reader',
185
+ blocked: false
186
+ }
187
+ }
188
+ } catch (e) { /* fall through */ }
189
+ return null
190
+ }
191
+ }
192
+
193
+ return null
194
+ }
195
+
67
196
  /**
68
197
  * Launch Playwright with the best available browser.
69
198
  * Priority: Camoufox binary > stealth Chromium > vanilla Chromium
@@ -402,6 +531,37 @@ function detectBlockPage(content, title, html, url) {
402
531
  return { type: 'recaptcha', detail: 'reCAPTCHA challenge page' }
403
532
  }
404
533
 
534
+ // Reddit network block
535
+ if (text.includes('been blocked by network security') ||
536
+ text.includes('log in to your reddit account') && text.includes('blocked') ||
537
+ text.includes('whoa there, pardner') ||
538
+ text.includes('your request has been blocked') && url?.includes('reddit')) {
539
+ return { type: 'reddit', detail: 'Reddit network-level IP block (datacenter IP detected)' }
540
+ }
541
+
542
+ // Amazon bot detection / CAPTCHA wall
543
+ if ((text.includes('continue shopping') && text.length < 300) ||
544
+ text.includes('sorry, we just need to make sure you') ||
545
+ text.includes('enter the characters you see below') ||
546
+ (text.includes('robot') && text.includes('sorry') && url?.includes('amazon')) ||
547
+ (titleLower.includes('robot check') && url?.includes('amazon'))) {
548
+ return { type: 'amazon', detail: 'Amazon CAPTCHA/bot detection wall' }
549
+ }
550
+
551
+ // LinkedIn auth wall / cookie consent wall
552
+ if ((text.includes('sign in') && text.includes('linkedin') && text.length < 1000) ||
553
+ (text.includes('join now') && text.includes('linkedin') && text.length < 1000) ||
554
+ (text.includes('essential and non-essential cookies') && url?.includes('linkedin'))) {
555
+ return { type: 'linkedin', detail: 'LinkedIn authentication or cookie consent wall' }
556
+ }
557
+
558
+ // Google / YouTube consent
559
+ if (text.includes('before you continue to google') ||
560
+ text.includes('before you continue to youtube') ||
561
+ (titleLower.includes('consent') && (url?.includes('google') || url?.includes('youtube')))) {
562
+ return { type: 'google-consent', detail: 'Google/YouTube consent page' }
563
+ }
564
+
405
565
  // Generic bot detection signals
406
566
  if (text.length < 200 && (
407
567
  text.includes('access denied') || text.includes('403 forbidden') ||
@@ -410,6 +570,16 @@ function detectBlockPage(content, title, html, url) {
410
570
  return { type: 'generic', detail: 'Generic bot detection or access denied page' }
411
571
  }
412
572
 
573
+ // Content quality heuristic — suspiciously short content from sites that should have more
574
+ if (text.length < 100 && text.length > 0 && url) {
575
+ const knownLargeSites = ['reddit.com', 'amazon.com', 'linkedin.com', 'facebook.com',
576
+ 'twitter.com', 'x.com', 'instagram.com', 'g2.com', 'yelp.com',
577
+ 'glassdoor.com', 'indeed.com', 'zillow.com', 'ebay.com']
578
+ if (knownLargeSites.some(s => url.includes(s))) {
579
+ return { type: 'suspected-block', detail: `Suspiciously short content (${text.length} chars) from ${url} — likely blocked or gated` }
580
+ }
581
+ }
582
+
413
583
  return null
414
584
  }
415
585