spectrawl 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +68 -1
- package/src/index.js +7 -0
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -202,6 +202,54 @@ class BrowseEngine {
|
|
|
202
202
|
}
|
|
203
203
|
}
|
|
204
204
|
|
|
205
|
+
// LinkedIn: use stored cookies + proxy to browse authenticated
|
|
206
|
+
if (url.includes('linkedin.com')) {
|
|
207
|
+
return async (originalUrl, opts) => {
|
|
208
|
+
// Cookies are auto-injected by parent Spectrawl.browse() from auth DB
|
|
209
|
+
const cookies = opts._cookies
|
|
210
|
+
|
|
211
|
+
if (!cookies || cookies.length === 0) {
|
|
212
|
+
return {
|
|
213
|
+
content: '',
|
|
214
|
+
url: originalUrl,
|
|
215
|
+
title: 'LinkedIn',
|
|
216
|
+
statusCode: 401,
|
|
217
|
+
cached: false,
|
|
218
|
+
engine: 'blocked',
|
|
219
|
+
blocked: true,
|
|
220
|
+
blockType: 'linkedin',
|
|
221
|
+
blockDetail: 'LinkedIn requires authentication. Add cookies: spectrawl login linkedin --account yourname'
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
try {
|
|
226
|
+
// Browse with cookies via Camoufox (needs residential proxy to avoid IP mismatch)
|
|
227
|
+
const browseResult = await this.browse(originalUrl, {
|
|
228
|
+
...opts,
|
|
229
|
+
_skipOverride: true,
|
|
230
|
+
stealth: true,
|
|
231
|
+
camoufox: true
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
if (browseResult && !browseResult.blocked && (browseResult.content || '').length > 200) {
|
|
235
|
+
return { ...browseResult, engine: 'linkedin-authenticated' }
|
|
236
|
+
}
|
|
237
|
+
} catch (e) { /* redirect loop or block — expected without proxy */ }
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
content: '',
|
|
241
|
+
url: originalUrl,
|
|
242
|
+
title: 'LinkedIn',
|
|
243
|
+
statusCode: 999,
|
|
244
|
+
cached: false,
|
|
245
|
+
engine: 'blocked',
|
|
246
|
+
blocked: true,
|
|
247
|
+
blockType: 'linkedin',
|
|
248
|
+
blockDetail: 'LinkedIn cookies valid but rejected from this IP (datacenter). Configure a residential proxy: spectrawl config set proxy.upstreams "[{\\"url\\":\\"http://user:pass@host:port\\"}]"'
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
205
253
|
// Amazon: try Jina Reader
|
|
206
254
|
if (url.includes('amazon.com') || url.includes('amazon.co')) {
|
|
207
255
|
return async (originalUrl, opts) => {
|
|
@@ -333,7 +381,26 @@ class BrowseEngine {
|
|
|
333
381
|
|
|
334
382
|
try {
|
|
335
383
|
if (opts._cookies) {
|
|
336
|
-
|
|
384
|
+
// Sanitize cookies for Playwright compatibility
|
|
385
|
+
const playwrightCookies = opts._cookies.map(c => {
|
|
386
|
+
const clean = { ...c }
|
|
387
|
+
if (!clean.sameSite || !['Strict', 'Lax', 'None'].includes(clean.sameSite)) {
|
|
388
|
+
clean.sameSite = 'Lax'
|
|
389
|
+
}
|
|
390
|
+
if (clean.domain && clean.domain.startsWith('.')) {
|
|
391
|
+
clean.domain = clean.domain.slice(1)
|
|
392
|
+
}
|
|
393
|
+
delete clean.hostOnly
|
|
394
|
+
delete clean.session
|
|
395
|
+
delete clean.storeId
|
|
396
|
+
delete clean.id
|
|
397
|
+
if (clean.expirationDate && !clean.expires) {
|
|
398
|
+
clean.expires = clean.expirationDate
|
|
399
|
+
delete clean.expirationDate
|
|
400
|
+
}
|
|
401
|
+
return clean
|
|
402
|
+
})
|
|
403
|
+
await context.addCookies(playwrightCookies)
|
|
337
404
|
}
|
|
338
405
|
|
|
339
406
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
|
package/src/index.js
CHANGED
|
@@ -78,6 +78,13 @@ class Spectrawl {
|
|
|
78
78
|
const cookies = await this.auth.getCookies(opts.auth)
|
|
79
79
|
opts._cookies = cookies
|
|
80
80
|
}
|
|
81
|
+
// Auto-inject stored cookies for sites that require auth (LinkedIn, etc.)
|
|
82
|
+
if (!opts._cookies && !opts.auth && url.includes('linkedin.com')) {
|
|
83
|
+
try {
|
|
84
|
+
const cookies = await this.auth.getCookies('linkedin')
|
|
85
|
+
if (cookies && cookies.length > 0) opts._cookies = cookies
|
|
86
|
+
} catch (e) { /* no stored cookies, proceed without */ }
|
|
87
|
+
}
|
|
81
88
|
return this.browseEngine.browse(url, opts)
|
|
82
89
|
}
|
|
83
90
|
|