spectrawl 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.6.2",
3
+ "version": "0.6.3",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -202,6 +202,54 @@ class BrowseEngine {
202
202
  }
203
203
  }
204
204
 
205
+ // LinkedIn: use stored cookies + proxy to browse authenticated
206
+ if (url.includes('linkedin.com')) {
207
+ return async (originalUrl, opts) => {
208
+ // Cookies are auto-injected by parent Spectrawl.browse() from auth DB
209
+ const cookies = opts._cookies
210
+
211
+ if (!cookies || cookies.length === 0) {
212
+ return {
213
+ content: '',
214
+ url: originalUrl,
215
+ title: 'LinkedIn',
216
+ statusCode: 401,
217
+ cached: false,
218
+ engine: 'blocked',
219
+ blocked: true,
220
+ blockType: 'linkedin',
221
+ blockDetail: 'LinkedIn requires authentication. Add cookies: spectrawl login linkedin --account yourname'
222
+ }
223
+ }
224
+
225
+ try {
226
+ // Browse with cookies via Camoufox (needs residential proxy to avoid IP mismatch)
227
+ const browseResult = await this.browse(originalUrl, {
228
+ ...opts,
229
+ _skipOverride: true,
230
+ stealth: true,
231
+ camoufox: true
232
+ })
233
+
234
+ if (browseResult && !browseResult.blocked && (browseResult.content || '').length > 200) {
235
+ return { ...browseResult, engine: 'linkedin-authenticated' }
236
+ }
237
+ } catch (e) { /* redirect loop or block — expected without proxy */ }
238
+
239
+ return {
240
+ content: '',
241
+ url: originalUrl,
242
+ title: 'LinkedIn',
243
+ statusCode: 999,
244
+ cached: false,
245
+ engine: 'blocked',
246
+ blocked: true,
247
+ blockType: 'linkedin',
248
+ blockDetail: 'LinkedIn cookies valid but rejected from this IP (datacenter). Configure a residential proxy: spectrawl config set proxy.upstreams "[{\\"url\\":\\"http://user:pass@host:port\\"}]"'
249
+ }
250
+ }
251
+ }
252
+
205
253
  // Amazon: try Jina Reader
206
254
  if (url.includes('amazon.com') || url.includes('amazon.co')) {
207
255
  return async (originalUrl, opts) => {
@@ -333,7 +381,26 @@ class BrowseEngine {
333
381
 
334
382
  try {
335
383
  if (opts._cookies) {
336
- await context.addCookies(opts._cookies)
384
+ // Sanitize cookies for Playwright compatibility
385
+ const playwrightCookies = opts._cookies.map(c => {
386
+ const clean = { ...c }
387
+ if (!clean.sameSite || !['Strict', 'Lax', 'None'].includes(clean.sameSite)) {
388
+ clean.sameSite = 'Lax'
389
+ }
390
+ if (clean.domain && clean.domain.startsWith('.')) {
391
+ clean.domain = clean.domain.slice(1)
392
+ }
393
+ delete clean.hostOnly
394
+ delete clean.session
395
+ delete clean.storeId
396
+ delete clean.id
397
+ if (clean.expirationDate && !clean.expires) {
398
+ clean.expires = clean.expirationDate
399
+ delete clean.expirationDate
400
+ }
401
+ return clean
402
+ })
403
+ await context.addCookies(playwrightCookies)
337
404
  }
338
405
 
339
406
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
package/src/index.js CHANGED
@@ -78,6 +78,13 @@ class Spectrawl {
78
78
  const cookies = await this.auth.getCookies(opts.auth)
79
79
  opts._cookies = cookies
80
80
  }
81
+ // Auto-inject stored cookies for sites that require auth (LinkedIn, etc.)
82
+ if (!opts._cookies && !opts.auth && url.includes('linkedin.com')) {
83
+ try {
84
+ const cookies = await this.auth.getCookies('linkedin')
85
+ if (cookies && cookies.length > 0) opts._cookies = cookies
86
+ } catch (e) { /* no stored cookies, proceed without */ }
87
+ }
81
88
  return this.browseEngine.browse(url, opts)
82
89
  }
83
90