spectrawl 0.6.2 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.6.2",
3
+ "version": "0.6.4",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -103,6 +103,73 @@ class BrowseEngine {
103
103
  * Returns a function that fetches content via alternative methods.
104
104
  */
105
105
  _getSiteOverride(url) {
106
+ // X/Twitter: articles and posts can't be browsed without login
107
+ // Fallback: xAI Responses API with x_search tool (reads X posts natively)
108
+ if ((url.includes('x.com/') || url.includes('twitter.com/')) && url.includes('/status/')) {
109
+ return async (originalUrl, opts) => {
110
+ const xaiKey = process.env.XAI_API_KEY
111
+ if (!xaiKey) return null // no key, fall through to normal browse
112
+
113
+ try {
114
+ const https = require('https')
115
+ const body = JSON.stringify({
116
+ model: 'grok-4-1-fast-non-reasoning',
117
+ input: [{ role: 'user', content: `Return the FULL exact text of this X post and all replies/thread if it's a thread. Include the author's name and handle. No commentary, no analysis, just the raw content:\n\n${originalUrl}` }],
118
+ tools: [{ type: 'x_search' }]
119
+ })
120
+
121
+ const content = await new Promise((resolve, reject) => {
122
+ const req = https.request({
123
+ hostname: 'api.x.ai',
124
+ path: '/v1/responses',
125
+ method: 'POST',
126
+ headers: {
127
+ 'Content-Type': 'application/json',
128
+ 'Authorization': `Bearer ${xaiKey}`,
129
+ 'Content-Length': Buffer.byteLength(body)
130
+ },
131
+ timeout: 30000
132
+ }, res => {
133
+ let data = ''
134
+ res.on('data', c => data += c)
135
+ res.on('end', () => {
136
+ try {
137
+ const json = JSON.parse(data)
138
+ if (json.error) return resolve(null)
139
+ const output = json.output || []
140
+ for (const o of output) {
141
+ if (o.type === 'message') {
142
+ for (const c of (o.content || [])) {
143
+ if (c.text && c.text.length > 20) return resolve(c.text)
144
+ }
145
+ }
146
+ }
147
+ resolve(null)
148
+ } catch { resolve(null) }
149
+ })
150
+ })
151
+ req.on('error', () => resolve(null))
152
+ req.setTimeout(30000, () => { req.destroy(); resolve(null) })
153
+ req.write(body)
154
+ req.end()
155
+ })
156
+
157
+ if (content && content.length > 20) {
158
+ return {
159
+ content,
160
+ url: originalUrl,
161
+ title: 'X Post (via xAI)',
162
+ statusCode: 200,
163
+ cached: false,
164
+ engine: 'xai-x-search',
165
+ blocked: false
166
+ }
167
+ }
168
+ } catch (e) { /* fall through */ }
169
+ return null // fall through to normal browse
170
+ }
171
+ }
172
+
106
173
  // Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
107
174
  // Fallback: PullPush API (free Reddit archive, no auth, no IP block)
108
175
  if (url.includes('reddit.com')) {
@@ -202,6 +269,54 @@ class BrowseEngine {
202
269
  }
203
270
  }
204
271
 
272
+ // LinkedIn: use stored cookies + proxy to browse authenticated
273
+ if (url.includes('linkedin.com')) {
274
+ return async (originalUrl, opts) => {
275
+ // Cookies are auto-injected by parent Spectrawl.browse() from auth DB
276
+ const cookies = opts._cookies
277
+
278
+ if (!cookies || cookies.length === 0) {
279
+ return {
280
+ content: '',
281
+ url: originalUrl,
282
+ title: 'LinkedIn',
283
+ statusCode: 401,
284
+ cached: false,
285
+ engine: 'blocked',
286
+ blocked: true,
287
+ blockType: 'linkedin',
288
+ blockDetail: 'LinkedIn requires authentication. Add cookies: spectrawl login linkedin --account yourname'
289
+ }
290
+ }
291
+
292
+ try {
293
+ // Browse with cookies via Camoufox (needs residential proxy to avoid IP mismatch)
294
+ const browseResult = await this.browse(originalUrl, {
295
+ ...opts,
296
+ _skipOverride: true,
297
+ stealth: true,
298
+ camoufox: true
299
+ })
300
+
301
+ if (browseResult && !browseResult.blocked && (browseResult.content || '').length > 200) {
302
+ return { ...browseResult, engine: 'linkedin-authenticated' }
303
+ }
304
+ } catch (e) { /* redirect loop or block — expected without proxy */ }
305
+
306
+ return {
307
+ content: '',
308
+ url: originalUrl,
309
+ title: 'LinkedIn',
310
+ statusCode: 999,
311
+ cached: false,
312
+ engine: 'blocked',
313
+ blocked: true,
314
+ blockType: 'linkedin',
315
+ blockDetail: 'LinkedIn cookies valid but rejected from this IP (datacenter). Configure a residential proxy: spectrawl config set proxy.upstreams "[{\\"url\\":\\"http://user:pass@host:port\\"}]"'
316
+ }
317
+ }
318
+ }
319
+
205
320
  // Amazon: try Jina Reader
206
321
  if (url.includes('amazon.com') || url.includes('amazon.co')) {
207
322
  return async (originalUrl, opts) => {
@@ -333,7 +448,26 @@ class BrowseEngine {
333
448
 
334
449
  try {
335
450
  if (opts._cookies) {
336
- await context.addCookies(opts._cookies)
451
+ // Sanitize cookies for Playwright compatibility
452
+ const playwrightCookies = opts._cookies.map(c => {
453
+ const clean = { ...c }
454
+ if (!clean.sameSite || !['Strict', 'Lax', 'None'].includes(clean.sameSite)) {
455
+ clean.sameSite = 'Lax'
456
+ }
457
+ if (clean.domain && clean.domain.startsWith('.')) {
458
+ clean.domain = clean.domain.slice(1)
459
+ }
460
+ delete clean.hostOnly
461
+ delete clean.session
462
+ delete clean.storeId
463
+ delete clean.id
464
+ if (clean.expirationDate && !clean.expires) {
465
+ clean.expires = clean.expirationDate
466
+ delete clean.expirationDate
467
+ }
468
+ return clean
469
+ })
470
+ await context.addCookies(playwrightCookies)
337
471
  }
338
472
 
339
473
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
package/src/index.js CHANGED
@@ -78,6 +78,13 @@ class Spectrawl {
78
78
  const cookies = await this.auth.getCookies(opts.auth)
79
79
  opts._cookies = cookies
80
80
  }
81
+ // Auto-inject stored cookies for sites that require auth (LinkedIn, etc.)
82
+ if (!opts._cookies && !opts.auth && url.includes('linkedin.com')) {
83
+ try {
84
+ const cookies = await this.auth.getCookies('linkedin')
85
+ if (cookies && cookies.length > 0) opts._cookies = cookies
86
+ } catch (e) { /* no stored cookies, proceed without */ }
87
+ }
81
88
  return this.browseEngine.browse(url, opts)
82
89
  }
83
90