spectrawl 0.6.2 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +135 -1
- package/src/index.js +7 -0
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -103,6 +103,73 @@ class BrowseEngine {
|
|
|
103
103
|
* Returns a function that fetches content via alternative methods.
|
|
104
104
|
*/
|
|
105
105
|
_getSiteOverride(url) {
|
|
106
|
+
// X/Twitter: articles and posts can't be browsed without login
|
|
107
|
+
// Fallback: xAI Responses API with x_search tool (reads X posts natively)
|
|
108
|
+
if ((url.includes('x.com/') || url.includes('twitter.com/')) && url.includes('/status/')) {
|
|
109
|
+
return async (originalUrl, opts) => {
|
|
110
|
+
const xaiKey = process.env.XAI_API_KEY
|
|
111
|
+
if (!xaiKey) return null // no key, fall through to normal browse
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const https = require('https')
|
|
115
|
+
const body = JSON.stringify({
|
|
116
|
+
model: 'grok-4-1-fast-non-reasoning',
|
|
117
|
+
input: [{ role: 'user', content: `Return the FULL exact text of this X post and all replies/thread if it's a thread. Include the author's name and handle. No commentary, no analysis, just the raw content:\n\n${originalUrl}` }],
|
|
118
|
+
tools: [{ type: 'x_search' }]
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
const content = await new Promise((resolve, reject) => {
|
|
122
|
+
const req = https.request({
|
|
123
|
+
hostname: 'api.x.ai',
|
|
124
|
+
path: '/v1/responses',
|
|
125
|
+
method: 'POST',
|
|
126
|
+
headers: {
|
|
127
|
+
'Content-Type': 'application/json',
|
|
128
|
+
'Authorization': `Bearer ${xaiKey}`,
|
|
129
|
+
'Content-Length': Buffer.byteLength(body)
|
|
130
|
+
},
|
|
131
|
+
timeout: 30000
|
|
132
|
+
}, res => {
|
|
133
|
+
let data = ''
|
|
134
|
+
res.on('data', c => data += c)
|
|
135
|
+
res.on('end', () => {
|
|
136
|
+
try {
|
|
137
|
+
const json = JSON.parse(data)
|
|
138
|
+
if (json.error) return resolve(null)
|
|
139
|
+
const output = json.output || []
|
|
140
|
+
for (const o of output) {
|
|
141
|
+
if (o.type === 'message') {
|
|
142
|
+
for (const c of (o.content || [])) {
|
|
143
|
+
if (c.text && c.text.length > 20) return resolve(c.text)
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
resolve(null)
|
|
148
|
+
} catch { resolve(null) }
|
|
149
|
+
})
|
|
150
|
+
})
|
|
151
|
+
req.on('error', () => resolve(null))
|
|
152
|
+
req.setTimeout(30000, () => { req.destroy(); resolve(null) })
|
|
153
|
+
req.write(body)
|
|
154
|
+
req.end()
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
if (content && content.length > 20) {
|
|
158
|
+
return {
|
|
159
|
+
content,
|
|
160
|
+
url: originalUrl,
|
|
161
|
+
title: 'X Post (via xAI)',
|
|
162
|
+
statusCode: 200,
|
|
163
|
+
cached: false,
|
|
164
|
+
engine: 'xai-x-search',
|
|
165
|
+
blocked: false
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
} catch (e) { /* fall through */ }
|
|
169
|
+
return null // fall through to normal browse
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
106
173
|
// Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
|
|
107
174
|
// Fallback: PullPush API (free Reddit archive, no auth, no IP block)
|
|
108
175
|
if (url.includes('reddit.com')) {
|
|
@@ -202,6 +269,54 @@ class BrowseEngine {
|
|
|
202
269
|
}
|
|
203
270
|
}
|
|
204
271
|
|
|
272
|
+
// LinkedIn: use stored cookies + proxy to browse authenticated
|
|
273
|
+
if (url.includes('linkedin.com')) {
|
|
274
|
+
return async (originalUrl, opts) => {
|
|
275
|
+
// Cookies are auto-injected by parent Spectrawl.browse() from auth DB
|
|
276
|
+
const cookies = opts._cookies
|
|
277
|
+
|
|
278
|
+
if (!cookies || cookies.length === 0) {
|
|
279
|
+
return {
|
|
280
|
+
content: '',
|
|
281
|
+
url: originalUrl,
|
|
282
|
+
title: 'LinkedIn',
|
|
283
|
+
statusCode: 401,
|
|
284
|
+
cached: false,
|
|
285
|
+
engine: 'blocked',
|
|
286
|
+
blocked: true,
|
|
287
|
+
blockType: 'linkedin',
|
|
288
|
+
blockDetail: 'LinkedIn requires authentication. Add cookies: spectrawl login linkedin --account yourname'
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
try {
|
|
293
|
+
// Browse with cookies via Camoufox (needs residential proxy to avoid IP mismatch)
|
|
294
|
+
const browseResult = await this.browse(originalUrl, {
|
|
295
|
+
...opts,
|
|
296
|
+
_skipOverride: true,
|
|
297
|
+
stealth: true,
|
|
298
|
+
camoufox: true
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
if (browseResult && !browseResult.blocked && (browseResult.content || '').length > 200) {
|
|
302
|
+
return { ...browseResult, engine: 'linkedin-authenticated' }
|
|
303
|
+
}
|
|
304
|
+
} catch (e) { /* redirect loop or block — expected without proxy */ }
|
|
305
|
+
|
|
306
|
+
return {
|
|
307
|
+
content: '',
|
|
308
|
+
url: originalUrl,
|
|
309
|
+
title: 'LinkedIn',
|
|
310
|
+
statusCode: 999,
|
|
311
|
+
cached: false,
|
|
312
|
+
engine: 'blocked',
|
|
313
|
+
blocked: true,
|
|
314
|
+
blockType: 'linkedin',
|
|
315
|
+
blockDetail: 'LinkedIn cookies valid but rejected from this IP (datacenter). Configure a residential proxy: spectrawl config set proxy.upstreams "[{\\"url\\":\\"http://user:pass@host:port\\"}]"'
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
205
320
|
// Amazon: try Jina Reader
|
|
206
321
|
if (url.includes('amazon.com') || url.includes('amazon.co')) {
|
|
207
322
|
return async (originalUrl, opts) => {
|
|
@@ -333,7 +448,26 @@ class BrowseEngine {
|
|
|
333
448
|
|
|
334
449
|
try {
|
|
335
450
|
if (opts._cookies) {
|
|
336
|
-
|
|
451
|
+
// Sanitize cookies for Playwright compatibility
|
|
452
|
+
const playwrightCookies = opts._cookies.map(c => {
|
|
453
|
+
const clean = { ...c }
|
|
454
|
+
if (!clean.sameSite || !['Strict', 'Lax', 'None'].includes(clean.sameSite)) {
|
|
455
|
+
clean.sameSite = 'Lax'
|
|
456
|
+
}
|
|
457
|
+
if (clean.domain && clean.domain.startsWith('.')) {
|
|
458
|
+
clean.domain = clean.domain.slice(1)
|
|
459
|
+
}
|
|
460
|
+
delete clean.hostOnly
|
|
461
|
+
delete clean.session
|
|
462
|
+
delete clean.storeId
|
|
463
|
+
delete clean.id
|
|
464
|
+
if (clean.expirationDate && !clean.expires) {
|
|
465
|
+
clean.expires = clean.expirationDate
|
|
466
|
+
delete clean.expirationDate
|
|
467
|
+
}
|
|
468
|
+
return clean
|
|
469
|
+
})
|
|
470
|
+
await context.addCookies(playwrightCookies)
|
|
337
471
|
}
|
|
338
472
|
|
|
339
473
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
|
package/src/index.js
CHANGED
|
@@ -78,6 +78,13 @@ class Spectrawl {
|
|
|
78
78
|
const cookies = await this.auth.getCookies(opts.auth)
|
|
79
79
|
opts._cookies = cookies
|
|
80
80
|
}
|
|
81
|
+
// Auto-inject stored cookies for sites that require auth (LinkedIn, etc.)
|
|
82
|
+
if (!opts._cookies && !opts.auth && url.includes('linkedin.com')) {
|
|
83
|
+
try {
|
|
84
|
+
const cookies = await this.auth.getCookies('linkedin')
|
|
85
|
+
if (cookies && cookies.length > 0) opts._cookies = cookies
|
|
86
|
+
} catch (e) { /* no stored cookies, proceed without */ }
|
|
87
|
+
}
|
|
81
88
|
return this.browseEngine.browse(url, opts)
|
|
82
89
|
}
|
|
83
90
|
|