spectrawl 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.6.1",
3
+ "version": "0.6.3",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -104,42 +104,90 @@ class BrowseEngine {
104
104
  */
105
105
  _getSiteOverride(url) {
106
106
  // Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
107
- // Fallback: return block info with actionable message + try Jina
107
+ // Fallback: PullPush API (free Reddit archive, no auth, no IP block)
108
108
  if (url.includes('reddit.com')) {
109
109
  return async (originalUrl, opts) => {
110
- // Try Jina Reader first (sometimes works)
111
110
  try {
112
- const jinaUrl = `https://r.jina.ai/${originalUrl}`
111
+ const parsed = new URL(originalUrl)
112
+ const pathParts = parsed.pathname.split('/').filter(Boolean)
113
+
114
+ // Extract subreddit and post ID from URL
115
+ let subreddit = null, postId = null, isComments = false
116
+ for (let i = 0; i < pathParts.length; i++) {
117
+ if (pathParts[i] === 'r' && pathParts[i + 1]) subreddit = pathParts[i + 1]
118
+ if (pathParts[i] === 'comments' && pathParts[i + 1]) { postId = pathParts[i + 1]; isComments = true }
119
+ }
120
+
113
121
  const h = require('https')
114
- const content = await new Promise((resolve, reject) => {
115
- const req = h.get(jinaUrl, {
116
- headers: { 'Accept': 'text/plain', 'User-Agent': 'Spectrawl/1.0' },
122
+ const fetchJson = (apiUrl) => new Promise((resolve) => {
123
+ const req = h.get(apiUrl, {
124
+ headers: { 'User-Agent': 'Spectrawl/0.6.1' },
117
125
  timeout: 10000
118
126
  }, res => {
119
127
  if (res.statusCode !== 200) return resolve(null)
120
128
  let data = ''
121
129
  res.on('data', c => data += c)
122
- res.on('end', () => resolve(data))
130
+ res.on('end', () => { try { resolve(JSON.parse(data)) } catch { resolve(null) } })
123
131
  })
124
132
  req.on('error', () => resolve(null))
125
133
  req.setTimeout(10000, () => { req.destroy(); resolve(null) })
126
134
  })
127
135
 
128
- if (content && content.length > 200 && !content.includes('blocked by network')) {
136
+ let content = ''
137
+
138
+ if (postId) {
139
+ // Specific thread: get post + comments
140
+ const postData = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?ids=${postId}`)
141
+ const comments = await fetchJson(`https://api.pullpush.io/reddit/search/comment/?link_id=${postId}&size=25&sort=score&sort_type=desc`)
142
+
143
+ if (postData?.data?.[0]) {
144
+ const post = postData.data[0]
145
+ content = `# ${post.title}\n\nby u/${post.author} in r/${post.subreddit} | ${post.score} points | ${post.num_comments} comments\n\n${post.selftext || post.url || ''}\n\n---\n\n## Comments\n\n`
146
+ if (comments?.data) {
147
+ for (const c of comments.data) {
148
+ content += `**u/${c.author}** (${c.score} pts):\n${c.body}\n\n`
149
+ }
150
+ }
151
+ }
152
+ } else if (subreddit) {
153
+ // Subreddit listing
154
+ const sort = parsed.pathname.includes('/top') ? 'score' : 'created_utc'
155
+ const order = sort === 'score' ? 'desc' : 'desc'
156
+ const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?subreddit=${subreddit}&size=25&sort=${sort}&sort_type=${order}`)
157
+
158
+ if (data?.data) {
159
+ content = `# r/${subreddit}\n\n`
160
+ for (const post of data.data) {
161
+ content += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments) by u/${post.author}\n ${post.url || ''}\n ${(post.selftext || '').slice(0, 200)}\n\n`
162
+ }
163
+ }
164
+ } else {
165
+ // Generic Reddit URL — try search
166
+ const query = parsed.searchParams.get('q') || pathParts.join(' ')
167
+ if (query) {
168
+ const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?q=${encodeURIComponent(query)}&size=15&sort=score&sort_type=desc`)
169
+ if (data?.data) {
170
+ content = `# Reddit search: ${query}\n\n`
171
+ for (const post of data.data) {
172
+ content += `- **${post.title}** in r/${post.subreddit} (${post.score} pts) by u/${post.author}\n ${(post.selftext || '').slice(0, 200)}\n\n`
173
+ }
174
+ }
175
+ }
176
+ }
177
+
178
+ if (content && content.length > 50) {
129
179
  return {
130
180
  content,
131
181
  url: originalUrl,
132
- title: 'Reddit (via Jina Reader)',
182
+ title: subreddit ? `r/${subreddit}` : 'Reddit',
133
183
  statusCode: 200,
134
184
  cached: false,
135
- engine: 'jina-reader',
185
+ engine: 'pullpush-api',
136
186
  blocked: false
137
187
  }
138
188
  }
139
- } catch (e) { /* try next */ }
189
+ } catch (e) { /* fall through */ }
140
190
 
141
- // All direct methods fail from datacenter IPs
142
- // Return explicit block with guidance
143
191
  return {
144
192
  content: '',
145
193
  url: originalUrl,
@@ -149,7 +197,55 @@ class BrowseEngine {
149
197
  engine: 'blocked',
150
198
  blocked: true,
151
199
  blockType: 'reddit',
152
- blockDetail: 'Reddit blocks all datacenter IPs. Use /search with a Reddit-related query to get cached Reddit content via Google, or configure a residential proxy.'
200
+ blockDetail: 'Reddit blocked and PullPush API unavailable. Use /search with a Reddit-related query to get cached content, or configure a residential proxy.'
201
+ }
202
+ }
203
+ }
204
+
205
+ // LinkedIn: use stored cookies + proxy to browse authenticated
206
+ if (url.includes('linkedin.com')) {
207
+ return async (originalUrl, opts) => {
208
+ // Cookies are auto-injected by parent Spectrawl.browse() from auth DB
209
+ const cookies = opts._cookies
210
+
211
+ if (!cookies || cookies.length === 0) {
212
+ return {
213
+ content: '',
214
+ url: originalUrl,
215
+ title: 'LinkedIn',
216
+ statusCode: 401,
217
+ cached: false,
218
+ engine: 'blocked',
219
+ blocked: true,
220
+ blockType: 'linkedin',
221
+ blockDetail: 'LinkedIn requires authentication. Add cookies: spectrawl login linkedin --account yourname'
222
+ }
223
+ }
224
+
225
+ try {
226
+ // Browse with cookies via Camoufox (needs residential proxy to avoid IP mismatch)
227
+ const browseResult = await this.browse(originalUrl, {
228
+ ...opts,
229
+ _skipOverride: true,
230
+ stealth: true,
231
+ camoufox: true
232
+ })
233
+
234
+ if (browseResult && !browseResult.blocked && (browseResult.content || '').length > 200) {
235
+ return { ...browseResult, engine: 'linkedin-authenticated' }
236
+ }
237
+ } catch (e) { /* redirect loop or block — expected without proxy */ }
238
+
239
+ return {
240
+ content: '',
241
+ url: originalUrl,
242
+ title: 'LinkedIn',
243
+ statusCode: 999,
244
+ cached: false,
245
+ engine: 'blocked',
246
+ blocked: true,
247
+ blockType: 'linkedin',
248
+ blockDetail: 'LinkedIn cookies valid but rejected from this IP (datacenter). Configure a residential proxy: spectrawl config set proxy.upstreams "[{\\"url\\":\\"http://user:pass@host:port\\"}]"'
153
249
  }
154
250
  }
155
251
  }
@@ -285,7 +381,26 @@ class BrowseEngine {
285
381
 
286
382
  try {
287
383
  if (opts._cookies) {
288
- await context.addCookies(opts._cookies)
384
+ // Sanitize cookies for Playwright compatibility
385
+ const playwrightCookies = opts._cookies.map(c => {
386
+ const clean = { ...c }
387
+ if (!clean.sameSite || !['Strict', 'Lax', 'None'].includes(clean.sameSite)) {
388
+ clean.sameSite = 'Lax'
389
+ }
390
+ if (clean.domain && clean.domain.startsWith('.')) {
391
+ clean.domain = clean.domain.slice(1)
392
+ }
393
+ delete clean.hostOnly
394
+ delete clean.session
395
+ delete clean.storeId
396
+ delete clean.id
397
+ if (clean.expirationDate && !clean.expires) {
398
+ clean.expires = clean.expirationDate
399
+ delete clean.expirationDate
400
+ }
401
+ return clean
402
+ })
403
+ await context.addCookies(playwrightCookies)
289
404
  }
290
405
 
291
406
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
package/src/index.js CHANGED
@@ -78,6 +78,13 @@ class Spectrawl {
78
78
  const cookies = await this.auth.getCookies(opts.auth)
79
79
  opts._cookies = cookies
80
80
  }
81
+ // Auto-inject stored cookies for sites that require auth (LinkedIn, etc.)
82
+ if (!opts._cookies && !opts.auth && url.includes('linkedin.com')) {
83
+ try {
84
+ const cookies = await this.auth.getCookies('linkedin')
85
+ if (cookies && cookies.length > 0) opts._cookies = cookies
86
+ } catch (e) { /* no stored cookies, proceed without */ }
87
+ }
81
88
  return this.browseEngine.browse(url, opts)
82
89
  }
83
90