spectrawl 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/browse/index.js +62 -14
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.6.1",
3
+ "version": "0.6.2",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -104,42 +104,90 @@ class BrowseEngine {
104
104
  */
105
105
  _getSiteOverride(url) {
106
106
  // Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
107
- // Fallback: return block info with actionable message + try Jina
107
+ // Fallback: PullPush API (free Reddit archive, no auth, no IP block)
108
108
  if (url.includes('reddit.com')) {
109
109
  return async (originalUrl, opts) => {
110
- // Try Jina Reader first (sometimes works)
111
110
  try {
112
- const jinaUrl = `https://r.jina.ai/${originalUrl}`
111
+ const parsed = new URL(originalUrl)
112
+ const pathParts = parsed.pathname.split('/').filter(Boolean)
113
+
114
+ // Extract subreddit and post ID from URL
115
+ let subreddit = null, postId = null, isComments = false
116
+ for (let i = 0; i < pathParts.length; i++) {
117
+ if (pathParts[i] === 'r' && pathParts[i + 1]) subreddit = pathParts[i + 1]
118
+ if (pathParts[i] === 'comments' && pathParts[i + 1]) { postId = pathParts[i + 1]; isComments = true }
119
+ }
120
+
113
121
  const h = require('https')
114
- const content = await new Promise((resolve, reject) => {
115
- const req = h.get(jinaUrl, {
116
- headers: { 'Accept': 'text/plain', 'User-Agent': 'Spectrawl/1.0' },
122
+ const fetchJson = (apiUrl) => new Promise((resolve) => {
123
+ const req = h.get(apiUrl, {
124
+ headers: { 'User-Agent': 'Spectrawl/0.6.1' },
117
125
  timeout: 10000
118
126
  }, res => {
119
127
  if (res.statusCode !== 200) return resolve(null)
120
128
  let data = ''
121
129
  res.on('data', c => data += c)
122
- res.on('end', () => resolve(data))
130
+ res.on('end', () => { try { resolve(JSON.parse(data)) } catch { resolve(null) } })
123
131
  })
124
132
  req.on('error', () => resolve(null))
125
133
  req.setTimeout(10000, () => { req.destroy(); resolve(null) })
126
134
  })
127
135
 
128
- if (content && content.length > 200 && !content.includes('blocked by network')) {
136
+ let content = ''
137
+
138
+ if (postId) {
139
+ // Specific thread: get post + comments
140
+ const postData = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?ids=${postId}`)
141
+ const comments = await fetchJson(`https://api.pullpush.io/reddit/search/comment/?link_id=${postId}&size=25&sort=score&sort_type=desc`)
142
+
143
+ if (postData?.data?.[0]) {
144
+ const post = postData.data[0]
145
+ content = `# ${post.title}\n\nby u/${post.author} in r/${post.subreddit} | ${post.score} points | ${post.num_comments} comments\n\n${post.selftext || post.url || ''}\n\n---\n\n## Comments\n\n`
146
+ if (comments?.data) {
147
+ for (const c of comments.data) {
148
+ content += `**u/${c.author}** (${c.score} pts):\n${c.body}\n\n`
149
+ }
150
+ }
151
+ }
152
+ } else if (subreddit) {
153
+ // Subreddit listing
154
+ const sort = parsed.pathname.includes('/top') ? 'score' : 'created_utc'
155
+ const order = sort === 'score' ? 'desc' : 'desc'
156
+ const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?subreddit=${subreddit}&size=25&sort=${sort}&sort_type=${order}`)
157
+
158
+ if (data?.data) {
159
+ content = `# r/${subreddit}\n\n`
160
+ for (const post of data.data) {
161
+ content += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments) by u/${post.author}\n ${post.url || ''}\n ${(post.selftext || '').slice(0, 200)}\n\n`
162
+ }
163
+ }
164
+ } else {
165
+ // Generic Reddit URL — try search
166
+ const query = parsed.searchParams.get('q') || pathParts.join(' ')
167
+ if (query) {
168
+ const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?q=${encodeURIComponent(query)}&size=15&sort=score&sort_type=desc`)
169
+ if (data?.data) {
170
+ content = `# Reddit search: ${query}\n\n`
171
+ for (const post of data.data) {
172
+ content += `- **${post.title}** in r/${post.subreddit} (${post.score} pts) by u/${post.author}\n ${(post.selftext || '').slice(0, 200)}\n\n`
173
+ }
174
+ }
175
+ }
176
+ }
177
+
178
+ if (content && content.length > 50) {
129
179
  return {
130
180
  content,
131
181
  url: originalUrl,
132
- title: 'Reddit (via Jina Reader)',
182
+ title: subreddit ? `r/${subreddit}` : 'Reddit',
133
183
  statusCode: 200,
134
184
  cached: false,
135
- engine: 'jina-reader',
185
+ engine: 'pullpush-api',
136
186
  blocked: false
137
187
  }
138
188
  }
139
- } catch (e) { /* try next */ }
189
+ } catch (e) { /* fall through */ }
140
190
 
141
- // All direct methods fail from datacenter IPs
142
- // Return explicit block with guidance
143
191
  return {
144
192
  content: '',
145
193
  url: originalUrl,
@@ -149,7 +197,7 @@ class BrowseEngine {
149
197
  engine: 'blocked',
150
198
  blocked: true,
151
199
  blockType: 'reddit',
152
- blockDetail: 'Reddit blocks all datacenter IPs. Use /search with a Reddit-related query to get cached Reddit content via Google, or configure a residential proxy.'
200
+ blockDetail: 'Reddit blocked and PullPush API unavailable. Use /search with a Reddit-related query to get cached content, or configure a residential proxy.'
153
201
  }
154
202
  }
155
203
  }