spectrawl 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +62 -14
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -104,42 +104,90 @@ class BrowseEngine {
|
|
|
104
104
|
*/
|
|
105
105
|
_getSiteOverride(url) {
|
|
106
106
|
// Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
|
|
107
|
-
// Fallback:
|
|
107
|
+
// Fallback: PullPush API (free Reddit archive, no auth, no IP block)
|
|
108
108
|
if (url.includes('reddit.com')) {
|
|
109
109
|
return async (originalUrl, opts) => {
|
|
110
|
-
// Try Jina Reader first (sometimes works)
|
|
111
110
|
try {
|
|
112
|
-
const
|
|
111
|
+
const parsed = new URL(originalUrl)
|
|
112
|
+
const pathParts = parsed.pathname.split('/').filter(Boolean)
|
|
113
|
+
|
|
114
|
+
// Extract subreddit and post ID from URL
|
|
115
|
+
let subreddit = null, postId = null, isComments = false
|
|
116
|
+
for (let i = 0; i < pathParts.length; i++) {
|
|
117
|
+
if (pathParts[i] === 'r' && pathParts[i + 1]) subreddit = pathParts[i + 1]
|
|
118
|
+
if (pathParts[i] === 'comments' && pathParts[i + 1]) { postId = pathParts[i + 1]; isComments = true }
|
|
119
|
+
}
|
|
120
|
+
|
|
113
121
|
const h = require('https')
|
|
114
|
-
const
|
|
115
|
-
const req = h.get(
|
|
116
|
-
headers: { '
|
|
122
|
+
const fetchJson = (apiUrl) => new Promise((resolve) => {
|
|
123
|
+
const req = h.get(apiUrl, {
|
|
124
|
+
headers: { 'User-Agent': 'Spectrawl/0.6.1' },
|
|
117
125
|
timeout: 10000
|
|
118
126
|
}, res => {
|
|
119
127
|
if (res.statusCode !== 200) return resolve(null)
|
|
120
128
|
let data = ''
|
|
121
129
|
res.on('data', c => data += c)
|
|
122
|
-
res.on('end', () => resolve(data))
|
|
130
|
+
res.on('end', () => { try { resolve(JSON.parse(data)) } catch { resolve(null) } })
|
|
123
131
|
})
|
|
124
132
|
req.on('error', () => resolve(null))
|
|
125
133
|
req.setTimeout(10000, () => { req.destroy(); resolve(null) })
|
|
126
134
|
})
|
|
127
135
|
|
|
128
|
-
|
|
136
|
+
let content = ''
|
|
137
|
+
|
|
138
|
+
if (postId) {
|
|
139
|
+
// Specific thread: get post + comments
|
|
140
|
+
const postData = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?ids=${postId}`)
|
|
141
|
+
const comments = await fetchJson(`https://api.pullpush.io/reddit/search/comment/?link_id=${postId}&size=25&sort=score&sort_type=desc`)
|
|
142
|
+
|
|
143
|
+
if (postData?.data?.[0]) {
|
|
144
|
+
const post = postData.data[0]
|
|
145
|
+
content = `# ${post.title}\n\nby u/${post.author} in r/${post.subreddit} | ${post.score} points | ${post.num_comments} comments\n\n${post.selftext || post.url || ''}\n\n---\n\n## Comments\n\n`
|
|
146
|
+
if (comments?.data) {
|
|
147
|
+
for (const c of comments.data) {
|
|
148
|
+
content += `**u/${c.author}** (${c.score} pts):\n${c.body}\n\n`
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
} else if (subreddit) {
|
|
153
|
+
// Subreddit listing
|
|
154
|
+
const sort = parsed.pathname.includes('/top') ? 'score' : 'created_utc'
|
|
155
|
+
const order = sort === 'score' ? 'desc' : 'desc'
|
|
156
|
+
const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?subreddit=${subreddit}&size=25&sort=${sort}&sort_type=${order}`)
|
|
157
|
+
|
|
158
|
+
if (data?.data) {
|
|
159
|
+
content = `# r/${subreddit}\n\n`
|
|
160
|
+
for (const post of data.data) {
|
|
161
|
+
content += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments) by u/${post.author}\n ${post.url || ''}\n ${(post.selftext || '').slice(0, 200)}\n\n`
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
} else {
|
|
165
|
+
// Generic Reddit URL — try search
|
|
166
|
+
const query = parsed.searchParams.get('q') || pathParts.join(' ')
|
|
167
|
+
if (query) {
|
|
168
|
+
const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?q=${encodeURIComponent(query)}&size=15&sort=score&sort_type=desc`)
|
|
169
|
+
if (data?.data) {
|
|
170
|
+
content = `# Reddit search: ${query}\n\n`
|
|
171
|
+
for (const post of data.data) {
|
|
172
|
+
content += `- **${post.title}** in r/${post.subreddit} (${post.score} pts) by u/${post.author}\n ${(post.selftext || '').slice(0, 200)}\n\n`
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (content && content.length > 50) {
|
|
129
179
|
return {
|
|
130
180
|
content,
|
|
131
181
|
url: originalUrl,
|
|
132
|
-
title:
|
|
182
|
+
title: subreddit ? `r/${subreddit}` : 'Reddit',
|
|
133
183
|
statusCode: 200,
|
|
134
184
|
cached: false,
|
|
135
|
-
engine: '
|
|
185
|
+
engine: 'pullpush-api',
|
|
136
186
|
blocked: false
|
|
137
187
|
}
|
|
138
188
|
}
|
|
139
|
-
} catch (e) { /*
|
|
189
|
+
} catch (e) { /* fall through */ }
|
|
140
190
|
|
|
141
|
-
// All direct methods fail from datacenter IPs
|
|
142
|
-
// Return explicit block with guidance
|
|
143
191
|
return {
|
|
144
192
|
content: '',
|
|
145
193
|
url: originalUrl,
|
|
@@ -149,7 +197,7 @@ class BrowseEngine {
|
|
|
149
197
|
engine: 'blocked',
|
|
150
198
|
blocked: true,
|
|
151
199
|
blockType: 'reddit',
|
|
152
|
-
blockDetail: 'Reddit
|
|
200
|
+
blockDetail: 'Reddit blocked and PullPush API unavailable. Use /search with a Reddit-related query to get cached content, or configure a residential proxy.'
|
|
153
201
|
}
|
|
154
202
|
}
|
|
155
203
|
}
|