spectrawl 0.6.3 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +67 -0
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -103,6 +103,73 @@ class BrowseEngine {
|
|
|
103
103
|
* Returns a function that fetches content via alternative methods.
|
|
104
104
|
*/
|
|
105
105
|
_getSiteOverride(url) {
|
|
106
|
+
// X/Twitter: articles and posts can't be browsed without login
|
|
107
|
+
// Fallback: xAI Responses API with x_search tool (reads X posts natively)
|
|
108
|
+
if ((url.includes('x.com/') || url.includes('twitter.com/')) && url.includes('/status/')) {
|
|
109
|
+
return async (originalUrl, opts) => {
|
|
110
|
+
const xaiKey = process.env.XAI_API_KEY
|
|
111
|
+
if (!xaiKey) return null // no key, fall through to normal browse
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const https = require('https')
|
|
115
|
+
const body = JSON.stringify({
|
|
116
|
+
model: 'grok-4-1-fast-non-reasoning',
|
|
117
|
+
input: [{ role: 'user', content: `Return the FULL exact text of this X post and all replies/thread if it's a thread. Include the author's name and handle. No commentary, no analysis, just the raw content:\n\n${originalUrl}` }],
|
|
118
|
+
tools: [{ type: 'x_search' }]
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
const content = await new Promise((resolve, reject) => {
|
|
122
|
+
const req = https.request({
|
|
123
|
+
hostname: 'api.x.ai',
|
|
124
|
+
path: '/v1/responses',
|
|
125
|
+
method: 'POST',
|
|
126
|
+
headers: {
|
|
127
|
+
'Content-Type': 'application/json',
|
|
128
|
+
'Authorization': `Bearer ${xaiKey}`,
|
|
129
|
+
'Content-Length': Buffer.byteLength(body)
|
|
130
|
+
},
|
|
131
|
+
timeout: 30000
|
|
132
|
+
}, res => {
|
|
133
|
+
let data = ''
|
|
134
|
+
res.on('data', c => data += c)
|
|
135
|
+
res.on('end', () => {
|
|
136
|
+
try {
|
|
137
|
+
const json = JSON.parse(data)
|
|
138
|
+
if (json.error) return resolve(null)
|
|
139
|
+
const output = json.output || []
|
|
140
|
+
for (const o of output) {
|
|
141
|
+
if (o.type === 'message') {
|
|
142
|
+
for (const c of (o.content || [])) {
|
|
143
|
+
if (c.text && c.text.length > 20) return resolve(c.text)
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
resolve(null)
|
|
148
|
+
} catch { resolve(null) }
|
|
149
|
+
})
|
|
150
|
+
})
|
|
151
|
+
req.on('error', () => resolve(null))
|
|
152
|
+
req.setTimeout(30000, () => { req.destroy(); resolve(null) })
|
|
153
|
+
req.write(body)
|
|
154
|
+
req.end()
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
if (content && content.length > 20) {
|
|
158
|
+
return {
|
|
159
|
+
content,
|
|
160
|
+
url: originalUrl,
|
|
161
|
+
title: 'X Post (via xAI)',
|
|
162
|
+
statusCode: 200,
|
|
163
|
+
cached: false,
|
|
164
|
+
engine: 'xai-x-search',
|
|
165
|
+
blocked: false
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
} catch (e) { /* fall through */ }
|
|
169
|
+
return null // fall through to normal browse
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
106
173
|
// Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
|
|
107
174
|
// Fallback: PullPush API (free Reddit archive, no auth, no IP block)
|
|
108
175
|
if (url.includes('reddit.com')) {
|