npm - spectrawl - Versions diffs - 0.6.2 → 0.6.4 - Mend

spectrawl 0.6.2 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.6.2",
+  "version": "0.6.4",
   "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/browse/index.js CHANGED Viewed

@@ -103,6 +103,73 @@ class BrowseEngine {
    * Returns a function that fetches content via alternative methods.
    */
   _getSiteOverride(url) {
+    // X/Twitter: articles and posts can't be browsed without login
+    // Fallback: xAI Responses API with x_search tool (reads X posts natively)
+    if ((url.includes('x.com/') || url.includes('twitter.com/')) && url.includes('/status/')) {
+      return async (originalUrl, opts) => {
+        const xaiKey = process.env.XAI_API_KEY
+        if (!xaiKey) return null // no key, fall through to normal browse
+        try {
+          const https = require('https')
+          const body = JSON.stringify({
+            model: 'grok-4-1-fast-non-reasoning',
+            input: [{ role: 'user', content: `Return the FULL exact text of this X post and all replies/thread if it's a thread. Include the author's name and handle. No commentary, no analysis, just the raw content:\n\n${originalUrl}` }],
+            tools: [{ type: 'x_search' }]
+          })
+          const content = await new Promise((resolve, reject) => {
+            const req = https.request({
+              hostname: 'api.x.ai',
+              path: '/v1/responses',
+              method: 'POST',
+              headers: {
+                'Content-Type': 'application/json',
+                'Authorization': `Bearer ${xaiKey}`,
+                'Content-Length': Buffer.byteLength(body)
+              },
+              timeout: 30000
+            }, res => {
+              let data = ''
+              res.on('data', c => data += c)
+              res.on('end', () => {
+                try {
+                  const json = JSON.parse(data)
+                  if (json.error) return resolve(null)
+                  const output = json.output || []
+                  for (const o of output) {
+                    if (o.type === 'message') {
+                      for (const c of (o.content || [])) {
+                        if (c.text && c.text.length > 20) return resolve(c.text)
+                      }
+                    }
+                  }
+                  resolve(null)
+                } catch { resolve(null) }
+              })
+            })
+            req.on('error', () => resolve(null))
+            req.setTimeout(30000, () => { req.destroy(); resolve(null) })
+            req.write(body)
+            req.end()
+          })
+          if (content && content.length > 20) {
+            return {
+              content,
+              url: originalUrl,
+              title: 'X Post (via xAI)',
+              statusCode: 200,
+              cached: false,
+              engine: 'xai-x-search',
+              blocked: false
+            }
+          }
+        } catch (e) { /* fall through */ }
+        return null // fall through to normal browse
+      }
+    }
     // Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
     // Fallback: PullPush API (free Reddit archive, no auth, no IP block)
     if (url.includes('reddit.com')) {
@@ -202,6 +269,54 @@ class BrowseEngine {
       }
     }
+    // LinkedIn: use stored cookies + proxy to browse authenticated
+    if (url.includes('linkedin.com')) {
+      return async (originalUrl, opts) => {
+        // Cookies are auto-injected by parent Spectrawl.browse() from auth DB
+        const cookies = opts._cookies
+        if (!cookies || cookies.length === 0) {
+          return {
+            content: '',
+            url: originalUrl,
+            title: 'LinkedIn',
+            statusCode: 401,
+            cached: false,
+            engine: 'blocked',
+            blocked: true,
+            blockType: 'linkedin',
+            blockDetail: 'LinkedIn requires authentication. Add cookies: spectrawl login linkedin --account yourname'
+          }
+        }
+        try {
+          // Browse with cookies via Camoufox (needs residential proxy to avoid IP mismatch)
+          const browseResult = await this.browse(originalUrl, {
+            ...opts,
+            _skipOverride: true,
+            stealth: true,
+            camoufox: true
+          })
+          if (browseResult && !browseResult.blocked && (browseResult.content || '').length > 200) {
+            return { ...browseResult, engine: 'linkedin-authenticated' }
+          }
+        } catch (e) { /* redirect loop or block — expected without proxy */ }
+        return {
+          content: '',
+          url: originalUrl,
+          title: 'LinkedIn',
+          statusCode: 999,
+          cached: false,
+          engine: 'blocked',
+          blocked: true,
+          blockType: 'linkedin',
+          blockDetail: 'LinkedIn cookies valid but rejected from this IP (datacenter). Configure a residential proxy: spectrawl config set proxy.upstreams "[{\\"url\\":\\"http://user:pass@host:port\\"}]"'
+        }
+      }
+    }
     // Amazon: try Jina Reader
     if (url.includes('amazon.com') || url.includes('amazon.co')) {
       return async (originalUrl, opts) => {
@@ -333,7 +448,26 @@ class BrowseEngine {
     try {
       if (opts._cookies) {
-        await context.addCookies(opts._cookies)
+        // Sanitize cookies for Playwright compatibility
+        const playwrightCookies = opts._cookies.map(c => {
+          const clean = { ...c }
+          if (!clean.sameSite || !['Strict', 'Lax', 'None'].includes(clean.sameSite)) {
+            clean.sameSite = 'Lax'
+          }
+          if (clean.domain && clean.domain.startsWith('.')) {
+            clean.domain = clean.domain.slice(1)
+          }
+          delete clean.hostOnly
+          delete clean.session
+          delete clean.storeId
+          delete clean.id
+          if (clean.expirationDate && !clean.expires) {
+            clean.expires = clean.expirationDate
+            delete clean.expirationDate
+          }
+          return clean
+        })
+        await context.addCookies(playwrightCookies)
       }
       await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })

package/src/index.js CHANGED Viewed

@@ -78,6 +78,13 @@ class Spectrawl {
       const cookies = await this.auth.getCookies(opts.auth)
       opts._cookies = cookies
     }
+    // Auto-inject stored cookies for sites that require auth (LinkedIn, etc.)
+    if (!opts._cookies && !opts.auth && url.includes('linkedin.com')) {
+      try {
+        const cookies = await this.auth.getCookies('linkedin')
+        if (cookies && cookies.length > 0) opts._cookies = cookies
+      } catch (e) { /* no stored cookies, proceed without */ }
+    }
     return this.browseEngine.browse(url, opts)
   }