npm - spectrawl - Versions diffs - 0.6.1 → 0.6.3 - Mend

spectrawl 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.6.1",
+  "version": "0.6.3",
   "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/browse/index.js CHANGED Viewed

@@ -104,42 +104,90 @@ class BrowseEngine {
    */
   _getSiteOverride(url) {
     // Reddit: datacenter IPs are fully blocked (browse, JSON, RSS all fail)
-    // Fallback: return block info with actionable message + try Jina
+    // Fallback: PullPush API (free Reddit archive, no auth, no IP block)
     if (url.includes('reddit.com')) {
       return async (originalUrl, opts) => {
-        // Try Jina Reader first (sometimes works)
         try {
-          const jinaUrl = `https://r.jina.ai/${originalUrl}`
+          const parsed = new URL(originalUrl)
+          const pathParts = parsed.pathname.split('/').filter(Boolean)
+          // Extract subreddit and post ID from URL
+          let subreddit = null, postId = null, isComments = false
+          for (let i = 0; i < pathParts.length; i++) {
+            if (pathParts[i] === 'r' && pathParts[i + 1]) subreddit = pathParts[i + 1]
+            if (pathParts[i] === 'comments' && pathParts[i + 1]) { postId = pathParts[i + 1]; isComments = true }
+          }
           const h = require('https')
-          const content = await new Promise((resolve, reject) => {
-            const req = h.get(jinaUrl, {
-              headers: { 'Accept': 'text/plain', 'User-Agent': 'Spectrawl/1.0' },
+          const fetchJson = (apiUrl) => new Promise((resolve) => {
+            const req = h.get(apiUrl, {
+              headers: { 'User-Agent': 'Spectrawl/0.6.1' },
               timeout: 10000
             }, res => {
               if (res.statusCode !== 200) return resolve(null)
               let data = ''
               res.on('data', c => data += c)
-              res.on('end', () => resolve(data))
+              res.on('end', () => { try { resolve(JSON.parse(data)) } catch { resolve(null) } })
             })
             req.on('error', () => resolve(null))
             req.setTimeout(10000, () => { req.destroy(); resolve(null) })
           })
-          if (content && content.length > 200 && !content.includes('blocked by network')) {
+          let content = ''
+          if (postId) {
+            // Specific thread: get post + comments
+            const postData = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?ids=${postId}`)
+            const comments = await fetchJson(`https://api.pullpush.io/reddit/search/comment/?link_id=${postId}&size=25&sort=score&sort_type=desc`)
+            if (postData?.data?.[0]) {
+              const post = postData.data[0]
+              content = `# ${post.title}\n\nby u/${post.author} in r/${post.subreddit} | ${post.score} points | ${post.num_comments} comments\n\n${post.selftext || post.url || ''}\n\n---\n\n## Comments\n\n`
+              if (comments?.data) {
+                for (const c of comments.data) {
+                  content += `**u/${c.author}** (${c.score} pts):\n${c.body}\n\n`
+                }
+              }
+            }
+          } else if (subreddit) {
+            // Subreddit listing
+            const sort = parsed.pathname.includes('/top') ? 'score' : 'created_utc'
+            const order = sort === 'score' ? 'desc' : 'desc'
+            const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?subreddit=${subreddit}&size=25&sort=${sort}&sort_type=${order}`)
+            if (data?.data) {
+              content = `# r/${subreddit}\n\n`
+              for (const post of data.data) {
+                content += `- **${post.title}** (${post.score} pts, ${post.num_comments} comments) by u/${post.author}\n  ${post.url || ''}\n  ${(post.selftext || '').slice(0, 200)}\n\n`
+              }
+            }
+          } else {
+            // Generic Reddit URL — try search
+            const query = parsed.searchParams.get('q') || pathParts.join(' ')
+            if (query) {
+              const data = await fetchJson(`https://api.pullpush.io/reddit/search/submission/?q=${encodeURIComponent(query)}&size=15&sort=score&sort_type=desc`)
+              if (data?.data) {
+                content = `# Reddit search: ${query}\n\n`
+                for (const post of data.data) {
+                  content += `- **${post.title}** in r/${post.subreddit} (${post.score} pts) by u/${post.author}\n  ${(post.selftext || '').slice(0, 200)}\n\n`
+                }
+              }
+            }
+          }
+          if (content && content.length > 50) {
             return {
               content,
               url: originalUrl,
-              title: 'Reddit (via Jina Reader)',
+              title: subreddit ? `r/${subreddit}` : 'Reddit',
               statusCode: 200,
               cached: false,
-              engine: 'jina-reader',
+              engine: 'pullpush-api',
               blocked: false
             }
           }
-        } catch (e) { /* try next */ }
+        } catch (e) { /* fall through */ }
-        // All direct methods fail from datacenter IPs
-        // Return explicit block with guidance
         return {
           content: '',
           url: originalUrl,
@@ -149,7 +197,55 @@ class BrowseEngine {
           engine: 'blocked',
           blocked: true,
           blockType: 'reddit',
-          blockDetail: 'Reddit blocks all datacenter IPs. Use /search with a Reddit-related query to get cached Reddit content via Google, or configure a residential proxy.'
+          blockDetail: 'Reddit blocked and PullPush API unavailable. Use /search with a Reddit-related query to get cached content, or configure a residential proxy.'
+        }
+      }
+    }
+    // LinkedIn: use stored cookies + proxy to browse authenticated
+    if (url.includes('linkedin.com')) {
+      return async (originalUrl, opts) => {
+        // Cookies are auto-injected by parent Spectrawl.browse() from auth DB
+        const cookies = opts._cookies
+        if (!cookies || cookies.length === 0) {
+          return {
+            content: '',
+            url: originalUrl,
+            title: 'LinkedIn',
+            statusCode: 401,
+            cached: false,
+            engine: 'blocked',
+            blocked: true,
+            blockType: 'linkedin',
+            blockDetail: 'LinkedIn requires authentication. Add cookies: spectrawl login linkedin --account yourname'
+          }
+        }
+        try {
+          // Browse with cookies via Camoufox (needs residential proxy to avoid IP mismatch)
+          const browseResult = await this.browse(originalUrl, {
+            ...opts,
+            _skipOverride: true,
+            stealth: true,
+            camoufox: true
+          })
+          if (browseResult && !browseResult.blocked && (browseResult.content || '').length > 200) {
+            return { ...browseResult, engine: 'linkedin-authenticated' }
+          }
+        } catch (e) { /* redirect loop or block — expected without proxy */ }
+        return {
+          content: '',
+          url: originalUrl,
+          title: 'LinkedIn',
+          statusCode: 999,
+          cached: false,
+          engine: 'blocked',
+          blocked: true,
+          blockType: 'linkedin',
+          blockDetail: 'LinkedIn cookies valid but rejected from this IP (datacenter). Configure a residential proxy: spectrawl config set proxy.upstreams "[{\\"url\\":\\"http://user:pass@host:port\\"}]"'
         }
       }
     }
@@ -285,7 +381,26 @@ class BrowseEngine {
     try {
       if (opts._cookies) {
-        await context.addCookies(opts._cookies)
+        // Sanitize cookies for Playwright compatibility
+        const playwrightCookies = opts._cookies.map(c => {
+          const clean = { ...c }
+          if (!clean.sameSite || !['Strict', 'Lax', 'None'].includes(clean.sameSite)) {
+            clean.sameSite = 'Lax'
+          }
+          if (clean.domain && clean.domain.startsWith('.')) {
+            clean.domain = clean.domain.slice(1)
+          }
+          delete clean.hostOnly
+          delete clean.session
+          delete clean.storeId
+          delete clean.id
+          if (clean.expirationDate && !clean.expires) {
+            clean.expires = clean.expirationDate
+            delete clean.expirationDate
+          }
+          return clean
+        })
+        await context.addCookies(playwrightCookies)
       }
       await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })

package/src/index.js CHANGED Viewed

@@ -78,6 +78,13 @@ class Spectrawl {
       const cookies = await this.auth.getCookies(opts.auth)
       opts._cookies = cookies
     }
+    // Auto-inject stored cookies for sites that require auth (LinkedIn, etc.)
+    if (!opts._cookies && !opts.auth && url.includes('linkedin.com')) {
+      try {
+        const cookies = await this.auth.getCookies('linkedin')
+        if (cookies && cookies.length > 0) opts._cookies = cookies
+      } catch (e) { /* no stored cookies, proceed without */ }
+    }
     return this.browseEngine.browse(url, opts)
   }