npm - spectrawl - Versions diffs - 0.4.0 → 0.4.2 - Mend

spectrawl 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.4.0",
+  "version": "0.4.2",
   "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/crawl.js CHANGED Viewed

@@ -1,23 +1,38 @@
 /**
- * Spectrawl Crawl Engine
- * Recursively crawls a website using Jina Reader (free) with Playwright fallback.
- * Designed for AI agents: returns clean markdown, not raw HTML.
+ * Spectrawl Crawl Engine v2
+ * Multi-page website crawler using our own browse engine (Camoufox).
+ * No external dependencies (no Jina, no Cloudflare).
+ * Supports sync + async (job-based) modes.
+ * Auto-detects system RAM and parallelizes crawling accordingly.
  */
-const https = require('https')
-const http = require('http')
+const crypto = require('crypto')
+const os = require('os')
+// ~250MB per browser tab (Camoufox average)
+const MB_PER_TAB = 250
+// Reserve this much RAM for OS + other processes
+const RESERVED_MB = 1500
 const DEFAULT_OPTS = {
-  depth: 1,
+  depth: 2,
   maxPages: 50,
-  format: 'markdown',   // markdown | html | json
-  delay: 300,           // ms between requests
-  stealth: false,
-  scope: 'domain',      // domain | prefix | any
-  timeout: 15000,
+  format: 'markdown',
+  delay: 300,            // ms between batch launches
+  stealth: true,
+  scope: 'domain',
+  timeout: 30000,
+  concurrency: 'auto',   // 'auto' | number — auto-detect from RAM
   includeLinks: true,
+  includePatterns: [],
+  excludePatterns: [],
+  merge: false,
   skipPatterns: [
-    /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)$/i,
+    /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
+    /\/_next\//,
+    /\/static\//,
+    /\/assets\//,
+    /mintcdn\.com/,
     /#/,
     /^mailto:/,
     /^tel:/,
@@ -25,6 +40,24 @@ const DEFAULT_OPTS = {
   ]
 }
+// In-memory job store for async crawls
+const jobs = new Map()
+/**
+ * Calculate max safe concurrency based on available system RAM.
+ */
+function detectConcurrency() {
+  const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
+  const freeMB = Math.floor(os.freemem() / 1024 / 1024)
+  // Use the lower of: (free RAM) or (total - reserved)
+  const availableMB = Math.min(freeMB, totalMB - RESERVED_MB)
+  const maxTabs = Math.max(1, Math.floor(availableMB / MB_PER_TAB))
+  // Cap at 10 — diminishing returns and politeness
+  const concurrency = Math.min(maxTabs, 10)
+  console.log(`[crawl] RAM: ${totalMB}MB total, ${freeMB}MB free → ${concurrency} concurrent tabs`)
+  return concurrency
+}
 class CrawlEngine {
   constructor(browseEngine, cache) {
     this.browseEngine = browseEngine
@@ -33,14 +66,20 @@ class CrawlEngine {
   /**
    * Crawl a website starting from a URL.
-   * @param {string} startUrl - Starting URL
-   * @param {object} opts - Crawl options
-   * @param {object} cookies - Optional auth cookies
+   * Automatically parallelizes based on available RAM.
    */
   async crawl(startUrl, opts = {}, cookies = null) {
-    const config = { ...DEFAULT_OPTS, ...opts }
+    const cleanOpts = Object.fromEntries(
+      Object.entries(opts).filter(([_, v]) => v !== undefined)
+    )
+    const config = { ...DEFAULT_OPTS, ...cleanOpts }
     const startTime = Date.now()
+    // Determine concurrency
+    const concurrency = config.concurrency === 'auto'
+      ? detectConcurrency()
+      : Math.max(1, Math.min(config.concurrency, 10))
     const startParsed = new URL(startUrl)
     const baseDomain = startParsed.hostname
     const basePrefix = startUrl.replace(/\/$/, '')
@@ -49,21 +88,14 @@ class CrawlEngine {
     const queue = [{ url: startUrl, depth: 0 }]
     const pages = []
     const failed = []
+    let activeCount = 0
-    while (queue.length > 0 && pages.length < config.maxPages) {
-      const { url, depth } = queue.shift()
-      const normalized = normalizeUrl(url)
-      if (visited.has(normalized)) continue
-      visited.add(normalized)
-      // Scope check
-      if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
-      // Skip pattern check
-      if (config.skipPatterns.some(p => p.test(url))) continue
+    // Process queue with concurrency control
+    const processUrl = async (item) => {
+      const { url, depth } = item
       try {
         const page = await this._fetchPage(url, config, cookies)
-        if (!page) { failed.push({ url, error: 'empty' }); continue }
+        if (!page) { failed.push({ url, error: 'empty' }); return }
         const links = page.links || []
         pages.push({
@@ -80,134 +112,236 @@ class CrawlEngine {
             const absLink = resolveUrl(link, url)
             if (!absLink) continue
             const normLink = normalizeUrl(absLink)
-            if (!visited.has(normLink)) {
-              queue.push({ url: absLink, depth: depth + 1 })
-            }
+            if (visited.has(normLink)) continue
+            // Pre-filter before queueing
+            if (!this._inScope(absLink, baseDomain, basePrefix, config.scope)) continue
+            if (config.skipPatterns.some(p => p.test(absLink))) continue
+            if (!this._matchesFilters(absLink, config.includePatterns, config.excludePatterns)) continue
+            visited.add(normLink)
+            queue.push({ url: absLink, depth: depth + 1 })
           }
         }
+      } catch (e) {
+        failed.push({ url, error: e.message })
+      }
+    }
+    // Seed the first URL
+    visited.add(normalizeUrl(startUrl))
+    // BFS with parallel workers
+    while (queue.length > 0 || activeCount > 0) {
+      // Launch up to `concurrency` parallel fetches
+      const batch = []
+      while (queue.length > 0 && batch.length < concurrency && (pages.length + activeCount + batch.length) < config.maxPages) {
+        batch.push(queue.shift())
+      }
+      if (batch.length === 0 && activeCount === 0) break
+      if (batch.length > 0) {
+        activeCount += batch.length
+        const results = await Promise.allSettled(
+          batch.map(item => processUrl(item))
+        )
+        activeCount -= batch.length
+        // Small delay between batches to be polite
         if (queue.length > 0 && config.delay > 0) {
           await sleep(config.delay)
         }
-      } catch (e) {
-        failed.push({ url, error: e.message })
       }
+      // Stop if we've hit maxPages
+      if (pages.length >= config.maxPages) break
     }
-    return {
+    const duration = Date.now() - startTime
+    const result = {
       startUrl,
       pages,
       stats: {
         total: visited.size,
         crawled: pages.length,
         failed: failed.length,
-        duration: Date.now() - startTime
+        concurrency,
+        duration,
+        pagesPerSecond: pages.length > 0 ? +(pages.length / (duration / 1000)).toFixed(2) : 0
       },
       failed: failed.length > 0 ? failed : undefined
     }
+    if (config.merge) {
+      result.merged = pages.map(p => {
+        return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
+      }).join('\n\n---\n\n')
+    }
+    return result
   }
-  async _fetchPage(url, config, cookies) {
-    // Try Jina Reader first (free, fast, clean markdown)
-    try {
-      const jinaUrl = `https://r.jina.ai/${url}`
-      const content = await fetchText(jinaUrl, {
-        'Accept': 'text/markdown',
-        'X-Return-Format': config.format === 'html' ? 'html' : 'markdown',
-        'X-With-Links-Summary': 'true',
-        'X-Timeout': '10'
+  /**
+   * Start an async crawl job. Returns job ID immediately.
+   */
+  startJob(startUrl, opts = {}, cookies = null) {
+    const jobId = crypto.randomUUID()
+    const job = {
+      id: jobId,
+      startUrl,
+      status: 'running',
+      started: Date.now(),
+      finished: 0,
+      total: 0,
+      pages: [],
+      failed: [],
+      error: null
+    }
+    jobs.set(jobId, job)
+    this.crawl(startUrl, opts, cookies)
+      .then(result => {
+        job.status = 'completed'
+        job.pages = result.pages
+        job.failed = result.failed || []
+        job.finished = result.stats.crawled
+        job.total = result.stats.total
+        job.duration = result.stats.duration
+        job.concurrency = result.stats.concurrency
+        job.pagesPerSecond = result.stats.pagesPerSecond
+      })
+      .catch(err => {
+        job.status = 'errored'
+        job.error = err.message
       })
-      if (content && content.length > 100) {
-        return parseJinaResponse(content, url)
-      }
-    } catch (e) {
-      // fall through to Playwright
+    return { jobId, status: 'running' }
+  }
+  /**
+   * Get job status/results.
+   */
+  getJob(jobId) {
+    const job = jobs.get(jobId)
+    if (!job) return null
+    return {
+      id: job.id,
+      startUrl: job.startUrl,
+      status: job.status,
+      started: job.started,
+      finished: job.finished,
+      total: job.total,
+      pageCount: job.pages.length,
+      concurrency: job.concurrency,
+      pagesPerSecond: job.pagesPerSecond,
+      error: job.error,
+      pages: job.status === 'completed' ? job.pages : undefined,
+      failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
+      duration: job.duration
     }
+  }
-    // Playwright fallback (stealth mode)
+  /**
+   * List all jobs.
+   */
+  listJobs() {
+    return Array.from(jobs.values()).map(j => ({
+      id: j.id,
+      startUrl: j.startUrl,
+      status: j.status,
+      pageCount: j.pages.length,
+      started: j.started
+    }))
+  }
+  /**
+   * Get system info for crawl capacity estimation.
+   */
+  static getCapacity() {
+    const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
+    const freeMB = Math.floor(os.freemem() / 1024 / 1024)
+    const concurrency = detectConcurrency()
+    // Estimate: each page takes ~4s with stealth delays
+    const pagesPerMinute = concurrency * 15  // ~4s per page
+    return {
+      totalRamMB: totalMB,
+      freeRamMB: freeMB,
+      maxConcurrency: concurrency,
+      estimatedPagesPerMinute: pagesPerMinute,
+      estimate100pages: `~${Math.ceil(100 / pagesPerMinute)} min`,
+      estimate1000pages: `~${Math.ceil(1000 / pagesPerMinute)} min`
+    }
+  }
+  async _fetchPage(url, config, cookies) {
     try {
       const result = await this.browseEngine.browse(url, {
         stealth: config.stealth,
         _cookies: cookies,
-        timeout: config.timeout
+        timeout: config.timeout,
+        html: true,
+        noCache: true
       })
       if (result?.content) {
+        const linkSource = result.html || result.content
         return {
           title: result.title || '',
           content: result.content,
-          links: extractLinks(result.html || result.content, url)
+          links: extractLinks(linkSource, url)
         }
       }
     } catch (e) {
       throw new Error(`Failed to fetch ${url}: ${e.message}`)
     }
     return null
   }
   _inScope(url, baseDomain, basePrefix, scope) {
     try {
       const parsed = new URL(url)
-      if (scope === 'domain') return parsed.hostname === baseDomain
+      if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
       if (scope === 'prefix') return url.startsWith(basePrefix)
-      return true // 'any'
+      return true
     } catch {
       return false
     }
   }
-}
-function parseJinaResponse(content, sourceUrl) {
-  // Jina returns markdown with a header block
-  const lines = content.split('\n')
-  let title = ''
-  const links = []
-  const contentLines = []
-  let inLinksSummary = false
-  for (const line of lines) {
-    if (line.startsWith('Title:')) {
-      title = line.replace('Title:', '').trim()
-    } else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
-      inLinksSummary = true
-    } else if (inLinksSummary) {
-      // Extract markdown links [text](url)
-      const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
-      for (const m of matches) links.push(m[2])
-    } else {
-      contentLines.push(line)
+  _matchesFilters(url, includePatterns, excludePatterns) {
+    if (excludePatterns && excludePatterns.length > 0) {
+      for (const pattern of excludePatterns) {
+        if (wildcardMatch(url, pattern)) return false
+      }
     }
+    if (includePatterns && includePatterns.length > 0) {
+      return includePatterns.some(pattern => wildcardMatch(url, pattern))
+    }
+    return true
   }
+}
-  // Also extract inline links from content
-  const inlineMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
-  for (const m of inlineMatches) {
-    if (!links.includes(m[2])) links.push(m[2])
-  }
-  return {
-    title: title || extractTitleFromMarkdown(contentLines.join('\n')),
-    content: contentLines.join('\n').trim(),
-    links: [...new Set(links)]
-  }
+function wildcardMatch(str, pattern) {
+  const regex = pattern
+    .replace(/[.+^${}()|[\]\\]/g, '\\$&')
+    .replace(/\*\*/g, '{{GLOBSTAR}}')
+    .replace(/\*/g, '[^/]*')
+    .replace(/\{\{GLOBSTAR\}\}/g, '.*')
+  return new RegExp('^' + regex + '$').test(str)
 }
-function extractLinks(html, baseUrl) {
+function extractLinks(content, baseUrl) {
   const links = []
-  const matches = html.matchAll(/href=["']([^"']+)["']/gi)
-  for (const m of matches) {
+  const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
+  for (const m of hrefMatches) {
     const resolved = resolveUrl(m[1], baseUrl)
     if (resolved && !links.includes(resolved)) links.push(resolved)
   }
+  const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
+  for (const m of mdMatches) {
+    if (!links.includes(m[2])) links.push(m[2])
+  }
   return links
 }
-function extractTitleFromMarkdown(content) {
-  const match = content.match(/^#\s+(.+)/m)
-  return match ? match[1].trim() : ''
-}
 function resolveUrl(url, base) {
   try {
     if (url.startsWith('http')) return url
@@ -221,27 +355,16 @@ function normalizeUrl(url) {
   try {
     const u = new URL(url)
     u.hash = ''
-    return u.href.replace(/\/$/, '')
+    let href = u.href
+    if (href.endsWith('/') && u.pathname !== '/') {
+      href = href.slice(0, -1)
+    }
+    return href
   } catch {
     return url
   }
 }
-function fetchText(url, headers = {}) {
-  return new Promise((resolve, reject) => {
-    const mod = url.startsWith('https') ? https : http
-    const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
-      if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
-      let d = ''
-      res.on('data', c => d += c)
-      res.on('end', () => resolve(d))
-    })
-    req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
-    req.on('error', reject)
-    req.end()
-  })
-}
 function sleep(ms) {
   return new Promise(r => setTimeout(r, ms))
 }

package/src/index.js CHANGED Viewed

@@ -92,6 +92,27 @@ class Spectrawl {
     return this.crawlEngine.crawl(url, opts, cookies)
   }
+  /**
+   * Start an async crawl job. Returns job ID immediately.
+   */
+  startCrawlJob(url, opts = {}) {
+    return this.crawlEngine.startJob(url, opts)
+  }
+  /**
+   * Get crawl job status/results.
+   */
+  getCrawlJob(jobId) {
+    return this.crawlEngine.getJob(jobId)
+  }
+  /**
+   * List all crawl jobs.
+   */
+  listCrawlJobs() {
+    return this.crawlEngine.listJobs()
+  }
   /**
    * Perform an authenticated action on a platform.
    * @param {string} platform - Platform name (x, reddit, devto, etc.)

package/src/server.js CHANGED Viewed

@@ -54,12 +54,40 @@ const server = http.createServer(async (req, res) => {
     if (req.method === 'POST' && path === '/crawl') {
       const body = await readBody(req)
-      const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth } = body
+      const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
+              includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
       if (!targetUrl) return error(res, 400, 'url is required')
-      const result = await spectrawl.crawl(targetUrl, { depth, maxPages, format, delay, stealth, scope, auth })
+      const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
+      if (asyncMode) {
+        // Async mode: return job ID immediately
+        const job = spectrawl.startCrawlJob(targetUrl, opts)
+        return json(res, job)
+      }
+      const result = await spectrawl.crawl(targetUrl, opts)
       return json(res, result)
     }
+    if (req.method === 'GET' && path === '/crawl/jobs') {
+      const jobList = spectrawl.listCrawlJobs()
+      return json(res, { jobs: jobList })
+    }
+    if (req.method === 'GET' && path === '/crawl/capacity') {
+      const { CrawlEngine } = require('./crawl')
+      return json(res, CrawlEngine.getCapacity())
+    }
+    if (req.method === 'GET' && path.startsWith('/crawl/')) {
+      const jobId = path.split('/crawl/')[1]
+      if (!jobId) return error(res, 400, 'job ID is required')
+      const job = spectrawl.getCrawlJob(jobId)
+      if (!job) return error(res, 404, 'job not found')
+      return json(res, job)
+    }
     if (req.method === 'POST' && path === '/act') {
       const body = await readBody(req)
       const { platform, action, ...params } = body