npm - spectrawl - Versions diffs - 0.3.22 → 0.4.1 - Mend

spectrawl 0.3.22 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 The unified web layer for AI agents. Search, browse, authenticate, and act on platforms — one package, self-hosted.
-**5,000 free searches/month** via Gemini Grounded Search. Full page scraping, stealth browsing, 19 platform adapters.
+**5,000 free searches/month** via Gemini Grounded Search. Full site crawling, stealth browsing, 19 platform adapters.
 ## What It Does
@@ -57,6 +57,7 @@ Different tools for different needs.
 | Stealth browsing | No | Yes (Camoufox + Playwright) |
 | Platform posting | No | 19 adapters |
 | Auth management | No | Cookie store + auto-refresh |
+| Site crawling | No | ✅ Free (Jina + Playwright) |
 | Cached repeats | No | <1ms |
 **Tavily** is fast and simple — great for agents that need quick answers. **Spectrawl** returns richer data and does more (browse, auth, post) — but it's slower. Choose based on your use case.
@@ -109,6 +110,41 @@ console.log(page.screenshot)    // PNG buffer (if requested)
 Auto-fallback: if Jina and readability return too little content (<200 chars), Spectrawl renders the page with Playwright and extracts from the rendered DOM. Tavily can't do this — they fail on JS-heavy pages.
+## Crawl
+Give your agent the ability to read an entire website in one call. Free, no API costs.
+Uses [Jina Reader](https://jina.ai/reader) (free, unlimited) with Playwright stealth fallback for JS-heavy sites.
+```js
+// Crawl a docs site — returns clean markdown for every page
+const result = await web.crawl('https://docs.example.com', {
+  depth: 2,       // how many levels deep (default: 1)
+  maxPages: 50,   // max pages to crawl (default: 50)
+  format: 'markdown', // markdown | html | json
+  delay: 300,     // ms between requests (be polite)
+  stealth: false, // use Camoufox for anti-detect
+  auth: 'account' // use stored cookies (crawl behind logins)
+})
+result.pages   // [{ url, title, content, links, depth }]
+result.stats   // { total, crawled, failed, duration }
+```
+**vs Cloudflare's /crawl:**
+- ✅ Free (self-hosted, no per-request cost)
+- ✅ Crawls sites that block Cloudflare IPs
+- ✅ Auth-aware — crawl behind login walls with stored cookies
+- ✅ Stealth mode — bypasses bot detection
+- ✅ Works for AI agents (50-200 pages, not millions)
+**HTTP API:**
+```bash
+curl -X POST http://localhost:3900/crawl \
+  -H "Content-Type: application/json" \
+  -d '{ "url": "https://docs.example.com", "depth": 2, "maxPages": 50 }'
+```
 ## Auth
 Persistent cookie storage (SQLite), multi-account management, automatic expiry detection.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.22",
+  "version": "0.4.1",
   "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/crawl.js ADDED Viewed

@@ -0,0 +1,311 @@
+/**
+ * Spectrawl Crawl Engine v2
+ * Multi-page website crawler using our own browse engine (Camoufox).
+ * No external dependencies (no Jina, no Cloudflare).
+ * Supports sync + async (job-based) modes.
+ */
+const crypto = require('crypto')
+const DEFAULT_OPTS = {
+  depth: 2,
+  maxPages: 50,
+  format: 'markdown',   // markdown | html | json
+  delay: 500,           // ms between requests
+  stealth: true,        // use stealth browsing by default
+  scope: 'domain',      // domain | prefix | any
+  timeout: 30000,
+  includeLinks: true,
+  includePatterns: [],   // wildcard patterns to include
+  excludePatterns: [],   // wildcard patterns to exclude
+  merge: false,          // merge all pages into single result
+  skipPatterns: [
+    /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
+    /\/_next\//,
+    /\/static\//,
+    /\/assets\//,
+    /mintcdn\.com/,
+    /#/,
+    /^mailto:/,
+    /^tel:/,
+    /^javascript:/,
+  ]
+}
+// In-memory job store for async crawls
+const jobs = new Map()
+class CrawlEngine {
+  constructor(browseEngine, cache) {
+    this.browseEngine = browseEngine
+    this.cache = cache
+  }
+  /**
+   * Crawl a website starting from a URL (synchronous — waits for completion).
+   */
+  async crawl(startUrl, opts = {}, cookies = null) {
+    // Filter out undefined values from opts to avoid overriding defaults
+    const cleanOpts = Object.fromEntries(
+      Object.entries(opts).filter(([_, v]) => v !== undefined)
+    )
+    const config = { ...DEFAULT_OPTS, ...cleanOpts }
+    const startTime = Date.now()
+    const startParsed = new URL(startUrl)
+    const baseDomain = startParsed.hostname
+    const basePrefix = startUrl.replace(/\/$/, '')
+    const visited = new Set()
+    const queue = [{ url: startUrl, depth: 0 }]
+    const pages = []
+    const failed = []
+    while (queue.length > 0 && pages.length < config.maxPages) {
+      const { url, depth } = queue.shift()
+      const normalized = normalizeUrl(url)
+      if (visited.has(normalized)) continue
+      visited.add(normalized)
+      // Scope check
+      if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
+      // Skip pattern check
+      if (config.skipPatterns.some(p => p.test(url))) continue
+      // Include/exclude pattern check
+      if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
+      try {
+        const page = await this._fetchPage(url, config, cookies)
+        if (!page) { failed.push({ url, error: 'empty' }); continue }
+        const links = page.links || []
+        pages.push({
+          url,
+          title: page.title || '',
+          content: page.content || '',
+          links: config.includeLinks ? links : undefined,
+          depth
+        })
+        // Enqueue child links
+        if (depth < config.depth) {
+          for (const link of links) {
+            const absLink = resolveUrl(link, url)
+            if (!absLink) continue
+            const normLink = normalizeUrl(absLink)
+            if (!visited.has(normLink)) {
+              queue.push({ url: absLink, depth: depth + 1 })
+            }
+          }
+        }
+        if (queue.length > 0 && config.delay > 0) {
+          await sleep(config.delay)
+        }
+      } catch (e) {
+        failed.push({ url, error: e.message })
+      }
+    }
+    const result = {
+      startUrl,
+      pages,
+      stats: {
+        total: visited.size,
+        crawled: pages.length,
+        failed: failed.length,
+        duration: Date.now() - startTime
+      },
+      failed: failed.length > 0 ? failed : undefined
+    }
+    // Merge mode: combine all pages into single content
+    if (config.merge) {
+      result.merged = pages.map(p => {
+        return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
+      }).join('\n\n---\n\n')
+    }
+    return result
+  }
+  /**
+   * Start an async crawl job. Returns job ID immediately.
+   */
+  startJob(startUrl, opts = {}, cookies = null) {
+    const jobId = crypto.randomUUID()
+    const job = {
+      id: jobId,
+      startUrl,
+      status: 'running',
+      started: Date.now(),
+      finished: 0,
+      total: 0,
+      pages: [],
+      failed: [],
+      error: null
+    }
+    jobs.set(jobId, job)
+    // Run crawl in background
+    this.crawl(startUrl, opts, cookies)
+      .then(result => {
+        job.status = 'completed'
+        job.pages = result.pages
+        job.failed = result.failed || []
+        job.finished = result.stats.crawled
+        job.total = result.stats.total
+        job.duration = result.stats.duration
+      })
+      .catch(err => {
+        job.status = 'errored'
+        job.error = err.message
+      })
+    return { jobId, status: 'running' }
+  }
+  /**
+   * Get job status/results.
+   */
+  getJob(jobId) {
+    const job = jobs.get(jobId)
+    if (!job) return null
+    return {
+      id: job.id,
+      startUrl: job.startUrl,
+      status: job.status,
+      started: job.started,
+      finished: job.finished,
+      total: job.total,
+      pageCount: job.pages.length,
+      error: job.error,
+      // Only include pages if completed
+      pages: job.status === 'completed' ? job.pages : undefined,
+      failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
+      duration: job.duration
+    }
+  }
+  /**
+   * List all jobs.
+   */
+  listJobs() {
+    return Array.from(jobs.values()).map(j => ({
+      id: j.id,
+      startUrl: j.startUrl,
+      status: j.status,
+      pageCount: j.pages.length,
+      started: j.started
+    }))
+  }
+  async _fetchPage(url, config, cookies) {
+    // Use our own browse engine (Camoufox) — no external dependencies
+    try {
+      const result = await this.browseEngine.browse(url, {
+        stealth: config.stealth,
+        _cookies: cookies,
+        timeout: config.timeout,
+        html: true,    // request raw HTML for link extraction
+        noCache: true  // always fetch fresh for crawling
+      })
+      if (result?.content) {
+        // Extract links from HTML if available, otherwise from markdown content
+        const linkSource = result.html || result.content
+        return {
+          title: result.title || '',
+          content: result.content,
+          links: extractLinks(linkSource, url)
+        }
+      }
+    } catch (e) {
+      throw new Error(`Failed to fetch ${url}: ${e.message}`)
+    }
+    return null
+  }
+  _inScope(url, baseDomain, basePrefix, scope) {
+    try {
+      const parsed = new URL(url)
+      if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
+      if (scope === 'prefix') return url.startsWith(basePrefix)
+      return true // 'any'
+    } catch {
+      return false
+    }
+  }
+  _matchesFilters(url, includePatterns, excludePatterns) {
+    // Exclude takes priority
+    if (excludePatterns && excludePatterns.length > 0) {
+      for (const pattern of excludePatterns) {
+        if (wildcardMatch(url, pattern)) return false
+      }
+    }
+    // If include patterns specified, URL must match at least one
+    if (includePatterns && includePatterns.length > 0) {
+      return includePatterns.some(pattern => wildcardMatch(url, pattern))
+    }
+    return true
+  }
+}
+/**
+ * Wildcard matching: * matches anything except /, ** matches everything including /
+ */
+function wildcardMatch(str, pattern) {
+  const regex = pattern
+    .replace(/[.+^${}()|[\]\\]/g, '\\$&')  // escape regex chars
+    .replace(/\*\*/g, '{{GLOBSTAR}}')
+    .replace(/\*/g, '[^/]*')
+    .replace(/\{\{GLOBSTAR\}\}/g, '.*')
+  return new RegExp('^' + regex + '$').test(str)
+}
+function extractLinks(content, baseUrl) {
+  const links = []
+  // Extract from href attributes (HTML)
+  const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
+  for (const m of hrefMatches) {
+    const resolved = resolveUrl(m[1], baseUrl)
+    if (resolved && !links.includes(resolved)) links.push(resolved)
+  }
+  // Extract from markdown links
+  const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
+  for (const m of mdMatches) {
+    if (!links.includes(m[2])) links.push(m[2])
+  }
+  return links
+}
+function resolveUrl(url, base) {
+  try {
+    if (url.startsWith('http')) return url
+    return new URL(url, base).href
+  } catch {
+    return null
+  }
+}
+function normalizeUrl(url) {
+  try {
+    const u = new URL(url)
+    u.hash = ''
+    // Remove trailing slash for consistency
+    let href = u.href
+    if (href.endsWith('/') && u.pathname !== '/') {
+      href = href.slice(0, -1)
+    }
+    return href
+  } catch {
+    return url
+  }
+}
+function sleep(ms) {
+  return new Promise(r => setTimeout(r, ms))
+}
+module.exports = { CrawlEngine }

package/src/index.js CHANGED Viewed

@@ -7,6 +7,7 @@ const { SearchEngine } = require('./search')
 const { BrowseEngine } = require('./browse')
 const { AuthManager } = require('./auth')
 const { ActEngine } = require('./act')
+const { CrawlEngine } = require('./crawl')
 const { Cache } = require('./cache')
 const { EventEmitter, EVENTS } = require('./events')
 const { CookieRefresher } = require('./auth/refresh')
@@ -36,6 +37,7 @@ class Spectrawl {
     this.browseEngine = new BrowseEngine(this.config.browse, this.cache)
     this.auth = new AuthManager(this.config.auth)
     this.actEngine = new ActEngine(this.config, this.auth, this.browseEngine)
+    this.crawlEngine = new CrawlEngine(this.browseEngine, this.cache)
     this.refresher = new CookieRefresher(this.auth, this.events, this.config.auth)
   }
@@ -75,6 +77,42 @@ class Spectrawl {
     return this.browseEngine.browse(url, opts)
   }
+  /**
+   * Crawl a website recursively. Returns clean markdown for every page.
+   * Uses Jina Reader (free) with Playwright stealth fallback.
+   * @param {string} url - Starting URL
+   * @param {object} opts - { depth, maxPages, format, delay, stealth, scope, auth }
+   * @returns {Promise<{pages[], stats, failed?}>}
+   */
+  async crawl(url, opts = {}) {
+    let cookies = null
+    if (opts.auth) {
+      cookies = await this.auth.getCookies(opts.auth)
+    }
+    return this.crawlEngine.crawl(url, opts, cookies)
+  }
+  /**
+   * Start an async crawl job. Returns job ID immediately.
+   */
+  startCrawlJob(url, opts = {}) {
+    return this.crawlEngine.startJob(url, opts)
+  }
+  /**
+   * Get crawl job status/results.
+   */
+  getCrawlJob(jobId) {
+    return this.crawlEngine.getJob(jobId)
+  }
+  /**
+   * List all crawl jobs.
+   */
+  listCrawlJobs() {
+    return this.crawlEngine.listJobs()
+  }
   /**
    * Perform an authenticated action on a platform.
    * @param {string} platform - Platform name (x, reddit, devto, etc.)

package/src/server.js CHANGED Viewed

@@ -52,6 +52,37 @@ const server = http.createServer(async (req, res) => {
       return json(res, result)
     }
+    if (req.method === 'POST' && path === '/crawl') {
+      const body = await readBody(req)
+      const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
+              includePatterns, excludePatterns, merge, async: asyncMode } = body
+      if (!targetUrl) return error(res, 400, 'url is required')
+      const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
+      if (asyncMode) {
+        // Async mode: return job ID immediately
+        const job = spectrawl.startCrawlJob(targetUrl, opts)
+        return json(res, job)
+      }
+      const result = await spectrawl.crawl(targetUrl, opts)
+      return json(res, result)
+    }
+    if (req.method === 'GET' && path.startsWith('/crawl/')) {
+      const jobId = path.split('/crawl/')[1]
+      if (!jobId) return error(res, 400, 'job ID is required')
+      const job = spectrawl.getCrawlJob(jobId)
+      if (!job) return error(res, 404, 'job not found')
+      return json(res, job)
+    }
+    if (req.method === 'GET' && path === '/crawl/jobs') {
+      const jobList = spectrawl.listCrawlJobs()
+      return json(res, { jobs: jobList })
+    }
     if (req.method === 'POST' && path === '/act') {
       const body = await readBody(req)
       const { platform, action, ...params } = body