npm - spectrawl - Versions diffs - 0.3.22 → 0.4.0 - Mend

spectrawl 0.3.22 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 The unified web layer for AI agents. Search, browse, authenticate, and act on platforms — one package, self-hosted.
-**5,000 free searches/month** via Gemini Grounded Search. Full page scraping, stealth browsing, 19 platform adapters.
+**5,000 free searches/month** via Gemini Grounded Search. Full site crawling, stealth browsing, 19 platform adapters.
 ## What It Does
@@ -57,6 +57,7 @@ Different tools for different needs.
 | Stealth browsing | No | Yes (Camoufox + Playwright) |
 | Platform posting | No | 19 adapters |
 | Auth management | No | Cookie store + auto-refresh |
+| Site crawling | No | ✅ Free (Jina + Playwright) |
 | Cached repeats | No | <1ms |
 **Tavily** is fast and simple — great for agents that need quick answers. **Spectrawl** returns richer data and does more (browse, auth, post) — but it's slower. Choose based on your use case.
@@ -109,6 +110,41 @@ console.log(page.screenshot)    // PNG buffer (if requested)
 Auto-fallback: if Jina and readability return too little content (<200 chars), Spectrawl renders the page with Playwright and extracts from the rendered DOM. Tavily can't do this — they fail on JS-heavy pages.
+## Crawl
+Give your agent the ability to read an entire website in one call. Free, no API costs.
+Uses [Jina Reader](https://jina.ai/reader) (free, unlimited) with Playwright stealth fallback for JS-heavy sites.
+```js
+// Crawl a docs site — returns clean markdown for every page
+const result = await web.crawl('https://docs.example.com', {
+  depth: 2,       // how many levels deep (default: 1)
+  maxPages: 50,   // max pages to crawl (default: 50)
+  format: 'markdown', // markdown | html | json
+  delay: 300,     // ms between requests (be polite)
+  stealth: false, // use Camoufox for anti-detect
+  auth: 'account' // use stored cookies (crawl behind logins)
+})
+result.pages   // [{ url, title, content, links, depth }]
+result.stats   // { total, crawled, failed, duration }
+```
+**vs Cloudflare's /crawl:**
+- ✅ Free (self-hosted, no per-request cost)
+- ✅ Crawls sites that block Cloudflare IPs
+- ✅ Auth-aware — crawl behind login walls with stored cookies
+- ✅ Stealth mode — bypasses bot detection
+- ✅ Works for AI agents (50-200 pages, not millions)
+**HTTP API:**
+```bash
+curl -X POST http://localhost:3900/crawl \
+  -H "Content-Type: application/json" \
+  -d '{ "url": "https://docs.example.com", "depth": 2, "maxPages": 50 }'
+```
 ## Auth
 Persistent cookie storage (SQLite), multi-account management, automatic expiry detection.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.22",
+  "version": "0.4.0",
   "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/crawl.js ADDED Viewed

@@ -0,0 +1,249 @@
+/**
+ * Spectrawl Crawl Engine
+ * Recursively crawls a website using Jina Reader (free) with Playwright fallback.
+ * Designed for AI agents: returns clean markdown, not raw HTML.
+ */
+const https = require('https')
+const http = require('http')
+const DEFAULT_OPTS = {
+  depth: 1,
+  maxPages: 50,
+  format: 'markdown',   // markdown | html | json
+  delay: 300,           // ms between requests
+  stealth: false,
+  scope: 'domain',      // domain | prefix | any
+  timeout: 15000,
+  includeLinks: true,
+  skipPatterns: [
+    /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)$/i,
+    /#/,
+    /^mailto:/,
+    /^tel:/,
+    /^javascript:/,
+  ]
+}
+class CrawlEngine {
+  constructor(browseEngine, cache) {
+    this.browseEngine = browseEngine
+    this.cache = cache
+  }
+  /**
+   * Crawl a website starting from a URL.
+   * @param {string} startUrl - Starting URL
+   * @param {object} opts - Crawl options
+   * @param {object} cookies - Optional auth cookies
+   */
+  async crawl(startUrl, opts = {}, cookies = null) {
+    const config = { ...DEFAULT_OPTS, ...opts }
+    const startTime = Date.now()
+    const startParsed = new URL(startUrl)
+    const baseDomain = startParsed.hostname
+    const basePrefix = startUrl.replace(/\/$/, '')
+    const visited = new Set()
+    const queue = [{ url: startUrl, depth: 0 }]
+    const pages = []
+    const failed = []
+    while (queue.length > 0 && pages.length < config.maxPages) {
+      const { url, depth } = queue.shift()
+      const normalized = normalizeUrl(url)
+      if (visited.has(normalized)) continue
+      visited.add(normalized)
+      // Scope check
+      if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
+      // Skip pattern check
+      if (config.skipPatterns.some(p => p.test(url))) continue
+      try {
+        const page = await this._fetchPage(url, config, cookies)
+        if (!page) { failed.push({ url, error: 'empty' }); continue }
+        const links = page.links || []
+        pages.push({
+          url,
+          title: page.title || '',
+          content: page.content || '',
+          links: config.includeLinks ? links : undefined,
+          depth
+        })
+        // Enqueue child links
+        if (depth < config.depth) {
+          for (const link of links) {
+            const absLink = resolveUrl(link, url)
+            if (!absLink) continue
+            const normLink = normalizeUrl(absLink)
+            if (!visited.has(normLink)) {
+              queue.push({ url: absLink, depth: depth + 1 })
+            }
+          }
+        }
+        if (queue.length > 0 && config.delay > 0) {
+          await sleep(config.delay)
+        }
+      } catch (e) {
+        failed.push({ url, error: e.message })
+      }
+    }
+    return {
+      startUrl,
+      pages,
+      stats: {
+        total: visited.size,
+        crawled: pages.length,
+        failed: failed.length,
+        duration: Date.now() - startTime
+      },
+      failed: failed.length > 0 ? failed : undefined
+    }
+  }
+  async _fetchPage(url, config, cookies) {
+    // Try Jina Reader first (free, fast, clean markdown)
+    try {
+      const jinaUrl = `https://r.jina.ai/${url}`
+      const content = await fetchText(jinaUrl, {
+        'Accept': 'text/markdown',
+        'X-Return-Format': config.format === 'html' ? 'html' : 'markdown',
+        'X-With-Links-Summary': 'true',
+        'X-Timeout': '10'
+      })
+      if (content && content.length > 100) {
+        return parseJinaResponse(content, url)
+      }
+    } catch (e) {
+      // fall through to Playwright
+    }
+    // Playwright fallback (stealth mode)
+    try {
+      const result = await this.browseEngine.browse(url, {
+        stealth: config.stealth,
+        _cookies: cookies,
+        timeout: config.timeout
+      })
+      if (result?.content) {
+        return {
+          title: result.title || '',
+          content: result.content,
+          links: extractLinks(result.html || result.content, url)
+        }
+      }
+    } catch (e) {
+      throw new Error(`Failed to fetch ${url}: ${e.message}`)
+    }
+    return null
+  }
+  _inScope(url, baseDomain, basePrefix, scope) {
+    try {
+      const parsed = new URL(url)
+      if (scope === 'domain') return parsed.hostname === baseDomain
+      if (scope === 'prefix') return url.startsWith(basePrefix)
+      return true // 'any'
+    } catch {
+      return false
+    }
+  }
+}
+function parseJinaResponse(content, sourceUrl) {
+  // Jina returns markdown with a header block
+  const lines = content.split('\n')
+  let title = ''
+  const links = []
+  const contentLines = []
+  let inLinksSummary = false
+  for (const line of lines) {
+    if (line.startsWith('Title:')) {
+      title = line.replace('Title:', '').trim()
+    } else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
+      inLinksSummary = true
+    } else if (inLinksSummary) {
+      // Extract markdown links [text](url)
+      const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
+      for (const m of matches) links.push(m[2])
+    } else {
+      contentLines.push(line)
+    }
+  }
+  // Also extract inline links from content
+  const inlineMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
+  for (const m of inlineMatches) {
+    if (!links.includes(m[2])) links.push(m[2])
+  }
+  return {
+    title: title || extractTitleFromMarkdown(contentLines.join('\n')),
+    content: contentLines.join('\n').trim(),
+    links: [...new Set(links)]
+  }
+}
+function extractLinks(html, baseUrl) {
+  const links = []
+  const matches = html.matchAll(/href=["']([^"']+)["']/gi)
+  for (const m of matches) {
+    const resolved = resolveUrl(m[1], baseUrl)
+    if (resolved && !links.includes(resolved)) links.push(resolved)
+  }
+  return links
+}
+function extractTitleFromMarkdown(content) {
+  const match = content.match(/^#\s+(.+)/m)
+  return match ? match[1].trim() : ''
+}
+function resolveUrl(url, base) {
+  try {
+    if (url.startsWith('http')) return url
+    return new URL(url, base).href
+  } catch {
+    return null
+  }
+}
+function normalizeUrl(url) {
+  try {
+    const u = new URL(url)
+    u.hash = ''
+    return u.href.replace(/\/$/, '')
+  } catch {
+    return url
+  }
+}
+function fetchText(url, headers = {}) {
+  return new Promise((resolve, reject) => {
+    const mod = url.startsWith('https') ? https : http
+    const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
+      if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
+      let d = ''
+      res.on('data', c => d += c)
+      res.on('end', () => resolve(d))
+    })
+    req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
+    req.on('error', reject)
+    req.end()
+  })
+}
+function sleep(ms) {
+  return new Promise(r => setTimeout(r, ms))
+}
+module.exports = { CrawlEngine }

package/src/index.js CHANGED Viewed

@@ -7,6 +7,7 @@ const { SearchEngine } = require('./search')
 const { BrowseEngine } = require('./browse')
 const { AuthManager } = require('./auth')
 const { ActEngine } = require('./act')
+const { CrawlEngine } = require('./crawl')
 const { Cache } = require('./cache')
 const { EventEmitter, EVENTS } = require('./events')
 const { CookieRefresher } = require('./auth/refresh')
@@ -36,6 +37,7 @@ class Spectrawl {
     this.browseEngine = new BrowseEngine(this.config.browse, this.cache)
     this.auth = new AuthManager(this.config.auth)
     this.actEngine = new ActEngine(this.config, this.auth, this.browseEngine)
+    this.crawlEngine = new CrawlEngine(this.browseEngine, this.cache)
     this.refresher = new CookieRefresher(this.auth, this.events, this.config.auth)
   }
@@ -75,6 +77,21 @@ class Spectrawl {
     return this.browseEngine.browse(url, opts)
   }
+  /**
+   * Crawl a website recursively. Returns clean markdown for every page.
+   * Uses Jina Reader (free) with Playwright stealth fallback.
+   * @param {string} url - Starting URL
+   * @param {object} opts - { depth, maxPages, format, delay, stealth, scope, auth }
+   * @returns {Promise<{pages[], stats, failed?}>}
+   */
+  async crawl(url, opts = {}) {
+    let cookies = null
+    if (opts.auth) {
+      cookies = await this.auth.getCookies(opts.auth)
+    }
+    return this.crawlEngine.crawl(url, opts, cookies)
+  }
   /**
    * Perform an authenticated action on a platform.
    * @param {string} platform - Platform name (x, reddit, devto, etc.)

package/src/server.js CHANGED Viewed

@@ -52,6 +52,14 @@ const server = http.createServer(async (req, res) => {
       return json(res, result)
     }
+    if (req.method === 'POST' && path === '/crawl') {
+      const body = await readBody(req)
+      const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth } = body
+      if (!targetUrl) return error(res, 400, 'url is required')
+      const result = await spectrawl.crawl(targetUrl, { depth, maxPages, format, delay, stealth, scope, auth })
+      return json(res, result)
+    }
     if (req.method === 'POST' && path === '/act') {
       const body = await readBody(req)
       const { platform, action, ...params } = body