npm - spectrawl - Versions diffs - 0.4.0 → 0.4.1 - Mend

spectrawl 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.4.0",
+  "version": "0.4.1",
   "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/crawl.js CHANGED Viewed

@@ -1,23 +1,30 @@
 /**
- * Spectrawl Crawl Engine
- * Recursively crawls a website using Jina Reader (free) with Playwright fallback.
- * Designed for AI agents: returns clean markdown, not raw HTML.
+ * Spectrawl Crawl Engine v2
+ * Multi-page website crawler using our own browse engine (Camoufox).
+ * No external dependencies (no Jina, no Cloudflare).
+ * Supports sync + async (job-based) modes.
  */
-const https = require('https')
-const http = require('http')
+const crypto = require('crypto')
 const DEFAULT_OPTS = {
-  depth: 1,
+  depth: 2,
   maxPages: 50,
   format: 'markdown',   // markdown | html | json
-  delay: 300,           // ms between requests
-  stealth: false,
+  delay: 500,           // ms between requests
+  stealth: true,        // use stealth browsing by default
   scope: 'domain',      // domain | prefix | any
-  timeout: 15000,
+  timeout: 30000,
   includeLinks: true,
+  includePatterns: [],   // wildcard patterns to include
+  excludePatterns: [],   // wildcard patterns to exclude
+  merge: false,          // merge all pages into single result
   skipPatterns: [
-    /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)$/i,
+    /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
+    /\/_next\//,
+    /\/static\//,
+    /\/assets\//,
+    /mintcdn\.com/,
     /#/,
     /^mailto:/,
     /^tel:/,
@@ -25,6 +32,9 @@ const DEFAULT_OPTS = {
   ]
 }
+// In-memory job store for async crawls
+const jobs = new Map()
 class CrawlEngine {
   constructor(browseEngine, cache) {
     this.browseEngine = browseEngine
@@ -32,13 +42,14 @@ class CrawlEngine {
   }
   /**
-   * Crawl a website starting from a URL.
-   * @param {string} startUrl - Starting URL
-   * @param {object} opts - Crawl options
-   * @param {object} cookies - Optional auth cookies
+   * Crawl a website starting from a URL (synchronous — waits for completion).
    */
   async crawl(startUrl, opts = {}, cookies = null) {
-    const config = { ...DEFAULT_OPTS, ...opts }
+    // Filter out undefined values from opts to avoid overriding defaults
+    const cleanOpts = Object.fromEntries(
+      Object.entries(opts).filter(([_, v]) => v !== undefined)
+    )
+    const config = { ...DEFAULT_OPTS, ...cleanOpts }
     const startTime = Date.now()
     const startParsed = new URL(startUrl)
@@ -60,6 +71,8 @@ class CrawlEngine {
       if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
       // Skip pattern check
       if (config.skipPatterns.some(p => p.test(url))) continue
+      // Include/exclude pattern check
+      if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
       try {
         const page = await this._fetchPage(url, config, cookies)
@@ -94,7 +107,7 @@ class CrawlEngine {
       }
     }
-    return {
+    const result = {
       startUrl,
       pages,
       stats: {
@@ -105,38 +118,105 @@ class CrawlEngine {
       },
       failed: failed.length > 0 ? failed : undefined
     }
+    // Merge mode: combine all pages into single content
+    if (config.merge) {
+      result.merged = pages.map(p => {
+        return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
+      }).join('\n\n---\n\n')
+    }
+    return result
   }
-  async _fetchPage(url, config, cookies) {
-    // Try Jina Reader first (free, fast, clean markdown)
-    try {
-      const jinaUrl = `https://r.jina.ai/${url}`
-      const content = await fetchText(jinaUrl, {
-        'Accept': 'text/markdown',
-        'X-Return-Format': config.format === 'html' ? 'html' : 'markdown',
-        'X-With-Links-Summary': 'true',
-        'X-Timeout': '10'
+  /**
+   * Start an async crawl job. Returns job ID immediately.
+   */
+  startJob(startUrl, opts = {}, cookies = null) {
+    const jobId = crypto.randomUUID()
+    const job = {
+      id: jobId,
+      startUrl,
+      status: 'running',
+      started: Date.now(),
+      finished: 0,
+      total: 0,
+      pages: [],
+      failed: [],
+      error: null
+    }
+    jobs.set(jobId, job)
+    // Run crawl in background
+    this.crawl(startUrl, opts, cookies)
+      .then(result => {
+        job.status = 'completed'
+        job.pages = result.pages
+        job.failed = result.failed || []
+        job.finished = result.stats.crawled
+        job.total = result.stats.total
+        job.duration = result.stats.duration
+      })
+      .catch(err => {
+        job.status = 'errored'
+        job.error = err.message
       })
-      if (content && content.length > 100) {
-        return parseJinaResponse(content, url)
-      }
-    } catch (e) {
-      // fall through to Playwright
+    return { jobId, status: 'running' }
+  }
+  /**
+   * Get job status/results.
+   */
+  getJob(jobId) {
+    const job = jobs.get(jobId)
+    if (!job) return null
+    return {
+      id: job.id,
+      startUrl: job.startUrl,
+      status: job.status,
+      started: job.started,
+      finished: job.finished,
+      total: job.total,
+      pageCount: job.pages.length,
+      error: job.error,
+      // Only include pages if completed
+      pages: job.status === 'completed' ? job.pages : undefined,
+      failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
+      duration: job.duration
     }
+  }
+  /**
+   * List all jobs.
+   */
+  listJobs() {
+    return Array.from(jobs.values()).map(j => ({
+      id: j.id,
+      startUrl: j.startUrl,
+      status: j.status,
+      pageCount: j.pages.length,
+      started: j.started
+    }))
+  }
-    // Playwright fallback (stealth mode)
+  async _fetchPage(url, config, cookies) {
+    // Use our own browse engine (Camoufox) — no external dependencies
     try {
       const result = await this.browseEngine.browse(url, {
         stealth: config.stealth,
         _cookies: cookies,
-        timeout: config.timeout
+        timeout: config.timeout,
+        html: true,    // request raw HTML for link extraction
+        noCache: true  // always fetch fresh for crawling
       })
       if (result?.content) {
+        // Extract links from HTML if available, otherwise from markdown content
+        const linkSource = result.html || result.content
         return {
           title: result.title || '',
           content: result.content,
-          links: extractLinks(result.html || result.content, url)
+          links: extractLinks(linkSource, url)
         }
       }
     } catch (e) {
@@ -149,65 +229,57 @@ class CrawlEngine {
   _inScope(url, baseDomain, basePrefix, scope) {
     try {
       const parsed = new URL(url)
-      if (scope === 'domain') return parsed.hostname === baseDomain
+      if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
       if (scope === 'prefix') return url.startsWith(basePrefix)
       return true // 'any'
     } catch {
       return false
     }
   }
-}
-function parseJinaResponse(content, sourceUrl) {
-  // Jina returns markdown with a header block
-  const lines = content.split('\n')
-  let title = ''
-  const links = []
-  const contentLines = []
-  let inLinksSummary = false
-  for (const line of lines) {
-    if (line.startsWith('Title:')) {
-      title = line.replace('Title:', '').trim()
-    } else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
-      inLinksSummary = true
-    } else if (inLinksSummary) {
-      // Extract markdown links [text](url)
-      const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
-      for (const m of matches) links.push(m[2])
-    } else {
-      contentLines.push(line)
+  _matchesFilters(url, includePatterns, excludePatterns) {
+    // Exclude takes priority
+    if (excludePatterns && excludePatterns.length > 0) {
+      for (const pattern of excludePatterns) {
+        if (wildcardMatch(url, pattern)) return false
+      }
     }
+    // If include patterns specified, URL must match at least one
+    if (includePatterns && includePatterns.length > 0) {
+      return includePatterns.some(pattern => wildcardMatch(url, pattern))
+    }
+    return true
   }
+}
-  // Also extract inline links from content
-  const inlineMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
-  for (const m of inlineMatches) {
-    if (!links.includes(m[2])) links.push(m[2])
-  }
-  return {
-    title: title || extractTitleFromMarkdown(contentLines.join('\n')),
-    content: contentLines.join('\n').trim(),
-    links: [...new Set(links)]
-  }
+/**
+ * Wildcard matching: * matches anything except /, ** matches everything including /
+ */
+function wildcardMatch(str, pattern) {
+  const regex = pattern
+    .replace(/[.+^${}()|[\]\\]/g, '\\$&')  // escape regex chars
+    .replace(/\*\*/g, '{{GLOBSTAR}}')
+    .replace(/\*/g, '[^/]*')
+    .replace(/\{\{GLOBSTAR\}\}/g, '.*')
+  return new RegExp('^' + regex + '$').test(str)
 }
-function extractLinks(html, baseUrl) {
+function extractLinks(content, baseUrl) {
   const links = []
-  const matches = html.matchAll(/href=["']([^"']+)["']/gi)
-  for (const m of matches) {
+  // Extract from href attributes (HTML)
+  const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
+  for (const m of hrefMatches) {
     const resolved = resolveUrl(m[1], baseUrl)
     if (resolved && !links.includes(resolved)) links.push(resolved)
   }
+  // Extract from markdown links
+  const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
+  for (const m of mdMatches) {
+    if (!links.includes(m[2])) links.push(m[2])
+  }
   return links
 }
-function extractTitleFromMarkdown(content) {
-  const match = content.match(/^#\s+(.+)/m)
-  return match ? match[1].trim() : ''
-}
 function resolveUrl(url, base) {
   try {
     if (url.startsWith('http')) return url
@@ -221,27 +293,17 @@ function normalizeUrl(url) {
   try {
     const u = new URL(url)
     u.hash = ''
-    return u.href.replace(/\/$/, '')
+    // Remove trailing slash for consistency
+    let href = u.href
+    if (href.endsWith('/') && u.pathname !== '/') {
+      href = href.slice(0, -1)
+    }
+    return href
   } catch {
     return url
   }
 }
-function fetchText(url, headers = {}) {
-  return new Promise((resolve, reject) => {
-    const mod = url.startsWith('https') ? https : http
-    const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
-      if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
-      let d = ''
-      res.on('data', c => d += c)
-      res.on('end', () => resolve(d))
-    })
-    req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
-    req.on('error', reject)
-    req.end()
-  })
-}
 function sleep(ms) {
   return new Promise(r => setTimeout(r, ms))
 }

package/src/index.js CHANGED Viewed

@@ -92,6 +92,27 @@ class Spectrawl {
     return this.crawlEngine.crawl(url, opts, cookies)
   }
+  /**
+   * Start an async crawl job. Returns job ID immediately.
+   */
+  startCrawlJob(url, opts = {}) {
+    return this.crawlEngine.startJob(url, opts)
+  }
+  /**
+   * Get crawl job status/results.
+   */
+  getCrawlJob(jobId) {
+    return this.crawlEngine.getJob(jobId)
+  }
+  /**
+   * List all crawl jobs.
+   */
+  listCrawlJobs() {
+    return this.crawlEngine.listJobs()
+  }
   /**
    * Perform an authenticated action on a platform.
    * @param {string} platform - Platform name (x, reddit, devto, etc.)

package/src/server.js CHANGED Viewed

@@ -54,12 +54,35 @@ const server = http.createServer(async (req, res) => {
     if (req.method === 'POST' && path === '/crawl') {
       const body = await readBody(req)
-      const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth } = body
+      const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
+              includePatterns, excludePatterns, merge, async: asyncMode } = body
       if (!targetUrl) return error(res, 400, 'url is required')
-      const result = await spectrawl.crawl(targetUrl, { depth, maxPages, format, delay, stealth, scope, auth })
+      const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
+      if (asyncMode) {
+        // Async mode: return job ID immediately
+        const job = spectrawl.startCrawlJob(targetUrl, opts)
+        return json(res, job)
+      }
+      const result = await spectrawl.crawl(targetUrl, opts)
       return json(res, result)
     }
+    if (req.method === 'GET' && path.startsWith('/crawl/')) {
+      const jobId = path.split('/crawl/')[1]
+      if (!jobId) return error(res, 400, 'job ID is required')
+      const job = spectrawl.getCrawlJob(jobId)
+      if (!job) return error(res, 404, 'job not found')
+      return json(res, job)
+    }
+    if (req.method === 'GET' && path === '/crawl/jobs') {
+      const jobList = spectrawl.listCrawlJobs()
+      return json(res, { jobs: jobList })
+    }
     if (req.method === 'POST' && path === '/act') {
       const body = await readBody(req)
       const { platform, action, ...params } = body