spectrawl 0.3.22 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  The unified web layer for AI agents. Search, browse, authenticate, and act on platforms — one package, self-hosted.
4
4
 
5
- **5,000 free searches/month** via Gemini Grounded Search. Full page scraping, stealth browsing, 19 platform adapters.
5
+ **5,000 free searches/month** via Gemini Grounded Search. Full site crawling, stealth browsing, 19 platform adapters.
6
6
 
7
7
  ## What It Does
8
8
 
@@ -57,6 +57,7 @@ Different tools for different needs.
57
57
  | Stealth browsing | No | Yes (Camoufox + Playwright) |
58
58
  | Platform posting | No | 19 adapters |
59
59
  | Auth management | No | Cookie store + auto-refresh |
60
+ | Site crawling | No | ✅ Free (Jina + Playwright) |
60
61
  | Cached repeats | No | <1ms |
61
62
 
62
63
  **Tavily** is fast and simple — great for agents that need quick answers. **Spectrawl** returns richer data and does more (browse, auth, post) — but it's slower. Choose based on your use case.
@@ -109,6 +110,41 @@ console.log(page.screenshot) // PNG buffer (if requested)
109
110
 
110
111
  Auto-fallback: if Jina and readability return too little content (<200 chars), Spectrawl renders the page with Playwright and extracts from the rendered DOM. Tavily can't do this — they fail on JS-heavy pages.
111
112
 
113
+ ## Crawl
114
+
115
+ Give your agent the ability to read an entire website in one call. Free, no API costs.
116
+
117
+ Uses [Jina Reader](https://jina.ai/reader) (free, unlimited) with Playwright stealth fallback for JS-heavy sites.
118
+
119
+ ```js
120
+ // Crawl a docs site — returns clean markdown for every page
121
+ const result = await web.crawl('https://docs.example.com', {
122
+ depth: 2, // how many levels deep (default: 1)
123
+ maxPages: 50, // max pages to crawl (default: 50)
124
+ format: 'markdown', // markdown | html | json
125
+ delay: 300, // ms between requests (be polite)
126
+ stealth: false, // use Camoufox for anti-detect
127
+ auth: 'account' // use stored cookies (crawl behind logins)
128
+ })
129
+
130
+ result.pages // [{ url, title, content, links, depth }]
131
+ result.stats // { total, crawled, failed, duration }
132
+ ```
133
+
134
+ **vs Cloudflare's /crawl:**
135
+ - ✅ Free (self-hosted, no per-request cost)
136
+ - ✅ Crawls sites that block Cloudflare IPs
137
+ - ✅ Auth-aware — crawl behind login walls with stored cookies
138
+ - ✅ Stealth mode — bypasses bot detection
139
+ - ✅ Works for AI agents (50-200 pages, not millions)
140
+
141
+ **HTTP API:**
142
+ ```bash
143
+ curl -X POST http://localhost:3900/crawl \
144
+ -H "Content-Type: application/json" \
145
+ -d '{ "url": "https://docs.example.com", "depth": 2, "maxPages": 50 }'
146
+ ```
147
+
112
148
  ## Auth
113
149
 
114
150
  Persistent cookie storage (SQLite), multi-account management, automatic expiry detection.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.22",
3
+ "version": "0.4.1",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/crawl.js ADDED
@@ -0,0 +1,311 @@
1
+ /**
2
+ * Spectrawl Crawl Engine v2
3
+ * Multi-page website crawler using our own browse engine (Camoufox).
4
+ * No external dependencies (no Jina, no Cloudflare).
5
+ * Supports sync + async (job-based) modes.
6
+ */
7
+
8
+ const crypto = require('crypto')
9
+
10
+ const DEFAULT_OPTS = {
11
+ depth: 2,
12
+ maxPages: 50,
13
+ format: 'markdown', // markdown | html | json
14
+ delay: 500, // ms between requests
15
+ stealth: true, // use stealth browsing by default
16
+ scope: 'domain', // domain | prefix | any
17
+ timeout: 30000,
18
+ includeLinks: true,
19
+ includePatterns: [], // wildcard patterns to include
20
+ excludePatterns: [], // wildcard patterns to exclude
21
+ merge: false, // merge all pages into single result
22
+ skipPatterns: [
23
+ /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
24
+ /\/_next\//,
25
+ /\/static\//,
26
+ /\/assets\//,
27
+ /mintcdn\.com/,
28
+ /#/,
29
+ /^mailto:/,
30
+ /^tel:/,
31
+ /^javascript:/,
32
+ ]
33
+ }
34
+
35
+ // In-memory job store for async crawls
36
+ const jobs = new Map()
37
+
38
+ class CrawlEngine {
39
+ constructor(browseEngine, cache) {
40
+ this.browseEngine = browseEngine
41
+ this.cache = cache
42
+ }
43
+
44
+ /**
45
+ * Crawl a website starting from a URL (synchronous — waits for completion).
46
+ */
47
+ async crawl(startUrl, opts = {}, cookies = null) {
48
+ // Filter out undefined values from opts to avoid overriding defaults
49
+ const cleanOpts = Object.fromEntries(
50
+ Object.entries(opts).filter(([_, v]) => v !== undefined)
51
+ )
52
+ const config = { ...DEFAULT_OPTS, ...cleanOpts }
53
+ const startTime = Date.now()
54
+
55
+ const startParsed = new URL(startUrl)
56
+ const baseDomain = startParsed.hostname
57
+ const basePrefix = startUrl.replace(/\/$/, '')
58
+
59
+ const visited = new Set()
60
+ const queue = [{ url: startUrl, depth: 0 }]
61
+ const pages = []
62
+ const failed = []
63
+
64
+ while (queue.length > 0 && pages.length < config.maxPages) {
65
+ const { url, depth } = queue.shift()
66
+ const normalized = normalizeUrl(url)
67
+ if (visited.has(normalized)) continue
68
+ visited.add(normalized)
69
+
70
+ // Scope check
71
+ if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
72
+ // Skip pattern check
73
+ if (config.skipPatterns.some(p => p.test(url))) continue
74
+ // Include/exclude pattern check
75
+ if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
76
+
77
+ try {
78
+ const page = await this._fetchPage(url, config, cookies)
79
+ if (!page) { failed.push({ url, error: 'empty' }); continue }
80
+
81
+ const links = page.links || []
82
+ pages.push({
83
+ url,
84
+ title: page.title || '',
85
+ content: page.content || '',
86
+ links: config.includeLinks ? links : undefined,
87
+ depth
88
+ })
89
+
90
+ // Enqueue child links
91
+ if (depth < config.depth) {
92
+ for (const link of links) {
93
+ const absLink = resolveUrl(link, url)
94
+ if (!absLink) continue
95
+ const normLink = normalizeUrl(absLink)
96
+ if (!visited.has(normLink)) {
97
+ queue.push({ url: absLink, depth: depth + 1 })
98
+ }
99
+ }
100
+ }
101
+
102
+ if (queue.length > 0 && config.delay > 0) {
103
+ await sleep(config.delay)
104
+ }
105
+ } catch (e) {
106
+ failed.push({ url, error: e.message })
107
+ }
108
+ }
109
+
110
+ const result = {
111
+ startUrl,
112
+ pages,
113
+ stats: {
114
+ total: visited.size,
115
+ crawled: pages.length,
116
+ failed: failed.length,
117
+ duration: Date.now() - startTime
118
+ },
119
+ failed: failed.length > 0 ? failed : undefined
120
+ }
121
+
122
+ // Merge mode: combine all pages into single content
123
+ if (config.merge) {
124
+ result.merged = pages.map(p => {
125
+ return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
126
+ }).join('\n\n---\n\n')
127
+ }
128
+
129
+ return result
130
+ }
131
+
132
+ /**
133
+ * Start an async crawl job. Returns job ID immediately.
134
+ */
135
+ startJob(startUrl, opts = {}, cookies = null) {
136
+ const jobId = crypto.randomUUID()
137
+ const job = {
138
+ id: jobId,
139
+ startUrl,
140
+ status: 'running',
141
+ started: Date.now(),
142
+ finished: 0,
143
+ total: 0,
144
+ pages: [],
145
+ failed: [],
146
+ error: null
147
+ }
148
+ jobs.set(jobId, job)
149
+
150
+ // Run crawl in background
151
+ this.crawl(startUrl, opts, cookies)
152
+ .then(result => {
153
+ job.status = 'completed'
154
+ job.pages = result.pages
155
+ job.failed = result.failed || []
156
+ job.finished = result.stats.crawled
157
+ job.total = result.stats.total
158
+ job.duration = result.stats.duration
159
+ })
160
+ .catch(err => {
161
+ job.status = 'errored'
162
+ job.error = err.message
163
+ })
164
+
165
+ return { jobId, status: 'running' }
166
+ }
167
+
168
+ /**
169
+ * Get job status/results.
170
+ */
171
+ getJob(jobId) {
172
+ const job = jobs.get(jobId)
173
+ if (!job) return null
174
+ return {
175
+ id: job.id,
176
+ startUrl: job.startUrl,
177
+ status: job.status,
178
+ started: job.started,
179
+ finished: job.finished,
180
+ total: job.total,
181
+ pageCount: job.pages.length,
182
+ error: job.error,
183
+ // Only include pages if completed
184
+ pages: job.status === 'completed' ? job.pages : undefined,
185
+ failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
186
+ duration: job.duration
187
+ }
188
+ }
189
+
190
+ /**
191
+ * List all jobs.
192
+ */
193
+ listJobs() {
194
+ return Array.from(jobs.values()).map(j => ({
195
+ id: j.id,
196
+ startUrl: j.startUrl,
197
+ status: j.status,
198
+ pageCount: j.pages.length,
199
+ started: j.started
200
+ }))
201
+ }
202
+
203
+ async _fetchPage(url, config, cookies) {
204
+ // Use our own browse engine (Camoufox) — no external dependencies
205
+ try {
206
+ const result = await this.browseEngine.browse(url, {
207
+ stealth: config.stealth,
208
+ _cookies: cookies,
209
+ timeout: config.timeout,
210
+ html: true, // request raw HTML for link extraction
211
+ noCache: true // always fetch fresh for crawling
212
+ })
213
+ if (result?.content) {
214
+ // Extract links from HTML if available, otherwise from markdown content
215
+ const linkSource = result.html || result.content
216
+ return {
217
+ title: result.title || '',
218
+ content: result.content,
219
+ links: extractLinks(linkSource, url)
220
+ }
221
+ }
222
+ } catch (e) {
223
+ throw new Error(`Failed to fetch ${url}: ${e.message}`)
224
+ }
225
+
226
+ return null
227
+ }
228
+
229
+ _inScope(url, baseDomain, basePrefix, scope) {
230
+ try {
231
+ const parsed = new URL(url)
232
+ if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
233
+ if (scope === 'prefix') return url.startsWith(basePrefix)
234
+ return true // 'any'
235
+ } catch {
236
+ return false
237
+ }
238
+ }
239
+
240
+ _matchesFilters(url, includePatterns, excludePatterns) {
241
+ // Exclude takes priority
242
+ if (excludePatterns && excludePatterns.length > 0) {
243
+ for (const pattern of excludePatterns) {
244
+ if (wildcardMatch(url, pattern)) return false
245
+ }
246
+ }
247
+ // If include patterns specified, URL must match at least one
248
+ if (includePatterns && includePatterns.length > 0) {
249
+ return includePatterns.some(pattern => wildcardMatch(url, pattern))
250
+ }
251
+ return true
252
+ }
253
+ }
254
+
255
+ /**
256
+ * Wildcard matching: * matches anything except /, ** matches everything including /
257
+ */
258
+ function wildcardMatch(str, pattern) {
259
+ const regex = pattern
260
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex chars
261
+ .replace(/\*\*/g, '{{GLOBSTAR}}')
262
+ .replace(/\*/g, '[^/]*')
263
+ .replace(/\{\{GLOBSTAR\}\}/g, '.*')
264
+ return new RegExp('^' + regex + '$').test(str)
265
+ }
266
+
267
+ function extractLinks(content, baseUrl) {
268
+ const links = []
269
+ // Extract from href attributes (HTML)
270
+ const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
271
+ for (const m of hrefMatches) {
272
+ const resolved = resolveUrl(m[1], baseUrl)
273
+ if (resolved && !links.includes(resolved)) links.push(resolved)
274
+ }
275
+ // Extract from markdown links
276
+ const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
277
+ for (const m of mdMatches) {
278
+ if (!links.includes(m[2])) links.push(m[2])
279
+ }
280
+ return links
281
+ }
282
+
283
+ function resolveUrl(url, base) {
284
+ try {
285
+ if (url.startsWith('http')) return url
286
+ return new URL(url, base).href
287
+ } catch {
288
+ return null
289
+ }
290
+ }
291
+
292
+ function normalizeUrl(url) {
293
+ try {
294
+ const u = new URL(url)
295
+ u.hash = ''
296
+ // Remove trailing slash for consistency
297
+ let href = u.href
298
+ if (href.endsWith('/') && u.pathname !== '/') {
299
+ href = href.slice(0, -1)
300
+ }
301
+ return href
302
+ } catch {
303
+ return url
304
+ }
305
+ }
306
+
307
+ function sleep(ms) {
308
+ return new Promise(r => setTimeout(r, ms))
309
+ }
310
+
311
+ module.exports = { CrawlEngine }
package/src/index.js CHANGED
@@ -7,6 +7,7 @@ const { SearchEngine } = require('./search')
7
7
  const { BrowseEngine } = require('./browse')
8
8
  const { AuthManager } = require('./auth')
9
9
  const { ActEngine } = require('./act')
10
+ const { CrawlEngine } = require('./crawl')
10
11
  const { Cache } = require('./cache')
11
12
  const { EventEmitter, EVENTS } = require('./events')
12
13
  const { CookieRefresher } = require('./auth/refresh')
@@ -36,6 +37,7 @@ class Spectrawl {
36
37
  this.browseEngine = new BrowseEngine(this.config.browse, this.cache)
37
38
  this.auth = new AuthManager(this.config.auth)
38
39
  this.actEngine = new ActEngine(this.config, this.auth, this.browseEngine)
40
+ this.crawlEngine = new CrawlEngine(this.browseEngine, this.cache)
39
41
  this.refresher = new CookieRefresher(this.auth, this.events, this.config.auth)
40
42
  }
41
43
 
@@ -75,6 +77,42 @@ class Spectrawl {
75
77
  return this.browseEngine.browse(url, opts)
76
78
  }
77
79
 
80
+ /**
81
+ * Crawl a website recursively. Returns clean markdown for every page.
82
+ * Uses Jina Reader (free) with Playwright stealth fallback.
83
+ * @param {string} url - Starting URL
84
+ * @param {object} opts - { depth, maxPages, format, delay, stealth, scope, auth }
85
+ * @returns {Promise<{pages[], stats, failed?}>}
86
+ */
87
+ async crawl(url, opts = {}) {
88
+ let cookies = null
89
+ if (opts.auth) {
90
+ cookies = await this.auth.getCookies(opts.auth)
91
+ }
92
+ return this.crawlEngine.crawl(url, opts, cookies)
93
+ }
94
+
95
+ /**
96
+ * Start an async crawl job. Returns job ID immediately.
97
+ */
98
+ startCrawlJob(url, opts = {}) {
99
+ return this.crawlEngine.startJob(url, opts)
100
+ }
101
+
102
+ /**
103
+ * Get crawl job status/results.
104
+ */
105
+ getCrawlJob(jobId) {
106
+ return this.crawlEngine.getJob(jobId)
107
+ }
108
+
109
+ /**
110
+ * List all crawl jobs.
111
+ */
112
+ listCrawlJobs() {
113
+ return this.crawlEngine.listJobs()
114
+ }
115
+
78
116
  /**
79
117
  * Perform an authenticated action on a platform.
80
118
  * @param {string} platform - Platform name (x, reddit, devto, etc.)
package/src/server.js CHANGED
@@ -52,6 +52,37 @@ const server = http.createServer(async (req, res) => {
52
52
  return json(res, result)
53
53
  }
54
54
 
55
+ if (req.method === 'POST' && path === '/crawl') {
56
+ const body = await readBody(req)
57
+ const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
58
+ includePatterns, excludePatterns, merge, async: asyncMode } = body
59
+ if (!targetUrl) return error(res, 400, 'url is required')
60
+
61
+ const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
62
+
63
+ if (asyncMode) {
64
+ // Async mode: return job ID immediately
65
+ const job = spectrawl.startCrawlJob(targetUrl, opts)
66
+ return json(res, job)
67
+ }
68
+
69
+ const result = await spectrawl.crawl(targetUrl, opts)
70
+ return json(res, result)
71
+ }
72
+
73
+ if (req.method === 'GET' && path.startsWith('/crawl/')) {
74
+ const jobId = path.split('/crawl/')[1]
75
+ if (!jobId) return error(res, 400, 'job ID is required')
76
+ const job = spectrawl.getCrawlJob(jobId)
77
+ if (!job) return error(res, 404, 'job not found')
78
+ return json(res, job)
79
+ }
80
+
81
+ if (req.method === 'GET' && path === '/crawl/jobs') {
82
+ const jobList = spectrawl.listCrawlJobs()
83
+ return json(res, { jobs: jobList })
84
+ }
85
+
55
86
  if (req.method === 'POST' && path === '/act') {
56
87
  const body = await readBody(req)
57
88
  const { platform, action, ...params } = body