spectrawl 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/package.json +1 -1
  2. package/src/crawl.js +108 -47
  3. package/src/server.js +12 -7
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.4.1",
3
+ "version": "0.4.2",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/crawl.js CHANGED
@@ -3,22 +3,30 @@
3
3
  * Multi-page website crawler using our own browse engine (Camoufox).
4
4
  * No external dependencies (no Jina, no Cloudflare).
5
5
  * Supports sync + async (job-based) modes.
6
+ * Auto-detects system RAM and parallelizes crawling accordingly.
6
7
  */
7
8
 
8
9
  const crypto = require('crypto')
10
+ const os = require('os')
11
+
12
+ // ~250MB per browser tab (Camoufox average)
13
+ const MB_PER_TAB = 250
14
+ // Reserve this much RAM for OS + other processes
15
+ const RESERVED_MB = 1500
9
16
 
10
17
  const DEFAULT_OPTS = {
11
18
  depth: 2,
12
19
  maxPages: 50,
13
- format: 'markdown', // markdown | html | json
14
- delay: 500, // ms between requests
15
- stealth: true, // use stealth browsing by default
16
- scope: 'domain', // domain | prefix | any
20
+ format: 'markdown',
21
+ delay: 300, // ms between batch launches
22
+ stealth: true,
23
+ scope: 'domain',
17
24
  timeout: 30000,
25
+ concurrency: 'auto', // 'auto' | number — auto-detect from RAM
18
26
  includeLinks: true,
19
- includePatterns: [], // wildcard patterns to include
20
- excludePatterns: [], // wildcard patterns to exclude
21
- merge: false, // merge all pages into single result
27
+ includePatterns: [],
28
+ excludePatterns: [],
29
+ merge: false,
22
30
  skipPatterns: [
23
31
  /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
24
32
  /\/_next\//,
@@ -35,6 +43,21 @@ const DEFAULT_OPTS = {
35
43
  // In-memory job store for async crawls
36
44
  const jobs = new Map()
37
45
 
46
+ /**
47
+ * Calculate max safe concurrency based on available system RAM.
48
+ */
49
+ function detectConcurrency() {
50
+ const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
51
+ const freeMB = Math.floor(os.freemem() / 1024 / 1024)
52
+ // Use the lower of: (free RAM) or (total - reserved)
53
+ const availableMB = Math.min(freeMB, totalMB - RESERVED_MB)
54
+ const maxTabs = Math.max(1, Math.floor(availableMB / MB_PER_TAB))
55
+ // Cap at 10 — diminishing returns and politeness
56
+ const concurrency = Math.min(maxTabs, 10)
57
+ console.log(`[crawl] RAM: ${totalMB}MB total, ${freeMB}MB free → ${concurrency} concurrent tabs`)
58
+ return concurrency
59
+ }
60
+
38
61
  class CrawlEngine {
39
62
  constructor(browseEngine, cache) {
40
63
  this.browseEngine = browseEngine
@@ -42,16 +65,21 @@ class CrawlEngine {
42
65
  }
43
66
 
44
67
  /**
45
- * Crawl a website starting from a URL (synchronous — waits for completion).
68
+ * Crawl a website starting from a URL.
69
+ * Automatically parallelizes based on available RAM.
46
70
  */
47
71
  async crawl(startUrl, opts = {}, cookies = null) {
48
- // Filter out undefined values from opts to avoid overriding defaults
49
72
  const cleanOpts = Object.fromEntries(
50
73
  Object.entries(opts).filter(([_, v]) => v !== undefined)
51
74
  )
52
75
  const config = { ...DEFAULT_OPTS, ...cleanOpts }
53
76
  const startTime = Date.now()
54
77
 
78
+ // Determine concurrency
79
+ const concurrency = config.concurrency === 'auto'
80
+ ? detectConcurrency()
81
+ : Math.max(1, Math.min(config.concurrency, 10))
82
+
55
83
  const startParsed = new URL(startUrl)
56
84
  const baseDomain = startParsed.hostname
57
85
  const basePrefix = startUrl.replace(/\/$/, '')
@@ -60,23 +88,14 @@ class CrawlEngine {
60
88
  const queue = [{ url: startUrl, depth: 0 }]
61
89
  const pages = []
62
90
  const failed = []
91
+ let activeCount = 0
63
92
 
64
- while (queue.length > 0 && pages.length < config.maxPages) {
65
- const { url, depth } = queue.shift()
66
- const normalized = normalizeUrl(url)
67
- if (visited.has(normalized)) continue
68
- visited.add(normalized)
69
-
70
- // Scope check
71
- if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
72
- // Skip pattern check
73
- if (config.skipPatterns.some(p => p.test(url))) continue
74
- // Include/exclude pattern check
75
- if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
76
-
93
+ // Process queue with concurrency control
94
+ const processUrl = async (item) => {
95
+ const { url, depth } = item
77
96
  try {
78
97
  const page = await this._fetchPage(url, config, cookies)
79
- if (!page) { failed.push({ url, error: 'empty' }); continue }
98
+ if (!page) { failed.push({ url, error: 'empty' }); return }
80
99
 
81
100
  const links = page.links || []
82
101
  pages.push({
@@ -93,20 +112,51 @@ class CrawlEngine {
93
112
  const absLink = resolveUrl(link, url)
94
113
  if (!absLink) continue
95
114
  const normLink = normalizeUrl(absLink)
96
- if (!visited.has(normLink)) {
97
- queue.push({ url: absLink, depth: depth + 1 })
98
- }
115
+ if (visited.has(normLink)) continue
116
+ // Pre-filter before queueing
117
+ if (!this._inScope(absLink, baseDomain, basePrefix, config.scope)) continue
118
+ if (config.skipPatterns.some(p => p.test(absLink))) continue
119
+ if (!this._matchesFilters(absLink, config.includePatterns, config.excludePatterns)) continue
120
+ visited.add(normLink)
121
+ queue.push({ url: absLink, depth: depth + 1 })
99
122
  }
100
123
  }
124
+ } catch (e) {
125
+ failed.push({ url, error: e.message })
126
+ }
127
+ }
128
+
129
+ // Seed the first URL
130
+ visited.add(normalizeUrl(startUrl))
131
+
132
+ // BFS with parallel workers
133
+ while (queue.length > 0 || activeCount > 0) {
134
+ // Launch up to `concurrency` parallel fetches
135
+ const batch = []
136
+ while (queue.length > 0 && batch.length < concurrency && (pages.length + activeCount + batch.length) < config.maxPages) {
137
+ batch.push(queue.shift())
138
+ }
101
139
 
140
+ if (batch.length === 0 && activeCount === 0) break
141
+
142
+ if (batch.length > 0) {
143
+ activeCount += batch.length
144
+ const results = await Promise.allSettled(
145
+ batch.map(item => processUrl(item))
146
+ )
147
+ activeCount -= batch.length
148
+
149
+ // Small delay between batches to be polite
102
150
  if (queue.length > 0 && config.delay > 0) {
103
151
  await sleep(config.delay)
104
152
  }
105
- } catch (e) {
106
- failed.push({ url, error: e.message })
107
153
  }
154
+
155
+ // Stop if we've hit maxPages
156
+ if (pages.length >= config.maxPages) break
108
157
  }
109
158
 
159
+ const duration = Date.now() - startTime
110
160
  const result = {
111
161
  startUrl,
112
162
  pages,
@@ -114,12 +164,13 @@ class CrawlEngine {
114
164
  total: visited.size,
115
165
  crawled: pages.length,
116
166
  failed: failed.length,
117
- duration: Date.now() - startTime
167
+ concurrency,
168
+ duration,
169
+ pagesPerSecond: pages.length > 0 ? +(pages.length / (duration / 1000)).toFixed(2) : 0
118
170
  },
119
171
  failed: failed.length > 0 ? failed : undefined
120
172
  }
121
173
 
122
- // Merge mode: combine all pages into single content
123
174
  if (config.merge) {
124
175
  result.merged = pages.map(p => {
125
176
  return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
@@ -147,7 +198,6 @@ class CrawlEngine {
147
198
  }
148
199
  jobs.set(jobId, job)
149
200
 
150
- // Run crawl in background
151
201
  this.crawl(startUrl, opts, cookies)
152
202
  .then(result => {
153
203
  job.status = 'completed'
@@ -156,6 +206,8 @@ class CrawlEngine {
156
206
  job.finished = result.stats.crawled
157
207
  job.total = result.stats.total
158
208
  job.duration = result.stats.duration
209
+ job.concurrency = result.stats.concurrency
210
+ job.pagesPerSecond = result.stats.pagesPerSecond
159
211
  })
160
212
  .catch(err => {
161
213
  job.status = 'errored'
@@ -179,8 +231,9 @@ class CrawlEngine {
179
231
  finished: job.finished,
180
232
  total: job.total,
181
233
  pageCount: job.pages.length,
234
+ concurrency: job.concurrency,
235
+ pagesPerSecond: job.pagesPerSecond,
182
236
  error: job.error,
183
- // Only include pages if completed
184
237
  pages: job.status === 'completed' ? job.pages : undefined,
185
238
  failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
186
239
  duration: job.duration
@@ -200,18 +253,35 @@ class CrawlEngine {
200
253
  }))
201
254
  }
202
255
 
256
+ /**
257
+ * Get system info for crawl capacity estimation.
258
+ */
259
+ static getCapacity() {
260
+ const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
261
+ const freeMB = Math.floor(os.freemem() / 1024 / 1024)
262
+ const concurrency = detectConcurrency()
263
+ // Estimate: each page takes ~4s with stealth delays
264
+ const pagesPerMinute = concurrency * 15 // ~4s per page
265
+ return {
266
+ totalRamMB: totalMB,
267
+ freeRamMB: freeMB,
268
+ maxConcurrency: concurrency,
269
+ estimatedPagesPerMinute: pagesPerMinute,
270
+ estimate100pages: `~${Math.ceil(100 / pagesPerMinute)} min`,
271
+ estimate1000pages: `~${Math.ceil(1000 / pagesPerMinute)} min`
272
+ }
273
+ }
274
+
203
275
  async _fetchPage(url, config, cookies) {
204
- // Use our own browse engine (Camoufox) — no external dependencies
205
276
  try {
206
277
  const result = await this.browseEngine.browse(url, {
207
278
  stealth: config.stealth,
208
279
  _cookies: cookies,
209
280
  timeout: config.timeout,
210
- html: true, // request raw HTML for link extraction
211
- noCache: true // always fetch fresh for crawling
281
+ html: true,
282
+ noCache: true
212
283
  })
213
284
  if (result?.content) {
214
- // Extract links from HTML if available, otherwise from markdown content
215
285
  const linkSource = result.html || result.content
216
286
  return {
217
287
  title: result.title || '',
@@ -222,7 +292,6 @@ class CrawlEngine {
222
292
  } catch (e) {
223
293
  throw new Error(`Failed to fetch ${url}: ${e.message}`)
224
294
  }
225
-
226
295
  return null
227
296
  }
228
297
 
@@ -231,20 +300,18 @@ class CrawlEngine {
231
300
  const parsed = new URL(url)
232
301
  if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
233
302
  if (scope === 'prefix') return url.startsWith(basePrefix)
234
- return true // 'any'
303
+ return true
235
304
  } catch {
236
305
  return false
237
306
  }
238
307
  }
239
308
 
240
309
  _matchesFilters(url, includePatterns, excludePatterns) {
241
- // Exclude takes priority
242
310
  if (excludePatterns && excludePatterns.length > 0) {
243
311
  for (const pattern of excludePatterns) {
244
312
  if (wildcardMatch(url, pattern)) return false
245
313
  }
246
314
  }
247
- // If include patterns specified, URL must match at least one
248
315
  if (includePatterns && includePatterns.length > 0) {
249
316
  return includePatterns.some(pattern => wildcardMatch(url, pattern))
250
317
  }
@@ -252,12 +319,9 @@ class CrawlEngine {
252
319
  }
253
320
  }
254
321
 
255
- /**
256
- * Wildcard matching: * matches anything except /, ** matches everything including /
257
- */
258
322
  function wildcardMatch(str, pattern) {
259
323
  const regex = pattern
260
- .replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex chars
324
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&')
261
325
  .replace(/\*\*/g, '{{GLOBSTAR}}')
262
326
  .replace(/\*/g, '[^/]*')
263
327
  .replace(/\{\{GLOBSTAR\}\}/g, '.*')
@@ -266,13 +330,11 @@ function wildcardMatch(str, pattern) {
266
330
 
267
331
  function extractLinks(content, baseUrl) {
268
332
  const links = []
269
- // Extract from href attributes (HTML)
270
333
  const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
271
334
  for (const m of hrefMatches) {
272
335
  const resolved = resolveUrl(m[1], baseUrl)
273
336
  if (resolved && !links.includes(resolved)) links.push(resolved)
274
337
  }
275
- // Extract from markdown links
276
338
  const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
277
339
  for (const m of mdMatches) {
278
340
  if (!links.includes(m[2])) links.push(m[2])
@@ -293,7 +355,6 @@ function normalizeUrl(url) {
293
355
  try {
294
356
  const u = new URL(url)
295
357
  u.hash = ''
296
- // Remove trailing slash for consistency
297
358
  let href = u.href
298
359
  if (href.endsWith('/') && u.pathname !== '/') {
299
360
  href = href.slice(0, -1)
package/src/server.js CHANGED
@@ -55,10 +55,10 @@ const server = http.createServer(async (req, res) => {
55
55
  if (req.method === 'POST' && path === '/crawl') {
56
56
  const body = await readBody(req)
57
57
  const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
58
- includePatterns, excludePatterns, merge, async: asyncMode } = body
58
+ includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
59
59
  if (!targetUrl) return error(res, 400, 'url is required')
60
60
 
61
- const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
61
+ const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
62
62
 
63
63
  if (asyncMode) {
64
64
  // Async mode: return job ID immediately
@@ -70,6 +70,16 @@ const server = http.createServer(async (req, res) => {
70
70
  return json(res, result)
71
71
  }
72
72
 
73
+ if (req.method === 'GET' && path === '/crawl/jobs') {
74
+ const jobList = spectrawl.listCrawlJobs()
75
+ return json(res, { jobs: jobList })
76
+ }
77
+
78
+ if (req.method === 'GET' && path === '/crawl/capacity') {
79
+ const { CrawlEngine } = require('./crawl')
80
+ return json(res, CrawlEngine.getCapacity())
81
+ }
82
+
73
83
  if (req.method === 'GET' && path.startsWith('/crawl/')) {
74
84
  const jobId = path.split('/crawl/')[1]
75
85
  if (!jobId) return error(res, 400, 'job ID is required')
@@ -78,11 +88,6 @@ const server = http.createServer(async (req, res) => {
78
88
  return json(res, job)
79
89
  }
80
90
 
81
- if (req.method === 'GET' && path === '/crawl/jobs') {
82
- const jobList = spectrawl.listCrawlJobs()
83
- return json(res, { jobs: jobList })
84
- }
85
-
86
91
  if (req.method === 'POST' && path === '/act') {
87
92
  const body = await readBody(req)
88
93
  const { platform, action, ...params } = body