spectrawl 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.4.1",
3
+ "version": "0.4.3",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -127,12 +127,21 @@ class BrowseEngine {
127
127
 
128
128
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
129
129
 
130
- // Human-like delays
131
- await page.waitForTimeout(800 + Math.random() * 1500)
132
- await page.evaluate(() => {
133
- window.scrollBy({ top: Math.floor(Math.random() * 400) + 100, behavior: 'smooth' })
134
- })
135
- await page.waitForTimeout(300 + Math.random() * 700)
130
+ if (opts.fastMode) {
131
+ // Crawl mode: minimal delays, just enough for lazy-load triggers
132
+ await page.waitForTimeout(400)
133
+ await page.evaluate(() => {
134
+ window.scrollBy({ top: 500, behavior: 'instant' })
135
+ })
136
+ await page.waitForTimeout(200)
137
+ } else {
138
+ // Normal browse: full human-like delays
139
+ await page.waitForTimeout(800 + Math.random() * 1500)
140
+ await page.evaluate(() => {
141
+ window.scrollBy({ top: Math.floor(Math.random() * 400) + 100, behavior: 'smooth' })
142
+ })
143
+ await page.waitForTimeout(300 + Math.random() * 700)
144
+ }
136
145
 
137
146
  const result = {}
138
147
 
package/src/crawl.js CHANGED
@@ -3,22 +3,30 @@
3
3
  * Multi-page website crawler using our own browse engine (Camoufox).
4
4
  * No external dependencies (no Jina, no Cloudflare).
5
5
  * Supports sync + async (job-based) modes.
6
+ * Auto-detects system RAM and parallelizes crawling accordingly.
6
7
  */
7
8
 
8
9
  const crypto = require('crypto')
10
+ const os = require('os')
11
+
12
+ // ~250MB per browser tab (Camoufox average)
13
+ const MB_PER_TAB = 250
14
+ // Reserve this much RAM for OS + other processes
15
+ const RESERVED_MB = 1500
9
16
 
10
17
  const DEFAULT_OPTS = {
11
18
  depth: 2,
12
19
  maxPages: 50,
13
- format: 'markdown', // markdown | html | json
14
- delay: 500, // ms between requests
15
- stealth: true, // use stealth browsing by default
16
- scope: 'domain', // domain | prefix | any
20
+ format: 'markdown',
21
+ delay: 300, // ms between batch launches
22
+ stealth: true,
23
+ scope: 'domain',
17
24
  timeout: 30000,
25
+ concurrency: 'auto', // 'auto' | number — auto-detect from RAM
18
26
  includeLinks: true,
19
- includePatterns: [], // wildcard patterns to include
20
- excludePatterns: [], // wildcard patterns to exclude
21
- merge: false, // merge all pages into single result
27
+ includePatterns: [],
28
+ excludePatterns: [],
29
+ merge: false,
22
30
  skipPatterns: [
23
31
  /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
24
32
  /\/_next\//,
@@ -35,6 +43,21 @@ const DEFAULT_OPTS = {
35
43
  // In-memory job store for async crawls
36
44
  const jobs = new Map()
37
45
 
46
+ /**
47
+ * Calculate max safe concurrency based on available system RAM.
48
+ */
49
+ function detectConcurrency() {
50
+ const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
51
+ const freeMB = Math.floor(os.freemem() / 1024 / 1024)
52
+ // Use the lower of: (free RAM) or (total - reserved)
53
+ const availableMB = Math.min(freeMB, totalMB - RESERVED_MB)
54
+ const maxTabs = Math.max(1, Math.floor(availableMB / MB_PER_TAB))
55
+ // Cap at 10 — diminishing returns and politeness
56
+ const concurrency = Math.min(maxTabs, 10)
57
+ console.log(`[crawl] RAM: ${totalMB}MB total, ${freeMB}MB free → ${concurrency} concurrent tabs`)
58
+ return concurrency
59
+ }
60
+
38
61
  class CrawlEngine {
39
62
  constructor(browseEngine, cache) {
40
63
  this.browseEngine = browseEngine
@@ -42,16 +65,21 @@ class CrawlEngine {
42
65
  }
43
66
 
44
67
  /**
45
- * Crawl a website starting from a URL (synchronous — waits for completion).
68
+ * Crawl a website starting from a URL.
69
+ * Automatically parallelizes based on available RAM.
46
70
  */
47
71
  async crawl(startUrl, opts = {}, cookies = null) {
48
- // Filter out undefined values from opts to avoid overriding defaults
49
72
  const cleanOpts = Object.fromEntries(
50
73
  Object.entries(opts).filter(([_, v]) => v !== undefined)
51
74
  )
52
75
  const config = { ...DEFAULT_OPTS, ...cleanOpts }
53
76
  const startTime = Date.now()
54
77
 
78
+ // Determine concurrency
79
+ const concurrency = config.concurrency === 'auto'
80
+ ? detectConcurrency()
81
+ : Math.max(1, Math.min(config.concurrency, 10))
82
+
55
83
  const startParsed = new URL(startUrl)
56
84
  const baseDomain = startParsed.hostname
57
85
  const basePrefix = startUrl.replace(/\/$/, '')
@@ -60,23 +88,14 @@ class CrawlEngine {
60
88
  const queue = [{ url: startUrl, depth: 0 }]
61
89
  const pages = []
62
90
  const failed = []
91
+ let activeCount = 0
63
92
 
64
- while (queue.length > 0 && pages.length < config.maxPages) {
65
- const { url, depth } = queue.shift()
66
- const normalized = normalizeUrl(url)
67
- if (visited.has(normalized)) continue
68
- visited.add(normalized)
69
-
70
- // Scope check
71
- if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
72
- // Skip pattern check
73
- if (config.skipPatterns.some(p => p.test(url))) continue
74
- // Include/exclude pattern check
75
- if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
76
-
93
+ // Process queue with concurrency control
94
+ const processUrl = async (item) => {
95
+ const { url, depth } = item
77
96
  try {
78
97
  const page = await this._fetchPage(url, config, cookies)
79
- if (!page) { failed.push({ url, error: 'empty' }); continue }
98
+ if (!page) { failed.push({ url, error: 'empty' }); return }
80
99
 
81
100
  const links = page.links || []
82
101
  pages.push({
@@ -93,20 +112,51 @@ class CrawlEngine {
93
112
  const absLink = resolveUrl(link, url)
94
113
  if (!absLink) continue
95
114
  const normLink = normalizeUrl(absLink)
96
- if (!visited.has(normLink)) {
97
- queue.push({ url: absLink, depth: depth + 1 })
98
- }
115
+ if (visited.has(normLink)) continue
116
+ // Pre-filter before queueing
117
+ if (!this._inScope(absLink, baseDomain, basePrefix, config.scope)) continue
118
+ if (config.skipPatterns.some(p => p.test(absLink))) continue
119
+ if (!this._matchesFilters(absLink, config.includePatterns, config.excludePatterns)) continue
120
+ visited.add(normLink)
121
+ queue.push({ url: absLink, depth: depth + 1 })
99
122
  }
100
123
  }
124
+ } catch (e) {
125
+ failed.push({ url, error: e.message })
126
+ }
127
+ }
128
+
129
+ // Seed the first URL
130
+ visited.add(normalizeUrl(startUrl))
131
+
132
+ // BFS with parallel workers
133
+ while (queue.length > 0 || activeCount > 0) {
134
+ // Launch up to `concurrency` parallel fetches
135
+ const batch = []
136
+ while (queue.length > 0 && batch.length < concurrency && (pages.length + activeCount + batch.length) < config.maxPages) {
137
+ batch.push(queue.shift())
138
+ }
101
139
 
140
+ if (batch.length === 0 && activeCount === 0) break
141
+
142
+ if (batch.length > 0) {
143
+ activeCount += batch.length
144
+ const results = await Promise.allSettled(
145
+ batch.map(item => processUrl(item))
146
+ )
147
+ activeCount -= batch.length
148
+
149
+ // Small delay between batches to be polite
102
150
  if (queue.length > 0 && config.delay > 0) {
103
151
  await sleep(config.delay)
104
152
  }
105
- } catch (e) {
106
- failed.push({ url, error: e.message })
107
153
  }
154
+
155
+ // Stop if we've hit maxPages
156
+ if (pages.length >= config.maxPages) break
108
157
  }
109
158
 
159
+ const duration = Date.now() - startTime
110
160
  const result = {
111
161
  startUrl,
112
162
  pages,
@@ -114,12 +164,13 @@ class CrawlEngine {
114
164
  total: visited.size,
115
165
  crawled: pages.length,
116
166
  failed: failed.length,
117
- duration: Date.now() - startTime
167
+ concurrency,
168
+ duration,
169
+ pagesPerSecond: pages.length > 0 ? +(pages.length / (duration / 1000)).toFixed(2) : 0
118
170
  },
119
171
  failed: failed.length > 0 ? failed : undefined
120
172
  }
121
173
 
122
- // Merge mode: combine all pages into single content
123
174
  if (config.merge) {
124
175
  result.merged = pages.map(p => {
125
176
  return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
@@ -147,7 +198,6 @@ class CrawlEngine {
147
198
  }
148
199
  jobs.set(jobId, job)
149
200
 
150
- // Run crawl in background
151
201
  this.crawl(startUrl, opts, cookies)
152
202
  .then(result => {
153
203
  job.status = 'completed'
@@ -156,6 +206,8 @@ class CrawlEngine {
156
206
  job.finished = result.stats.crawled
157
207
  job.total = result.stats.total
158
208
  job.duration = result.stats.duration
209
+ job.concurrency = result.stats.concurrency
210
+ job.pagesPerSecond = result.stats.pagesPerSecond
159
211
  })
160
212
  .catch(err => {
161
213
  job.status = 'errored'
@@ -179,8 +231,9 @@ class CrawlEngine {
179
231
  finished: job.finished,
180
232
  total: job.total,
181
233
  pageCount: job.pages.length,
234
+ concurrency: job.concurrency,
235
+ pagesPerSecond: job.pagesPerSecond,
182
236
  error: job.error,
183
- // Only include pages if completed
184
237
  pages: job.status === 'completed' ? job.pages : undefined,
185
238
  failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
186
239
  duration: job.duration
@@ -200,18 +253,38 @@ class CrawlEngine {
200
253
  }))
201
254
  }
202
255
 
256
+ /**
257
+ * Get system info for crawl capacity estimation.
258
+ */
259
+ static getCapacity() {
260
+ const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
261
+ const freeMB = Math.floor(os.freemem() / 1024 / 1024)
262
+ const concurrency = detectConcurrency()
263
+ // Realistic: ~0.8s per page with fast mode, limited by shared browser pipeline
264
+ // Concurrency helps but not linearly — shared browser bottleneck
265
+ const effectiveConcurrency = Math.min(concurrency, 5) // diminishing returns past 5
266
+ const pagesPerMinute = Math.floor(effectiveConcurrency * 30) // ~2s effective per page with overhead
267
+ return {
268
+ totalRamMB: totalMB,
269
+ freeRamMB: freeMB,
270
+ maxConcurrency: concurrency,
271
+ estimatedPagesPerMinute: pagesPerMinute,
272
+ estimate100pages: `~${Math.ceil(100 / pagesPerMinute)} min`,
273
+ estimate1000pages: `~${Math.ceil(1000 / pagesPerMinute)} min`
274
+ }
275
+ }
276
+
203
277
  async _fetchPage(url, config, cookies) {
204
- // Use our own browse engine (Camoufox) — no external dependencies
205
278
  try {
206
279
  const result = await this.browseEngine.browse(url, {
207
280
  stealth: config.stealth,
208
281
  _cookies: cookies,
209
282
  timeout: config.timeout,
210
- html: true, // request raw HTML for link extraction
211
- noCache: true // always fetch fresh for crawling
283
+ html: true,
284
+ noCache: true,
285
+ fastMode: true // crawl mode: reduced delays for speed
212
286
  })
213
287
  if (result?.content) {
214
- // Extract links from HTML if available, otherwise from markdown content
215
288
  const linkSource = result.html || result.content
216
289
  return {
217
290
  title: result.title || '',
@@ -222,7 +295,6 @@ class CrawlEngine {
222
295
  } catch (e) {
223
296
  throw new Error(`Failed to fetch ${url}: ${e.message}`)
224
297
  }
225
-
226
298
  return null
227
299
  }
228
300
 
@@ -231,20 +303,18 @@ class CrawlEngine {
231
303
  const parsed = new URL(url)
232
304
  if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
233
305
  if (scope === 'prefix') return url.startsWith(basePrefix)
234
- return true // 'any'
306
+ return true
235
307
  } catch {
236
308
  return false
237
309
  }
238
310
  }
239
311
 
240
312
  _matchesFilters(url, includePatterns, excludePatterns) {
241
- // Exclude takes priority
242
313
  if (excludePatterns && excludePatterns.length > 0) {
243
314
  for (const pattern of excludePatterns) {
244
315
  if (wildcardMatch(url, pattern)) return false
245
316
  }
246
317
  }
247
- // If include patterns specified, URL must match at least one
248
318
  if (includePatterns && includePatterns.length > 0) {
249
319
  return includePatterns.some(pattern => wildcardMatch(url, pattern))
250
320
  }
@@ -252,12 +322,9 @@ class CrawlEngine {
252
322
  }
253
323
  }
254
324
 
255
- /**
256
- * Wildcard matching: * matches anything except /, ** matches everything including /
257
- */
258
325
  function wildcardMatch(str, pattern) {
259
326
  const regex = pattern
260
- .replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex chars
327
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&')
261
328
  .replace(/\*\*/g, '{{GLOBSTAR}}')
262
329
  .replace(/\*/g, '[^/]*')
263
330
  .replace(/\{\{GLOBSTAR\}\}/g, '.*')
@@ -266,13 +333,11 @@ function wildcardMatch(str, pattern) {
266
333
 
267
334
  function extractLinks(content, baseUrl) {
268
335
  const links = []
269
- // Extract from href attributes (HTML)
270
336
  const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
271
337
  for (const m of hrefMatches) {
272
338
  const resolved = resolveUrl(m[1], baseUrl)
273
339
  if (resolved && !links.includes(resolved)) links.push(resolved)
274
340
  }
275
- // Extract from markdown links
276
341
  const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
277
342
  for (const m of mdMatches) {
278
343
  if (!links.includes(m[2])) links.push(m[2])
@@ -293,7 +358,6 @@ function normalizeUrl(url) {
293
358
  try {
294
359
  const u = new URL(url)
295
360
  u.hash = ''
296
- // Remove trailing slash for consistency
297
361
  let href = u.href
298
362
  if (href.endsWith('/') && u.pathname !== '/') {
299
363
  href = href.slice(0, -1)
package/src/server.js CHANGED
@@ -55,10 +55,10 @@ const server = http.createServer(async (req, res) => {
55
55
  if (req.method === 'POST' && path === '/crawl') {
56
56
  const body = await readBody(req)
57
57
  const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
58
- includePatterns, excludePatterns, merge, async: asyncMode } = body
58
+ includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
59
59
  if (!targetUrl) return error(res, 400, 'url is required')
60
60
 
61
- const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
61
+ const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
62
62
 
63
63
  if (asyncMode) {
64
64
  // Async mode: return job ID immediately
@@ -70,6 +70,16 @@ const server = http.createServer(async (req, res) => {
70
70
  return json(res, result)
71
71
  }
72
72
 
73
+ if (req.method === 'GET' && path === '/crawl/jobs') {
74
+ const jobList = spectrawl.listCrawlJobs()
75
+ return json(res, { jobs: jobList })
76
+ }
77
+
78
+ if (req.method === 'GET' && path === '/crawl/capacity') {
79
+ const { CrawlEngine } = require('./crawl')
80
+ return json(res, CrawlEngine.getCapacity())
81
+ }
82
+
73
83
  if (req.method === 'GET' && path.startsWith('/crawl/')) {
74
84
  const jobId = path.split('/crawl/')[1]
75
85
  if (!jobId) return error(res, 400, 'job ID is required')
@@ -78,11 +88,6 @@ const server = http.createServer(async (req, res) => {
78
88
  return json(res, job)
79
89
  }
80
90
 
81
- if (req.method === 'GET' && path === '/crawl/jobs') {
82
- const jobList = spectrawl.listCrawlJobs()
83
- return json(res, { jobs: jobList })
84
- }
85
-
86
91
  if (req.method === 'POST' && path === '/act') {
87
92
  const body = await readBody(req)
88
93
  const { platform, action, ...params } = body