spectrawl 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/crawl.js +108 -47
- package/src/server.js +12 -7
package/package.json
CHANGED
package/src/crawl.js
CHANGED
|
@@ -3,22 +3,30 @@
|
|
|
3
3
|
* Multi-page website crawler using our own browse engine (Camoufox).
|
|
4
4
|
* No external dependencies (no Jina, no Cloudflare).
|
|
5
5
|
* Supports sync + async (job-based) modes.
|
|
6
|
+
* Auto-detects system RAM and parallelizes crawling accordingly.
|
|
6
7
|
*/
|
|
7
8
|
|
|
8
9
|
const crypto = require('crypto')
|
|
10
|
+
const os = require('os')
|
|
11
|
+
|
|
12
|
+
// ~250MB per browser tab (Camoufox average)
|
|
13
|
+
const MB_PER_TAB = 250
|
|
14
|
+
// Reserve this much RAM for OS + other processes
|
|
15
|
+
const RESERVED_MB = 1500
|
|
9
16
|
|
|
10
17
|
const DEFAULT_OPTS = {
|
|
11
18
|
depth: 2,
|
|
12
19
|
maxPages: 50,
|
|
13
|
-
format: 'markdown',
|
|
14
|
-
delay:
|
|
15
|
-
stealth: true,
|
|
16
|
-
scope: 'domain',
|
|
20
|
+
format: 'markdown',
|
|
21
|
+
delay: 300, // ms between batch launches
|
|
22
|
+
stealth: true,
|
|
23
|
+
scope: 'domain',
|
|
17
24
|
timeout: 30000,
|
|
25
|
+
concurrency: 'auto', // 'auto' | number — auto-detect from RAM
|
|
18
26
|
includeLinks: true,
|
|
19
|
-
includePatterns: [],
|
|
20
|
-
excludePatterns: [],
|
|
21
|
-
merge: false,
|
|
27
|
+
includePatterns: [],
|
|
28
|
+
excludePatterns: [],
|
|
29
|
+
merge: false,
|
|
22
30
|
skipPatterns: [
|
|
23
31
|
/\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
|
|
24
32
|
/\/_next\//,
|
|
@@ -35,6 +43,21 @@ const DEFAULT_OPTS = {
|
|
|
35
43
|
// In-memory job store for async crawls
|
|
36
44
|
const jobs = new Map()
|
|
37
45
|
|
|
46
|
+
/**
|
|
47
|
+
* Calculate max safe concurrency based on available system RAM.
|
|
48
|
+
*/
|
|
49
|
+
function detectConcurrency() {
|
|
50
|
+
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
51
|
+
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
52
|
+
// Use the lower of: (free RAM) or (total - reserved)
|
|
53
|
+
const availableMB = Math.min(freeMB, totalMB - RESERVED_MB)
|
|
54
|
+
const maxTabs = Math.max(1, Math.floor(availableMB / MB_PER_TAB))
|
|
55
|
+
// Cap at 10 — diminishing returns and politeness
|
|
56
|
+
const concurrency = Math.min(maxTabs, 10)
|
|
57
|
+
console.log(`[crawl] RAM: ${totalMB}MB total, ${freeMB}MB free → ${concurrency} concurrent tabs`)
|
|
58
|
+
return concurrency
|
|
59
|
+
}
|
|
60
|
+
|
|
38
61
|
class CrawlEngine {
|
|
39
62
|
constructor(browseEngine, cache) {
|
|
40
63
|
this.browseEngine = browseEngine
|
|
@@ -42,16 +65,21 @@ class CrawlEngine {
|
|
|
42
65
|
}
|
|
43
66
|
|
|
44
67
|
/**
|
|
45
|
-
* Crawl a website starting from a URL
|
|
68
|
+
* Crawl a website starting from a URL.
|
|
69
|
+
* Automatically parallelizes based on available RAM.
|
|
46
70
|
*/
|
|
47
71
|
async crawl(startUrl, opts = {}, cookies = null) {
|
|
48
|
-
// Filter out undefined values from opts to avoid overriding defaults
|
|
49
72
|
const cleanOpts = Object.fromEntries(
|
|
50
73
|
Object.entries(opts).filter(([_, v]) => v !== undefined)
|
|
51
74
|
)
|
|
52
75
|
const config = { ...DEFAULT_OPTS, ...cleanOpts }
|
|
53
76
|
const startTime = Date.now()
|
|
54
77
|
|
|
78
|
+
// Determine concurrency
|
|
79
|
+
const concurrency = config.concurrency === 'auto'
|
|
80
|
+
? detectConcurrency()
|
|
81
|
+
: Math.max(1, Math.min(config.concurrency, 10))
|
|
82
|
+
|
|
55
83
|
const startParsed = new URL(startUrl)
|
|
56
84
|
const baseDomain = startParsed.hostname
|
|
57
85
|
const basePrefix = startUrl.replace(/\/$/, '')
|
|
@@ -60,23 +88,14 @@ class CrawlEngine {
|
|
|
60
88
|
const queue = [{ url: startUrl, depth: 0 }]
|
|
61
89
|
const pages = []
|
|
62
90
|
const failed = []
|
|
91
|
+
let activeCount = 0
|
|
63
92
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
const
|
|
67
|
-
if (visited.has(normalized)) continue
|
|
68
|
-
visited.add(normalized)
|
|
69
|
-
|
|
70
|
-
// Scope check
|
|
71
|
-
if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
|
|
72
|
-
// Skip pattern check
|
|
73
|
-
if (config.skipPatterns.some(p => p.test(url))) continue
|
|
74
|
-
// Include/exclude pattern check
|
|
75
|
-
if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
|
|
76
|
-
|
|
93
|
+
// Process queue with concurrency control
|
|
94
|
+
const processUrl = async (item) => {
|
|
95
|
+
const { url, depth } = item
|
|
77
96
|
try {
|
|
78
97
|
const page = await this._fetchPage(url, config, cookies)
|
|
79
|
-
if (!page) { failed.push({ url, error: 'empty' });
|
|
98
|
+
if (!page) { failed.push({ url, error: 'empty' }); return }
|
|
80
99
|
|
|
81
100
|
const links = page.links || []
|
|
82
101
|
pages.push({
|
|
@@ -93,20 +112,51 @@ class CrawlEngine {
|
|
|
93
112
|
const absLink = resolveUrl(link, url)
|
|
94
113
|
if (!absLink) continue
|
|
95
114
|
const normLink = normalizeUrl(absLink)
|
|
96
|
-
if (
|
|
97
|
-
|
|
98
|
-
|
|
115
|
+
if (visited.has(normLink)) continue
|
|
116
|
+
// Pre-filter before queueing
|
|
117
|
+
if (!this._inScope(absLink, baseDomain, basePrefix, config.scope)) continue
|
|
118
|
+
if (config.skipPatterns.some(p => p.test(absLink))) continue
|
|
119
|
+
if (!this._matchesFilters(absLink, config.includePatterns, config.excludePatterns)) continue
|
|
120
|
+
visited.add(normLink)
|
|
121
|
+
queue.push({ url: absLink, depth: depth + 1 })
|
|
99
122
|
}
|
|
100
123
|
}
|
|
124
|
+
} catch (e) {
|
|
125
|
+
failed.push({ url, error: e.message })
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Seed the first URL
|
|
130
|
+
visited.add(normalizeUrl(startUrl))
|
|
131
|
+
|
|
132
|
+
// BFS with parallel workers
|
|
133
|
+
while (queue.length > 0 || activeCount > 0) {
|
|
134
|
+
// Launch up to `concurrency` parallel fetches
|
|
135
|
+
const batch = []
|
|
136
|
+
while (queue.length > 0 && batch.length < concurrency && (pages.length + activeCount + batch.length) < config.maxPages) {
|
|
137
|
+
batch.push(queue.shift())
|
|
138
|
+
}
|
|
101
139
|
|
|
140
|
+
if (batch.length === 0 && activeCount === 0) break
|
|
141
|
+
|
|
142
|
+
if (batch.length > 0) {
|
|
143
|
+
activeCount += batch.length
|
|
144
|
+
const results = await Promise.allSettled(
|
|
145
|
+
batch.map(item => processUrl(item))
|
|
146
|
+
)
|
|
147
|
+
activeCount -= batch.length
|
|
148
|
+
|
|
149
|
+
// Small delay between batches to be polite
|
|
102
150
|
if (queue.length > 0 && config.delay > 0) {
|
|
103
151
|
await sleep(config.delay)
|
|
104
152
|
}
|
|
105
|
-
} catch (e) {
|
|
106
|
-
failed.push({ url, error: e.message })
|
|
107
153
|
}
|
|
154
|
+
|
|
155
|
+
// Stop if we've hit maxPages
|
|
156
|
+
if (pages.length >= config.maxPages) break
|
|
108
157
|
}
|
|
109
158
|
|
|
159
|
+
const duration = Date.now() - startTime
|
|
110
160
|
const result = {
|
|
111
161
|
startUrl,
|
|
112
162
|
pages,
|
|
@@ -114,12 +164,13 @@ class CrawlEngine {
|
|
|
114
164
|
total: visited.size,
|
|
115
165
|
crawled: pages.length,
|
|
116
166
|
failed: failed.length,
|
|
117
|
-
|
|
167
|
+
concurrency,
|
|
168
|
+
duration,
|
|
169
|
+
pagesPerSecond: pages.length > 0 ? +(pages.length / (duration / 1000)).toFixed(2) : 0
|
|
118
170
|
},
|
|
119
171
|
failed: failed.length > 0 ? failed : undefined
|
|
120
172
|
}
|
|
121
173
|
|
|
122
|
-
// Merge mode: combine all pages into single content
|
|
123
174
|
if (config.merge) {
|
|
124
175
|
result.merged = pages.map(p => {
|
|
125
176
|
return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
|
|
@@ -147,7 +198,6 @@ class CrawlEngine {
|
|
|
147
198
|
}
|
|
148
199
|
jobs.set(jobId, job)
|
|
149
200
|
|
|
150
|
-
// Run crawl in background
|
|
151
201
|
this.crawl(startUrl, opts, cookies)
|
|
152
202
|
.then(result => {
|
|
153
203
|
job.status = 'completed'
|
|
@@ -156,6 +206,8 @@ class CrawlEngine {
|
|
|
156
206
|
job.finished = result.stats.crawled
|
|
157
207
|
job.total = result.stats.total
|
|
158
208
|
job.duration = result.stats.duration
|
|
209
|
+
job.concurrency = result.stats.concurrency
|
|
210
|
+
job.pagesPerSecond = result.stats.pagesPerSecond
|
|
159
211
|
})
|
|
160
212
|
.catch(err => {
|
|
161
213
|
job.status = 'errored'
|
|
@@ -179,8 +231,9 @@ class CrawlEngine {
|
|
|
179
231
|
finished: job.finished,
|
|
180
232
|
total: job.total,
|
|
181
233
|
pageCount: job.pages.length,
|
|
234
|
+
concurrency: job.concurrency,
|
|
235
|
+
pagesPerSecond: job.pagesPerSecond,
|
|
182
236
|
error: job.error,
|
|
183
|
-
// Only include pages if completed
|
|
184
237
|
pages: job.status === 'completed' ? job.pages : undefined,
|
|
185
238
|
failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
|
|
186
239
|
duration: job.duration
|
|
@@ -200,18 +253,35 @@ class CrawlEngine {
|
|
|
200
253
|
}))
|
|
201
254
|
}
|
|
202
255
|
|
|
256
|
+
/**
|
|
257
|
+
* Get system info for crawl capacity estimation.
|
|
258
|
+
*/
|
|
259
|
+
static getCapacity() {
|
|
260
|
+
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
261
|
+
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
262
|
+
const concurrency = detectConcurrency()
|
|
263
|
+
// Estimate: each page takes ~4s with stealth delays
|
|
264
|
+
const pagesPerMinute = concurrency * 15 // ~4s per page
|
|
265
|
+
return {
|
|
266
|
+
totalRamMB: totalMB,
|
|
267
|
+
freeRamMB: freeMB,
|
|
268
|
+
maxConcurrency: concurrency,
|
|
269
|
+
estimatedPagesPerMinute: pagesPerMinute,
|
|
270
|
+
estimate100pages: `~${Math.ceil(100 / pagesPerMinute)} min`,
|
|
271
|
+
estimate1000pages: `~${Math.ceil(1000 / pagesPerMinute)} min`
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
203
275
|
async _fetchPage(url, config, cookies) {
|
|
204
|
-
// Use our own browse engine (Camoufox) — no external dependencies
|
|
205
276
|
try {
|
|
206
277
|
const result = await this.browseEngine.browse(url, {
|
|
207
278
|
stealth: config.stealth,
|
|
208
279
|
_cookies: cookies,
|
|
209
280
|
timeout: config.timeout,
|
|
210
|
-
html: true,
|
|
211
|
-
noCache: true
|
|
281
|
+
html: true,
|
|
282
|
+
noCache: true
|
|
212
283
|
})
|
|
213
284
|
if (result?.content) {
|
|
214
|
-
// Extract links from HTML if available, otherwise from markdown content
|
|
215
285
|
const linkSource = result.html || result.content
|
|
216
286
|
return {
|
|
217
287
|
title: result.title || '',
|
|
@@ -222,7 +292,6 @@ class CrawlEngine {
|
|
|
222
292
|
} catch (e) {
|
|
223
293
|
throw new Error(`Failed to fetch ${url}: ${e.message}`)
|
|
224
294
|
}
|
|
225
|
-
|
|
226
295
|
return null
|
|
227
296
|
}
|
|
228
297
|
|
|
@@ -231,20 +300,18 @@ class CrawlEngine {
|
|
|
231
300
|
const parsed = new URL(url)
|
|
232
301
|
if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
|
|
233
302
|
if (scope === 'prefix') return url.startsWith(basePrefix)
|
|
234
|
-
return true
|
|
303
|
+
return true
|
|
235
304
|
} catch {
|
|
236
305
|
return false
|
|
237
306
|
}
|
|
238
307
|
}
|
|
239
308
|
|
|
240
309
|
_matchesFilters(url, includePatterns, excludePatterns) {
|
|
241
|
-
// Exclude takes priority
|
|
242
310
|
if (excludePatterns && excludePatterns.length > 0) {
|
|
243
311
|
for (const pattern of excludePatterns) {
|
|
244
312
|
if (wildcardMatch(url, pattern)) return false
|
|
245
313
|
}
|
|
246
314
|
}
|
|
247
|
-
// If include patterns specified, URL must match at least one
|
|
248
315
|
if (includePatterns && includePatterns.length > 0) {
|
|
249
316
|
return includePatterns.some(pattern => wildcardMatch(url, pattern))
|
|
250
317
|
}
|
|
@@ -252,12 +319,9 @@ class CrawlEngine {
|
|
|
252
319
|
}
|
|
253
320
|
}
|
|
254
321
|
|
|
255
|
-
/**
|
|
256
|
-
* Wildcard matching: * matches anything except /, ** matches everything including /
|
|
257
|
-
*/
|
|
258
322
|
function wildcardMatch(str, pattern) {
|
|
259
323
|
const regex = pattern
|
|
260
|
-
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
|
|
324
|
+
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
|
|
261
325
|
.replace(/\*\*/g, '{{GLOBSTAR}}')
|
|
262
326
|
.replace(/\*/g, '[^/]*')
|
|
263
327
|
.replace(/\{\{GLOBSTAR\}\}/g, '.*')
|
|
@@ -266,13 +330,11 @@ function wildcardMatch(str, pattern) {
|
|
|
266
330
|
|
|
267
331
|
function extractLinks(content, baseUrl) {
|
|
268
332
|
const links = []
|
|
269
|
-
// Extract from href attributes (HTML)
|
|
270
333
|
const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
|
|
271
334
|
for (const m of hrefMatches) {
|
|
272
335
|
const resolved = resolveUrl(m[1], baseUrl)
|
|
273
336
|
if (resolved && !links.includes(resolved)) links.push(resolved)
|
|
274
337
|
}
|
|
275
|
-
// Extract from markdown links
|
|
276
338
|
const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
|
|
277
339
|
for (const m of mdMatches) {
|
|
278
340
|
if (!links.includes(m[2])) links.push(m[2])
|
|
@@ -293,7 +355,6 @@ function normalizeUrl(url) {
|
|
|
293
355
|
try {
|
|
294
356
|
const u = new URL(url)
|
|
295
357
|
u.hash = ''
|
|
296
|
-
// Remove trailing slash for consistency
|
|
297
358
|
let href = u.href
|
|
298
359
|
if (href.endsWith('/') && u.pathname !== '/') {
|
|
299
360
|
href = href.slice(0, -1)
|
package/src/server.js
CHANGED
|
@@ -55,10 +55,10 @@ const server = http.createServer(async (req, res) => {
|
|
|
55
55
|
if (req.method === 'POST' && path === '/crawl') {
|
|
56
56
|
const body = await readBody(req)
|
|
57
57
|
const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
|
|
58
|
-
includePatterns, excludePatterns, merge, async: asyncMode } = body
|
|
58
|
+
includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
|
|
59
59
|
if (!targetUrl) return error(res, 400, 'url is required')
|
|
60
60
|
|
|
61
|
-
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
|
|
61
|
+
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
|
|
62
62
|
|
|
63
63
|
if (asyncMode) {
|
|
64
64
|
// Async mode: return job ID immediately
|
|
@@ -70,6 +70,16 @@ const server = http.createServer(async (req, res) => {
|
|
|
70
70
|
return json(res, result)
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
+
if (req.method === 'GET' && path === '/crawl/jobs') {
|
|
74
|
+
const jobList = spectrawl.listCrawlJobs()
|
|
75
|
+
return json(res, { jobs: jobList })
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (req.method === 'GET' && path === '/crawl/capacity') {
|
|
79
|
+
const { CrawlEngine } = require('./crawl')
|
|
80
|
+
return json(res, CrawlEngine.getCapacity())
|
|
81
|
+
}
|
|
82
|
+
|
|
73
83
|
if (req.method === 'GET' && path.startsWith('/crawl/')) {
|
|
74
84
|
const jobId = path.split('/crawl/')[1]
|
|
75
85
|
if (!jobId) return error(res, 400, 'job ID is required')
|
|
@@ -78,11 +88,6 @@ const server = http.createServer(async (req, res) => {
|
|
|
78
88
|
return json(res, job)
|
|
79
89
|
}
|
|
80
90
|
|
|
81
|
-
if (req.method === 'GET' && path === '/crawl/jobs') {
|
|
82
|
-
const jobList = spectrawl.listCrawlJobs()
|
|
83
|
-
return json(res, { jobs: jobList })
|
|
84
|
-
}
|
|
85
|
-
|
|
86
91
|
if (req.method === 'POST' && path === '/act') {
|
|
87
92
|
const body = await readBody(req)
|
|
88
93
|
const { platform, action, ...params } = body
|