spectrawl 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +15 -6
- package/src/crawl.js +111 -47
- package/src/server.js +12 -7
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -127,12 +127,21 @@ class BrowseEngine {
|
|
|
127
127
|
|
|
128
128
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
130
|
+
if (opts.fastMode) {
|
|
131
|
+
// Crawl mode: minimal delays, just enough for lazy-load triggers
|
|
132
|
+
await page.waitForTimeout(400)
|
|
133
|
+
await page.evaluate(() => {
|
|
134
|
+
window.scrollBy({ top: 500, behavior: 'instant' })
|
|
135
|
+
})
|
|
136
|
+
await page.waitForTimeout(200)
|
|
137
|
+
} else {
|
|
138
|
+
// Normal browse: full human-like delays
|
|
139
|
+
await page.waitForTimeout(800 + Math.random() * 1500)
|
|
140
|
+
await page.evaluate(() => {
|
|
141
|
+
window.scrollBy({ top: Math.floor(Math.random() * 400) + 100, behavior: 'smooth' })
|
|
142
|
+
})
|
|
143
|
+
await page.waitForTimeout(300 + Math.random() * 700)
|
|
144
|
+
}
|
|
136
145
|
|
|
137
146
|
const result = {}
|
|
138
147
|
|
package/src/crawl.js
CHANGED
|
@@ -3,22 +3,30 @@
|
|
|
3
3
|
* Multi-page website crawler using our own browse engine (Camoufox).
|
|
4
4
|
* No external dependencies (no Jina, no Cloudflare).
|
|
5
5
|
* Supports sync + async (job-based) modes.
|
|
6
|
+
* Auto-detects system RAM and parallelizes crawling accordingly.
|
|
6
7
|
*/
|
|
7
8
|
|
|
8
9
|
const crypto = require('crypto')
|
|
10
|
+
const os = require('os')
|
|
11
|
+
|
|
12
|
+
// ~250MB per browser tab (Camoufox average)
|
|
13
|
+
const MB_PER_TAB = 250
|
|
14
|
+
// Reserve this much RAM for OS + other processes
|
|
15
|
+
const RESERVED_MB = 1500
|
|
9
16
|
|
|
10
17
|
const DEFAULT_OPTS = {
|
|
11
18
|
depth: 2,
|
|
12
19
|
maxPages: 50,
|
|
13
|
-
format: 'markdown',
|
|
14
|
-
delay:
|
|
15
|
-
stealth: true,
|
|
16
|
-
scope: 'domain',
|
|
20
|
+
format: 'markdown',
|
|
21
|
+
delay: 300, // ms between batch launches
|
|
22
|
+
stealth: true,
|
|
23
|
+
scope: 'domain',
|
|
17
24
|
timeout: 30000,
|
|
25
|
+
concurrency: 'auto', // 'auto' | number — auto-detect from RAM
|
|
18
26
|
includeLinks: true,
|
|
19
|
-
includePatterns: [],
|
|
20
|
-
excludePatterns: [],
|
|
21
|
-
merge: false,
|
|
27
|
+
includePatterns: [],
|
|
28
|
+
excludePatterns: [],
|
|
29
|
+
merge: false,
|
|
22
30
|
skipPatterns: [
|
|
23
31
|
/\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
|
|
24
32
|
/\/_next\//,
|
|
@@ -35,6 +43,21 @@ const DEFAULT_OPTS = {
|
|
|
35
43
|
// In-memory job store for async crawls
|
|
36
44
|
const jobs = new Map()
|
|
37
45
|
|
|
46
|
+
/**
|
|
47
|
+
* Calculate max safe concurrency based on available system RAM.
|
|
48
|
+
*/
|
|
49
|
+
function detectConcurrency() {
|
|
50
|
+
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
51
|
+
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
52
|
+
// Use the lower of: (free RAM) or (total - reserved)
|
|
53
|
+
const availableMB = Math.min(freeMB, totalMB - RESERVED_MB)
|
|
54
|
+
const maxTabs = Math.max(1, Math.floor(availableMB / MB_PER_TAB))
|
|
55
|
+
// Cap at 10 — diminishing returns and politeness
|
|
56
|
+
const concurrency = Math.min(maxTabs, 10)
|
|
57
|
+
console.log(`[crawl] RAM: ${totalMB}MB total, ${freeMB}MB free → ${concurrency} concurrent tabs`)
|
|
58
|
+
return concurrency
|
|
59
|
+
}
|
|
60
|
+
|
|
38
61
|
class CrawlEngine {
|
|
39
62
|
constructor(browseEngine, cache) {
|
|
40
63
|
this.browseEngine = browseEngine
|
|
@@ -42,16 +65,21 @@ class CrawlEngine {
|
|
|
42
65
|
}
|
|
43
66
|
|
|
44
67
|
/**
|
|
45
|
-
* Crawl a website starting from a URL
|
|
68
|
+
* Crawl a website starting from a URL.
|
|
69
|
+
* Automatically parallelizes based on available RAM.
|
|
46
70
|
*/
|
|
47
71
|
async crawl(startUrl, opts = {}, cookies = null) {
|
|
48
|
-
// Filter out undefined values from opts to avoid overriding defaults
|
|
49
72
|
const cleanOpts = Object.fromEntries(
|
|
50
73
|
Object.entries(opts).filter(([_, v]) => v !== undefined)
|
|
51
74
|
)
|
|
52
75
|
const config = { ...DEFAULT_OPTS, ...cleanOpts }
|
|
53
76
|
const startTime = Date.now()
|
|
54
77
|
|
|
78
|
+
// Determine concurrency
|
|
79
|
+
const concurrency = config.concurrency === 'auto'
|
|
80
|
+
? detectConcurrency()
|
|
81
|
+
: Math.max(1, Math.min(config.concurrency, 10))
|
|
82
|
+
|
|
55
83
|
const startParsed = new URL(startUrl)
|
|
56
84
|
const baseDomain = startParsed.hostname
|
|
57
85
|
const basePrefix = startUrl.replace(/\/$/, '')
|
|
@@ -60,23 +88,14 @@ class CrawlEngine {
|
|
|
60
88
|
const queue = [{ url: startUrl, depth: 0 }]
|
|
61
89
|
const pages = []
|
|
62
90
|
const failed = []
|
|
91
|
+
let activeCount = 0
|
|
63
92
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
const
|
|
67
|
-
if (visited.has(normalized)) continue
|
|
68
|
-
visited.add(normalized)
|
|
69
|
-
|
|
70
|
-
// Scope check
|
|
71
|
-
if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
|
|
72
|
-
// Skip pattern check
|
|
73
|
-
if (config.skipPatterns.some(p => p.test(url))) continue
|
|
74
|
-
// Include/exclude pattern check
|
|
75
|
-
if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
|
|
76
|
-
|
|
93
|
+
// Process queue with concurrency control
|
|
94
|
+
const processUrl = async (item) => {
|
|
95
|
+
const { url, depth } = item
|
|
77
96
|
try {
|
|
78
97
|
const page = await this._fetchPage(url, config, cookies)
|
|
79
|
-
if (!page) { failed.push({ url, error: 'empty' });
|
|
98
|
+
if (!page) { failed.push({ url, error: 'empty' }); return }
|
|
80
99
|
|
|
81
100
|
const links = page.links || []
|
|
82
101
|
pages.push({
|
|
@@ -93,20 +112,51 @@ class CrawlEngine {
|
|
|
93
112
|
const absLink = resolveUrl(link, url)
|
|
94
113
|
if (!absLink) continue
|
|
95
114
|
const normLink = normalizeUrl(absLink)
|
|
96
|
-
if (
|
|
97
|
-
|
|
98
|
-
|
|
115
|
+
if (visited.has(normLink)) continue
|
|
116
|
+
// Pre-filter before queueing
|
|
117
|
+
if (!this._inScope(absLink, baseDomain, basePrefix, config.scope)) continue
|
|
118
|
+
if (config.skipPatterns.some(p => p.test(absLink))) continue
|
|
119
|
+
if (!this._matchesFilters(absLink, config.includePatterns, config.excludePatterns)) continue
|
|
120
|
+
visited.add(normLink)
|
|
121
|
+
queue.push({ url: absLink, depth: depth + 1 })
|
|
99
122
|
}
|
|
100
123
|
}
|
|
124
|
+
} catch (e) {
|
|
125
|
+
failed.push({ url, error: e.message })
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Seed the first URL
|
|
130
|
+
visited.add(normalizeUrl(startUrl))
|
|
131
|
+
|
|
132
|
+
// BFS with parallel workers
|
|
133
|
+
while (queue.length > 0 || activeCount > 0) {
|
|
134
|
+
// Launch up to `concurrency` parallel fetches
|
|
135
|
+
const batch = []
|
|
136
|
+
while (queue.length > 0 && batch.length < concurrency && (pages.length + activeCount + batch.length) < config.maxPages) {
|
|
137
|
+
batch.push(queue.shift())
|
|
138
|
+
}
|
|
101
139
|
|
|
140
|
+
if (batch.length === 0 && activeCount === 0) break
|
|
141
|
+
|
|
142
|
+
if (batch.length > 0) {
|
|
143
|
+
activeCount += batch.length
|
|
144
|
+
const results = await Promise.allSettled(
|
|
145
|
+
batch.map(item => processUrl(item))
|
|
146
|
+
)
|
|
147
|
+
activeCount -= batch.length
|
|
148
|
+
|
|
149
|
+
// Small delay between batches to be polite
|
|
102
150
|
if (queue.length > 0 && config.delay > 0) {
|
|
103
151
|
await sleep(config.delay)
|
|
104
152
|
}
|
|
105
|
-
} catch (e) {
|
|
106
|
-
failed.push({ url, error: e.message })
|
|
107
153
|
}
|
|
154
|
+
|
|
155
|
+
// Stop if we've hit maxPages
|
|
156
|
+
if (pages.length >= config.maxPages) break
|
|
108
157
|
}
|
|
109
158
|
|
|
159
|
+
const duration = Date.now() - startTime
|
|
110
160
|
const result = {
|
|
111
161
|
startUrl,
|
|
112
162
|
pages,
|
|
@@ -114,12 +164,13 @@ class CrawlEngine {
|
|
|
114
164
|
total: visited.size,
|
|
115
165
|
crawled: pages.length,
|
|
116
166
|
failed: failed.length,
|
|
117
|
-
|
|
167
|
+
concurrency,
|
|
168
|
+
duration,
|
|
169
|
+
pagesPerSecond: pages.length > 0 ? +(pages.length / (duration / 1000)).toFixed(2) : 0
|
|
118
170
|
},
|
|
119
171
|
failed: failed.length > 0 ? failed : undefined
|
|
120
172
|
}
|
|
121
173
|
|
|
122
|
-
// Merge mode: combine all pages into single content
|
|
123
174
|
if (config.merge) {
|
|
124
175
|
result.merged = pages.map(p => {
|
|
125
176
|
return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
|
|
@@ -147,7 +198,6 @@ class CrawlEngine {
|
|
|
147
198
|
}
|
|
148
199
|
jobs.set(jobId, job)
|
|
149
200
|
|
|
150
|
-
// Run crawl in background
|
|
151
201
|
this.crawl(startUrl, opts, cookies)
|
|
152
202
|
.then(result => {
|
|
153
203
|
job.status = 'completed'
|
|
@@ -156,6 +206,8 @@ class CrawlEngine {
|
|
|
156
206
|
job.finished = result.stats.crawled
|
|
157
207
|
job.total = result.stats.total
|
|
158
208
|
job.duration = result.stats.duration
|
|
209
|
+
job.concurrency = result.stats.concurrency
|
|
210
|
+
job.pagesPerSecond = result.stats.pagesPerSecond
|
|
159
211
|
})
|
|
160
212
|
.catch(err => {
|
|
161
213
|
job.status = 'errored'
|
|
@@ -179,8 +231,9 @@ class CrawlEngine {
|
|
|
179
231
|
finished: job.finished,
|
|
180
232
|
total: job.total,
|
|
181
233
|
pageCount: job.pages.length,
|
|
234
|
+
concurrency: job.concurrency,
|
|
235
|
+
pagesPerSecond: job.pagesPerSecond,
|
|
182
236
|
error: job.error,
|
|
183
|
-
// Only include pages if completed
|
|
184
237
|
pages: job.status === 'completed' ? job.pages : undefined,
|
|
185
238
|
failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
|
|
186
239
|
duration: job.duration
|
|
@@ -200,18 +253,38 @@ class CrawlEngine {
|
|
|
200
253
|
}))
|
|
201
254
|
}
|
|
202
255
|
|
|
256
|
+
/**
|
|
257
|
+
* Get system info for crawl capacity estimation.
|
|
258
|
+
*/
|
|
259
|
+
static getCapacity() {
|
|
260
|
+
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
261
|
+
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
262
|
+
const concurrency = detectConcurrency()
|
|
263
|
+
// Realistic: ~0.8s per page with fast mode, limited by shared browser pipeline
|
|
264
|
+
// Concurrency helps but not linearly — shared browser bottleneck
|
|
265
|
+
const effectiveConcurrency = Math.min(concurrency, 5) // diminishing returns past 5
|
|
266
|
+
const pagesPerMinute = Math.floor(effectiveConcurrency * 30) // ~2s effective per page with overhead
|
|
267
|
+
return {
|
|
268
|
+
totalRamMB: totalMB,
|
|
269
|
+
freeRamMB: freeMB,
|
|
270
|
+
maxConcurrency: concurrency,
|
|
271
|
+
estimatedPagesPerMinute: pagesPerMinute,
|
|
272
|
+
estimate100pages: `~${Math.ceil(100 / pagesPerMinute)} min`,
|
|
273
|
+
estimate1000pages: `~${Math.ceil(1000 / pagesPerMinute)} min`
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
203
277
|
async _fetchPage(url, config, cookies) {
|
|
204
|
-
// Use our own browse engine (Camoufox) — no external dependencies
|
|
205
278
|
try {
|
|
206
279
|
const result = await this.browseEngine.browse(url, {
|
|
207
280
|
stealth: config.stealth,
|
|
208
281
|
_cookies: cookies,
|
|
209
282
|
timeout: config.timeout,
|
|
210
|
-
html: true,
|
|
211
|
-
noCache: true
|
|
283
|
+
html: true,
|
|
284
|
+
noCache: true,
|
|
285
|
+
fastMode: true // crawl mode: reduced delays for speed
|
|
212
286
|
})
|
|
213
287
|
if (result?.content) {
|
|
214
|
-
// Extract links from HTML if available, otherwise from markdown content
|
|
215
288
|
const linkSource = result.html || result.content
|
|
216
289
|
return {
|
|
217
290
|
title: result.title || '',
|
|
@@ -222,7 +295,6 @@ class CrawlEngine {
|
|
|
222
295
|
} catch (e) {
|
|
223
296
|
throw new Error(`Failed to fetch ${url}: ${e.message}`)
|
|
224
297
|
}
|
|
225
|
-
|
|
226
298
|
return null
|
|
227
299
|
}
|
|
228
300
|
|
|
@@ -231,20 +303,18 @@ class CrawlEngine {
|
|
|
231
303
|
const parsed = new URL(url)
|
|
232
304
|
if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
|
|
233
305
|
if (scope === 'prefix') return url.startsWith(basePrefix)
|
|
234
|
-
return true
|
|
306
|
+
return true
|
|
235
307
|
} catch {
|
|
236
308
|
return false
|
|
237
309
|
}
|
|
238
310
|
}
|
|
239
311
|
|
|
240
312
|
_matchesFilters(url, includePatterns, excludePatterns) {
|
|
241
|
-
// Exclude takes priority
|
|
242
313
|
if (excludePatterns && excludePatterns.length > 0) {
|
|
243
314
|
for (const pattern of excludePatterns) {
|
|
244
315
|
if (wildcardMatch(url, pattern)) return false
|
|
245
316
|
}
|
|
246
317
|
}
|
|
247
|
-
// If include patterns specified, URL must match at least one
|
|
248
318
|
if (includePatterns && includePatterns.length > 0) {
|
|
249
319
|
return includePatterns.some(pattern => wildcardMatch(url, pattern))
|
|
250
320
|
}
|
|
@@ -252,12 +322,9 @@ class CrawlEngine {
|
|
|
252
322
|
}
|
|
253
323
|
}
|
|
254
324
|
|
|
255
|
-
/**
|
|
256
|
-
* Wildcard matching: * matches anything except /, ** matches everything including /
|
|
257
|
-
*/
|
|
258
325
|
function wildcardMatch(str, pattern) {
|
|
259
326
|
const regex = pattern
|
|
260
|
-
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
|
|
327
|
+
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
|
|
261
328
|
.replace(/\*\*/g, '{{GLOBSTAR}}')
|
|
262
329
|
.replace(/\*/g, '[^/]*')
|
|
263
330
|
.replace(/\{\{GLOBSTAR\}\}/g, '.*')
|
|
@@ -266,13 +333,11 @@ function wildcardMatch(str, pattern) {
|
|
|
266
333
|
|
|
267
334
|
function extractLinks(content, baseUrl) {
|
|
268
335
|
const links = []
|
|
269
|
-
// Extract from href attributes (HTML)
|
|
270
336
|
const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
|
|
271
337
|
for (const m of hrefMatches) {
|
|
272
338
|
const resolved = resolveUrl(m[1], baseUrl)
|
|
273
339
|
if (resolved && !links.includes(resolved)) links.push(resolved)
|
|
274
340
|
}
|
|
275
|
-
// Extract from markdown links
|
|
276
341
|
const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
|
|
277
342
|
for (const m of mdMatches) {
|
|
278
343
|
if (!links.includes(m[2])) links.push(m[2])
|
|
@@ -293,7 +358,6 @@ function normalizeUrl(url) {
|
|
|
293
358
|
try {
|
|
294
359
|
const u = new URL(url)
|
|
295
360
|
u.hash = ''
|
|
296
|
-
// Remove trailing slash for consistency
|
|
297
361
|
let href = u.href
|
|
298
362
|
if (href.endsWith('/') && u.pathname !== '/') {
|
|
299
363
|
href = href.slice(0, -1)
|
package/src/server.js
CHANGED
|
@@ -55,10 +55,10 @@ const server = http.createServer(async (req, res) => {
|
|
|
55
55
|
if (req.method === 'POST' && path === '/crawl') {
|
|
56
56
|
const body = await readBody(req)
|
|
57
57
|
const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
|
|
58
|
-
includePatterns, excludePatterns, merge, async: asyncMode } = body
|
|
58
|
+
includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
|
|
59
59
|
if (!targetUrl) return error(res, 400, 'url is required')
|
|
60
60
|
|
|
61
|
-
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
|
|
61
|
+
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
|
|
62
62
|
|
|
63
63
|
if (asyncMode) {
|
|
64
64
|
// Async mode: return job ID immediately
|
|
@@ -70,6 +70,16 @@ const server = http.createServer(async (req, res) => {
|
|
|
70
70
|
return json(res, result)
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
+
if (req.method === 'GET' && path === '/crawl/jobs') {
|
|
74
|
+
const jobList = spectrawl.listCrawlJobs()
|
|
75
|
+
return json(res, { jobs: jobList })
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (req.method === 'GET' && path === '/crawl/capacity') {
|
|
79
|
+
const { CrawlEngine } = require('./crawl')
|
|
80
|
+
return json(res, CrawlEngine.getCapacity())
|
|
81
|
+
}
|
|
82
|
+
|
|
73
83
|
if (req.method === 'GET' && path.startsWith('/crawl/')) {
|
|
74
84
|
const jobId = path.split('/crawl/')[1]
|
|
75
85
|
if (!jobId) return error(res, 400, 'job ID is required')
|
|
@@ -78,11 +88,6 @@ const server = http.createServer(async (req, res) => {
|
|
|
78
88
|
return json(res, job)
|
|
79
89
|
}
|
|
80
90
|
|
|
81
|
-
if (req.method === 'GET' && path === '/crawl/jobs') {
|
|
82
|
-
const jobList = spectrawl.listCrawlJobs()
|
|
83
|
-
return json(res, { jobs: jobList })
|
|
84
|
-
}
|
|
85
|
-
|
|
86
91
|
if (req.method === 'POST' && path === '/act') {
|
|
87
92
|
const body = await readBody(req)
|
|
88
93
|
const { platform, action, ...params } = body
|