spectrawl 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/crawl.js +233 -110
- package/src/index.js +21 -0
- package/src/server.js +30 -2
package/package.json
CHANGED
package/src/crawl.js
CHANGED
|
@@ -1,23 +1,38 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Spectrawl Crawl Engine
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* Spectrawl Crawl Engine v2
|
|
3
|
+
* Multi-page website crawler using our own browse engine (Camoufox).
|
|
4
|
+
* No external dependencies (no Jina, no Cloudflare).
|
|
5
|
+
* Supports sync + async (job-based) modes.
|
|
6
|
+
* Auto-detects system RAM and parallelizes crawling accordingly.
|
|
5
7
|
*/
|
|
6
8
|
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
+
const crypto = require('crypto')
|
|
10
|
+
const os = require('os')
|
|
11
|
+
|
|
12
|
+
// ~250MB per browser tab (Camoufox average)
|
|
13
|
+
const MB_PER_TAB = 250
|
|
14
|
+
// Reserve this much RAM for OS + other processes
|
|
15
|
+
const RESERVED_MB = 1500
|
|
9
16
|
|
|
10
17
|
const DEFAULT_OPTS = {
|
|
11
|
-
depth:
|
|
18
|
+
depth: 2,
|
|
12
19
|
maxPages: 50,
|
|
13
|
-
format: 'markdown',
|
|
14
|
-
delay: 300,
|
|
15
|
-
stealth:
|
|
16
|
-
scope: 'domain',
|
|
17
|
-
timeout:
|
|
20
|
+
format: 'markdown',
|
|
21
|
+
delay: 300, // ms between batch launches
|
|
22
|
+
stealth: true,
|
|
23
|
+
scope: 'domain',
|
|
24
|
+
timeout: 30000,
|
|
25
|
+
concurrency: 'auto', // 'auto' | number — auto-detect from RAM
|
|
18
26
|
includeLinks: true,
|
|
27
|
+
includePatterns: [],
|
|
28
|
+
excludePatterns: [],
|
|
29
|
+
merge: false,
|
|
19
30
|
skipPatterns: [
|
|
20
|
-
/\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)
|
|
31
|
+
/\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
|
|
32
|
+
/\/_next\//,
|
|
33
|
+
/\/static\//,
|
|
34
|
+
/\/assets\//,
|
|
35
|
+
/mintcdn\.com/,
|
|
21
36
|
/#/,
|
|
22
37
|
/^mailto:/,
|
|
23
38
|
/^tel:/,
|
|
@@ -25,6 +40,24 @@ const DEFAULT_OPTS = {
|
|
|
25
40
|
]
|
|
26
41
|
}
|
|
27
42
|
|
|
43
|
+
// In-memory job store for async crawls
|
|
44
|
+
const jobs = new Map()
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Calculate max safe concurrency based on available system RAM.
|
|
48
|
+
*/
|
|
49
|
+
function detectConcurrency() {
|
|
50
|
+
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
51
|
+
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
52
|
+
// Use the lower of: (free RAM) or (total - reserved)
|
|
53
|
+
const availableMB = Math.min(freeMB, totalMB - RESERVED_MB)
|
|
54
|
+
const maxTabs = Math.max(1, Math.floor(availableMB / MB_PER_TAB))
|
|
55
|
+
// Cap at 10 — diminishing returns and politeness
|
|
56
|
+
const concurrency = Math.min(maxTabs, 10)
|
|
57
|
+
console.log(`[crawl] RAM: ${totalMB}MB total, ${freeMB}MB free → ${concurrency} concurrent tabs`)
|
|
58
|
+
return concurrency
|
|
59
|
+
}
|
|
60
|
+
|
|
28
61
|
class CrawlEngine {
|
|
29
62
|
constructor(browseEngine, cache) {
|
|
30
63
|
this.browseEngine = browseEngine
|
|
@@ -33,14 +66,20 @@ class CrawlEngine {
|
|
|
33
66
|
|
|
34
67
|
/**
|
|
35
68
|
* Crawl a website starting from a URL.
|
|
36
|
-
*
|
|
37
|
-
* @param {object} opts - Crawl options
|
|
38
|
-
* @param {object} cookies - Optional auth cookies
|
|
69
|
+
* Automatically parallelizes based on available RAM.
|
|
39
70
|
*/
|
|
40
71
|
async crawl(startUrl, opts = {}, cookies = null) {
|
|
41
|
-
const
|
|
72
|
+
const cleanOpts = Object.fromEntries(
|
|
73
|
+
Object.entries(opts).filter(([_, v]) => v !== undefined)
|
|
74
|
+
)
|
|
75
|
+
const config = { ...DEFAULT_OPTS, ...cleanOpts }
|
|
42
76
|
const startTime = Date.now()
|
|
43
77
|
|
|
78
|
+
// Determine concurrency
|
|
79
|
+
const concurrency = config.concurrency === 'auto'
|
|
80
|
+
? detectConcurrency()
|
|
81
|
+
: Math.max(1, Math.min(config.concurrency, 10))
|
|
82
|
+
|
|
44
83
|
const startParsed = new URL(startUrl)
|
|
45
84
|
const baseDomain = startParsed.hostname
|
|
46
85
|
const basePrefix = startUrl.replace(/\/$/, '')
|
|
@@ -49,21 +88,14 @@ class CrawlEngine {
|
|
|
49
88
|
const queue = [{ url: startUrl, depth: 0 }]
|
|
50
89
|
const pages = []
|
|
51
90
|
const failed = []
|
|
91
|
+
let activeCount = 0
|
|
52
92
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
const
|
|
56
|
-
if (visited.has(normalized)) continue
|
|
57
|
-
visited.add(normalized)
|
|
58
|
-
|
|
59
|
-
// Scope check
|
|
60
|
-
if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
|
|
61
|
-
// Skip pattern check
|
|
62
|
-
if (config.skipPatterns.some(p => p.test(url))) continue
|
|
63
|
-
|
|
93
|
+
// Process queue with concurrency control
|
|
94
|
+
const processUrl = async (item) => {
|
|
95
|
+
const { url, depth } = item
|
|
64
96
|
try {
|
|
65
97
|
const page = await this._fetchPage(url, config, cookies)
|
|
66
|
-
if (!page) { failed.push({ url, error: 'empty' });
|
|
98
|
+
if (!page) { failed.push({ url, error: 'empty' }); return }
|
|
67
99
|
|
|
68
100
|
const links = page.links || []
|
|
69
101
|
pages.push({
|
|
@@ -80,134 +112,236 @@ class CrawlEngine {
|
|
|
80
112
|
const absLink = resolveUrl(link, url)
|
|
81
113
|
if (!absLink) continue
|
|
82
114
|
const normLink = normalizeUrl(absLink)
|
|
83
|
-
if (
|
|
84
|
-
|
|
85
|
-
|
|
115
|
+
if (visited.has(normLink)) continue
|
|
116
|
+
// Pre-filter before queueing
|
|
117
|
+
if (!this._inScope(absLink, baseDomain, basePrefix, config.scope)) continue
|
|
118
|
+
if (config.skipPatterns.some(p => p.test(absLink))) continue
|
|
119
|
+
if (!this._matchesFilters(absLink, config.includePatterns, config.excludePatterns)) continue
|
|
120
|
+
visited.add(normLink)
|
|
121
|
+
queue.push({ url: absLink, depth: depth + 1 })
|
|
86
122
|
}
|
|
87
123
|
}
|
|
124
|
+
} catch (e) {
|
|
125
|
+
failed.push({ url, error: e.message })
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Seed the first URL
|
|
130
|
+
visited.add(normalizeUrl(startUrl))
|
|
131
|
+
|
|
132
|
+
// BFS with parallel workers
|
|
133
|
+
while (queue.length > 0 || activeCount > 0) {
|
|
134
|
+
// Launch up to `concurrency` parallel fetches
|
|
135
|
+
const batch = []
|
|
136
|
+
while (queue.length > 0 && batch.length < concurrency && (pages.length + activeCount + batch.length) < config.maxPages) {
|
|
137
|
+
batch.push(queue.shift())
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (batch.length === 0 && activeCount === 0) break
|
|
141
|
+
|
|
142
|
+
if (batch.length > 0) {
|
|
143
|
+
activeCount += batch.length
|
|
144
|
+
const results = await Promise.allSettled(
|
|
145
|
+
batch.map(item => processUrl(item))
|
|
146
|
+
)
|
|
147
|
+
activeCount -= batch.length
|
|
88
148
|
|
|
149
|
+
// Small delay between batches to be polite
|
|
89
150
|
if (queue.length > 0 && config.delay > 0) {
|
|
90
151
|
await sleep(config.delay)
|
|
91
152
|
}
|
|
92
|
-
} catch (e) {
|
|
93
|
-
failed.push({ url, error: e.message })
|
|
94
153
|
}
|
|
154
|
+
|
|
155
|
+
// Stop if we've hit maxPages
|
|
156
|
+
if (pages.length >= config.maxPages) break
|
|
95
157
|
}
|
|
96
158
|
|
|
97
|
-
|
|
159
|
+
const duration = Date.now() - startTime
|
|
160
|
+
const result = {
|
|
98
161
|
startUrl,
|
|
99
162
|
pages,
|
|
100
163
|
stats: {
|
|
101
164
|
total: visited.size,
|
|
102
165
|
crawled: pages.length,
|
|
103
166
|
failed: failed.length,
|
|
104
|
-
|
|
167
|
+
concurrency,
|
|
168
|
+
duration,
|
|
169
|
+
pagesPerSecond: pages.length > 0 ? +(pages.length / (duration / 1000)).toFixed(2) : 0
|
|
105
170
|
},
|
|
106
171
|
failed: failed.length > 0 ? failed : undefined
|
|
107
172
|
}
|
|
173
|
+
|
|
174
|
+
if (config.merge) {
|
|
175
|
+
result.merged = pages.map(p => {
|
|
176
|
+
return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
|
|
177
|
+
}).join('\n\n---\n\n')
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return result
|
|
108
181
|
}
|
|
109
182
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
183
|
+
/**
|
|
184
|
+
* Start an async crawl job. Returns job ID immediately.
|
|
185
|
+
*/
|
|
186
|
+
startJob(startUrl, opts = {}, cookies = null) {
|
|
187
|
+
const jobId = crypto.randomUUID()
|
|
188
|
+
const job = {
|
|
189
|
+
id: jobId,
|
|
190
|
+
startUrl,
|
|
191
|
+
status: 'running',
|
|
192
|
+
started: Date.now(),
|
|
193
|
+
finished: 0,
|
|
194
|
+
total: 0,
|
|
195
|
+
pages: [],
|
|
196
|
+
failed: [],
|
|
197
|
+
error: null
|
|
198
|
+
}
|
|
199
|
+
jobs.set(jobId, job)
|
|
200
|
+
|
|
201
|
+
this.crawl(startUrl, opts, cookies)
|
|
202
|
+
.then(result => {
|
|
203
|
+
job.status = 'completed'
|
|
204
|
+
job.pages = result.pages
|
|
205
|
+
job.failed = result.failed || []
|
|
206
|
+
job.finished = result.stats.crawled
|
|
207
|
+
job.total = result.stats.total
|
|
208
|
+
job.duration = result.stats.duration
|
|
209
|
+
job.concurrency = result.stats.concurrency
|
|
210
|
+
job.pagesPerSecond = result.stats.pagesPerSecond
|
|
211
|
+
})
|
|
212
|
+
.catch(err => {
|
|
213
|
+
job.status = 'errored'
|
|
214
|
+
job.error = err.message
|
|
119
215
|
})
|
|
120
216
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
217
|
+
return { jobId, status: 'running' }
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Get job status/results.
|
|
222
|
+
*/
|
|
223
|
+
getJob(jobId) {
|
|
224
|
+
const job = jobs.get(jobId)
|
|
225
|
+
if (!job) return null
|
|
226
|
+
return {
|
|
227
|
+
id: job.id,
|
|
228
|
+
startUrl: job.startUrl,
|
|
229
|
+
status: job.status,
|
|
230
|
+
started: job.started,
|
|
231
|
+
finished: job.finished,
|
|
232
|
+
total: job.total,
|
|
233
|
+
pageCount: job.pages.length,
|
|
234
|
+
concurrency: job.concurrency,
|
|
235
|
+
pagesPerSecond: job.pagesPerSecond,
|
|
236
|
+
error: job.error,
|
|
237
|
+
pages: job.status === 'completed' ? job.pages : undefined,
|
|
238
|
+
failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
|
|
239
|
+
duration: job.duration
|
|
126
240
|
}
|
|
241
|
+
}
|
|
127
242
|
|
|
128
|
-
|
|
243
|
+
/**
|
|
244
|
+
* List all jobs.
|
|
245
|
+
*/
|
|
246
|
+
listJobs() {
|
|
247
|
+
return Array.from(jobs.values()).map(j => ({
|
|
248
|
+
id: j.id,
|
|
249
|
+
startUrl: j.startUrl,
|
|
250
|
+
status: j.status,
|
|
251
|
+
pageCount: j.pages.length,
|
|
252
|
+
started: j.started
|
|
253
|
+
}))
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Get system info for crawl capacity estimation.
|
|
258
|
+
*/
|
|
259
|
+
static getCapacity() {
|
|
260
|
+
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
261
|
+
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
262
|
+
const concurrency = detectConcurrency()
|
|
263
|
+
// Estimate: each page takes ~4s with stealth delays
|
|
264
|
+
const pagesPerMinute = concurrency * 15 // ~4s per page
|
|
265
|
+
return {
|
|
266
|
+
totalRamMB: totalMB,
|
|
267
|
+
freeRamMB: freeMB,
|
|
268
|
+
maxConcurrency: concurrency,
|
|
269
|
+
estimatedPagesPerMinute: pagesPerMinute,
|
|
270
|
+
estimate100pages: `~${Math.ceil(100 / pagesPerMinute)} min`,
|
|
271
|
+
estimate1000pages: `~${Math.ceil(1000 / pagesPerMinute)} min`
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
async _fetchPage(url, config, cookies) {
|
|
129
276
|
try {
|
|
130
277
|
const result = await this.browseEngine.browse(url, {
|
|
131
278
|
stealth: config.stealth,
|
|
132
279
|
_cookies: cookies,
|
|
133
|
-
timeout: config.timeout
|
|
280
|
+
timeout: config.timeout,
|
|
281
|
+
html: true,
|
|
282
|
+
noCache: true
|
|
134
283
|
})
|
|
135
284
|
if (result?.content) {
|
|
285
|
+
const linkSource = result.html || result.content
|
|
136
286
|
return {
|
|
137
287
|
title: result.title || '',
|
|
138
288
|
content: result.content,
|
|
139
|
-
links: extractLinks(
|
|
289
|
+
links: extractLinks(linkSource, url)
|
|
140
290
|
}
|
|
141
291
|
}
|
|
142
292
|
} catch (e) {
|
|
143
293
|
throw new Error(`Failed to fetch ${url}: ${e.message}`)
|
|
144
294
|
}
|
|
145
|
-
|
|
146
295
|
return null
|
|
147
296
|
}
|
|
148
297
|
|
|
149
298
|
_inScope(url, baseDomain, basePrefix, scope) {
|
|
150
299
|
try {
|
|
151
300
|
const parsed = new URL(url)
|
|
152
|
-
if (scope === 'domain') return parsed.hostname === baseDomain
|
|
301
|
+
if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
|
|
153
302
|
if (scope === 'prefix') return url.startsWith(basePrefix)
|
|
154
|
-
return true
|
|
303
|
+
return true
|
|
155
304
|
} catch {
|
|
156
305
|
return false
|
|
157
306
|
}
|
|
158
307
|
}
|
|
159
|
-
}
|
|
160
308
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
const contentLines = []
|
|
167
|
-
let inLinksSummary = false
|
|
168
|
-
|
|
169
|
-
for (const line of lines) {
|
|
170
|
-
if (line.startsWith('Title:')) {
|
|
171
|
-
title = line.replace('Title:', '').trim()
|
|
172
|
-
} else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
|
|
173
|
-
inLinksSummary = true
|
|
174
|
-
} else if (inLinksSummary) {
|
|
175
|
-
// Extract markdown links [text](url)
|
|
176
|
-
const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
|
|
177
|
-
for (const m of matches) links.push(m[2])
|
|
178
|
-
} else {
|
|
179
|
-
contentLines.push(line)
|
|
309
|
+
_matchesFilters(url, includePatterns, excludePatterns) {
|
|
310
|
+
if (excludePatterns && excludePatterns.length > 0) {
|
|
311
|
+
for (const pattern of excludePatterns) {
|
|
312
|
+
if (wildcardMatch(url, pattern)) return false
|
|
313
|
+
}
|
|
180
314
|
}
|
|
315
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
316
|
+
return includePatterns.some(pattern => wildcardMatch(url, pattern))
|
|
317
|
+
}
|
|
318
|
+
return true
|
|
181
319
|
}
|
|
320
|
+
}
|
|
182
321
|
|
|
183
|
-
|
|
184
|
-
const
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
return
|
|
190
|
-
title: title || extractTitleFromMarkdown(contentLines.join('\n')),
|
|
191
|
-
content: contentLines.join('\n').trim(),
|
|
192
|
-
links: [...new Set(links)]
|
|
193
|
-
}
|
|
322
|
+
function wildcardMatch(str, pattern) {
|
|
323
|
+
const regex = pattern
|
|
324
|
+
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
|
|
325
|
+
.replace(/\*\*/g, '{{GLOBSTAR}}')
|
|
326
|
+
.replace(/\*/g, '[^/]*')
|
|
327
|
+
.replace(/\{\{GLOBSTAR\}\}/g, '.*')
|
|
328
|
+
return new RegExp('^' + regex + '$').test(str)
|
|
194
329
|
}
|
|
195
330
|
|
|
196
|
-
function extractLinks(
|
|
331
|
+
function extractLinks(content, baseUrl) {
|
|
197
332
|
const links = []
|
|
198
|
-
const
|
|
199
|
-
for (const m of
|
|
333
|
+
const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
|
|
334
|
+
for (const m of hrefMatches) {
|
|
200
335
|
const resolved = resolveUrl(m[1], baseUrl)
|
|
201
336
|
if (resolved && !links.includes(resolved)) links.push(resolved)
|
|
202
337
|
}
|
|
338
|
+
const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
|
|
339
|
+
for (const m of mdMatches) {
|
|
340
|
+
if (!links.includes(m[2])) links.push(m[2])
|
|
341
|
+
}
|
|
203
342
|
return links
|
|
204
343
|
}
|
|
205
344
|
|
|
206
|
-
function extractTitleFromMarkdown(content) {
|
|
207
|
-
const match = content.match(/^#\s+(.+)/m)
|
|
208
|
-
return match ? match[1].trim() : ''
|
|
209
|
-
}
|
|
210
|
-
|
|
211
345
|
function resolveUrl(url, base) {
|
|
212
346
|
try {
|
|
213
347
|
if (url.startsWith('http')) return url
|
|
@@ -221,27 +355,16 @@ function normalizeUrl(url) {
|
|
|
221
355
|
try {
|
|
222
356
|
const u = new URL(url)
|
|
223
357
|
u.hash = ''
|
|
224
|
-
|
|
358
|
+
let href = u.href
|
|
359
|
+
if (href.endsWith('/') && u.pathname !== '/') {
|
|
360
|
+
href = href.slice(0, -1)
|
|
361
|
+
}
|
|
362
|
+
return href
|
|
225
363
|
} catch {
|
|
226
364
|
return url
|
|
227
365
|
}
|
|
228
366
|
}
|
|
229
367
|
|
|
230
|
-
function fetchText(url, headers = {}) {
|
|
231
|
-
return new Promise((resolve, reject) => {
|
|
232
|
-
const mod = url.startsWith('https') ? https : http
|
|
233
|
-
const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
|
|
234
|
-
if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
|
|
235
|
-
let d = ''
|
|
236
|
-
res.on('data', c => d += c)
|
|
237
|
-
res.on('end', () => resolve(d))
|
|
238
|
-
})
|
|
239
|
-
req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
|
|
240
|
-
req.on('error', reject)
|
|
241
|
-
req.end()
|
|
242
|
-
})
|
|
243
|
-
}
|
|
244
|
-
|
|
245
368
|
function sleep(ms) {
|
|
246
369
|
return new Promise(r => setTimeout(r, ms))
|
|
247
370
|
}
|
package/src/index.js
CHANGED
|
@@ -92,6 +92,27 @@ class Spectrawl {
|
|
|
92
92
|
return this.crawlEngine.crawl(url, opts, cookies)
|
|
93
93
|
}
|
|
94
94
|
|
|
95
|
+
/**
|
|
96
|
+
* Start an async crawl job. Returns job ID immediately.
|
|
97
|
+
*/
|
|
98
|
+
startCrawlJob(url, opts = {}) {
|
|
99
|
+
return this.crawlEngine.startJob(url, opts)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Get crawl job status/results.
|
|
104
|
+
*/
|
|
105
|
+
getCrawlJob(jobId) {
|
|
106
|
+
return this.crawlEngine.getJob(jobId)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* List all crawl jobs.
|
|
111
|
+
*/
|
|
112
|
+
listCrawlJobs() {
|
|
113
|
+
return this.crawlEngine.listJobs()
|
|
114
|
+
}
|
|
115
|
+
|
|
95
116
|
/**
|
|
96
117
|
* Perform an authenticated action on a platform.
|
|
97
118
|
* @param {string} platform - Platform name (x, reddit, devto, etc.)
|
package/src/server.js
CHANGED
|
@@ -54,12 +54,40 @@ const server = http.createServer(async (req, res) => {
|
|
|
54
54
|
|
|
55
55
|
if (req.method === 'POST' && path === '/crawl') {
|
|
56
56
|
const body = await readBody(req)
|
|
57
|
-
const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth
|
|
57
|
+
const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
|
|
58
|
+
includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
|
|
58
59
|
if (!targetUrl) return error(res, 400, 'url is required')
|
|
59
|
-
|
|
60
|
+
|
|
61
|
+
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
|
|
62
|
+
|
|
63
|
+
if (asyncMode) {
|
|
64
|
+
// Async mode: return job ID immediately
|
|
65
|
+
const job = spectrawl.startCrawlJob(targetUrl, opts)
|
|
66
|
+
return json(res, job)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const result = await spectrawl.crawl(targetUrl, opts)
|
|
60
70
|
return json(res, result)
|
|
61
71
|
}
|
|
62
72
|
|
|
73
|
+
if (req.method === 'GET' && path === '/crawl/jobs') {
|
|
74
|
+
const jobList = spectrawl.listCrawlJobs()
|
|
75
|
+
return json(res, { jobs: jobList })
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (req.method === 'GET' && path === '/crawl/capacity') {
|
|
79
|
+
const { CrawlEngine } = require('./crawl')
|
|
80
|
+
return json(res, CrawlEngine.getCapacity())
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (req.method === 'GET' && path.startsWith('/crawl/')) {
|
|
84
|
+
const jobId = path.split('/crawl/')[1]
|
|
85
|
+
if (!jobId) return error(res, 400, 'job ID is required')
|
|
86
|
+
const job = spectrawl.getCrawlJob(jobId)
|
|
87
|
+
if (!job) return error(res, 404, 'job not found')
|
|
88
|
+
return json(res, job)
|
|
89
|
+
}
|
|
90
|
+
|
|
63
91
|
if (req.method === 'POST' && path === '/act') {
|
|
64
92
|
const body = await readBody(req)
|
|
65
93
|
const { platform, action, ...params } = body
|