spectrawl 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/crawl.js +151 -89
- package/src/index.js +21 -0
- package/src/server.js +25 -2
package/package.json
CHANGED
package/src/crawl.js
CHANGED
|
@@ -1,23 +1,30 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Spectrawl Crawl Engine
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* Spectrawl Crawl Engine v2
|
|
3
|
+
* Multi-page website crawler using our own browse engine (Camoufox).
|
|
4
|
+
* No external dependencies (no Jina, no Cloudflare).
|
|
5
|
+
* Supports sync + async (job-based) modes.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
|
-
const
|
|
8
|
-
const http = require('http')
|
|
8
|
+
const crypto = require('crypto')
|
|
9
9
|
|
|
10
10
|
const DEFAULT_OPTS = {
|
|
11
|
-
depth:
|
|
11
|
+
depth: 2,
|
|
12
12
|
maxPages: 50,
|
|
13
13
|
format: 'markdown', // markdown | html | json
|
|
14
|
-
delay:
|
|
15
|
-
stealth:
|
|
14
|
+
delay: 500, // ms between requests
|
|
15
|
+
stealth: true, // use stealth browsing by default
|
|
16
16
|
scope: 'domain', // domain | prefix | any
|
|
17
|
-
timeout:
|
|
17
|
+
timeout: 30000,
|
|
18
18
|
includeLinks: true,
|
|
19
|
+
includePatterns: [], // wildcard patterns to include
|
|
20
|
+
excludePatterns: [], // wildcard patterns to exclude
|
|
21
|
+
merge: false, // merge all pages into single result
|
|
19
22
|
skipPatterns: [
|
|
20
|
-
/\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)
|
|
23
|
+
/\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
|
|
24
|
+
/\/_next\//,
|
|
25
|
+
/\/static\//,
|
|
26
|
+
/\/assets\//,
|
|
27
|
+
/mintcdn\.com/,
|
|
21
28
|
/#/,
|
|
22
29
|
/^mailto:/,
|
|
23
30
|
/^tel:/,
|
|
@@ -25,6 +32,9 @@ const DEFAULT_OPTS = {
|
|
|
25
32
|
]
|
|
26
33
|
}
|
|
27
34
|
|
|
35
|
+
// In-memory job store for async crawls
|
|
36
|
+
const jobs = new Map()
|
|
37
|
+
|
|
28
38
|
class CrawlEngine {
|
|
29
39
|
constructor(browseEngine, cache) {
|
|
30
40
|
this.browseEngine = browseEngine
|
|
@@ -32,13 +42,14 @@ class CrawlEngine {
|
|
|
32
42
|
}
|
|
33
43
|
|
|
34
44
|
/**
|
|
35
|
-
* Crawl a website starting from a URL.
|
|
36
|
-
* @param {string} startUrl - Starting URL
|
|
37
|
-
* @param {object} opts - Crawl options
|
|
38
|
-
* @param {object} cookies - Optional auth cookies
|
|
45
|
+
* Crawl a website starting from a URL (synchronous — waits for completion).
|
|
39
46
|
*/
|
|
40
47
|
async crawl(startUrl, opts = {}, cookies = null) {
|
|
41
|
-
|
|
48
|
+
// Filter out undefined values from opts to avoid overriding defaults
|
|
49
|
+
const cleanOpts = Object.fromEntries(
|
|
50
|
+
Object.entries(opts).filter(([_, v]) => v !== undefined)
|
|
51
|
+
)
|
|
52
|
+
const config = { ...DEFAULT_OPTS, ...cleanOpts }
|
|
42
53
|
const startTime = Date.now()
|
|
43
54
|
|
|
44
55
|
const startParsed = new URL(startUrl)
|
|
@@ -60,6 +71,8 @@ class CrawlEngine {
|
|
|
60
71
|
if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
|
|
61
72
|
// Skip pattern check
|
|
62
73
|
if (config.skipPatterns.some(p => p.test(url))) continue
|
|
74
|
+
// Include/exclude pattern check
|
|
75
|
+
if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
|
|
63
76
|
|
|
64
77
|
try {
|
|
65
78
|
const page = await this._fetchPage(url, config, cookies)
|
|
@@ -94,7 +107,7 @@ class CrawlEngine {
|
|
|
94
107
|
}
|
|
95
108
|
}
|
|
96
109
|
|
|
97
|
-
|
|
110
|
+
const result = {
|
|
98
111
|
startUrl,
|
|
99
112
|
pages,
|
|
100
113
|
stats: {
|
|
@@ -105,38 +118,105 @@ class CrawlEngine {
|
|
|
105
118
|
},
|
|
106
119
|
failed: failed.length > 0 ? failed : undefined
|
|
107
120
|
}
|
|
121
|
+
|
|
122
|
+
// Merge mode: combine all pages into single content
|
|
123
|
+
if (config.merge) {
|
|
124
|
+
result.merged = pages.map(p => {
|
|
125
|
+
return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
|
|
126
|
+
}).join('\n\n---\n\n')
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return result
|
|
108
130
|
}
|
|
109
131
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
132
|
+
/**
|
|
133
|
+
* Start an async crawl job. Returns job ID immediately.
|
|
134
|
+
*/
|
|
135
|
+
startJob(startUrl, opts = {}, cookies = null) {
|
|
136
|
+
const jobId = crypto.randomUUID()
|
|
137
|
+
const job = {
|
|
138
|
+
id: jobId,
|
|
139
|
+
startUrl,
|
|
140
|
+
status: 'running',
|
|
141
|
+
started: Date.now(),
|
|
142
|
+
finished: 0,
|
|
143
|
+
total: 0,
|
|
144
|
+
pages: [],
|
|
145
|
+
failed: [],
|
|
146
|
+
error: null
|
|
147
|
+
}
|
|
148
|
+
jobs.set(jobId, job)
|
|
149
|
+
|
|
150
|
+
// Run crawl in background
|
|
151
|
+
this.crawl(startUrl, opts, cookies)
|
|
152
|
+
.then(result => {
|
|
153
|
+
job.status = 'completed'
|
|
154
|
+
job.pages = result.pages
|
|
155
|
+
job.failed = result.failed || []
|
|
156
|
+
job.finished = result.stats.crawled
|
|
157
|
+
job.total = result.stats.total
|
|
158
|
+
job.duration = result.stats.duration
|
|
159
|
+
})
|
|
160
|
+
.catch(err => {
|
|
161
|
+
job.status = 'errored'
|
|
162
|
+
job.error = err.message
|
|
119
163
|
})
|
|
120
164
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
165
|
+
return { jobId, status: 'running' }
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Get job status/results.
|
|
170
|
+
*/
|
|
171
|
+
getJob(jobId) {
|
|
172
|
+
const job = jobs.get(jobId)
|
|
173
|
+
if (!job) return null
|
|
174
|
+
return {
|
|
175
|
+
id: job.id,
|
|
176
|
+
startUrl: job.startUrl,
|
|
177
|
+
status: job.status,
|
|
178
|
+
started: job.started,
|
|
179
|
+
finished: job.finished,
|
|
180
|
+
total: job.total,
|
|
181
|
+
pageCount: job.pages.length,
|
|
182
|
+
error: job.error,
|
|
183
|
+
// Only include pages if completed
|
|
184
|
+
pages: job.status === 'completed' ? job.pages : undefined,
|
|
185
|
+
failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
|
|
186
|
+
duration: job.duration
|
|
126
187
|
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* List all jobs.
|
|
192
|
+
*/
|
|
193
|
+
listJobs() {
|
|
194
|
+
return Array.from(jobs.values()).map(j => ({
|
|
195
|
+
id: j.id,
|
|
196
|
+
startUrl: j.startUrl,
|
|
197
|
+
status: j.status,
|
|
198
|
+
pageCount: j.pages.length,
|
|
199
|
+
started: j.started
|
|
200
|
+
}))
|
|
201
|
+
}
|
|
127
202
|
|
|
128
|
-
|
|
203
|
+
async _fetchPage(url, config, cookies) {
|
|
204
|
+
// Use our own browse engine (Camoufox) — no external dependencies
|
|
129
205
|
try {
|
|
130
206
|
const result = await this.browseEngine.browse(url, {
|
|
131
207
|
stealth: config.stealth,
|
|
132
208
|
_cookies: cookies,
|
|
133
|
-
timeout: config.timeout
|
|
209
|
+
timeout: config.timeout,
|
|
210
|
+
html: true, // request raw HTML for link extraction
|
|
211
|
+
noCache: true // always fetch fresh for crawling
|
|
134
212
|
})
|
|
135
213
|
if (result?.content) {
|
|
214
|
+
// Extract links from HTML if available, otherwise from markdown content
|
|
215
|
+
const linkSource = result.html || result.content
|
|
136
216
|
return {
|
|
137
217
|
title: result.title || '',
|
|
138
218
|
content: result.content,
|
|
139
|
-
links: extractLinks(
|
|
219
|
+
links: extractLinks(linkSource, url)
|
|
140
220
|
}
|
|
141
221
|
}
|
|
142
222
|
} catch (e) {
|
|
@@ -149,65 +229,57 @@ class CrawlEngine {
|
|
|
149
229
|
_inScope(url, baseDomain, basePrefix, scope) {
|
|
150
230
|
try {
|
|
151
231
|
const parsed = new URL(url)
|
|
152
|
-
if (scope === 'domain') return parsed.hostname === baseDomain
|
|
232
|
+
if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
|
|
153
233
|
if (scope === 'prefix') return url.startsWith(basePrefix)
|
|
154
234
|
return true // 'any'
|
|
155
235
|
} catch {
|
|
156
236
|
return false
|
|
157
237
|
}
|
|
158
238
|
}
|
|
159
|
-
}
|
|
160
239
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
let inLinksSummary = false
|
|
168
|
-
|
|
169
|
-
for (const line of lines) {
|
|
170
|
-
if (line.startsWith('Title:')) {
|
|
171
|
-
title = line.replace('Title:', '').trim()
|
|
172
|
-
} else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
|
|
173
|
-
inLinksSummary = true
|
|
174
|
-
} else if (inLinksSummary) {
|
|
175
|
-
// Extract markdown links [text](url)
|
|
176
|
-
const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
|
|
177
|
-
for (const m of matches) links.push(m[2])
|
|
178
|
-
} else {
|
|
179
|
-
contentLines.push(line)
|
|
240
|
+
_matchesFilters(url, includePatterns, excludePatterns) {
|
|
241
|
+
// Exclude takes priority
|
|
242
|
+
if (excludePatterns && excludePatterns.length > 0) {
|
|
243
|
+
for (const pattern of excludePatterns) {
|
|
244
|
+
if (wildcardMatch(url, pattern)) return false
|
|
245
|
+
}
|
|
180
246
|
}
|
|
247
|
+
// If include patterns specified, URL must match at least one
|
|
248
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
249
|
+
return includePatterns.some(pattern => wildcardMatch(url, pattern))
|
|
250
|
+
}
|
|
251
|
+
return true
|
|
181
252
|
}
|
|
253
|
+
}
|
|
182
254
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
}
|
|
255
|
+
/**
|
|
256
|
+
* Wildcard matching: * matches anything except /, ** matches everything including /
|
|
257
|
+
*/
|
|
258
|
+
function wildcardMatch(str, pattern) {
|
|
259
|
+
const regex = pattern
|
|
260
|
+
.replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex chars
|
|
261
|
+
.replace(/\*\*/g, '{{GLOBSTAR}}')
|
|
262
|
+
.replace(/\*/g, '[^/]*')
|
|
263
|
+
.replace(/\{\{GLOBSTAR\}\}/g, '.*')
|
|
264
|
+
return new RegExp('^' + regex + '$').test(str)
|
|
194
265
|
}
|
|
195
266
|
|
|
196
|
-
function extractLinks(
|
|
267
|
+
function extractLinks(content, baseUrl) {
|
|
197
268
|
const links = []
|
|
198
|
-
|
|
199
|
-
|
|
269
|
+
// Extract from href attributes (HTML)
|
|
270
|
+
const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
|
|
271
|
+
for (const m of hrefMatches) {
|
|
200
272
|
const resolved = resolveUrl(m[1], baseUrl)
|
|
201
273
|
if (resolved && !links.includes(resolved)) links.push(resolved)
|
|
202
274
|
}
|
|
275
|
+
// Extract from markdown links
|
|
276
|
+
const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
|
|
277
|
+
for (const m of mdMatches) {
|
|
278
|
+
if (!links.includes(m[2])) links.push(m[2])
|
|
279
|
+
}
|
|
203
280
|
return links
|
|
204
281
|
}
|
|
205
282
|
|
|
206
|
-
function extractTitleFromMarkdown(content) {
|
|
207
|
-
const match = content.match(/^#\s+(.+)/m)
|
|
208
|
-
return match ? match[1].trim() : ''
|
|
209
|
-
}
|
|
210
|
-
|
|
211
283
|
function resolveUrl(url, base) {
|
|
212
284
|
try {
|
|
213
285
|
if (url.startsWith('http')) return url
|
|
@@ -221,27 +293,17 @@ function normalizeUrl(url) {
|
|
|
221
293
|
try {
|
|
222
294
|
const u = new URL(url)
|
|
223
295
|
u.hash = ''
|
|
224
|
-
|
|
296
|
+
// Remove trailing slash for consistency
|
|
297
|
+
let href = u.href
|
|
298
|
+
if (href.endsWith('/') && u.pathname !== '/') {
|
|
299
|
+
href = href.slice(0, -1)
|
|
300
|
+
}
|
|
301
|
+
return href
|
|
225
302
|
} catch {
|
|
226
303
|
return url
|
|
227
304
|
}
|
|
228
305
|
}
|
|
229
306
|
|
|
230
|
-
function fetchText(url, headers = {}) {
|
|
231
|
-
return new Promise((resolve, reject) => {
|
|
232
|
-
const mod = url.startsWith('https') ? https : http
|
|
233
|
-
const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
|
|
234
|
-
if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
|
|
235
|
-
let d = ''
|
|
236
|
-
res.on('data', c => d += c)
|
|
237
|
-
res.on('end', () => resolve(d))
|
|
238
|
-
})
|
|
239
|
-
req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
|
|
240
|
-
req.on('error', reject)
|
|
241
|
-
req.end()
|
|
242
|
-
})
|
|
243
|
-
}
|
|
244
|
-
|
|
245
307
|
function sleep(ms) {
|
|
246
308
|
return new Promise(r => setTimeout(r, ms))
|
|
247
309
|
}
|
package/src/index.js
CHANGED
|
@@ -92,6 +92,27 @@ class Spectrawl {
|
|
|
92
92
|
return this.crawlEngine.crawl(url, opts, cookies)
|
|
93
93
|
}
|
|
94
94
|
|
|
95
|
+
/**
|
|
96
|
+
* Start an async crawl job. Returns job ID immediately.
|
|
97
|
+
*/
|
|
98
|
+
startCrawlJob(url, opts = {}) {
|
|
99
|
+
return this.crawlEngine.startJob(url, opts)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Get crawl job status/results.
|
|
104
|
+
*/
|
|
105
|
+
getCrawlJob(jobId) {
|
|
106
|
+
return this.crawlEngine.getJob(jobId)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* List all crawl jobs.
|
|
111
|
+
*/
|
|
112
|
+
listCrawlJobs() {
|
|
113
|
+
return this.crawlEngine.listJobs()
|
|
114
|
+
}
|
|
115
|
+
|
|
95
116
|
/**
|
|
96
117
|
* Perform an authenticated action on a platform.
|
|
97
118
|
* @param {string} platform - Platform name (x, reddit, devto, etc.)
|
package/src/server.js
CHANGED
|
@@ -54,12 +54,35 @@ const server = http.createServer(async (req, res) => {
|
|
|
54
54
|
|
|
55
55
|
if (req.method === 'POST' && path === '/crawl') {
|
|
56
56
|
const body = await readBody(req)
|
|
57
|
-
const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth
|
|
57
|
+
const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
|
|
58
|
+
includePatterns, excludePatterns, merge, async: asyncMode } = body
|
|
58
59
|
if (!targetUrl) return error(res, 400, 'url is required')
|
|
59
|
-
|
|
60
|
+
|
|
61
|
+
const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
|
|
62
|
+
|
|
63
|
+
if (asyncMode) {
|
|
64
|
+
// Async mode: return job ID immediately
|
|
65
|
+
const job = spectrawl.startCrawlJob(targetUrl, opts)
|
|
66
|
+
return json(res, job)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const result = await spectrawl.crawl(targetUrl, opts)
|
|
60
70
|
return json(res, result)
|
|
61
71
|
}
|
|
62
72
|
|
|
73
|
+
if (req.method === 'GET' && path.startsWith('/crawl/')) {
|
|
74
|
+
const jobId = path.split('/crawl/')[1]
|
|
75
|
+
if (!jobId) return error(res, 400, 'job ID is required')
|
|
76
|
+
const job = spectrawl.getCrawlJob(jobId)
|
|
77
|
+
if (!job) return error(res, 404, 'job not found')
|
|
78
|
+
return json(res, job)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (req.method === 'GET' && path === '/crawl/jobs') {
|
|
82
|
+
const jobList = spectrawl.listCrawlJobs()
|
|
83
|
+
return json(res, { jobs: jobList })
|
|
84
|
+
}
|
|
85
|
+
|
|
63
86
|
if (req.method === 'POST' && path === '/act') {
|
|
64
87
|
const body = await readBody(req)
|
|
65
88
|
const { platform, action, ...params } = body
|