spectrawl 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.4.0",
3
+ "version": "0.4.2",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/crawl.js CHANGED
@@ -1,23 +1,38 @@
1
1
  /**
2
- * Spectrawl Crawl Engine
3
- * Recursively crawls a website using Jina Reader (free) with Playwright fallback.
4
- * Designed for AI agents: returns clean markdown, not raw HTML.
2
+ * Spectrawl Crawl Engine v2
3
+ * Multi-page website crawler using our own browse engine (Camoufox).
4
+ * No external dependencies (no Jina, no Cloudflare).
5
+ * Supports sync + async (job-based) modes.
6
+ * Auto-detects system RAM and parallelizes crawling accordingly.
5
7
  */
6
8
 
7
- const https = require('https')
8
- const http = require('http')
9
+ const crypto = require('crypto')
10
+ const os = require('os')
11
+
12
+ // ~250MB per browser tab (Camoufox average)
13
+ const MB_PER_TAB = 250
14
+ // Reserve this much RAM for OS + other processes
15
+ const RESERVED_MB = 1500
9
16
 
10
17
  const DEFAULT_OPTS = {
11
- depth: 1,
18
+ depth: 2,
12
19
  maxPages: 50,
13
- format: 'markdown', // markdown | html | json
14
- delay: 300, // ms between requests
15
- stealth: false,
16
- scope: 'domain', // domain | prefix | any
17
- timeout: 15000,
20
+ format: 'markdown',
21
+ delay: 300, // ms between batch launches
22
+ stealth: true,
23
+ scope: 'domain',
24
+ timeout: 30000,
25
+ concurrency: 'auto', // 'auto' | number — auto-detect from RAM
18
26
  includeLinks: true,
27
+ includePatterns: [],
28
+ excludePatterns: [],
29
+ merge: false,
19
30
  skipPatterns: [
20
- /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)$/i,
31
+ /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
32
+ /\/_next\//,
33
+ /\/static\//,
34
+ /\/assets\//,
35
+ /mintcdn\.com/,
21
36
  /#/,
22
37
  /^mailto:/,
23
38
  /^tel:/,
@@ -25,6 +40,24 @@ const DEFAULT_OPTS = {
25
40
  ]
26
41
  }
27
42
 
43
+ // In-memory job store for async crawls
44
+ const jobs = new Map()
45
+
46
+ /**
47
+ * Calculate max safe concurrency based on available system RAM.
48
+ */
49
+ function detectConcurrency() {
50
+ const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
51
+ const freeMB = Math.floor(os.freemem() / 1024 / 1024)
52
+ // Use the lower of: (free RAM) or (total - reserved)
53
+ const availableMB = Math.min(freeMB, totalMB - RESERVED_MB)
54
+ const maxTabs = Math.max(1, Math.floor(availableMB / MB_PER_TAB))
55
+ // Cap at 10 — diminishing returns and politeness
56
+ const concurrency = Math.min(maxTabs, 10)
57
+ console.log(`[crawl] RAM: ${totalMB}MB total, ${freeMB}MB free → ${concurrency} concurrent tabs`)
58
+ return concurrency
59
+ }
60
+
28
61
  class CrawlEngine {
29
62
  constructor(browseEngine, cache) {
30
63
  this.browseEngine = browseEngine
@@ -33,14 +66,20 @@ class CrawlEngine {
33
66
 
34
67
  /**
35
68
  * Crawl a website starting from a URL.
36
- * @param {string} startUrl - Starting URL
37
- * @param {object} opts - Crawl options
38
- * @param {object} cookies - Optional auth cookies
69
+ * Automatically parallelizes based on available RAM.
39
70
  */
40
71
  async crawl(startUrl, opts = {}, cookies = null) {
41
- const config = { ...DEFAULT_OPTS, ...opts }
72
+ const cleanOpts = Object.fromEntries(
73
+ Object.entries(opts).filter(([_, v]) => v !== undefined)
74
+ )
75
+ const config = { ...DEFAULT_OPTS, ...cleanOpts }
42
76
  const startTime = Date.now()
43
77
 
78
+ // Determine concurrency
79
+ const concurrency = config.concurrency === 'auto'
80
+ ? detectConcurrency()
81
+ : Math.max(1, Math.min(config.concurrency, 10))
82
+
44
83
  const startParsed = new URL(startUrl)
45
84
  const baseDomain = startParsed.hostname
46
85
  const basePrefix = startUrl.replace(/\/$/, '')
@@ -49,21 +88,14 @@ class CrawlEngine {
49
88
  const queue = [{ url: startUrl, depth: 0 }]
50
89
  const pages = []
51
90
  const failed = []
91
+ let activeCount = 0
52
92
 
53
- while (queue.length > 0 && pages.length < config.maxPages) {
54
- const { url, depth } = queue.shift()
55
- const normalized = normalizeUrl(url)
56
- if (visited.has(normalized)) continue
57
- visited.add(normalized)
58
-
59
- // Scope check
60
- if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
61
- // Skip pattern check
62
- if (config.skipPatterns.some(p => p.test(url))) continue
63
-
93
+ // Process queue with concurrency control
94
+ const processUrl = async (item) => {
95
+ const { url, depth } = item
64
96
  try {
65
97
  const page = await this._fetchPage(url, config, cookies)
66
- if (!page) { failed.push({ url, error: 'empty' }); continue }
98
+ if (!page) { failed.push({ url, error: 'empty' }); return }
67
99
 
68
100
  const links = page.links || []
69
101
  pages.push({
@@ -80,134 +112,236 @@ class CrawlEngine {
80
112
  const absLink = resolveUrl(link, url)
81
113
  if (!absLink) continue
82
114
  const normLink = normalizeUrl(absLink)
83
- if (!visited.has(normLink)) {
84
- queue.push({ url: absLink, depth: depth + 1 })
85
- }
115
+ if (visited.has(normLink)) continue
116
+ // Pre-filter before queueing
117
+ if (!this._inScope(absLink, baseDomain, basePrefix, config.scope)) continue
118
+ if (config.skipPatterns.some(p => p.test(absLink))) continue
119
+ if (!this._matchesFilters(absLink, config.includePatterns, config.excludePatterns)) continue
120
+ visited.add(normLink)
121
+ queue.push({ url: absLink, depth: depth + 1 })
86
122
  }
87
123
  }
124
+ } catch (e) {
125
+ failed.push({ url, error: e.message })
126
+ }
127
+ }
128
+
129
+ // Seed the first URL
130
+ visited.add(normalizeUrl(startUrl))
131
+
132
+ // BFS with parallel workers
133
+ while (queue.length > 0 || activeCount > 0) {
134
+ // Launch up to `concurrency` parallel fetches
135
+ const batch = []
136
+ while (queue.length > 0 && batch.length < concurrency && (pages.length + activeCount + batch.length) < config.maxPages) {
137
+ batch.push(queue.shift())
138
+ }
139
+
140
+ if (batch.length === 0 && activeCount === 0) break
141
+
142
+ if (batch.length > 0) {
143
+ activeCount += batch.length
144
+ const results = await Promise.allSettled(
145
+ batch.map(item => processUrl(item))
146
+ )
147
+ activeCount -= batch.length
88
148
 
149
+ // Small delay between batches to be polite
89
150
  if (queue.length > 0 && config.delay > 0) {
90
151
  await sleep(config.delay)
91
152
  }
92
- } catch (e) {
93
- failed.push({ url, error: e.message })
94
153
  }
154
+
155
+ // Stop if we've hit maxPages
156
+ if (pages.length >= config.maxPages) break
95
157
  }
96
158
 
97
- return {
159
+ const duration = Date.now() - startTime
160
+ const result = {
98
161
  startUrl,
99
162
  pages,
100
163
  stats: {
101
164
  total: visited.size,
102
165
  crawled: pages.length,
103
166
  failed: failed.length,
104
- duration: Date.now() - startTime
167
+ concurrency,
168
+ duration,
169
+ pagesPerSecond: pages.length > 0 ? +(pages.length / (duration / 1000)).toFixed(2) : 0
105
170
  },
106
171
  failed: failed.length > 0 ? failed : undefined
107
172
  }
173
+
174
+ if (config.merge) {
175
+ result.merged = pages.map(p => {
176
+ return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
177
+ }).join('\n\n---\n\n')
178
+ }
179
+
180
+ return result
108
181
  }
109
182
 
110
- async _fetchPage(url, config, cookies) {
111
- // Try Jina Reader first (free, fast, clean markdown)
112
- try {
113
- const jinaUrl = `https://r.jina.ai/${url}`
114
- const content = await fetchText(jinaUrl, {
115
- 'Accept': 'text/markdown',
116
- 'X-Return-Format': config.format === 'html' ? 'html' : 'markdown',
117
- 'X-With-Links-Summary': 'true',
118
- 'X-Timeout': '10'
183
+ /**
184
+ * Start an async crawl job. Returns job ID immediately.
185
+ */
186
+ startJob(startUrl, opts = {}, cookies = null) {
187
+ const jobId = crypto.randomUUID()
188
+ const job = {
189
+ id: jobId,
190
+ startUrl,
191
+ status: 'running',
192
+ started: Date.now(),
193
+ finished: 0,
194
+ total: 0,
195
+ pages: [],
196
+ failed: [],
197
+ error: null
198
+ }
199
+ jobs.set(jobId, job)
200
+
201
+ this.crawl(startUrl, opts, cookies)
202
+ .then(result => {
203
+ job.status = 'completed'
204
+ job.pages = result.pages
205
+ job.failed = result.failed || []
206
+ job.finished = result.stats.crawled
207
+ job.total = result.stats.total
208
+ job.duration = result.stats.duration
209
+ job.concurrency = result.stats.concurrency
210
+ job.pagesPerSecond = result.stats.pagesPerSecond
211
+ })
212
+ .catch(err => {
213
+ job.status = 'errored'
214
+ job.error = err.message
119
215
  })
120
216
 
121
- if (content && content.length > 100) {
122
- return parseJinaResponse(content, url)
123
- }
124
- } catch (e) {
125
- // fall through to Playwright
217
+ return { jobId, status: 'running' }
218
+ }
219
+
220
+ /**
221
+ * Get job status/results.
222
+ */
223
+ getJob(jobId) {
224
+ const job = jobs.get(jobId)
225
+ if (!job) return null
226
+ return {
227
+ id: job.id,
228
+ startUrl: job.startUrl,
229
+ status: job.status,
230
+ started: job.started,
231
+ finished: job.finished,
232
+ total: job.total,
233
+ pageCount: job.pages.length,
234
+ concurrency: job.concurrency,
235
+ pagesPerSecond: job.pagesPerSecond,
236
+ error: job.error,
237
+ pages: job.status === 'completed' ? job.pages : undefined,
238
+ failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
239
+ duration: job.duration
126
240
  }
241
+ }
127
242
 
128
- // Playwright fallback (stealth mode)
243
+ /**
244
+ * List all jobs.
245
+ */
246
+ listJobs() {
247
+ return Array.from(jobs.values()).map(j => ({
248
+ id: j.id,
249
+ startUrl: j.startUrl,
250
+ status: j.status,
251
+ pageCount: j.pages.length,
252
+ started: j.started
253
+ }))
254
+ }
255
+
256
+ /**
257
+ * Get system info for crawl capacity estimation.
258
+ */
259
+ static getCapacity() {
260
+ const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
261
+ const freeMB = Math.floor(os.freemem() / 1024 / 1024)
262
+ const concurrency = detectConcurrency()
263
+ // Estimate: each page takes ~4s with stealth delays
264
+ const pagesPerMinute = concurrency * 15 // ~4s per page
265
+ return {
266
+ totalRamMB: totalMB,
267
+ freeRamMB: freeMB,
268
+ maxConcurrency: concurrency,
269
+ estimatedPagesPerMinute: pagesPerMinute,
270
+ estimate100pages: `~${Math.ceil(100 / pagesPerMinute)} min`,
271
+ estimate1000pages: `~${Math.ceil(1000 / pagesPerMinute)} min`
272
+ }
273
+ }
274
+
275
+ async _fetchPage(url, config, cookies) {
129
276
  try {
130
277
  const result = await this.browseEngine.browse(url, {
131
278
  stealth: config.stealth,
132
279
  _cookies: cookies,
133
- timeout: config.timeout
280
+ timeout: config.timeout,
281
+ html: true,
282
+ noCache: true
134
283
  })
135
284
  if (result?.content) {
285
+ const linkSource = result.html || result.content
136
286
  return {
137
287
  title: result.title || '',
138
288
  content: result.content,
139
- links: extractLinks(result.html || result.content, url)
289
+ links: extractLinks(linkSource, url)
140
290
  }
141
291
  }
142
292
  } catch (e) {
143
293
  throw new Error(`Failed to fetch ${url}: ${e.message}`)
144
294
  }
145
-
146
295
  return null
147
296
  }
148
297
 
149
298
  _inScope(url, baseDomain, basePrefix, scope) {
150
299
  try {
151
300
  const parsed = new URL(url)
152
- if (scope === 'domain') return parsed.hostname === baseDomain
301
+ if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
153
302
  if (scope === 'prefix') return url.startsWith(basePrefix)
154
- return true // 'any'
303
+ return true
155
304
  } catch {
156
305
  return false
157
306
  }
158
307
  }
159
- }
160
308
 
161
- function parseJinaResponse(content, sourceUrl) {
162
- // Jina returns markdown with a header block
163
- const lines = content.split('\n')
164
- let title = ''
165
- const links = []
166
- const contentLines = []
167
- let inLinksSummary = false
168
-
169
- for (const line of lines) {
170
- if (line.startsWith('Title:')) {
171
- title = line.replace('Title:', '').trim()
172
- } else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
173
- inLinksSummary = true
174
- } else if (inLinksSummary) {
175
- // Extract markdown links [text](url)
176
- const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
177
- for (const m of matches) links.push(m[2])
178
- } else {
179
- contentLines.push(line)
309
+ _matchesFilters(url, includePatterns, excludePatterns) {
310
+ if (excludePatterns && excludePatterns.length > 0) {
311
+ for (const pattern of excludePatterns) {
312
+ if (wildcardMatch(url, pattern)) return false
313
+ }
180
314
  }
315
+ if (includePatterns && includePatterns.length > 0) {
316
+ return includePatterns.some(pattern => wildcardMatch(url, pattern))
317
+ }
318
+ return true
181
319
  }
320
+ }
182
321
 
183
- // Also extract inline links from content
184
- const inlineMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
185
- for (const m of inlineMatches) {
186
- if (!links.includes(m[2])) links.push(m[2])
187
- }
188
-
189
- return {
190
- title: title || extractTitleFromMarkdown(contentLines.join('\n')),
191
- content: contentLines.join('\n').trim(),
192
- links: [...new Set(links)]
193
- }
322
+ function wildcardMatch(str, pattern) {
323
+ const regex = pattern
324
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&')
325
+ .replace(/\*\*/g, '{{GLOBSTAR}}')
326
+ .replace(/\*/g, '[^/]*')
327
+ .replace(/\{\{GLOBSTAR\}\}/g, '.*')
328
+ return new RegExp('^' + regex + '$').test(str)
194
329
  }
195
330
 
196
- function extractLinks(html, baseUrl) {
331
+ function extractLinks(content, baseUrl) {
197
332
  const links = []
198
- const matches = html.matchAll(/href=["']([^"']+)["']/gi)
199
- for (const m of matches) {
333
+ const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
334
+ for (const m of hrefMatches) {
200
335
  const resolved = resolveUrl(m[1], baseUrl)
201
336
  if (resolved && !links.includes(resolved)) links.push(resolved)
202
337
  }
338
+ const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
339
+ for (const m of mdMatches) {
340
+ if (!links.includes(m[2])) links.push(m[2])
341
+ }
203
342
  return links
204
343
  }
205
344
 
206
- function extractTitleFromMarkdown(content) {
207
- const match = content.match(/^#\s+(.+)/m)
208
- return match ? match[1].trim() : ''
209
- }
210
-
211
345
  function resolveUrl(url, base) {
212
346
  try {
213
347
  if (url.startsWith('http')) return url
@@ -221,27 +355,16 @@ function normalizeUrl(url) {
221
355
  try {
222
356
  const u = new URL(url)
223
357
  u.hash = ''
224
- return u.href.replace(/\/$/, '')
358
+ let href = u.href
359
+ if (href.endsWith('/') && u.pathname !== '/') {
360
+ href = href.slice(0, -1)
361
+ }
362
+ return href
225
363
  } catch {
226
364
  return url
227
365
  }
228
366
  }
229
367
 
230
- function fetchText(url, headers = {}) {
231
- return new Promise((resolve, reject) => {
232
- const mod = url.startsWith('https') ? https : http
233
- const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
234
- if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
235
- let d = ''
236
- res.on('data', c => d += c)
237
- res.on('end', () => resolve(d))
238
- })
239
- req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
240
- req.on('error', reject)
241
- req.end()
242
- })
243
- }
244
-
245
368
  function sleep(ms) {
246
369
  return new Promise(r => setTimeout(r, ms))
247
370
  }
package/src/index.js CHANGED
@@ -92,6 +92,27 @@ class Spectrawl {
92
92
  return this.crawlEngine.crawl(url, opts, cookies)
93
93
  }
94
94
 
95
+ /**
96
+ * Start an async crawl job. Returns job ID immediately.
97
+ */
98
+ startCrawlJob(url, opts = {}) {
99
+ return this.crawlEngine.startJob(url, opts)
100
+ }
101
+
102
+ /**
103
+ * Get crawl job status/results.
104
+ */
105
+ getCrawlJob(jobId) {
106
+ return this.crawlEngine.getJob(jobId)
107
+ }
108
+
109
+ /**
110
+ * List all crawl jobs.
111
+ */
112
+ listCrawlJobs() {
113
+ return this.crawlEngine.listJobs()
114
+ }
115
+
95
116
  /**
96
117
  * Perform an authenticated action on a platform.
97
118
  * @param {string} platform - Platform name (x, reddit, devto, etc.)
package/src/server.js CHANGED
@@ -54,12 +54,40 @@ const server = http.createServer(async (req, res) => {
54
54
 
55
55
  if (req.method === 'POST' && path === '/crawl') {
56
56
  const body = await readBody(req)
57
- const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth } = body
57
+ const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
58
+ includePatterns, excludePatterns, merge, async: asyncMode, concurrency } = body
58
59
  if (!targetUrl) return error(res, 400, 'url is required')
59
- const result = await spectrawl.crawl(targetUrl, { depth, maxPages, format, delay, stealth, scope, auth })
60
+
61
+ const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge, concurrency }
62
+
63
+ if (asyncMode) {
64
+ // Async mode: return job ID immediately
65
+ const job = spectrawl.startCrawlJob(targetUrl, opts)
66
+ return json(res, job)
67
+ }
68
+
69
+ const result = await spectrawl.crawl(targetUrl, opts)
60
70
  return json(res, result)
61
71
  }
62
72
 
73
+ if (req.method === 'GET' && path === '/crawl/jobs') {
74
+ const jobList = spectrawl.listCrawlJobs()
75
+ return json(res, { jobs: jobList })
76
+ }
77
+
78
+ if (req.method === 'GET' && path === '/crawl/capacity') {
79
+ const { CrawlEngine } = require('./crawl')
80
+ return json(res, CrawlEngine.getCapacity())
81
+ }
82
+
83
+ if (req.method === 'GET' && path.startsWith('/crawl/')) {
84
+ const jobId = path.split('/crawl/')[1]
85
+ if (!jobId) return error(res, 400, 'job ID is required')
86
+ const job = spectrawl.getCrawlJob(jobId)
87
+ if (!job) return error(res, 404, 'job not found')
88
+ return json(res, job)
89
+ }
90
+
63
91
  if (req.method === 'POST' && path === '/act') {
64
92
  const body = await readBody(req)
65
93
  const { platform, action, ...params } = body