spectrawl 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.4.0",
3
+ "version": "0.4.1",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/crawl.js CHANGED
@@ -1,23 +1,30 @@
1
1
  /**
2
- * Spectrawl Crawl Engine
3
- * Recursively crawls a website using Jina Reader (free) with Playwright fallback.
4
- * Designed for AI agents: returns clean markdown, not raw HTML.
2
+ * Spectrawl Crawl Engine v2
3
+ * Multi-page website crawler using our own browse engine (Camoufox).
4
+ * No external dependencies (no Jina, no Cloudflare).
5
+ * Supports sync + async (job-based) modes.
5
6
  */
6
7
 
7
- const https = require('https')
8
- const http = require('http')
8
+ const crypto = require('crypto')
9
9
 
10
10
  const DEFAULT_OPTS = {
11
- depth: 1,
11
+ depth: 2,
12
12
  maxPages: 50,
13
13
  format: 'markdown', // markdown | html | json
14
- delay: 300, // ms between requests
15
- stealth: false,
14
+ delay: 500, // ms between requests
15
+ stealth: true, // use stealth browsing by default
16
16
  scope: 'domain', // domain | prefix | any
17
- timeout: 15000,
17
+ timeout: 30000,
18
18
  includeLinks: true,
19
+ includePatterns: [], // wildcard patterns to include
20
+ excludePatterns: [], // wildcard patterns to exclude
21
+ merge: false, // merge all pages into single result
19
22
  skipPatterns: [
20
- /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)$/i,
23
+ /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css|js)(\?|$)/i,
24
+ /\/_next\//,
25
+ /\/static\//,
26
+ /\/assets\//,
27
+ /mintcdn\.com/,
21
28
  /#/,
22
29
  /^mailto:/,
23
30
  /^tel:/,
@@ -25,6 +32,9 @@ const DEFAULT_OPTS = {
25
32
  ]
26
33
  }
27
34
 
35
+ // In-memory job store for async crawls
36
+ const jobs = new Map()
37
+
28
38
  class CrawlEngine {
29
39
  constructor(browseEngine, cache) {
30
40
  this.browseEngine = browseEngine
@@ -32,13 +42,14 @@ class CrawlEngine {
32
42
  }
33
43
 
34
44
  /**
35
- * Crawl a website starting from a URL.
36
- * @param {string} startUrl - Starting URL
37
- * @param {object} opts - Crawl options
38
- * @param {object} cookies - Optional auth cookies
45
+ * Crawl a website starting from a URL (synchronous — waits for completion).
39
46
  */
40
47
  async crawl(startUrl, opts = {}, cookies = null) {
41
- const config = { ...DEFAULT_OPTS, ...opts }
48
+ // Filter out undefined values from opts to avoid overriding defaults
49
+ const cleanOpts = Object.fromEntries(
50
+ Object.entries(opts).filter(([_, v]) => v !== undefined)
51
+ )
52
+ const config = { ...DEFAULT_OPTS, ...cleanOpts }
42
53
  const startTime = Date.now()
43
54
 
44
55
  const startParsed = new URL(startUrl)
@@ -60,6 +71,8 @@ class CrawlEngine {
60
71
  if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
61
72
  // Skip pattern check
62
73
  if (config.skipPatterns.some(p => p.test(url))) continue
74
+ // Include/exclude pattern check
75
+ if (!this._matchesFilters(url, config.includePatterns, config.excludePatterns)) continue
63
76
 
64
77
  try {
65
78
  const page = await this._fetchPage(url, config, cookies)
@@ -94,7 +107,7 @@ class CrawlEngine {
94
107
  }
95
108
  }
96
109
 
97
- return {
110
+ const result = {
98
111
  startUrl,
99
112
  pages,
100
113
  stats: {
@@ -105,38 +118,105 @@ class CrawlEngine {
105
118
  },
106
119
  failed: failed.length > 0 ? failed : undefined
107
120
  }
121
+
122
+ // Merge mode: combine all pages into single content
123
+ if (config.merge) {
124
+ result.merged = pages.map(p => {
125
+ return `<!-- Source: ${p.url} -->\n# ${p.title || p.url}\n\n${p.content}`
126
+ }).join('\n\n---\n\n')
127
+ }
128
+
129
+ return result
108
130
  }
109
131
 
110
- async _fetchPage(url, config, cookies) {
111
- // Try Jina Reader first (free, fast, clean markdown)
112
- try {
113
- const jinaUrl = `https://r.jina.ai/${url}`
114
- const content = await fetchText(jinaUrl, {
115
- 'Accept': 'text/markdown',
116
- 'X-Return-Format': config.format === 'html' ? 'html' : 'markdown',
117
- 'X-With-Links-Summary': 'true',
118
- 'X-Timeout': '10'
132
+ /**
133
+ * Start an async crawl job. Returns job ID immediately.
134
+ */
135
+ startJob(startUrl, opts = {}, cookies = null) {
136
+ const jobId = crypto.randomUUID()
137
+ const job = {
138
+ id: jobId,
139
+ startUrl,
140
+ status: 'running',
141
+ started: Date.now(),
142
+ finished: 0,
143
+ total: 0,
144
+ pages: [],
145
+ failed: [],
146
+ error: null
147
+ }
148
+ jobs.set(jobId, job)
149
+
150
+ // Run crawl in background
151
+ this.crawl(startUrl, opts, cookies)
152
+ .then(result => {
153
+ job.status = 'completed'
154
+ job.pages = result.pages
155
+ job.failed = result.failed || []
156
+ job.finished = result.stats.crawled
157
+ job.total = result.stats.total
158
+ job.duration = result.stats.duration
159
+ })
160
+ .catch(err => {
161
+ job.status = 'errored'
162
+ job.error = err.message
119
163
  })
120
164
 
121
- if (content && content.length > 100) {
122
- return parseJinaResponse(content, url)
123
- }
124
- } catch (e) {
125
- // fall through to Playwright
165
+ return { jobId, status: 'running' }
166
+ }
167
+
168
+ /**
169
+ * Get job status/results.
170
+ */
171
+ getJob(jobId) {
172
+ const job = jobs.get(jobId)
173
+ if (!job) return null
174
+ return {
175
+ id: job.id,
176
+ startUrl: job.startUrl,
177
+ status: job.status,
178
+ started: job.started,
179
+ finished: job.finished,
180
+ total: job.total,
181
+ pageCount: job.pages.length,
182
+ error: job.error,
183
+ // Only include pages if completed
184
+ pages: job.status === 'completed' ? job.pages : undefined,
185
+ failed: job.status === 'completed' ? (job.failed.length > 0 ? job.failed : undefined) : undefined,
186
+ duration: job.duration
126
187
  }
188
+ }
189
+
190
+ /**
191
+ * List all jobs.
192
+ */
193
+ listJobs() {
194
+ return Array.from(jobs.values()).map(j => ({
195
+ id: j.id,
196
+ startUrl: j.startUrl,
197
+ status: j.status,
198
+ pageCount: j.pages.length,
199
+ started: j.started
200
+ }))
201
+ }
127
202
 
128
- // Playwright fallback (stealth mode)
203
+ async _fetchPage(url, config, cookies) {
204
+ // Use our own browse engine (Camoufox) — no external dependencies
129
205
  try {
130
206
  const result = await this.browseEngine.browse(url, {
131
207
  stealth: config.stealth,
132
208
  _cookies: cookies,
133
- timeout: config.timeout
209
+ timeout: config.timeout,
210
+ html: true, // request raw HTML for link extraction
211
+ noCache: true // always fetch fresh for crawling
134
212
  })
135
213
  if (result?.content) {
214
+ // Extract links from HTML if available, otherwise from markdown content
215
+ const linkSource = result.html || result.content
136
216
  return {
137
217
  title: result.title || '',
138
218
  content: result.content,
139
- links: extractLinks(result.html || result.content, url)
219
+ links: extractLinks(linkSource, url)
140
220
  }
141
221
  }
142
222
  } catch (e) {
@@ -149,65 +229,57 @@ class CrawlEngine {
149
229
  _inScope(url, baseDomain, basePrefix, scope) {
150
230
  try {
151
231
  const parsed = new URL(url)
152
- if (scope === 'domain') return parsed.hostname === baseDomain
232
+ if (scope === 'domain') return parsed.hostname === baseDomain || parsed.hostname.endsWith('.' + baseDomain)
153
233
  if (scope === 'prefix') return url.startsWith(basePrefix)
154
234
  return true // 'any'
155
235
  } catch {
156
236
  return false
157
237
  }
158
238
  }
159
- }
160
239
 
161
- function parseJinaResponse(content, sourceUrl) {
162
- // Jina returns markdown with a header block
163
- const lines = content.split('\n')
164
- let title = ''
165
- const links = []
166
- const contentLines = []
167
- let inLinksSummary = false
168
-
169
- for (const line of lines) {
170
- if (line.startsWith('Title:')) {
171
- title = line.replace('Title:', '').trim()
172
- } else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
173
- inLinksSummary = true
174
- } else if (inLinksSummary) {
175
- // Extract markdown links [text](url)
176
- const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
177
- for (const m of matches) links.push(m[2])
178
- } else {
179
- contentLines.push(line)
240
+ _matchesFilters(url, includePatterns, excludePatterns) {
241
+ // Exclude takes priority
242
+ if (excludePatterns && excludePatterns.length > 0) {
243
+ for (const pattern of excludePatterns) {
244
+ if (wildcardMatch(url, pattern)) return false
245
+ }
180
246
  }
247
+ // If include patterns specified, URL must match at least one
248
+ if (includePatterns && includePatterns.length > 0) {
249
+ return includePatterns.some(pattern => wildcardMatch(url, pattern))
250
+ }
251
+ return true
181
252
  }
253
+ }
182
254
 
183
- // Also extract inline links from content
184
- const inlineMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
185
- for (const m of inlineMatches) {
186
- if (!links.includes(m[2])) links.push(m[2])
187
- }
188
-
189
- return {
190
- title: title || extractTitleFromMarkdown(contentLines.join('\n')),
191
- content: contentLines.join('\n').trim(),
192
- links: [...new Set(links)]
193
- }
255
+ /**
256
+ * Wildcard matching: * matches anything except /, ** matches everything including /
257
+ */
258
+ function wildcardMatch(str, pattern) {
259
+ const regex = pattern
260
+ .replace(/[.+^${}()|[\]\\]/g, '\\$&') // escape regex chars
261
+ .replace(/\*\*/g, '{{GLOBSTAR}}')
262
+ .replace(/\*/g, '[^/]*')
263
+ .replace(/\{\{GLOBSTAR\}\}/g, '.*')
264
+ return new RegExp('^' + regex + '$').test(str)
194
265
  }
195
266
 
196
- function extractLinks(html, baseUrl) {
267
+ function extractLinks(content, baseUrl) {
197
268
  const links = []
198
- const matches = html.matchAll(/href=["']([^"']+)["']/gi)
199
- for (const m of matches) {
269
+ // Extract from href attributes (HTML)
270
+ const hrefMatches = content.matchAll(/href=["']([^"']+)["']/gi)
271
+ for (const m of hrefMatches) {
200
272
  const resolved = resolveUrl(m[1], baseUrl)
201
273
  if (resolved && !links.includes(resolved)) links.push(resolved)
202
274
  }
275
+ // Extract from markdown links
276
+ const mdMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
277
+ for (const m of mdMatches) {
278
+ if (!links.includes(m[2])) links.push(m[2])
279
+ }
203
280
  return links
204
281
  }
205
282
 
206
- function extractTitleFromMarkdown(content) {
207
- const match = content.match(/^#\s+(.+)/m)
208
- return match ? match[1].trim() : ''
209
- }
210
-
211
283
  function resolveUrl(url, base) {
212
284
  try {
213
285
  if (url.startsWith('http')) return url
@@ -221,27 +293,17 @@ function normalizeUrl(url) {
221
293
  try {
222
294
  const u = new URL(url)
223
295
  u.hash = ''
224
- return u.href.replace(/\/$/, '')
296
+ // Remove trailing slash for consistency
297
+ let href = u.href
298
+ if (href.endsWith('/') && u.pathname !== '/') {
299
+ href = href.slice(0, -1)
300
+ }
301
+ return href
225
302
  } catch {
226
303
  return url
227
304
  }
228
305
  }
229
306
 
230
- function fetchText(url, headers = {}) {
231
- return new Promise((resolve, reject) => {
232
- const mod = url.startsWith('https') ? https : http
233
- const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
234
- if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
235
- let d = ''
236
- res.on('data', c => d += c)
237
- res.on('end', () => resolve(d))
238
- })
239
- req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
240
- req.on('error', reject)
241
- req.end()
242
- })
243
- }
244
-
245
307
  function sleep(ms) {
246
308
  return new Promise(r => setTimeout(r, ms))
247
309
  }
package/src/index.js CHANGED
@@ -92,6 +92,27 @@ class Spectrawl {
92
92
  return this.crawlEngine.crawl(url, opts, cookies)
93
93
  }
94
94
 
95
+ /**
96
+ * Start an async crawl job. Returns job ID immediately.
97
+ */
98
+ startCrawlJob(url, opts = {}) {
99
+ return this.crawlEngine.startJob(url, opts)
100
+ }
101
+
102
+ /**
103
+ * Get crawl job status/results.
104
+ */
105
+ getCrawlJob(jobId) {
106
+ return this.crawlEngine.getJob(jobId)
107
+ }
108
+
109
+ /**
110
+ * List all crawl jobs.
111
+ */
112
+ listCrawlJobs() {
113
+ return this.crawlEngine.listJobs()
114
+ }
115
+
95
116
  /**
96
117
  * Perform an authenticated action on a platform.
97
118
  * @param {string} platform - Platform name (x, reddit, devto, etc.)
package/src/server.js CHANGED
@@ -54,12 +54,35 @@ const server = http.createServer(async (req, res) => {
54
54
 
55
55
  if (req.method === 'POST' && path === '/crawl') {
56
56
  const body = await readBody(req)
57
- const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth } = body
57
+ const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth,
58
+ includePatterns, excludePatterns, merge, async: asyncMode } = body
58
59
  if (!targetUrl) return error(res, 400, 'url is required')
59
- const result = await spectrawl.crawl(targetUrl, { depth, maxPages, format, delay, stealth, scope, auth })
60
+
61
+ const opts = { depth, maxPages, format, delay, stealth, scope, auth, includePatterns, excludePatterns, merge }
62
+
63
+ if (asyncMode) {
64
+ // Async mode: return job ID immediately
65
+ const job = spectrawl.startCrawlJob(targetUrl, opts)
66
+ return json(res, job)
67
+ }
68
+
69
+ const result = await spectrawl.crawl(targetUrl, opts)
60
70
  return json(res, result)
61
71
  }
62
72
 
73
+ if (req.method === 'GET' && path.startsWith('/crawl/')) {
74
+ const jobId = path.split('/crawl/')[1]
75
+ if (!jobId) return error(res, 400, 'job ID is required')
76
+ const job = spectrawl.getCrawlJob(jobId)
77
+ if (!job) return error(res, 404, 'job not found')
78
+ return json(res, job)
79
+ }
80
+
81
+ if (req.method === 'GET' && path === '/crawl/jobs') {
82
+ const jobList = spectrawl.listCrawlJobs()
83
+ return json(res, { jobs: jobList })
84
+ }
85
+
63
86
  if (req.method === 'POST' && path === '/act') {
64
87
  const body = await readBody(req)
65
88
  const { platform, action, ...params } = body