spectrawl 0.3.19 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  The unified web layer for AI agents. Search, browse, authenticate, and act on platforms — one package, self-hosted.
4
4
 
5
- **5,000 free searches/month** via Gemini Grounded Search. Full page scraping, stealth browsing, 24 platform adapters.
5
+ **5,000 free searches/month** via Gemini Grounded Search. Full site crawling, stealth browsing, 19 platform adapters.
6
6
 
7
7
  ## What It Does
8
8
 
@@ -55,8 +55,9 @@ Different tools for different needs.
55
55
  | Returns | Snippets + AI answer | Full page content + snippets |
56
56
  | Self-hosted | No | Yes |
57
57
  | Stealth browsing | No | Yes (Camoufox + Playwright) |
58
- | Platform posting | No | 24 adapters |
58
+ | Platform posting | No | 19 adapters |
59
59
  | Auth management | No | Cookie store + auto-refresh |
60
+ | Site crawling | No | ✅ Free (Jina + Playwright) |
60
61
  | Cached repeats | No | <1ms |
61
62
 
62
63
  **Tavily** is fast and simple — great for agents that need quick answers. **Spectrawl** returns richer data and does more (browse, auth, post) — but it's slower. Choose based on your use case.
@@ -109,6 +110,41 @@ console.log(page.screenshot) // PNG buffer (if requested)
109
110
 
110
111
  Auto-fallback: if Jina and readability return too little content (<200 chars), Spectrawl renders the page with Playwright and extracts from the rendered DOM. Tavily can't do this — they fail on JS-heavy pages.
111
112
 
113
+ ## Crawl
114
+
115
+ Give your agent the ability to read an entire website in one call. Free, no API costs.
116
+
117
+ Uses [Jina Reader](https://jina.ai/reader) (free, unlimited) with Playwright stealth fallback for JS-heavy sites.
118
+
119
+ ```js
120
+ // Crawl a docs site — returns clean markdown for every page
121
+ const result = await web.crawl('https://docs.example.com', {
122
+ depth: 2, // how many levels deep (default: 1)
123
+ maxPages: 50, // max pages to crawl (default: 50)
124
+ format: 'markdown', // markdown | html | json
125
+ delay: 300, // ms between requests (be polite)
126
+ stealth: false, // use Camoufox for anti-detect
127
+ auth: 'account' // use stored cookies (crawl behind logins)
128
+ })
129
+
130
+ result.pages // [{ url, title, content, links, depth }]
131
+ result.stats // { total, crawled, failed, duration }
132
+ ```
133
+
134
+ **vs Cloudflare's /crawl:**
135
+ - ✅ Free (self-hosted, no per-request cost)
136
+ - ✅ Crawls sites that block Cloudflare IPs
137
+ - ✅ Auth-aware — crawl behind login walls with stored cookies
138
+ - ✅ Stealth mode — bypasses bot detection
139
+ - ✅ Works for AI agents (50-200 pages, not millions)
140
+
141
+ **HTTP API:**
142
+ ```bash
143
+ curl -X POST http://localhost:3900/crawl \
144
+ -H "Content-Type: application/json" \
145
+ -d '{ "url": "https://docs.example.com", "depth": 2, "maxPages": 50 }'
146
+ ```
147
+
112
148
  ## Auth
113
149
 
114
150
  Persistent cookie storage (SQLite), multi-account management, automatic expiry detection.
@@ -124,9 +160,9 @@ const accounts = await web.auth.getStatus()
124
160
 
125
161
  Cookie refresh cron fires `cookie_expiring` and `cookie_expired` events before accounts go stale.
126
162
 
127
- ## Act — 24 Platform Adapters
163
+ ## Act — 19 Platform Adapters
128
164
 
129
- Post to 24+ platforms with one API:
165
+ Post to 19 platforms with one API:
130
166
 
131
167
  ```js
132
168
  await web.act('github', 'create-issue', { repo: 'user/repo', title: 'Bug report', body: '...' })
@@ -135,7 +171,7 @@ await web.act('devto', 'post', { title: '...', body: '...', tags: ['ai'] })
135
171
  await web.act('huggingface', 'create-repo', { name: 'my-model', type: 'model' })
136
172
  ```
137
173
 
138
- **Live tested:** GitHub ✅, Reddit ✅, Dev.to ✅, HuggingFace ✅, X (reads) ✅
174
+ **Live tested:** GitHub ✅, Reddit ✅, Dev.to ✅, HuggingFace ✅, X (reads) ✅, Hashnode ✅, Discord ✅, Product Hunt
139
175
 
140
176
  | Platform | Auth Method | Actions |
141
177
  |----------|-------------|---------|
@@ -154,7 +190,10 @@ await web.act('huggingface', 'create-repo', { name: 'my-model', type: 'model' })
154
190
  | Quora | Browser automation | answer |
155
191
  | HuggingFace | Hub API | repo, model card, upload |
156
192
  | BetaList | REST API | submit |
157
- | **14 Directories** | Generic adapter | submit |
193
+ | AlternativeTo | Cookie session | submit, claim |
194
+ | DevHunt | Supabase auth | submit, upvote |
195
+ | SaaSHub | Generic adapter | submit |
196
+ | **Generic Directory** | Configurable | submit |
158
197
 
159
198
  Built-in rate limiting, content dedup (MD5, 24h window), and dead letter queue for retries.
160
199
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.19",
3
+ "version": "0.4.0",
4
4
  "description": "The unified web layer for AI agents. Search (8 engines), stealth browse, auth, act on 24 platforms. Self-hosted.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/crawl.js ADDED
@@ -0,0 +1,249 @@
1
+ /**
2
+ * Spectrawl Crawl Engine
3
+ * Recursively crawls a website using Jina Reader (free) with Playwright fallback.
4
+ * Designed for AI agents: returns clean markdown, not raw HTML.
5
+ */
6
+
7
+ const https = require('https')
8
+ const http = require('http')
9
+
10
+ const DEFAULT_OPTS = {
11
+ depth: 1,
12
+ maxPages: 50,
13
+ format: 'markdown', // markdown | html | json
14
+ delay: 300, // ms between requests
15
+ stealth: false,
16
+ scope: 'domain', // domain | prefix | any
17
+ timeout: 15000,
18
+ includeLinks: true,
19
+ skipPatterns: [
20
+ /\.(png|jpg|jpeg|gif|svg|ico|webp|pdf|zip|gz|tar|mp4|mp3|woff|woff2|ttf|css)$/i,
21
+ /#/,
22
+ /^mailto:/,
23
+ /^tel:/,
24
+ /^javascript:/,
25
+ ]
26
+ }
27
+
28
+ class CrawlEngine {
29
+ constructor(browseEngine, cache) {
30
+ this.browseEngine = browseEngine
31
+ this.cache = cache
32
+ }
33
+
34
+ /**
35
+ * Crawl a website starting from a URL.
36
+ * @param {string} startUrl - Starting URL
37
+ * @param {object} opts - Crawl options
38
+ * @param {object} cookies - Optional auth cookies
39
+ */
40
+ async crawl(startUrl, opts = {}, cookies = null) {
41
+ const config = { ...DEFAULT_OPTS, ...opts }
42
+ const startTime = Date.now()
43
+
44
+ const startParsed = new URL(startUrl)
45
+ const baseDomain = startParsed.hostname
46
+ const basePrefix = startUrl.replace(/\/$/, '')
47
+
48
+ const visited = new Set()
49
+ const queue = [{ url: startUrl, depth: 0 }]
50
+ const pages = []
51
+ const failed = []
52
+
53
+ while (queue.length > 0 && pages.length < config.maxPages) {
54
+ const { url, depth } = queue.shift()
55
+ const normalized = normalizeUrl(url)
56
+ if (visited.has(normalized)) continue
57
+ visited.add(normalized)
58
+
59
+ // Scope check
60
+ if (!this._inScope(url, baseDomain, basePrefix, config.scope)) continue
61
+ // Skip pattern check
62
+ if (config.skipPatterns.some(p => p.test(url))) continue
63
+
64
+ try {
65
+ const page = await this._fetchPage(url, config, cookies)
66
+ if (!page) { failed.push({ url, error: 'empty' }); continue }
67
+
68
+ const links = page.links || []
69
+ pages.push({
70
+ url,
71
+ title: page.title || '',
72
+ content: page.content || '',
73
+ links: config.includeLinks ? links : undefined,
74
+ depth
75
+ })
76
+
77
+ // Enqueue child links
78
+ if (depth < config.depth) {
79
+ for (const link of links) {
80
+ const absLink = resolveUrl(link, url)
81
+ if (!absLink) continue
82
+ const normLink = normalizeUrl(absLink)
83
+ if (!visited.has(normLink)) {
84
+ queue.push({ url: absLink, depth: depth + 1 })
85
+ }
86
+ }
87
+ }
88
+
89
+ if (queue.length > 0 && config.delay > 0) {
90
+ await sleep(config.delay)
91
+ }
92
+ } catch (e) {
93
+ failed.push({ url, error: e.message })
94
+ }
95
+ }
96
+
97
+ return {
98
+ startUrl,
99
+ pages,
100
+ stats: {
101
+ total: visited.size,
102
+ crawled: pages.length,
103
+ failed: failed.length,
104
+ duration: Date.now() - startTime
105
+ },
106
+ failed: failed.length > 0 ? failed : undefined
107
+ }
108
+ }
109
+
110
+ async _fetchPage(url, config, cookies) {
111
+ // Try Jina Reader first (free, fast, clean markdown)
112
+ try {
113
+ const jinaUrl = `https://r.jina.ai/${url}`
114
+ const content = await fetchText(jinaUrl, {
115
+ 'Accept': 'text/markdown',
116
+ 'X-Return-Format': config.format === 'html' ? 'html' : 'markdown',
117
+ 'X-With-Links-Summary': 'true',
118
+ 'X-Timeout': '10'
119
+ })
120
+
121
+ if (content && content.length > 100) {
122
+ return parseJinaResponse(content, url)
123
+ }
124
+ } catch (e) {
125
+ // fall through to Playwright
126
+ }
127
+
128
+ // Playwright fallback (stealth mode)
129
+ try {
130
+ const result = await this.browseEngine.browse(url, {
131
+ stealth: config.stealth,
132
+ _cookies: cookies,
133
+ timeout: config.timeout
134
+ })
135
+ if (result?.content) {
136
+ return {
137
+ title: result.title || '',
138
+ content: result.content,
139
+ links: extractLinks(result.html || result.content, url)
140
+ }
141
+ }
142
+ } catch (e) {
143
+ throw new Error(`Failed to fetch ${url}: ${e.message}`)
144
+ }
145
+
146
+ return null
147
+ }
148
+
149
+ _inScope(url, baseDomain, basePrefix, scope) {
150
+ try {
151
+ const parsed = new URL(url)
152
+ if (scope === 'domain') return parsed.hostname === baseDomain
153
+ if (scope === 'prefix') return url.startsWith(basePrefix)
154
+ return true // 'any'
155
+ } catch {
156
+ return false
157
+ }
158
+ }
159
+ }
160
+
161
+ function parseJinaResponse(content, sourceUrl) {
162
+ // Jina returns markdown with a header block
163
+ const lines = content.split('\n')
164
+ let title = ''
165
+ const links = []
166
+ const contentLines = []
167
+ let inLinksSummary = false
168
+
169
+ for (const line of lines) {
170
+ if (line.startsWith('Title:')) {
171
+ title = line.replace('Title:', '').trim()
172
+ } else if (line.startsWith('Links/Buttons:') || line.includes('## Links')) {
173
+ inLinksSummary = true
174
+ } else if (inLinksSummary) {
175
+ // Extract markdown links [text](url)
176
+ const matches = line.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
177
+ for (const m of matches) links.push(m[2])
178
+ } else {
179
+ contentLines.push(line)
180
+ }
181
+ }
182
+
183
+ // Also extract inline links from content
184
+ const inlineMatches = content.matchAll(/\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g)
185
+ for (const m of inlineMatches) {
186
+ if (!links.includes(m[2])) links.push(m[2])
187
+ }
188
+
189
+ return {
190
+ title: title || extractTitleFromMarkdown(contentLines.join('\n')),
191
+ content: contentLines.join('\n').trim(),
192
+ links: [...new Set(links)]
193
+ }
194
+ }
195
+
196
+ function extractLinks(html, baseUrl) {
197
+ const links = []
198
+ const matches = html.matchAll(/href=["']([^"']+)["']/gi)
199
+ for (const m of matches) {
200
+ const resolved = resolveUrl(m[1], baseUrl)
201
+ if (resolved && !links.includes(resolved)) links.push(resolved)
202
+ }
203
+ return links
204
+ }
205
+
206
+ function extractTitleFromMarkdown(content) {
207
+ const match = content.match(/^#\s+(.+)/m)
208
+ return match ? match[1].trim() : ''
209
+ }
210
+
211
+ function resolveUrl(url, base) {
212
+ try {
213
+ if (url.startsWith('http')) return url
214
+ return new URL(url, base).href
215
+ } catch {
216
+ return null
217
+ }
218
+ }
219
+
220
+ function normalizeUrl(url) {
221
+ try {
222
+ const u = new URL(url)
223
+ u.hash = ''
224
+ return u.href.replace(/\/$/, '')
225
+ } catch {
226
+ return url
227
+ }
228
+ }
229
+
230
+ function fetchText(url, headers = {}) {
231
+ return new Promise((resolve, reject) => {
232
+ const mod = url.startsWith('https') ? https : http
233
+ const req = mod.request(url, { headers: { 'User-Agent': 'Spectrawl/1.0', ...headers } }, res => {
234
+ if (res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}`)); return }
235
+ let d = ''
236
+ res.on('data', c => d += c)
237
+ res.on('end', () => resolve(d))
238
+ })
239
+ req.setTimeout(15000, () => { req.destroy(); reject(new Error('timeout')) })
240
+ req.on('error', reject)
241
+ req.end()
242
+ })
243
+ }
244
+
245
+ function sleep(ms) {
246
+ return new Promise(r => setTimeout(r, ms))
247
+ }
248
+
249
+ module.exports = { CrawlEngine }
package/src/index.js CHANGED
@@ -7,6 +7,7 @@ const { SearchEngine } = require('./search')
7
7
  const { BrowseEngine } = require('./browse')
8
8
  const { AuthManager } = require('./auth')
9
9
  const { ActEngine } = require('./act')
10
+ const { CrawlEngine } = require('./crawl')
10
11
  const { Cache } = require('./cache')
11
12
  const { EventEmitter, EVENTS } = require('./events')
12
13
  const { CookieRefresher } = require('./auth/refresh')
@@ -36,6 +37,7 @@ class Spectrawl {
36
37
  this.browseEngine = new BrowseEngine(this.config.browse, this.cache)
37
38
  this.auth = new AuthManager(this.config.auth)
38
39
  this.actEngine = new ActEngine(this.config, this.auth, this.browseEngine)
40
+ this.crawlEngine = new CrawlEngine(this.browseEngine, this.cache)
39
41
  this.refresher = new CookieRefresher(this.auth, this.events, this.config.auth)
40
42
  }
41
43
 
@@ -75,6 +77,21 @@ class Spectrawl {
75
77
  return this.browseEngine.browse(url, opts)
76
78
  }
77
79
 
80
+ /**
81
+ * Crawl a website recursively. Returns clean markdown for every page.
82
+ * Uses Jina Reader (free) with Playwright stealth fallback.
83
+ * @param {string} url - Starting URL
84
+ * @param {object} opts - { depth, maxPages, format, delay, stealth, scope, auth }
85
+ * @returns {Promise<{pages[], stats, failed?}>}
86
+ */
87
+ async crawl(url, opts = {}) {
88
+ let cookies = null
89
+ if (opts.auth) {
90
+ cookies = await this.auth.getCookies(opts.auth)
91
+ }
92
+ return this.crawlEngine.crawl(url, opts, cookies)
93
+ }
94
+
78
95
  /**
79
96
  * Perform an authenticated action on a platform.
80
97
  * @param {string} platform - Platform name (x, reddit, devto, etc.)
package/src/server.js CHANGED
@@ -52,6 +52,14 @@ const server = http.createServer(async (req, res) => {
52
52
  return json(res, result)
53
53
  }
54
54
 
55
+ if (req.method === 'POST' && path === '/crawl') {
56
+ const body = await readBody(req)
57
+ const { url: targetUrl, depth, maxPages, format, delay, stealth, scope, auth } = body
58
+ if (!targetUrl) return error(res, 400, 'url is required')
59
+ const result = await spectrawl.crawl(targetUrl, { depth, maxPages, format, delay, stealth, scope, auth })
60
+ return json(res, result)
61
+ }
62
+
55
63
  if (req.method === 'POST' && path === '/act') {
56
64
  const body = await readBody(req)
57
65
  const { platform, action, ...params } = body
@@ -61,6 +69,65 @@ const server = http.createServer(async (req, res) => {
61
69
  return json(res, result)
62
70
  }
63
71
 
72
+ // Threads OAuth callback
73
+ if (req.method === 'GET' && path === '/auth/callback/threads') {
74
+ const code = url.searchParams.get('code')
75
+ const errParam = url.searchParams.get('error')
76
+ if (errParam) {
77
+ res.writeHead(200, { 'Content-Type': 'text/html' })
78
+ return res.end(`<h2>❌ Auth error: ${errParam}</h2>`)
79
+ }
80
+ if (!code) {
81
+ res.writeHead(400, { 'Content-Type': 'text/html' })
82
+ return res.end('<h2>❌ No code received</h2>')
83
+ }
84
+ try {
85
+ // Exchange code for token
86
+ const fetch = require('node:https')
87
+ const params = new URLSearchParams({
88
+ client_id: '1574846783732558',
89
+ client_secret: 'f8589ca3523b0ea5bab3fac2c2ae4c15',
90
+ code,
91
+ grant_type: 'authorization_code',
92
+ redirect_uri: 'https://gateway.xanos.org/auth/callback/threads'
93
+ })
94
+ const tokenRes = await new Promise((resolve, reject) => {
95
+ const postData = params.toString()
96
+ const options = {
97
+ hostname: 'graph.threads.net',
98
+ path: '/oauth/access_token',
99
+ method: 'POST',
100
+ headers: {
101
+ 'Content-Type': 'application/x-www-form-urlencoded',
102
+ 'Content-Length': Buffer.byteLength(postData)
103
+ }
104
+ }
105
+ const req2 = fetch.request(options, (r) => {
106
+ let data = ''
107
+ r.on('data', chunk => data += chunk)
108
+ r.on('end', () => resolve(JSON.parse(data)))
109
+ })
110
+ req2.on('error', reject)
111
+ req2.write(postData)
112
+ req2.end()
113
+ })
114
+ // Save to credentials
115
+ const fs = require('fs')
116
+ const credsPath = '/root/.openclaw/workspace-dijiclaw/.openclaw/credentials/threads-api.json'
117
+ const creds = JSON.parse(fs.readFileSync(credsPath, 'utf8'))
118
+ creds.user_token = tokenRes.access_token
119
+ creds.user_id = tokenRes.user_id
120
+ creds.token_type = tokenRes.token_type
121
+ creds.note = 'User token saved via OAuth callback'
122
+ fs.writeFileSync(credsPath, JSON.stringify(creds, null, 2))
123
+ res.writeHead(200, { 'Content-Type': 'text/html' })
124
+ return res.end('<h2>✅ Threads connected! You can close this tab.</h2>')
125
+ } catch (e) {
126
+ res.writeHead(500, { 'Content-Type': 'text/html' })
127
+ return res.end(`<h2>❌ Token exchange failed: ${e.message}</h2>`)
128
+ }
129
+ }
130
+
64
131
  return error(res, 404, 'Not found')
65
132
  } catch (err) {
66
133
  console.error('Server error:', err)