spectrawl 0.3.14 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +1 -0
- package/package.json +1 -1
- package/src/browse/captcha-solver.js +162 -0
- package/src/browse/index.js +4 -0
- package/src/browse/install-stealth.js +51 -6
- package/src/search/scraper.js +2 -2
package/index.d.ts
CHANGED
|
@@ -17,6 +17,7 @@ declare module 'spectrawl' {
|
|
|
17
17
|
defaultEngine?: string
|
|
18
18
|
proxy?: { type: string; host: string; port: number; username?: string; password?: string }
|
|
19
19
|
humanlike?: { minDelay?: number; maxDelay?: number; scrollBehavior?: boolean }
|
|
20
|
+
captcha?: { apiKey?: string; model?: string }
|
|
20
21
|
}
|
|
21
22
|
auth?: {
|
|
22
23
|
refreshInterval?: string
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.16",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
const https = require('https')
|
|
2
|
+
const fs = require('fs')
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CAPTCHA solver using Gemini Vision.
|
|
6
|
+
* Free tier: 1,500 req/day (gemini-2.0-flash).
|
|
7
|
+
*
|
|
8
|
+
* Handles: image CAPTCHAs, text CAPTCHAs, simple challenges.
|
|
9
|
+
* Does NOT handle: reCAPTCHA v2/v3, hCaptcha, Cloudflare Turnstile
|
|
10
|
+
* (those require token solving services like 2captcha).
|
|
11
|
+
*
|
|
12
|
+
* Strategy: Playwright stealth bypasses most CAPTCHAs.
|
|
13
|
+
* This is the fallback when a visual CAPTCHA appears.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
class CaptchaSolver {
|
|
17
|
+
constructor(config = {}) {
|
|
18
|
+
this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
|
|
19
|
+
this.model = config.model || 'gemini-2.0-flash'
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Detect if a page has a CAPTCHA challenge.
|
|
24
|
+
* Returns { hasCaptcha, type, selector } or null.
|
|
25
|
+
*/
|
|
26
|
+
async detect(page) {
|
|
27
|
+
return page.evaluate(() => {
|
|
28
|
+
// Check for common CAPTCHA indicators
|
|
29
|
+
const indicators = [
|
|
30
|
+
// reCAPTCHA
|
|
31
|
+
{ selector: '.g-recaptcha, #recaptcha, [data-sitekey]', type: 'recaptcha' },
|
|
32
|
+
// hCaptcha
|
|
33
|
+
{ selector: '.h-captcha, [data-hcaptcha-sitekey]', type: 'hcaptcha' },
|
|
34
|
+
// Cloudflare Turnstile
|
|
35
|
+
{ selector: '.cf-turnstile, [data-turnstile-sitekey]', type: 'turnstile' },
|
|
36
|
+
// Image CAPTCHA (solvable with vision)
|
|
37
|
+
{ selector: 'img[src*="captcha"], img[alt*="captcha"], .captcha-image', type: 'image' },
|
|
38
|
+
// Text/math CAPTCHA
|
|
39
|
+
{ selector: '[class*="captcha"] input, #captcha-input', type: 'text' },
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
for (const { selector, type } of indicators) {
|
|
43
|
+
const el = document.querySelector(selector)
|
|
44
|
+
if (el) return { hasCaptcha: true, type, selector }
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Check page text for CAPTCHA mentions
|
|
48
|
+
const bodyText = document.body?.innerText?.toLowerCase() || ''
|
|
49
|
+
if (bodyText.includes('verify you are human') || bodyText.includes('complete the captcha')) {
|
|
50
|
+
return { hasCaptcha: true, type: 'unknown', selector: null }
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return { hasCaptcha: false, type: null, selector: null }
|
|
54
|
+
})
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Attempt to solve a visual CAPTCHA using Gemini Vision.
|
|
59
|
+
* Takes a screenshot of the CAPTCHA element, sends to Gemini, returns answer.
|
|
60
|
+
*/
|
|
61
|
+
async solveImage(page, captchaSelector) {
|
|
62
|
+
if (!this.apiKey) {
|
|
63
|
+
throw new Error('GEMINI_API_KEY required for CAPTCHA solving')
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Screenshot the CAPTCHA element
|
|
67
|
+
const element = await page.$(captchaSelector)
|
|
68
|
+
if (!element) throw new Error(`CAPTCHA element not found: ${captchaSelector}`)
|
|
69
|
+
|
|
70
|
+
const screenshot = await element.screenshot({ type: 'png' })
|
|
71
|
+
const base64 = screenshot.toString('base64')
|
|
72
|
+
|
|
73
|
+
// Ask Gemini to solve it
|
|
74
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.model}:generateContent?key=${this.apiKey}`
|
|
75
|
+
const body = JSON.stringify({
|
|
76
|
+
contents: [{
|
|
77
|
+
parts: [
|
|
78
|
+
{ text: 'This is a CAPTCHA image. What text, numbers, or answer does it show? Reply with ONLY the answer, nothing else.' },
|
|
79
|
+
{ inline_data: { mime_type: 'image/png', data: base64 } }
|
|
80
|
+
]
|
|
81
|
+
}],
|
|
82
|
+
generationConfig: { temperature: 0, maxOutputTokens: 100 }
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
const data = await this._post(url, body)
|
|
86
|
+
const answer = data.candidates?.[0]?.content?.parts?.[0]?.text?.trim()
|
|
87
|
+
return answer || null
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Full solve flow: detect → solve → fill → submit.
|
|
92
|
+
* Returns true if solved, false if unsolvable type.
|
|
93
|
+
*/
|
|
94
|
+
async trySolve(page) {
|
|
95
|
+
const detection = await this.detect(page)
|
|
96
|
+
if (!detection?.hasCaptcha) return { solved: false, reason: 'no captcha detected' }
|
|
97
|
+
|
|
98
|
+
// Token-based CAPTCHAs — can't solve with vision
|
|
99
|
+
if (['recaptcha', 'hcaptcha', 'turnstile'].includes(detection.type)) {
|
|
100
|
+
return { solved: false, reason: `${detection.type} requires token solving (2captcha/CapSolver)` }
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Image CAPTCHA — solve with Gemini Vision
|
|
104
|
+
if (detection.type === 'image') {
|
|
105
|
+
try {
|
|
106
|
+
const answer = await this.solveImage(page, detection.selector)
|
|
107
|
+
if (!answer) return { solved: false, reason: 'gemini could not read captcha' }
|
|
108
|
+
|
|
109
|
+
// Find the input field near the CAPTCHA
|
|
110
|
+
const inputSelector = await page.evaluate((captchaSelector) => {
|
|
111
|
+
const captcha = document.querySelector(captchaSelector)
|
|
112
|
+
// Look for nearby input
|
|
113
|
+
const parent = captcha?.closest('form') || captcha?.parentElement
|
|
114
|
+
const input = parent?.querySelector('input[type="text"], input:not([type])')
|
|
115
|
+
if (input) {
|
|
116
|
+
input.id = input.id || '__spectrawl_captcha_input'
|
|
117
|
+
return '#' + input.id
|
|
118
|
+
}
|
|
119
|
+
return null
|
|
120
|
+
}, detection.selector)
|
|
121
|
+
|
|
122
|
+
if (inputSelector) {
|
|
123
|
+
await page.fill(inputSelector, answer)
|
|
124
|
+
return { solved: true, answer, type: 'image' }
|
|
125
|
+
}
|
|
126
|
+
return { solved: false, reason: 'could not find captcha input field', answer }
|
|
127
|
+
} catch (e) {
|
|
128
|
+
return { solved: false, reason: e.message }
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return { solved: false, reason: `unsupported captcha type: ${detection.type}` }
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
_post(url, body) {
|
|
136
|
+
return new Promise((resolve, reject) => {
|
|
137
|
+
const urlObj = new URL(url)
|
|
138
|
+
const req = https.request({
|
|
139
|
+
hostname: urlObj.hostname,
|
|
140
|
+
path: urlObj.pathname + urlObj.search,
|
|
141
|
+
method: 'POST',
|
|
142
|
+
headers: {
|
|
143
|
+
'Content-Type': 'application/json',
|
|
144
|
+
'Content-Length': Buffer.byteLength(body)
|
|
145
|
+
}
|
|
146
|
+
}, res => {
|
|
147
|
+
let data = ''
|
|
148
|
+
res.on('data', c => data += c)
|
|
149
|
+
res.on('end', () => {
|
|
150
|
+
try { resolve(JSON.parse(data)) }
|
|
151
|
+
catch (e) { reject(new Error('Invalid Gemini response')) }
|
|
152
|
+
})
|
|
153
|
+
})
|
|
154
|
+
req.on('error', reject)
|
|
155
|
+
req.setTimeout(15000, () => { req.destroy(); reject(new Error('Gemini vision timeout')) })
|
|
156
|
+
req.write(body)
|
|
157
|
+
req.end()
|
|
158
|
+
})
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
module.exports = { CaptchaSolver }
|
package/src/browse/index.js
CHANGED
|
@@ -12,6 +12,7 @@ const os = require('os')
|
|
|
12
12
|
const path = require('path')
|
|
13
13
|
const { CamoufoxClient } = require('./camoufox')
|
|
14
14
|
const { getCamoufoxPath, isInstalled } = require('./install-stealth')
|
|
15
|
+
const { CaptchaSolver } = require('./captcha-solver')
|
|
15
16
|
|
|
16
17
|
class BrowseEngine {
|
|
17
18
|
constructor(config = {}, cache) {
|
|
@@ -23,6 +24,9 @@ class BrowseEngine {
|
|
|
23
24
|
this.remoteCamoufox = config.camoufox?.url ? new CamoufoxClient(config.camoufox) : null
|
|
24
25
|
this._remoteCamoufoxAvailable = null
|
|
25
26
|
|
|
27
|
+
// CAPTCHA solver (Gemini Vision fallback)
|
|
28
|
+
this.captchaSolver = new CaptchaSolver(config.captcha || {})
|
|
29
|
+
|
|
26
30
|
// Which engine we're using
|
|
27
31
|
this._engine = null
|
|
28
32
|
}
|
|
@@ -114,15 +114,60 @@ async function install() {
|
|
|
114
114
|
// Download
|
|
115
115
|
await download(url, zipPath)
|
|
116
116
|
|
|
117
|
-
// Extract
|
|
117
|
+
// Extract — try multiple methods (large zip64 files break some tools)
|
|
118
118
|
console.log(' Extracting...')
|
|
119
119
|
fs.mkdirSync(extractDir, { recursive: true })
|
|
120
120
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
//
|
|
125
|
-
execSync(`
|
|
121
|
+
const extractMethods = [
|
|
122
|
+
// 1. unzip (most common on Linux/Mac)
|
|
123
|
+
() => execSync(`unzip -o "${zipPath}" -d "${extractDir}"`, { stdio: 'pipe' }),
|
|
124
|
+
// 2. 7z (handles zip64 reliably)
|
|
125
|
+
() => execSync(`7z x -o"${extractDir}" -y "${zipPath}"`, { stdio: 'pipe' }),
|
|
126
|
+
// 3. bsdtar (available on many systems, handles zip64)
|
|
127
|
+
() => execSync(`bsdtar -xf "${zipPath}" -C "${extractDir}"`, { stdio: 'pipe' }),
|
|
128
|
+
// 4. Node.js built-in (no external deps, handles zip64)
|
|
129
|
+
() => {
|
|
130
|
+
const { execSync: es } = require('child_process')
|
|
131
|
+
es(`node -e "
|
|
132
|
+
const fs = require('fs');
|
|
133
|
+
const zlib = require('zlib');
|
|
134
|
+
const { execFileSync } = require('child_process');
|
|
135
|
+
// Use jar if available (JDK)
|
|
136
|
+
execFileSync('jar', ['xf', '${zipPath}'], { cwd: '${extractDir}', stdio: 'pipe' });
|
|
137
|
+
"`, { stdio: 'pipe' })
|
|
138
|
+
},
|
|
139
|
+
// 5. Python with explicit zip64 support
|
|
140
|
+
() => execSync(`python3 -c "
|
|
141
|
+
import zipfile, sys
|
|
142
|
+
try:
|
|
143
|
+
z = zipfile.ZipFile('${zipPath}', allowZip64=True)
|
|
144
|
+
z.extractall('${extractDir}')
|
|
145
|
+
z.close()
|
|
146
|
+
except Exception as e:
|
|
147
|
+
print(f'Python extract failed: {e}', file=sys.stderr)
|
|
148
|
+
sys.exit(1)
|
|
149
|
+
"`, { stdio: 'pipe' }),
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
let extracted = false
|
|
153
|
+
for (const method of extractMethods) {
|
|
154
|
+
try {
|
|
155
|
+
method()
|
|
156
|
+
extracted = true
|
|
157
|
+
break
|
|
158
|
+
} catch (e) {
|
|
159
|
+
continue
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
if (!extracted) {
|
|
164
|
+
fs.unlinkSync(zipPath)
|
|
165
|
+
throw new Error(
|
|
166
|
+
'Could not extract Camoufox archive. Install one of: unzip, 7z, or bsdtar.\n' +
|
|
167
|
+
' Ubuntu/Debian: sudo apt-get install unzip\n' +
|
|
168
|
+
' macOS: brew install p7zip\n' +
|
|
169
|
+
' Alpine: apk add unzip'
|
|
170
|
+
)
|
|
126
171
|
}
|
|
127
172
|
|
|
128
173
|
// Clean up zip
|
package/src/search/scraper.js
CHANGED
|
@@ -12,7 +12,7 @@ const { jinaExtract } = require('./engines/jina')
|
|
|
12
12
|
*/
|
|
13
13
|
async function scrapeUrls(urls, opts = {}) {
|
|
14
14
|
const results = {}
|
|
15
|
-
const timeout = opts.timeout ||
|
|
15
|
+
const timeout = opts.timeout || 5000 // 5s per URL — balances speed vs quality
|
|
16
16
|
const concurrent = opts.concurrent || 5
|
|
17
17
|
const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
|
|
18
18
|
|
|
@@ -35,7 +35,7 @@ async function scrapeUrls(urls, opts = {}) {
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
async function scrapeUrl(url, opts = {}) {
|
|
38
|
-
const { timeout =
|
|
38
|
+
const { timeout = 5000, engine = 'auto', browse } = opts
|
|
39
39
|
|
|
40
40
|
// Try Jina first if available (better markdown output)
|
|
41
41
|
if (engine === 'jina' || engine === 'auto') {
|