spectrawl 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts CHANGED
@@ -17,6 +17,7 @@ declare module 'spectrawl' {
17
17
  defaultEngine?: string
18
18
  proxy?: { type: string; host: string; port: number; username?: string; password?: string }
19
19
  humanlike?: { minDelay?: number; maxDelay?: number; scrollBehavior?: boolean }
20
+ captcha?: { apiKey?: string; model?: string }
20
21
  }
21
22
  auth?: {
22
23
  refreshInterval?: string
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.14",
3
+ "version": "0.3.16",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -0,0 +1,162 @@
1
+ const https = require('https')
2
+ const fs = require('fs')
3
+
4
+ /**
5
+ * CAPTCHA solver using Gemini Vision.
6
+ * Free tier: 1,500 req/day (gemini-2.0-flash).
7
+ *
8
+ * Handles: image CAPTCHAs, text CAPTCHAs, simple challenges.
9
+ * Does NOT handle: reCAPTCHA v2/v3, hCaptcha, Cloudflare Turnstile
10
+ * (those require token solving services like 2captcha).
11
+ *
12
+ * Strategy: Playwright stealth bypasses most CAPTCHAs.
13
+ * This is the fallback when a visual CAPTCHA appears.
14
+ */
15
+
16
+ class CaptchaSolver {
17
+ constructor(config = {}) {
18
+ this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
19
+ this.model = config.model || 'gemini-2.0-flash'
20
+ }
21
+
22
+ /**
23
+ * Detect if a page has a CAPTCHA challenge.
24
+ * Returns { hasCaptcha, type, selector } or null.
25
+ */
26
+ async detect(page) {
27
+ return page.evaluate(() => {
28
+ // Check for common CAPTCHA indicators
29
+ const indicators = [
30
+ // reCAPTCHA
31
+ { selector: '.g-recaptcha, #recaptcha, [data-sitekey]', type: 'recaptcha' },
32
+ // hCaptcha
33
+ { selector: '.h-captcha, [data-hcaptcha-sitekey]', type: 'hcaptcha' },
34
+ // Cloudflare Turnstile
35
+ { selector: '.cf-turnstile, [data-turnstile-sitekey]', type: 'turnstile' },
36
+ // Image CAPTCHA (solvable with vision)
37
+ { selector: 'img[src*="captcha"], img[alt*="captcha"], .captcha-image', type: 'image' },
38
+ // Text/math CAPTCHA
39
+ { selector: '[class*="captcha"] input, #captcha-input', type: 'text' },
40
+ ]
41
+
42
+ for (const { selector, type } of indicators) {
43
+ const el = document.querySelector(selector)
44
+ if (el) return { hasCaptcha: true, type, selector }
45
+ }
46
+
47
+ // Check page text for CAPTCHA mentions
48
+ const bodyText = document.body?.innerText?.toLowerCase() || ''
49
+ if (bodyText.includes('verify you are human') || bodyText.includes('complete the captcha')) {
50
+ return { hasCaptcha: true, type: 'unknown', selector: null }
51
+ }
52
+
53
+ return { hasCaptcha: false, type: null, selector: null }
54
+ })
55
+ }
56
+
57
+ /**
58
+ * Attempt to solve a visual CAPTCHA using Gemini Vision.
59
+ * Takes a screenshot of the CAPTCHA element, sends to Gemini, returns answer.
60
+ */
61
+ async solveImage(page, captchaSelector) {
62
+ if (!this.apiKey) {
63
+ throw new Error('GEMINI_API_KEY required for CAPTCHA solving')
64
+ }
65
+
66
+ // Screenshot the CAPTCHA element
67
+ const element = await page.$(captchaSelector)
68
+ if (!element) throw new Error(`CAPTCHA element not found: ${captchaSelector}`)
69
+
70
+ const screenshot = await element.screenshot({ type: 'png' })
71
+ const base64 = screenshot.toString('base64')
72
+
73
+ // Ask Gemini to solve it
74
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.model}:generateContent?key=${this.apiKey}`
75
+ const body = JSON.stringify({
76
+ contents: [{
77
+ parts: [
78
+ { text: 'This is a CAPTCHA image. What text, numbers, or answer does it show? Reply with ONLY the answer, nothing else.' },
79
+ { inline_data: { mime_type: 'image/png', data: base64 } }
80
+ ]
81
+ }],
82
+ generationConfig: { temperature: 0, maxOutputTokens: 100 }
83
+ })
84
+
85
+ const data = await this._post(url, body)
86
+ const answer = data.candidates?.[0]?.content?.parts?.[0]?.text?.trim()
87
+ return answer || null
88
+ }
89
+
90
+ /**
91
+ * Full solve flow: detect → solve → fill → submit.
92
+ * Returns true if solved, false if unsolvable type.
93
+ */
94
+ async trySolve(page) {
95
+ const detection = await this.detect(page)
96
+ if (!detection?.hasCaptcha) return { solved: false, reason: 'no captcha detected' }
97
+
98
+ // Token-based CAPTCHAs — can't solve with vision
99
+ if (['recaptcha', 'hcaptcha', 'turnstile'].includes(detection.type)) {
100
+ return { solved: false, reason: `${detection.type} requires token solving (2captcha/CapSolver)` }
101
+ }
102
+
103
+ // Image CAPTCHA — solve with Gemini Vision
104
+ if (detection.type === 'image') {
105
+ try {
106
+ const answer = await this.solveImage(page, detection.selector)
107
+ if (!answer) return { solved: false, reason: 'gemini could not read captcha' }
108
+
109
+ // Find the input field near the CAPTCHA
110
+ const inputSelector = await page.evaluate((captchaSelector) => {
111
+ const captcha = document.querySelector(captchaSelector)
112
+ // Look for nearby input
113
+ const parent = captcha?.closest('form') || captcha?.parentElement
114
+ const input = parent?.querySelector('input[type="text"], input:not([type])')
115
+ if (input) {
116
+ input.id = input.id || '__spectrawl_captcha_input'
117
+ return '#' + input.id
118
+ }
119
+ return null
120
+ }, detection.selector)
121
+
122
+ if (inputSelector) {
123
+ await page.fill(inputSelector, answer)
124
+ return { solved: true, answer, type: 'image' }
125
+ }
126
+ return { solved: false, reason: 'could not find captcha input field', answer }
127
+ } catch (e) {
128
+ return { solved: false, reason: e.message }
129
+ }
130
+ }
131
+
132
+ return { solved: false, reason: `unsupported captcha type: ${detection.type}` }
133
+ }
134
+
135
+ _post(url, body) {
136
+ return new Promise((resolve, reject) => {
137
+ const urlObj = new URL(url)
138
+ const req = https.request({
139
+ hostname: urlObj.hostname,
140
+ path: urlObj.pathname + urlObj.search,
141
+ method: 'POST',
142
+ headers: {
143
+ 'Content-Type': 'application/json',
144
+ 'Content-Length': Buffer.byteLength(body)
145
+ }
146
+ }, res => {
147
+ let data = ''
148
+ res.on('data', c => data += c)
149
+ res.on('end', () => {
150
+ try { resolve(JSON.parse(data)) }
151
+ catch (e) { reject(new Error('Invalid Gemini response')) }
152
+ })
153
+ })
154
+ req.on('error', reject)
155
+ req.setTimeout(15000, () => { req.destroy(); reject(new Error('Gemini vision timeout')) })
156
+ req.write(body)
157
+ req.end()
158
+ })
159
+ }
160
+ }
161
+
162
+ module.exports = { CaptchaSolver }
@@ -12,6 +12,7 @@ const os = require('os')
12
12
  const path = require('path')
13
13
  const { CamoufoxClient } = require('./camoufox')
14
14
  const { getCamoufoxPath, isInstalled } = require('./install-stealth')
15
+ const { CaptchaSolver } = require('./captcha-solver')
15
16
 
16
17
  class BrowseEngine {
17
18
  constructor(config = {}, cache) {
@@ -23,6 +24,9 @@ class BrowseEngine {
23
24
  this.remoteCamoufox = config.camoufox?.url ? new CamoufoxClient(config.camoufox) : null
24
25
  this._remoteCamoufoxAvailable = null
25
26
 
27
+ // CAPTCHA solver (Gemini Vision fallback)
28
+ this.captchaSolver = new CaptchaSolver(config.captcha || {})
29
+
26
30
  // Which engine we're using
27
31
  this._engine = null
28
32
  }
@@ -114,15 +114,60 @@ async function install() {
114
114
  // Download
115
115
  await download(url, zipPath)
116
116
 
117
- // Extract
117
+ // Extract — try multiple methods (large zip64 files break some tools)
118
118
  console.log(' Extracting...')
119
119
  fs.mkdirSync(extractDir, { recursive: true })
120
120
 
121
- try {
122
- execSync(`unzip -o "${zipPath}" -d "${extractDir}"`, { stdio: 'pipe' })
123
- } catch (e) {
124
- // Try with built-in tools on systems without unzip
125
- execSync(`python3 -c "import zipfile; zipfile.ZipFile('${zipPath}').extractall('${extractDir}')"`, { stdio: 'pipe' })
121
+ const extractMethods = [
122
+ // 1. unzip (most common on Linux/Mac)
123
+ () => execSync(`unzip -o "${zipPath}" -d "${extractDir}"`, { stdio: 'pipe' }),
124
+ // 2. 7z (handles zip64 reliably)
125
+ () => execSync(`7z x -o"${extractDir}" -y "${zipPath}"`, { stdio: 'pipe' }),
126
+ // 3. bsdtar (available on many systems, handles zip64)
127
+ () => execSync(`bsdtar -xf "${zipPath}" -C "${extractDir}"`, { stdio: 'pipe' }),
128
+ // 4. Node.js built-in (no external deps, handles zip64)
129
+ () => {
130
+ const { execSync: es } = require('child_process')
131
+ es(`node -e "
132
+ const fs = require('fs');
133
+ const zlib = require('zlib');
134
+ const { execFileSync } = require('child_process');
135
+ // Use jar if available (JDK)
136
+ execFileSync('jar', ['xf', '${zipPath}'], { cwd: '${extractDir}', stdio: 'pipe' });
137
+ "`, { stdio: 'pipe' })
138
+ },
139
+ // 5. Python with explicit zip64 support
140
+ () => execSync(`python3 -c "
141
+ import zipfile, sys
142
+ try:
143
+ z = zipfile.ZipFile('${zipPath}', allowZip64=True)
144
+ z.extractall('${extractDir}')
145
+ z.close()
146
+ except Exception as e:
147
+ print(f'Python extract failed: {e}', file=sys.stderr)
148
+ sys.exit(1)
149
+ "`, { stdio: 'pipe' }),
150
+ ]
151
+
152
+ let extracted = false
153
+ for (const method of extractMethods) {
154
+ try {
155
+ method()
156
+ extracted = true
157
+ break
158
+ } catch (e) {
159
+ continue
160
+ }
161
+ }
162
+
163
+ if (!extracted) {
164
+ fs.unlinkSync(zipPath)
165
+ throw new Error(
166
+ 'Could not extract Camoufox archive. Install one of: unzip, 7z, or bsdtar.\n' +
167
+ ' Ubuntu/Debian: sudo apt-get install unzip\n' +
168
+ ' macOS: brew install p7zip\n' +
169
+ ' Alpine: apk add unzip'
170
+ )
126
171
  }
127
172
 
128
173
  // Clean up zip
@@ -12,7 +12,7 @@ const { jinaExtract } = require('./engines/jina')
12
12
  */
13
13
  async function scrapeUrls(urls, opts = {}) {
14
14
  const results = {}
15
- const timeout = opts.timeout || 3000 // 3s hard cutoff per URL (was 10s)
15
+ const timeout = opts.timeout || 5000 // 5s per URL balances speed vs quality
16
16
  const concurrent = opts.concurrent || 5
17
17
  const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
18
18
 
@@ -35,7 +35,7 @@ async function scrapeUrls(urls, opts = {}) {
35
35
  }
36
36
 
37
37
  async function scrapeUrl(url, opts = {}) {
38
- const { timeout = 3000, engine = 'auto', browse } = opts
38
+ const { timeout = 5000, engine = 'auto', browse } = opts
39
39
 
40
40
  // Try Jina first if available (better markdown output)
41
41
  if (engine === 'jina' || engine === 'auto') {