npm - spectrawl - Versions diffs - 0.1.0 - Mend

spectrawl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/LICENSE +21 -0
package/README.md +179 -0
package/index.d.ts +90 -0
package/package.json +53 -0
package/src/act/adapters/devto.js +103 -0
package/src/act/adapters/hashnode.js +89 -0
package/src/act/adapters/ih.js +251 -0
package/src/act/adapters/linkedin.js +106 -0
package/src/act/adapters/reddit.js +160 -0
package/src/act/adapters/x.js +202 -0
package/src/act/form-filler.js +94 -0
package/src/act/index.js +159 -0
package/src/act/rate-limiter.js +143 -0
package/src/auth/index.js +132 -0
package/src/auth/refresh.js +111 -0
package/src/browse/camoufox.js +164 -0
package/src/browse/index.js +278 -0
package/src/browse/install-stealth.js +188 -0
package/src/cache.js +82 -0
package/src/cli.js +160 -0
package/src/config.js +65 -0
package/src/events.js +57 -0
package/src/index.js +108 -0
package/src/mcp.js +195 -0
package/src/search/engines/brave.js +62 -0
package/src/search/engines/ddg.js +192 -0
package/src/search/engines/google-cse.js +50 -0
package/src/search/engines/jina.js +76 -0
package/src/search/engines/searxng.js +69 -0
package/src/search/engines/serper.js +64 -0
package/src/search/index.js +104 -0
package/src/search/scraper.js +170 -0
package/src/search/summarizer.js +156 -0
package/src/server.js +111 -0

package/src/search/engines/serper.js ADDED Viewed

@@ -0,0 +1,64 @@
+const https = require('https')
+/**
+ * Serper.dev — 2500 free Google SERP queries.
+ * Requires SERPER_API_KEY in config or env.
+ */
+async function serperSearch(query, config = {}) {
+  const apiKey = config.apiKey || process.env.SERPER_API_KEY
+  if (!apiKey) throw new Error('Serper API key not configured')
+  const maxResults = config.maxResults || 10
+  const body = JSON.stringify({
+    q: query,
+    num: maxResults
+  })
+  const data = await postJson('https://google.serper.dev/search', body, {
+    'X-API-KEY': apiKey,
+    'Content-Type': 'application/json'
+  })
+  if (!data.organic) return []
+  return data.organic.map(r => ({
+    url: r.link,
+    title: r.title,
+    snippet: r.snippet || '',
+    engine: 'serper'
+  }))
+}
+function postJson(url, body, headers = {}) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const opts = {
+      hostname: urlObj.hostname,
+      path: urlObj.pathname,
+      method: 'POST',
+      headers: {
+        ...headers,
+        'User-Agent': 'Spectrawl/0.1.0',
+        'Content-Length': Buffer.byteLength(body)
+      }
+    }
+    const req = https.request(opts, (res) => {
+      let data = ''
+      res.on('data', chunk => data += chunk)
+      res.on('end', () => {
+        try {
+          resolve(JSON.parse(data))
+        } catch (e) {
+          reject(new Error(`Serper API returned invalid JSON: ${data.slice(0, 200)}`))
+        }
+      })
+    })
+    req.on('error', reject)
+    req.setTimeout(10000, () => { req.destroy(); reject(new Error('Serper API timeout')) })
+    req.write(body)
+    req.end()
+  })
+}
+module.exports = { serperSearch }

package/src/search/index.js ADDED Viewed

@@ -0,0 +1,104 @@
+const { ddgSearch } = require('./engines/ddg')
+const { braveSearch } = require('./engines/brave')
+const { serperSearch } = require('./engines/serper')
+const { searxngSearch } = require('./engines/searxng')
+const { googleCseSearch } = require('./engines/google-cse')
+const { jinaSearch } = require('./engines/jina')
+const { scrapeUrls } = require('./scraper')
+const { Summarizer } = require('./summarizer')
+const ENGINES = {
+  searxng: searxngSearch,
+  ddg: ddgSearch,
+  brave: braveSearch,
+  serper: serperSearch,
+  'google-cse': googleCseSearch,
+  jina: jinaSearch
+}
+class SearchEngine {
+  constructor(config = {}, cache) {
+    this.config = config
+    this.cache = cache
+    this.cascade = config.cascade || ['ddg', 'brave', 'serper']
+    this.scrapeTop = config.scrapeTop || 3
+    this.summarizer = config.llm ? new Summarizer(config.llm) : null
+  }
+  /**
+   * Search using the cascade strategy.
+   * Tries free/unlimited engines first, escalates to quota-limited ones if needed.
+   */
+  async search(query, opts = {}) {
+    if (!query || !query.trim()) {
+      throw new Error('Search query is required')
+    }
+    // Check cache first
+    const cacheKey = `${query}:${JSON.stringify(opts)}`
+    const cached = this.cache?.get('search', cacheKey)
+    if (cached) return { ...cached, cached: true }
+    let results = []
+    const minResults = opts.minResults || 5
+    // Cascade through engines until we have enough results
+    for (const engineName of this.cascade) {
+      const engine = ENGINES[engineName]
+      if (!engine) continue
+      try {
+        const engineResults = await engine(query, this.config[engineName] || {})
+        results = dedupeResults([...results, ...engineResults])
+        if (results.length >= minResults) break
+      } catch (err) {
+        console.warn(`Search engine ${engineName} failed:`, err.message)
+        continue
+      }
+    }
+    // Scrape top N results for full content
+    const scrapeCount = opts.scrapeTop ?? this.scrapeTop
+    if (scrapeCount > 0 && results.length > 0) {
+      const urls = results.slice(0, scrapeCount).map(r => r.url)
+      const scraped = await scrapeUrls(urls)
+      for (const result of results) {
+        const scrapedContent = scraped[result.url]
+        if (scrapedContent) {
+          result.fullContent = scrapedContent
+        }
+      }
+    }
+    // LLM summarization (optional)
+    let answer = null
+    if (opts.summarize && this.config.llm) {
+      answer = await this._summarize(query, results)
+    }
+    const response = { answer, sources: results, cached: false }
+    // Cache the result
+    this.cache?.set('search', cacheKey, response)
+    return response
+  }
+  async _summarize(query, results) {
+    if (!this.summarizer) return null
+    return this.summarizer.summarize(query, results)
+  }
+}
+function dedupeResults(results) {
+  const seen = new Set()
+  return results.filter(r => {
+    if (seen.has(r.url)) return false
+    seen.add(r.url)
+    return true
+  })
+}
+module.exports = { SearchEngine }

package/src/search/scraper.js ADDED Viewed

@@ -0,0 +1,170 @@
+const https = require('https')
+const http = require('http')
+const { URL } = require('url')
+const { jinaExtract } = require('./engines/jina')
+/**
+ * Scrape URLs for full content.
+ * Dual engine approach (like tavily-open):
+ *   1. Jina Reader (fast, AI-optimized markdown) — if available
+ *   2. Readability (built-in, no deps) — fallback
+ *   3. Browser (Camoufox/Playwright) — for JS-heavy/blocked pages
+ */
+async function scrapeUrls(urls, opts = {}) {
+  const results = {}
+  const timeout = opts.timeout || 10000
+  const concurrent = opts.concurrent || 3
+  const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
+  for (let i = 0; i < urls.length; i += concurrent) {
+    const batch = urls.slice(i, i + concurrent)
+    const promises = batch.map(url => scrapeUrl(url, { timeout, engine }).catch(() => null))
+    const batchResults = await Promise.all(promises)
+    batch.forEach((url, idx) => {
+      if (batchResults[idx]) {
+        results[url] = batchResults[idx]
+      }
+    })
+  }
+  return results
+}
+async function scrapeUrl(url, opts = {}) {
+  const { timeout = 10000, engine = 'auto' } = opts
+  // Try Jina first if available (better markdown output)
+  if (engine === 'jina' || engine === 'auto') {
+    try {
+      const result = await jinaExtract(url)
+      if (result.content && result.content.length > 100) {
+        return result.content
+      }
+    } catch (e) {
+      // Fall through to readability
+    }
+  }
+  // Readability fallback
+  const html = await fetchPage(url, timeout)
+  return extractMarkdown(html)
+}
+function fetchPage(url, timeout = 10000, redirects = 3) {
+  return new Promise((resolve, reject) => {
+    if (redirects <= 0) return reject(new Error('Too many redirects'))
+    const urlObj = new URL(url)
+    const client = urlObj.protocol === 'https:' ? https : http
+    const opts = {
+      hostname: urlObj.hostname,
+      path: urlObj.pathname + urlObj.search,
+      method: 'GET',
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept-Encoding': 'identity'
+      }
+    }
+    const req = client.request(opts, (res) => {
+      if ([301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
+        const redirectUrl = new URL(res.headers.location, url).toString()
+        return fetchPage(redirectUrl, timeout, redirects - 1).then(resolve).catch(reject)
+      }
+      if (res.statusCode !== 200) {
+        return reject(new Error(`HTTP ${res.statusCode}`))
+      }
+      let data = ''
+      res.on('data', chunk => data += chunk)
+      res.on('end', () => resolve(data))
+    })
+    req.on('error', reject)
+    req.setTimeout(timeout, () => { req.destroy(); reject(new Error('Scrape timeout')) })
+    req.end()
+  })
+}
+/**
+ * Extract content as clean markdown (improved over basic readability).
+ * Handles: headings, lists, code blocks, tables, links, bold/italic.
+ */
+function extractMarkdown(html) {
+  // Remove noise
+  let content = html
+    .replace(/<script[\s\S]*?<\/script>/gi, '')
+    .replace(/<style[\s\S]*?<\/style>/gi, '')
+    .replace(/<!--[\s\S]*?-->/g, '')
+    .replace(/<nav[\s\S]*?<\/nav>/gi, '')
+    .replace(/<footer[\s\S]*?<\/footer>/gi, '')
+    .replace(/<header[\s\S]*?<\/header>/gi, '')
+    .replace(/<aside[\s\S]*?<\/aside>/gi, '')
+    .replace(/<iframe[\s\S]*?<\/iframe>/gi, '')
+    .replace(/<svg[\s\S]*?<\/svg>/gi, '')
+  // Try to find main content
+  const mainMatch = content.match(/<main[\s\S]*?<\/main>/i) ||
+                    content.match(/<article[\s\S]*?<\/article>/i) ||
+                    content.match(/<div[^>]*(?:content|article|post|entry|main)[^>]*>[\s\S]*?<\/div>/i)
+  if (mainMatch) content = mainMatch[0]
+  // Convert to markdown
+  content = content
+    // Headings
+    .replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n# $1\n')
+    .replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n## $1\n')
+    .replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n### $1\n')
+    .replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n#### $1\n')
+    .replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, '\n##### $1\n')
+    .replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, '\n###### $1\n')
+    // Bold/italic
+    .replace(/<(?:strong|b)>([\s\S]*?)<\/(?:strong|b)>/gi, '**$1**')
+    .replace(/<(?:em|i)>([\s\S]*?)<\/(?:em|i)>/gi, '*$1*')
+    // Links
+    .replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)')
+    // Code blocks
+    .replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '\n```\n$1\n```\n')
+    .replace(/<code>([\s\S]*?)<\/code>/gi, '`$1`')
+    // Lists
+    .replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1\n')
+    .replace(/<\/?[ou]l[^>]*>/gi, '\n')
+    // Table (basic)
+    .replace(/<tr[^>]*>([\s\S]*?)<\/tr>/gi, '|$1|\n')
+    .replace(/<t[hd][^>]*>([\s\S]*?)<\/t[hd]>/gi, ' $1 |')
+    // Paragraphs/breaks
+    .replace(/<br\s*\/?>/gi, '\n')
+    .replace(/<\/p>/gi, '\n\n')
+    .replace(/<p[^>]*>/gi, '')
+    // Strip remaining tags
+    .replace(/<[^>]+>/g, '')
+    // Decode entities
+    .replace(/&nbsp;/g, ' ')
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/&mdash;/g, '—')
+    .replace(/&ndash;/g, '–')
+    // Clean whitespace
+    .replace(/\n{3,}/g, '\n\n')
+    .replace(/[ \t]+/g, ' ')
+    .replace(/^ +/gm, '')
+    .trim()
+  // Truncate
+  if (content.length > 15000) {
+    content = content.slice(0, 15000) + '\n\n...(truncated)'
+  }
+  return content
+}
+module.exports = { scrapeUrls, scrapeUrl, extractMarkdown }

package/src/search/summarizer.js ADDED Viewed

@@ -0,0 +1,156 @@
+const https = require('https')
+/**
+ * LLM summarization for search results.
+ * Supports: openai, anthropic, minimax, ollama
+ */
+class Summarizer {
+  constructor(config = {}) {
+    this.provider = config.provider || 'openai'
+    this.model = config.model || 'gpt-4o-mini'
+    this.apiKey = config.apiKey || process.env[this._envKey()]
+    this.baseUrl = config.baseUrl || null
+  }
+  _envKey() {
+    const keys = {
+      openai: 'OPENAI_API_KEY',
+      anthropic: 'ANTHROPIC_API_KEY',
+      minimax: 'MINIMAX_API_KEY',
+      xai: 'XAI_API_KEY'
+    }
+    return keys[this.provider] || 'OPENAI_API_KEY'
+  }
+  async summarize(query, sources) {
+    if (!this.apiKey) return null
+    const context = sources
+      .slice(0, 5)
+      .map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
+      .join('\n\n')
+    const prompt = `Based on the following search results, provide a concise answer to the query: "${query}"
+Include citations as [1], [2], etc. referencing the source numbers below.
+Be direct and factual. If the sources don't contain enough information, say so.
+Sources:
+${context}
+Answer:`
+    try {
+      return await this._call(prompt)
+    } catch (err) {
+      console.warn('Summarization failed:', err.message)
+      return null
+    }
+  }
+  async _call(prompt) {
+    switch (this.provider) {
+      case 'openai':
+      case 'minimax':
+      case 'xai':
+        return this._openaiCompatible(prompt)
+      case 'anthropic':
+        return this._anthropic(prompt)
+      case 'ollama':
+        return this._ollama(prompt)
+      default:
+        return this._openaiCompatible(prompt)
+    }
+  }
+  async _openaiCompatible(prompt) {
+    const urls = {
+      openai: 'https://api.openai.com/v1/chat/completions',
+      minimax: 'https://api.minimax.chat/v1/text/chatcompletion_v2',
+      xai: 'https://api.x.ai/v1/chat/completions'
+    }
+    const url = this.baseUrl || urls[this.provider] || urls.openai
+    const body = JSON.stringify({
+      model: this.model,
+      messages: [
+        { role: 'system', content: 'You are a concise search assistant. Answer with citations.' },
+        { role: 'user', content: prompt }
+      ],
+      max_tokens: 500,
+      temperature: 0.3
+    })
+    const data = await postJson(url, body, {
+      'Authorization': `Bearer ${this.apiKey}`,
+      'Content-Type': 'application/json'
+    })
+    return data.choices?.[0]?.message?.content || null
+  }
+  async _anthropic(prompt) {
+    const url = this.baseUrl || 'https://api.anthropic.com/v1/messages'
+    const body = JSON.stringify({
+      model: this.model || 'claude-3-5-haiku-20241022',
+      max_tokens: 500,
+      messages: [{ role: 'user', content: prompt }]
+    })
+    const data = await postJson(url, body, {
+      'x-api-key': this.apiKey,
+      'anthropic-version': '2023-06-01',
+      'Content-Type': 'application/json'
+    })
+    return data.content?.[0]?.text || null
+  }
+  async _ollama(prompt) {
+    const url = this.baseUrl || 'http://localhost:11434/api/generate'
+    const body = JSON.stringify({
+      model: this.model || 'llama3',
+      prompt,
+      stream: false
+    })
+    const data = await postJson(url, body, {
+      'Content-Type': 'application/json'
+    })
+    return data.response || null
+  }
+}
+function postJson(url, body, headers = {}) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const client = urlObj.protocol === 'https:' ? https : require('http')
+    const opts = {
+      hostname: urlObj.hostname,
+      port: urlObj.port || (urlObj.protocol === 'https:' ? 443 : 80),
+      path: urlObj.pathname + urlObj.search,
+      method: 'POST',
+      headers: {
+        ...headers,
+        'Content-Length': Buffer.byteLength(body)
+      }
+    }
+    const req = client.request(opts, (res) => {
+      let data = ''
+      res.on('data', chunk => data += chunk)
+      res.on('end', () => {
+        try { resolve(JSON.parse(data)) }
+        catch (e) { reject(new Error(`Invalid JSON: ${data.slice(0, 200)}`)) }
+      })
+    })
+    req.on('error', reject)
+    req.setTimeout(30000, () => { req.destroy(); reject(new Error('LLM timeout')) })
+    req.write(body)
+    req.end()
+  })
+}
+module.exports = { Summarizer }

package/src/server.js ADDED Viewed

@@ -0,0 +1,111 @@
+const http = require('http')
+const { Spectrawl } = require('./index')
+const { loadConfig } = require('./config')
+const config = loadConfig()
+const spectrawl = new Spectrawl()
+const server = http.createServer(async (req, res) => {
+  // CORS
+  res.setHeader('Access-Control-Allow-Origin', '*')
+  res.setHeader('Access-Control-Allow-Methods', 'GET, POST, OPTIONS')
+  res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Authorization')
+  if (req.method === 'OPTIONS') {
+    res.writeHead(204)
+    return res.end()
+  }
+  const url = new URL(req.url, `http://${req.headers.host}`)
+  const path = url.pathname
+  try {
+    if (req.method === 'GET' && path === '/health') {
+      return json(res, { status: 'ok', version: '0.1.0' })
+    }
+    if (req.method === 'GET' && path === '/status') {
+      const status = await spectrawl.status()
+      return json(res, { accounts: status })
+    }
+    if (req.method === 'POST' && path === '/search') {
+      const body = await readBody(req)
+      const { query, summarize, scrapeTop, minResults } = body
+      if (!query) return error(res, 400, 'query is required')
+      const results = await spectrawl.search(query, { summarize, scrapeTop, minResults })
+      return json(res, results)
+    }
+    if (req.method === 'POST' && path === '/browse') {
+      const body = await readBody(req)
+      const { url: targetUrl, auth, screenshot, html, stealth } = body
+      if (!targetUrl) return error(res, 400, 'url is required')
+      const result = await spectrawl.browse(targetUrl, { auth, screenshot, html, stealth })
+      // If screenshot, return as base64
+      if (result.screenshot) {
+        result.screenshot = result.screenshot.toString('base64')
+      }
+      return json(res, result)
+    }
+    if (req.method === 'POST' && path === '/act') {
+      const body = await readBody(req)
+      const { platform, action, ...params } = body
+      if (!platform || !action) return error(res, 400, 'platform and action are required')
+      const result = await spectrawl.act(platform, action, params)
+      return json(res, result)
+    }
+    return error(res, 404, 'Not found')
+  } catch (err) {
+    console.error('Server error:', err)
+    return error(res, 500, err.message)
+  }
+})
+function json(res, data, status = 200) {
+  res.writeHead(status, { 'Content-Type': 'application/json' })
+  res.end(JSON.stringify(data))
+}
+function error(res, status, message) {
+  json(res, { error: message }, status)
+}
+function readBody(req) {
+  return new Promise((resolve, reject) => {
+    let body = ''
+    req.on('data', chunk => body += chunk)
+    req.on('end', () => {
+      try { resolve(JSON.parse(body)) }
+      catch (e) { reject(new Error('Invalid JSON body')) }
+    })
+    req.on('error', reject)
+  })
+}
+const port = config.port || 3900
+server.listen(port, () => {
+  console.log(`🌐 Spectrawl server running on http://localhost:${port}`)
+  console.log(`   POST /search  — search the web`)
+  console.log(`   POST /browse  — stealth browse`)
+  console.log(`   POST /act     — platform actions`)
+  console.log(`   GET  /status  — auth health`)
+  console.log(`   GET  /health  — server health`)
+})
+// Graceful shutdown
+process.on('SIGTERM', async () => {
+  await spectrawl.close()
+  server.close()
+})
+process.on('SIGINT', async () => {
+  await spectrawl.close()
+  server.close()
+})