npm - spectrawl - Versions diffs - 0.2.0 → 0.3.0 - Mend

spectrawl 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +26 -5
package/package.json +2 -2
package/src/index.js +12 -0
package/src/search/index.js +119 -0
package/src/search/query-expander.js +122 -0
package/src/search/reranker.js +114 -0
package/src/search/summarizer.js +16 -1

package/README.md CHANGED Viewed

@@ -19,7 +19,14 @@ npm install spectrawl
 **Auth** — Persistent cookie storage (SQLite), multi-account management, automatic cookie refresh, expiry alerts.
-**Act** — Post to X, Reddit, Dev.to, Hashnode, LinkedIn, IndieHackers. Rate limiting, content dedup, dead letter queue for retries.
+**Act** — 24 platform adapters covering 30+ sites:
+- **Content platforms:** X, Reddit, LinkedIn, Dev.to, Hashnode, IndieHackers, Medium, Hacker News, Quora
+- **Developer:** GitHub (repos, issues, releases), HuggingFace (models, datasets), Discord (bot + webhooks)
+- **Launch/SEO:** Product Hunt, BetaList, AlternativeTo, SaaSHub, DevHunt, AppSumo
+- **Directories:** Generic adapter for MicroLaunch, Uneed, Peerlist, Fazier, BetaPage, LaunchingNext, StartupStash, SideProjectors, TAIFT, Futurepedia, Crunchbase, G2, StackShare, YouTube
+- Rate limiting, content dedup, dead letter queue for retries.
+**Proxy** — Rotating proxy server. One endpoint (`localhost:8080`) for all your tools. Round-robin, random, or least-used strategies. Health checking with auto-failover.
 ## Quick Start
@@ -123,6 +130,19 @@ Configure the cascade in `spectrawl.json`:
 | Hashnode | GraphQL API | post |
 | LinkedIn | Cookie API (Voyager) | post |
 | IndieHackers | Browser automation | post, comment, upvote |
+| Medium | REST API | post (markdown) |
+| GitHub | REST v3 | repo, file, issue, release |
+| Discord | Bot API + webhooks | send, thread |
+| Product Hunt | GraphQL v2 | launch, comment, upvote |
+| Hacker News | Cookie/form POST | submit, comment, upvote |
+| YouTube | Data API v3 | comment, playlist, update |
+| Quora | Browser automation | answer, question |
+| HuggingFace | Hub API | repo, model card, upload |
+| BetaList | REST API | submit |
+| AlternativeTo | Browser automation | submit |
+| SaaSHub | Browser automation | submit |
+| DevHunt | Browser automation | submit |
+| **30+ Directories** | Generic adapter | submit (MicroLaunch, Uneed, TAIFT, Futurepedia, Crunchbase, G2, etc.) |
 ## Configuration
@@ -141,10 +161,11 @@ Configure the cascade in `spectrawl.json`:
     "scrapeTtl": 24
   },
   "proxy": {
-    "host": "proxy.example.com",
-    "port": "8080",
-    "username": "user",
-    "password": "pass"
+    "localPort": 8080,
+    "strategy": "round-robin",
+    "upstreams": [
+      { "url": "http://user:pass@proxy1.example.com:8080" }
+    ]
   },
   "camoufox": {
     "url": "http://localhost:9869"

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "spectrawl",
-  "version": "0.2.0",
-  "description": "The unified web layer for AI agents. Search, browse, authenticate, act — one tool, self-hosted, free.",
+  "version": "0.3.0",
+  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
   "main": "src/index.js",
   "types": "index.d.ts",
   "bin": {

package/src/index.js CHANGED Viewed

@@ -34,6 +34,18 @@ class Spectrawl {
     return this.searchEngine.search(query, opts)
   }
+  /**
+   * Deep search — Tavily-equivalent "advanced" mode.
+   * Query expansion → parallel search → rerank → scrape → AI answer with citations.
+   * Requires GEMINI_API_KEY (free tier) or configured LLM.
+   * @param {string} query - Search query
+   * @param {object} opts - { scrapeTop, expand, rerank }
+   * @returns {Promise<{answer, sources[], queries[], cached}>}
+   */
+  async deepSearch(query, opts = {}) {
+    return this.searchEngine.deepSearch(query, opts)
+  }
   /**
    * Browse a URL with stealth and optional auth.
    * @param {string} url - URL to browse

package/src/search/index.js CHANGED Viewed

@@ -6,6 +6,8 @@ const { googleCseSearch } = require('./engines/google-cse')
 const { jinaSearch } = require('./engines/jina')
 const { scrapeUrls } = require('./scraper')
 const { Summarizer } = require('./summarizer')
+const { Reranker } = require('./reranker')
+const { QueryExpander } = require('./query-expander')
 const ENGINES = {
   searxng: searxngSearch,
@@ -23,6 +25,11 @@ class SearchEngine {
     this.cascade = config.cascade || ['ddg', 'brave', 'serper']
     this.scrapeTop = config.scrapeTop || 3
     this.summarizer = config.llm ? new Summarizer(config.llm) : null
+    // Gemini-powered features (free tier)
+    const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
+    this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
+    this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
   }
   /**
@@ -86,6 +93,118 @@ class SearchEngine {
     return response
   }
+  /**
+   * Deep search — Tavily-equivalent "advanced" mode.
+   * Query expansion → parallel search → merge/dedup → rerank → scrape top N → summarize with citations.
+   *
+   * Returns: { answer, sources: [{title, url, content, score}], cached }
+   */
+  async deepSearch(query, opts = {}) {
+    if (!query || !query.trim()) {
+      throw new Error('Search query is required')
+    }
+    // Check cache
+    const cacheKey = `deep:${query}:${JSON.stringify(opts)}`
+    const cached = this.cache?.get('search', cacheKey)
+    if (cached) return { ...cached, cached: true }
+    // Step 1: Query expansion
+    let queries = [query]
+    if (this.expander && opts.expand !== false) {
+      queries = await this.expander.expand(query)
+    }
+    // Step 2: Search across all query variants (with stagger to avoid rate limits)
+    const resultSets = []
+    for (const q of queries) {
+      try {
+        const r = await this._rawSearch(q, opts)
+        resultSets.push(r)
+      } catch (e) {
+        resultSets.push([])
+      }
+      // Small delay between queries to avoid rate limiting
+      if (queries.length > 1) await new Promise(r => setTimeout(r, 300))
+    }
+    // Step 3: Merge and deduplicate
+    let results = this.expander
+      ? this.expander.mergeResults(resultSets)
+      : dedupeResults(resultSets.flat())
+    // Step 4: Rerank by relevance
+    if (this.reranker && opts.rerank !== false) {
+      results = await this.reranker.rerank(query, results)
+    }
+    // Step 5: Parallel scrape top N for full content
+    const scrapeCount = opts.scrapeTop ?? this.scrapeTop ?? 5
+    if (scrapeCount > 0 && results.length > 0) {
+      const urls = results.slice(0, scrapeCount).map(r => r.url)
+      const scraped = await scrapeUrls(urls)
+      for (const result of results) {
+        const scrapedContent = scraped[result.url]
+        if (scrapedContent) {
+          result.fullContent = scrapedContent
+        }
+      }
+    }
+    // Step 6: Summarize with citations
+    let answer = null
+    const summarizer = this.summarizer || (this.reranker ? new Summarizer({
+      provider: 'gemini',
+      model: 'gemini-2.0-flash',
+      apiKey: process.env.GEMINI_API_KEY
+    }) : null)
+    if (summarizer) {
+      answer = await summarizer.summarize(query, results)
+    }
+    const response = {
+      answer,
+      sources: results.map(r => ({
+        title: r.title,
+        url: r.url,
+        snippet: r.snippet,
+        content: r.fullContent?.slice(0, 2000) || r.snippet || '',
+        score: r.score || null
+      })),
+      queries, // show which queries were used
+      cached: false
+    }
+    this.cache?.set('search', cacheKey, response)
+    return response
+  }
+  /**
+   * Raw search without reranking or summarization.
+   * Used internally by deepSearch for parallel query variants.
+   */
+  async _rawSearch(query, opts = {}) {
+    let results = []
+    const minResults = opts.minResults || 5
+    for (const engineName of this.cascade) {
+      const engine = ENGINES[engineName]
+      if (!engine) continue
+      try {
+        const engineResults = await engine(query, this.config[engineName] || {})
+        results = dedupeResults([...results, ...engineResults])
+        if (results.length >= minResults) break
+      } catch (err) {
+        continue
+      }
+    }
+    return results
+  }
   async _summarize(query, results) {
     if (!this.summarizer) return null
     return this.summarizer.summarize(query, results)

package/src/search/query-expander.js ADDED Viewed

@@ -0,0 +1,122 @@
+const https = require('https')
+/**
+ * Query expansion — generates variant queries to catch what one search misses.
+ * "best CRM" → ["top CRM software 2026", "CRM comparison startups", "best CRM for small business"]
+ * Merges and deduplicates results across all variants.
+ */
+class QueryExpander {
+  constructor(config = {}) {
+    this.provider = config.provider || 'gemini'
+    this.model = config.model || 'gemini-2.0-flash'
+    this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
+    this.variants = config.variants || 3
+  }
+  /**
+   * Expand a query into multiple search variants.
+   * Returns array of query strings (including the original).
+   */
+  async expand(query) {
+    if (!this.apiKey) return [query]
+    const prompt = `Generate ${this.variants} alternative search queries for: "${query}"
+Requirements:
+- Each should find different but relevant results
+- Include synonyms, related terms, different phrasings
+- One should be more specific, one broader, one from a different angle
+Respond with ONLY a JSON array of strings. No explanation.
+Example: ["alternative query 1", "alternative query 2", "alternative query 3"]`
+    try {
+      const text = await this._call(prompt)
+      const match = text.match(/\[[\s\S]*?\]/)
+      if (!match) return [query]
+      const variants = JSON.parse(match[0])
+      if (!Array.isArray(variants)) return [query]
+      return [query, ...variants.slice(0, this.variants)]
+    } catch (err) {
+      console.warn('Query expansion failed:', err.message)
+      return [query]
+    }
+  }
+  /**
+   * Merge and deduplicate results from multiple queries.
+   * Keeps highest-scored version of each URL.
+   */
+  mergeResults(resultSets) {
+    const seen = new Map() // url → result
+    for (const results of resultSets) {
+      for (const r of results) {
+        const url = r.url?.toLowerCase()
+        if (!url) continue
+        const existing = seen.get(url)
+        if (!existing || (r.score || 0) > (existing.score || 0)) {
+          seen.set(url, r)
+        }
+      }
+    }
+    return Array.from(seen.values())
+  }
+  async _call(prompt) {
+    if (this.provider === 'gemini') {
+      const model = this.model || 'gemini-2.0-flash'
+      const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
+      const body = JSON.stringify({
+        contents: [{ parts: [{ text: prompt }] }],
+        generationConfig: { temperature: 0.7, maxOutputTokens: 200 }
+      })
+      const data = await postJson(url, body)
+      return data.candidates?.[0]?.content?.parts?.[0]?.text || '[]'
+    }
+    const url = 'https://api.openai.com/v1/chat/completions'
+    const body = JSON.stringify({
+      model: this.model,
+      messages: [{ role: 'user', content: prompt }],
+      max_tokens: 200,
+      temperature: 0.7
+    })
+    const data = await postJson(url, body, { 'Authorization': `Bearer ${this.apiKey}` })
+    return data.choices?.[0]?.message?.content || '[]'
+  }
+}
+function postJson(url, body, extraHeaders = {}) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const opts = {
+      hostname: urlObj.hostname,
+      path: urlObj.pathname + urlObj.search,
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'Content-Length': Buffer.byteLength(body),
+        ...extraHeaders
+      }
+    }
+    const req = https.request(opts, res => {
+      let data = ''
+      res.on('data', c => data += c)
+      res.on('end', () => {
+        try { resolve(JSON.parse(data)) }
+        catch (e) { reject(new Error(`Invalid response: ${data.slice(0, 200)}`)) }
+      })
+    })
+    req.on('error', reject)
+    req.setTimeout(15000, () => { req.destroy(); reject(new Error('Expander timeout')) })
+    req.write(body)
+    req.end()
+  })
+}
+module.exports = { QueryExpander }

package/src/search/reranker.js ADDED Viewed

@@ -0,0 +1,114 @@
+const https = require('https')
+/**
+ * AI result reranker — scores search results by relevance.
+ * Uses Gemini Flash by default (free, fast).
+ * This is Tavily's secret sauce: AI-scored relevance, not raw search order.
+ */
+class Reranker {
+  constructor(config = {}) {
+    this.provider = config.provider || 'gemini'
+    this.model = config.model || 'gemini-2.0-flash'
+    this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
+  }
+  /**
+   * Rerank results by relevance to query.
+   * Returns results sorted by score (highest first) with score field added.
+   */
+  async rerank(query, results) {
+    if (!this.apiKey || results.length <= 1) return results
+    const batch = results.slice(0, 20) // Max 20 results to rerank
+    const prompt = `Score each search result's relevance to the query on a scale of 0.0 to 1.0.
+Query: "${query}"
+Results:
+${batch.map((r, i) => `[${i}] ${r.title}\n${(r.snippet || r.content || '').slice(0, 200)}`).join('\n\n')}
+Respond with ONLY a JSON array of scores, one per result. Example: [0.95, 0.72, 0.31]
+No explanation, just the array.`
+    try {
+      const text = await this._call(prompt)
+      const scores = JSON.parse(text.match(/\[[\d.,\s]+\]/)?.[0] || '[]')
+      if (scores.length !== batch.length) return results
+      // Attach scores and sort
+      const scored = batch.map((r, i) => ({ ...r, score: scores[i] || 0 }))
+      scored.sort((a, b) => b.score - a.score)
+      // Append any results beyond the batch limit
+      if (results.length > 20) {
+        scored.push(...results.slice(20).map(r => ({ ...r, score: 0 })))
+      }
+      return scored
+    } catch (err) {
+      console.warn('Reranking failed, using original order:', err.message)
+      return results
+    }
+  }
+  async _call(prompt) {
+    if (this.provider === 'gemini') {
+      const model = this.model || 'gemini-2.0-flash'
+      const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
+      const body = JSON.stringify({
+        contents: [{ parts: [{ text: prompt }] }],
+        generationConfig: { temperature: 0, maxOutputTokens: 200 }
+      })
+      const data = await postJson(url, body)
+      return data.candidates?.[0]?.content?.parts?.[0]?.text || '[]'
+    }
+    // Fallback: OpenAI-compatible
+    const url = this.provider === 'minimax'
+      ? 'https://api.minimax.chat/v1/text/chatcompletion_v2'
+      : 'https://api.openai.com/v1/chat/completions'
+    const body = JSON.stringify({
+      model: this.model,
+      messages: [{ role: 'user', content: prompt }],
+      max_tokens: 200,
+      temperature: 0
+    })
+    const data = await postJson(url, body, {
+      'Authorization': `Bearer ${this.apiKey}`
+    })
+    return data.choices?.[0]?.message?.content || '[]'
+  }
+}
+function postJson(url, body, extraHeaders = {}) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const opts = {
+      hostname: urlObj.hostname,
+      path: urlObj.pathname + urlObj.search,
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'Content-Length': Buffer.byteLength(body),
+        ...extraHeaders
+      }
+    }
+    const req = https.request(opts, res => {
+      let data = ''
+      res.on('data', c => data += c)
+      res.on('end', () => {
+        try { resolve(JSON.parse(data)) }
+        catch (e) { reject(new Error(`Invalid response: ${data.slice(0, 200)}`)) }
+      })
+    })
+    req.on('error', reject)
+    req.setTimeout(15000, () => { req.destroy(); reject(new Error('Reranker timeout')) })
+    req.write(body)
+    req.end()
+  })
+}
+module.exports = { Reranker }

package/src/search/summarizer.js CHANGED Viewed

@@ -17,7 +17,8 @@ class Summarizer {
       openai: 'OPENAI_API_KEY',
       anthropic: 'ANTHROPIC_API_KEY',
       minimax: 'MINIMAX_API_KEY',
-      xai: 'XAI_API_KEY'
+      xai: 'XAI_API_KEY',
+      gemini: 'GEMINI_API_KEY'
     }
     return keys[this.provider] || 'OPENAI_API_KEY'
   }
@@ -56,6 +57,8 @@ Answer:`
         return this._openaiCompatible(prompt)
       case 'anthropic':
         return this._anthropic(prompt)
+      case 'gemini':
+        return this._gemini(prompt)
       case 'ollama':
         return this._ollama(prompt)
       default:
@@ -106,6 +109,18 @@ Answer:`
     return data.content?.[0]?.text || null
   }
+  async _gemini(prompt) {
+    const model = this.model || 'gemini-2.0-flash'
+    const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
+    const body = JSON.stringify({
+      contents: [{ parts: [{ text: prompt }] }],
+      generationConfig: { temperature: 0.3, maxOutputTokens: 500 }
+    })
+    const data = await postJson(url, body, { 'Content-Type': 'application/json' })
+    return data.candidates?.[0]?.content?.parts?.[0]?.text || null
+  }
   async _ollama(prompt) {
     const url = this.baseUrl || 'http://localhost:11434/api/generate'
     const body = JSON.stringify({