npm - spectrawl - Versions diffs - 0.3.0 → 0.3.1 - Mend

spectrawl 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/search/engines/gemini-grounded.js +90 -0
package/src/search/index.js +7 -3
package/src/search/scraper.js +14 -11

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "spectrawl",
-  "version": "0.3.0",
+  "version": "0.3.1",
   "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
   "main": "src/index.js",
   "types": "index.d.ts",

package/src/search/engines/gemini-grounded.js ADDED Viewed

@@ -0,0 +1,90 @@
+const https = require('https')
+/**
+ * Gemini Grounded Search — uses Google's Gemini API with built-in Google Search.
+ * Free tier: 1,500 req/day for Flash.
+ * Returns both an AI answer AND the search results it found.
+ *
+ * This is basically free Google search + AI summarization in one call.
+ */
+async function geminiGroundedSearch(query, config = {}) {
+  const apiKey = config.apiKey || process.env.GEMINI_API_KEY
+  if (!apiKey) throw new Error('GEMINI_API_KEY required for grounded search')
+  const model = config.model || 'gemini-2.0-flash'
+  const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`
+  const body = JSON.stringify({
+    contents: [{
+      parts: [{ text: `Search the web and provide relevant results for: ${query}` }]
+    }],
+    tools: [{ google_search: {} }],
+    generationConfig: {
+      temperature: 0.1,
+      maxOutputTokens: 1000
+    }
+  })
+  const data = await post(url, body)
+  if (data.error) {
+    throw new Error(`Gemini grounded search: ${data.error.message}`)
+  }
+  // Extract grounding metadata (search results)
+  const candidate = data.candidates?.[0]
+  const grounding = candidate?.groundingMetadata
+  const chunks = grounding?.groundingChunks || []
+  const answer = candidate?.content?.parts?.map(p => p.text).filter(Boolean).join('\n') || ''
+  // Convert grounding chunks to standard search result format
+  const results = chunks.map((chunk, i) => ({
+    title: chunk.web?.title || `Result ${i + 1}`,
+    url: chunk.web?.uri || '',
+    snippet: '', // Gemini doesn't give per-result snippets
+    source: 'gemini-grounded'
+  })).filter(r => r.url)
+  // Also try to extract URLs from grounding support
+  const supports = grounding?.groundingSupports || []
+  for (const support of supports) {
+    const indices = support.groundingChunkIndices || []
+    // Already captured above
+  }
+  // Attach the AI answer as metadata
+  if (results.length > 0) {
+    results._groundedAnswer = answer
+  }
+  return results
+}
+function post(url, body) {
+  return new Promise((resolve, reject) => {
+    const urlObj = new URL(url)
+    const opts = {
+      hostname: urlObj.hostname,
+      path: urlObj.pathname + urlObj.search,
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        'Content-Length': Buffer.byteLength(body)
+      }
+    }
+    const req = https.request(opts, res => {
+      let data = ''
+      res.on('data', c => data += c)
+      res.on('end', () => {
+        try { resolve(JSON.parse(data)) }
+        catch (e) { reject(new Error(`Invalid Gemini response: ${data.slice(0, 200)}`)) }
+      })
+    })
+    req.on('error', reject)
+    req.setTimeout(15000, () => { req.destroy(); reject(new Error('Gemini grounded search timeout')) })
+    req.write(body)
+    req.end()
+  })
+}
+module.exports = { geminiGroundedSearch }

package/src/search/index.js CHANGED Viewed

@@ -4,6 +4,7 @@ const { serperSearch } = require('./engines/serper')
 const { searxngSearch } = require('./engines/searxng')
 const { googleCseSearch } = require('./engines/google-cse')
 const { jinaSearch } = require('./engines/jina')
+const { geminiGroundedSearch } = require('./engines/gemini-grounded')
 const { scrapeUrls } = require('./scraper')
 const { Summarizer } = require('./summarizer')
 const { Reranker } = require('./reranker')
@@ -15,7 +16,9 @@ const ENGINES = {
   brave: braveSearch,
   serper: serperSearch,
   'google-cse': googleCseSearch,
-  jina: jinaSearch
+  jina: jinaSearch,
+  'gemini-grounded': geminiGroundedSearch,
+  gemini: geminiGroundedSearch
 }
 class SearchEngine {
@@ -109,9 +112,10 @@ class SearchEngine {
     const cached = this.cache?.get('search', cacheKey)
     if (cached) return { ...cached, cached: true }
-    // Step 1: Query expansion
+    // Step 1: Query expansion (skip if using Gemini grounded — it searches Google natively)
     let queries = [query]
-    if (this.expander && opts.expand !== false) {
+    const usesGrounded = this.cascade.includes('gemini-grounded') || this.cascade.includes('gemini')
+    if (this.expander && opts.expand !== false && !usesGrounded) {
       queries = await this.expander.expand(query)
     }

package/src/search/scraper.js CHANGED Viewed

@@ -16,17 +16,20 @@ async function scrapeUrls(urls, opts = {}) {
   const concurrent = opts.concurrent || 3
   const engine = opts.engine || 'auto' // 'jina', 'readability', 'auto'
-  for (let i = 0; i < urls.length; i += concurrent) {
-    const batch = urls.slice(i, i + concurrent)
-    const promises = batch.map(url => scrapeUrl(url, { timeout, engine }).catch(() => null))
-    const batchResults = await Promise.all(promises)
-    batch.forEach((url, idx) => {
-      if (batchResults[idx]) {
-        results[url] = batchResults[idx]
-      }
-    })
-  }
+  // All URLs in parallel (with per-URL timeout)
+  const promises = urls.map(url => {
+    const p = scrapeUrl(url, { timeout, engine }).catch(() => null)
+    // Hard timeout per URL
+    const timer = new Promise(resolve => setTimeout(() => resolve(null), timeout + 1000))
+    return Promise.race([p, timer])
+  })
+  const allResults = await Promise.all(promises)
+  urls.forEach((url, idx) => {
+    if (allResults[idx]) {
+      results[url] = allResults[idx]
+    }
+  })
   return results
 }