spectrawl 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.2",
3
+ "version": "0.3.4",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
@@ -144,8 +144,8 @@ class SearchEngine {
144
144
  // Step 3: Merge and deduplicate
145
145
  let results = dedupeResults(resultSets.flat())
146
146
 
147
- // Step 4: Rerank by relevance
148
- if (this.reranker && opts.rerank !== false) {
147
+ // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
148
+ if (this.reranker && opts.rerank !== false && !usesGrounded) {
149
149
  results = await this.reranker.rerank(query, results)
150
150
  }
151
151
 
@@ -167,7 +167,7 @@ class SearchEngine {
167
167
  let answer = null
168
168
  const summarizer = this.summarizer || (this.reranker ? new Summarizer({
169
169
  provider: 'gemini',
170
- model: 'gemini-2.0-flash',
170
+ model: 'gemini-2.5-flash',
171
171
  apiKey: process.env.GEMINI_API_KEY
172
172
  }) : null)
173
173
 
@@ -177,12 +177,12 @@ class SearchEngine {
177
177
 
178
178
  const response = {
179
179
  answer,
180
- sources: results.map(r => ({
180
+ sources: results.map((r, i) => ({
181
181
  title: r.title,
182
182
  url: r.url,
183
183
  snippet: r.snippet,
184
184
  content: r.fullContent?.slice(0, 2000) || r.snippet || '',
185
- score: r.score || null
185
+ score: r.score || r.confidence || Math.max(0.5, 1 - (i * 0.05))
186
186
  })),
187
187
  queries, // show which queries were used
188
188
  cached: false
@@ -8,7 +8,7 @@ const https = require('https')
8
8
  class QueryExpander {
9
9
  constructor(config = {}) {
10
10
  this.provider = config.provider || 'gemini'
11
- this.model = config.model || 'gemini-2.0-flash'
11
+ this.model = config.model || 'gemini-2.5-flash'
12
12
  this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
13
13
  this.variants = config.variants || 3
14
14
  }
@@ -69,7 +69,7 @@ Example: ["alternative query 1", "alternative query 2", "alternative query 3"]`
69
69
 
70
70
  async _call(prompt) {
71
71
  if (this.provider === 'gemini') {
72
- const model = this.model || 'gemini-2.0-flash'
72
+ const model = this.model || 'gemini-2.5-flash'
73
73
  const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
74
74
  const body = JSON.stringify({
75
75
  contents: [{ parts: [{ text: prompt }] }],
@@ -8,7 +8,7 @@ const https = require('https')
8
8
  class Reranker {
9
9
  constructor(config = {}) {
10
10
  this.provider = config.provider || 'gemini'
11
- this.model = config.model || 'gemini-2.0-flash'
11
+ this.model = config.model || 'gemini-2.5-flash'
12
12
  this.apiKey = config.apiKey || process.env.GEMINI_API_KEY
13
13
  }
14
14
 
@@ -55,7 +55,7 @@ No explanation, just the array.`
55
55
 
56
56
  async _call(prompt) {
57
57
  if (this.provider === 'gemini') {
58
- const model = this.model || 'gemini-2.0-flash'
58
+ const model = this.model || 'gemini-2.5-flash'
59
59
  const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
60
60
  const body = JSON.stringify({
61
61
  contents: [{ parts: [{ text: prompt }] }],
@@ -35,13 +35,13 @@ async function scrapeUrls(urls, opts = {}) {
35
35
  }
36
36
 
37
37
  async function scrapeUrl(url, opts = {}) {
38
- const { timeout = 10000, engine = 'auto' } = opts
38
+ const { timeout = 10000, engine = 'auto', browse } = opts
39
39
 
40
40
  // Try Jina first if available (better markdown output)
41
41
  if (engine === 'jina' || engine === 'auto') {
42
42
  try {
43
43
  const result = await jinaExtract(url)
44
- if (result.content && result.content.length > 100) {
44
+ if (result.content && result.content.length > 200) {
45
45
  return result.content
46
46
  }
47
47
  } catch (e) {
@@ -49,9 +49,44 @@ async function scrapeUrl(url, opts = {}) {
49
49
  }
50
50
  }
51
51
 
52
- // Readability fallback
53
- const html = await fetchPage(url, timeout)
54
- return extractMarkdown(html)
52
+ // Readability fallback (HTTP fetch + HTML→markdown)
53
+ try {
54
+ const html = await fetchPage(url, timeout)
55
+ const content = extractMarkdown(html)
56
+ if (content && content.length > 200) {
57
+ return content
58
+ }
59
+ } catch (e) {
60
+ // Fall through to browser
61
+ }
62
+
63
+ // Browser fallback for JS-rendered pages or when extraction is too short
64
+ // This is where we beat Tavily — they can't render JS pages
65
+ if (browse !== false) {
66
+ try {
67
+ const { BrowseEngine } = require('../browse')
68
+ const browser = new BrowseEngine()
69
+ const result = await browser.browse(url, {
70
+ timeout,
71
+ extractText: true,
72
+ screenshot: false
73
+ })
74
+ await browser.close()
75
+ if (result.text && result.text.length > 200) {
76
+ return result.text
77
+ }
78
+ } catch (e) {
79
+ // All methods exhausted
80
+ }
81
+ }
82
+
83
+ // Return whatever we got, even if short
84
+ try {
85
+ const html = await fetchPage(url, timeout)
86
+ return extractMarkdown(html)
87
+ } catch (e) {
88
+ return ''
89
+ }
55
90
  }
56
91
 
57
92
  function fetchPage(url, timeout = 10000, redirects = 3) {
@@ -110,7 +110,7 @@ Answer:`
110
110
  }
111
111
 
112
112
  async _gemini(prompt) {
113
- const model = this.model || 'gemini-2.0-flash'
113
+ const model = this.model || 'gemini-2.5-flash'
114
114
  const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${this.apiKey}`
115
115
  const body = JSON.stringify({
116
116
  contents: [{ parts: [{ text: prompt }] }],