spectrawl 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/search/index.js +2 -2
- package/src/search/scraper.js +40 -5
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "spectrawl",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.3",
|
|
4
4
|
"description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"types": "index.d.ts",
|
package/src/search/index.js
CHANGED
|
@@ -144,8 +144,8 @@ class SearchEngine {
|
|
|
144
144
|
// Step 3: Merge and deduplicate
|
|
145
145
|
let results = dedupeResults(resultSets.flat())
|
|
146
146
|
|
|
147
|
-
// Step 4: Rerank by relevance
|
|
148
|
-
if (this.reranker && opts.rerank !== false) {
|
|
147
|
+
// Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
|
|
148
|
+
if (this.reranker && opts.rerank !== false && !usesGrounded) {
|
|
149
149
|
results = await this.reranker.rerank(query, results)
|
|
150
150
|
}
|
|
151
151
|
|
package/src/search/scraper.js
CHANGED
|
@@ -35,13 +35,13 @@ async function scrapeUrls(urls, opts = {}) {
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
async function scrapeUrl(url, opts = {}) {
|
|
38
|
-
const { timeout = 10000, engine = 'auto' } = opts
|
|
38
|
+
const { timeout = 10000, engine = 'auto', browse } = opts
|
|
39
39
|
|
|
40
40
|
// Try Jina first if available (better markdown output)
|
|
41
41
|
if (engine === 'jina' || engine === 'auto') {
|
|
42
42
|
try {
|
|
43
43
|
const result = await jinaExtract(url)
|
|
44
|
-
if (result.content && result.content.length >
|
|
44
|
+
if (result.content && result.content.length > 200) {
|
|
45
45
|
return result.content
|
|
46
46
|
}
|
|
47
47
|
} catch (e) {
|
|
@@ -49,9 +49,44 @@ async function scrapeUrl(url, opts = {}) {
|
|
|
49
49
|
}
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
-
// Readability fallback
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
// Readability fallback (HTTP fetch + HTML→markdown)
|
|
53
|
+
try {
|
|
54
|
+
const html = await fetchPage(url, timeout)
|
|
55
|
+
const content = extractMarkdown(html)
|
|
56
|
+
if (content && content.length > 200) {
|
|
57
|
+
return content
|
|
58
|
+
}
|
|
59
|
+
} catch (e) {
|
|
60
|
+
// Fall through to browser
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Browser fallback for JS-rendered pages or when extraction is too short
|
|
64
|
+
// This is where we beat Tavily — they can't render JS pages
|
|
65
|
+
if (browse !== false) {
|
|
66
|
+
try {
|
|
67
|
+
const { BrowseEngine } = require('../browse')
|
|
68
|
+
const browser = new BrowseEngine()
|
|
69
|
+
const result = await browser.browse(url, {
|
|
70
|
+
timeout,
|
|
71
|
+
extractText: true,
|
|
72
|
+
screenshot: false
|
|
73
|
+
})
|
|
74
|
+
await browser.close()
|
|
75
|
+
if (result.text && result.text.length > 200) {
|
|
76
|
+
return result.text
|
|
77
|
+
}
|
|
78
|
+
} catch (e) {
|
|
79
|
+
// All methods exhausted
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Return whatever we got, even if short
|
|
84
|
+
try {
|
|
85
|
+
const html = await fetchPage(url, timeout)
|
|
86
|
+
return extractMarkdown(html)
|
|
87
|
+
} catch (e) {
|
|
88
|
+
return ''
|
|
89
|
+
}
|
|
55
90
|
}
|
|
56
91
|
|
|
57
92
|
function fetchPage(url, timeout = 10000, redirects = 3) {
|