spectrawl 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.4",
3
+ "version": "0.3.5",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/index.js CHANGED
@@ -12,9 +12,24 @@ const { EventEmitter, EVENTS } = require('./events')
12
12
  const { CookieRefresher } = require('./auth/refresh')
13
13
  const { loadConfig } = require('./config')
14
14
 
15
+ function deepMergeConfig(target, source) {
16
+ const result = { ...target }
17
+ for (const key of Object.keys(source)) {
18
+ if (source[key] && typeof source[key] === 'object' && !Array.isArray(source[key])) {
19
+ result[key] = deepMergeConfig(target[key] || {}, source[key])
20
+ } else {
21
+ result[key] = source[key]
22
+ }
23
+ }
24
+ return result
25
+ }
26
+
15
27
  class Spectrawl {
16
28
  constructor(configPath) {
17
- this.config = loadConfig(configPath)
29
+ // Accept either a file path (string) or a config object
30
+ this.config = (typeof configPath === 'object' && configPath !== null)
31
+ ? deepMergeConfig(loadConfig(null), configPath)
32
+ : loadConfig(configPath)
18
33
  this.events = new EventEmitter()
19
34
  this.cache = new Cache(this.config.cache)
20
35
  this.searchEngine = new SearchEngine(this.config.search, this.cache)
@@ -9,6 +9,7 @@ const { scrapeUrls } = require('./scraper')
9
9
  const { Summarizer } = require('./summarizer')
10
10
  const { Reranker } = require('./reranker')
11
11
  const { QueryExpander } = require('./query-expander')
12
+ const { SourceRanker } = require('./source-ranker')
12
13
 
13
14
  const ENGINES = {
14
15
  searxng: searxngSearch,
@@ -33,6 +34,7 @@ class SearchEngine {
33
34
  const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
34
35
  this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
35
36
  this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
37
+ this.sourceRanker = new SourceRanker(config.sourceRanker || {})
36
38
  }
37
39
 
38
40
  /**
@@ -90,8 +92,10 @@ class SearchEngine {
90
92
 
91
93
  const response = { answer, sources: results, cached: false }
92
94
 
93
- // Cache the result
94
- this.cache?.set('search', cacheKey, response)
95
+ // Only cache if we got results
96
+ if (results.length > 0) {
97
+ this.cache?.set('search', cacheKey, response)
98
+ }
95
99
 
96
100
  return response
97
101
  }
@@ -123,12 +127,23 @@ class SearchEngine {
123
127
  // When using Gemini Grounded, also run DDG in parallel for volume
124
128
  const resultSets = []
125
129
  if (usesGrounded) {
126
- // Parallel: Gemini for quality + DDG for volume
130
+ // Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
131
+ const delay = ms => new Promise(r => setTimeout(r, ms))
127
132
  const [groundedResults, ddgResults] = await Promise.all([
128
- this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
129
- this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
133
+ this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
134
+ delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
130
135
  ])
136
+ if (process.env.SPECTRAWL_DEBUG) {
137
+ console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
138
+ }
131
139
  resultSets.push(groundedResults, ddgResults)
140
+
141
+ // If primary failed, retry with a different approach
142
+ if (groundedResults.length === 0 && ddgResults.length === 0) {
143
+ await delay(1000)
144
+ const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
145
+ resultSets.push(retry)
146
+ }
132
147
  } else {
133
148
  for (const q of queries) {
134
149
  try {
@@ -142,13 +157,21 @@ class SearchEngine {
142
157
  }
143
158
 
144
159
  // Step 3: Merge and deduplicate
145
- let results = dedupeResults(resultSets.flat())
160
+ const flatResults = resultSets.flat()
161
+ let results = dedupeResults(flatResults)
162
+ if (process.env.SPECTRAWL_DEBUG) {
163
+ console.log('[deepSearch] resultSets lengths:', resultSets.map(s => s.length))
164
+ console.log('[deepSearch] flat:', flatResults.length, '→ deduped:', results.length)
165
+ }
146
166
 
147
- // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
167
+ // Step 4a: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
148
168
  if (this.reranker && opts.rerank !== false && !usesGrounded) {
149
169
  results = await this.reranker.rerank(query, results)
150
170
  }
151
171
 
172
+ // Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
173
+ results = this.sourceRanker.rank(results)
174
+
152
175
  // Step 5: Parallel scrape top N for full content (skip in fast mode)
153
176
  const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
154
177
  if (scrapeCount > 0 && results.length > 0) {
@@ -188,7 +211,10 @@ class SearchEngine {
188
211
  cached: false
189
212
  }
190
213
 
191
- this.cache?.set('search', cacheKey, response)
214
+ // Only cache if we got results — never cache failures
215
+ if (response.sources.length > 0) {
216
+ this.cache?.set('search', cacheKey, response)
217
+ }
192
218
  return response
193
219
  }
194
220
 
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Source quality ranker — boost trusted sources, penalize SEO spam.
3
+ * This is something Tavily doesn't have.
4
+ *
5
+ * Users can customize weights per domain or use built-in presets.
6
+ */
7
+
8
+ // Built-in domain quality tiers
9
+ const DEFAULT_WEIGHTS = {
10
+ // Tier 1: Primary sources, high trust (1.3x boost)
11
+ 'github.com': 1.3,
12
+ 'stackoverflow.com': 1.3,
13
+ 'news.ycombinator.com': 1.3,
14
+ 'arxiv.org': 1.3,
15
+ 'docs.google.com': 1.2,
16
+ 'developer.mozilla.org': 1.3,
17
+ 'wikipedia.org': 1.2,
18
+ 'en.wikipedia.org': 1.2,
19
+
20
+ // Tier 2: Quality community/editorial (1.15x boost)
21
+ 'reddit.com': 1.15,
22
+ 'www.reddit.com': 1.15,
23
+ 'dev.to': 1.15,
24
+ 'medium.com': 1.1,
25
+ 'blog.logrocket.com': 1.15,
26
+ 'css-tricks.com': 1.15,
27
+ 'smashingmagazine.com': 1.15,
28
+ 'web.dev': 1.2,
29
+ 'npmjs.com': 1.15,
30
+ 'www.npmjs.com': 1.15,
31
+ 'pypi.org': 1.15,
32
+
33
+ // Tier 3: Known SEO farms / thin content (0.7x penalty)
34
+ 'w3schools.com': 0.8,
35
+ 'www.w3schools.com': 0.8,
36
+ 'geeksforgeeks.org': 0.85,
37
+ 'www.geeksforgeeks.org': 0.85,
38
+ 'tutorialspoint.com': 0.7,
39
+ 'www.tutorialspoint.com': 0.7,
40
+ 'javatpoint.com': 0.7,
41
+ 'www.javatpoint.com': 0.7,
42
+ }
43
+
44
+ // Content-type signals that indicate quality
45
+ const QUALITY_SIGNALS = {
46
+ // URL patterns that suggest high quality
47
+ positive: [
48
+ /\/blog\//i, // Blog posts (usually more detailed)
49
+ /\/docs\//i, // Documentation
50
+ /\/guide/i, // Guides
51
+ /\/tutorial/i, // Tutorials
52
+ /github\.com\/[\w-]+\/[\w-]+$/, // Repo pages (not search)
53
+ /\/wiki\//i, // Wiki pages
54
+ /\/research\//i, // Research
55
+ ],
56
+ // URL patterns that suggest low quality
57
+ negative: [
58
+ /\/tag\//i, // Tag listing pages
59
+ /\/category\//i, // Category pages
60
+ /\/page\/\d+/i, // Pagination
61
+ /\?utm_/i, // Tracking URLs
62
+ /\/amp\//i, // AMP pages (usually stripped)
63
+ /\/slideshow/i, // Slideshow spam
64
+ /\/gallery/i, // Gallery spam
65
+ /\/listicle/i, // Listicle spam
66
+ ]
67
+ }
68
+
69
+ class SourceRanker {
70
+ constructor(config = {}) {
71
+ this.weights = { ...DEFAULT_WEIGHTS, ...(config.weights || {}) }
72
+ this.boostDomains = config.boost || [] // Always boost these domains
73
+ this.blockDomains = config.block || [] // Always exclude these domains
74
+ }
75
+
76
+ /**
77
+ * Apply source quality scoring to search results.
78
+ * Modifies scores in-place and reorders by adjusted score.
79
+ */
80
+ rank(results) {
81
+ if (!results || results.length === 0) return results
82
+
83
+ // Filter blocked domains
84
+ let filtered = results.filter(r => {
85
+ try {
86
+ const host = new URL(r.url).hostname
87
+ return !this.blockDomains.some(d => host.includes(d))
88
+ } catch { return true }
89
+ })
90
+
91
+ // Apply quality weights
92
+ filtered = filtered.map(r => {
93
+ let multiplier = 1.0
94
+
95
+ try {
96
+ const url = new URL(r.url)
97
+ const host = url.hostname
98
+
99
+ // Domain weight
100
+ for (const [domain, weight] of Object.entries(this.weights)) {
101
+ if (host === domain || host.endsWith('.' + domain)) {
102
+ multiplier *= weight
103
+ break
104
+ }
105
+ }
106
+
107
+ // Boost domains
108
+ if (this.boostDomains.some(d => host.includes(d))) {
109
+ multiplier *= 1.3
110
+ }
111
+
112
+ // URL quality signals
113
+ const fullUrl = r.url
114
+ for (const pattern of QUALITY_SIGNALS.positive) {
115
+ if (pattern.test(fullUrl)) { multiplier *= 1.05; break }
116
+ }
117
+ for (const pattern of QUALITY_SIGNALS.negative) {
118
+ if (pattern.test(fullUrl)) { multiplier *= 0.85; break }
119
+ }
120
+
121
+ // Freshness signal (year in URL)
122
+ const yearMatch = fullUrl.match(/20(2[4-9]|3\d)/)
123
+ if (yearMatch) multiplier *= 1.05 // Recent content boost
124
+
125
+ } catch { /* invalid URL, no adjustment */ }
126
+
127
+ const baseScore = r.score || r.confidence || 0.5
128
+ return { ...r, score: Math.min(1, baseScore * multiplier), _multiplier: multiplier }
129
+ })
130
+
131
+ // Sort by adjusted score
132
+ filtered.sort((a, b) => (b.score || 0) - (a.score || 0))
133
+
134
+ return filtered
135
+ }
136
+ }
137
+
138
+ module.exports = { SourceRanker, DEFAULT_WEIGHTS }
@@ -31,10 +31,15 @@ class Summarizer {
31
31
  .map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
32
32
  .join('\n\n')
33
33
 
34
- const prompt = `Based on the following search results, provide a concise answer to the query: "${query}"
34
+ const prompt = `Answer this question directly: "${query}"
35
35
 
36
- Include citations as [1], [2], etc. referencing the source numbers below.
37
- Be direct and factual. If the sources don't contain enough information, say so.
36
+ Rules:
37
+ - Give a clear, specific answer. Name things, list tools, state facts.
38
+ - Use [1], [2] etc. to cite sources inline.
39
+ - Never say "based on the provided sources" or "according to search results."
40
+ - Never hedge with "it appears" or "it seems." Be direct.
41
+ - If sources disagree, note it briefly.
42
+ - Keep it concise — 2-4 paragraphs max.
38
43
 
39
44
  Sources:
40
45
  ${context}
@@ -77,7 +82,7 @@ Answer:`
77
82
  const body = JSON.stringify({
78
83
  model: this.model,
79
84
  messages: [
80
- { role: 'system', content: 'You are a concise search assistant. Answer with citations.' },
85
+ { role: 'system', content: 'You are a search engine. Give direct, specific answers with numbered citations. Never hedge or qualify with "based on sources" — just answer the question.' },
81
86
  { role: 'user', content: prompt }
82
87
  ],
83
88
  max_tokens: 500,