spectrawl 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "spectrawl",
3
- "version": "0.3.4",
3
+ "version": "0.3.6",
4
4
  "description": "The unified web layer for AI agents. Search (6 engines), stealth browse (Camoufox + Playwright), auth (cookies, multi-account), act (24 adapters, 30+ platforms), proxy rotation. Self-hosted, free.",
5
5
  "main": "src/index.js",
6
6
  "types": "index.d.ts",
package/src/index.js CHANGED
@@ -12,9 +12,24 @@ const { EventEmitter, EVENTS } = require('./events')
12
12
  const { CookieRefresher } = require('./auth/refresh')
13
13
  const { loadConfig } = require('./config')
14
14
 
15
+ function deepMergeConfig(target, source) {
16
+ const result = { ...target }
17
+ for (const key of Object.keys(source)) {
18
+ if (source[key] && typeof source[key] === 'object' && !Array.isArray(source[key])) {
19
+ result[key] = deepMergeConfig(target[key] || {}, source[key])
20
+ } else {
21
+ result[key] = source[key]
22
+ }
23
+ }
24
+ return result
25
+ }
26
+
15
27
  class Spectrawl {
16
28
  constructor(configPath) {
17
- this.config = loadConfig(configPath)
29
+ // Accept either a file path (string) or a config object
30
+ this.config = (typeof configPath === 'object' && configPath !== null)
31
+ ? deepMergeConfig(loadConfig(null), configPath)
32
+ : loadConfig(configPath)
18
33
  this.events = new EventEmitter()
19
34
  this.cache = new Cache(this.config.cache)
20
35
  this.searchEngine = new SearchEngine(this.config.search, this.cache)
@@ -0,0 +1,123 @@
1
+ const https = require('https')
2
+ const { URL } = require('url')
3
+
4
+ /**
5
+ * Bing web search — scrapes Bing HTML results.
6
+ * No API key needed. More reliable from datacenter IPs than DDG.
7
+ * DDG actually uses Bing's index anyway — this goes direct.
8
+ */
9
+ async function bingSearch(query, config = {}) {
10
+ const maxResults = config.maxResults || 10
11
+
12
+ try {
13
+ const html = await fetchBing(query)
14
+
15
+ // Detect blocks
16
+ if (html.includes('captcha') || html.includes('unusual traffic') || html.length < 1000) {
17
+ return []
18
+ }
19
+
20
+ return parseBingResults(html, maxResults)
21
+ } catch (e) {
22
+ return []
23
+ }
24
+ }
25
+
26
+ function parseBingResults(html, maxResults) {
27
+ const results = []
28
+
29
+ // Bing result blocks: <li class="b_algo">
30
+ const blockRegex = /<li\s+class="b_algo">([\s\S]*?)<\/li>/g
31
+ let block
32
+ while ((block = blockRegex.exec(html)) !== null && results.length < maxResults) {
33
+ const content = block[1]
34
+
35
+ // Extract URL and title from <h2><a href="...">title</a></h2>
36
+ const linkMatch = content.match(/<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i)
37
+ if (!linkMatch) continue
38
+
39
+ const url = linkMatch[1]
40
+ const title = stripHtml(linkMatch[2])
41
+
42
+ // Skip Bing internal links
43
+ if (url.includes('bing.com') || url.includes('microsoft.com/bing')) continue
44
+
45
+ // Extract snippet from <p> or <div class="b_caption">
46
+ const snippetMatch = content.match(/<p[^>]*>([\s\S]*?)<\/p>/i) ||
47
+ content.match(/<div\s+class="b_caption"[^>]*>([\s\S]*?)<\/div>/i)
48
+ const snippet = snippetMatch ? stripHtml(snippetMatch[1]) : ''
49
+
50
+ results.push({ url, title, snippet, engine: 'bing' })
51
+ }
52
+
53
+ return results
54
+ }
55
+
56
+ function fetchBing(query) {
57
+ return new Promise((resolve, reject) => {
58
+ const path = `/search?q=${encodeURIComponent(query)}&setlang=en&count=15`
59
+ const opts = {
60
+ hostname: 'www.bing.com',
61
+ path,
62
+ method: 'GET',
63
+ headers: {
64
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
65
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66
+ 'Accept-Language': 'en-US,en;q=0.9',
67
+ 'Accept-Encoding': 'identity',
68
+ 'DNT': '1'
69
+ }
70
+ }
71
+
72
+ const req = https.get(opts, res => {
73
+ // Follow redirects
74
+ if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
75
+ const loc = res.headers.location
76
+ if (loc.startsWith('http')) {
77
+ return fetchUrl(loc).then(resolve).catch(reject)
78
+ }
79
+ return fetchUrl('https://www.bing.com' + loc).then(resolve).catch(reject)
80
+ }
81
+
82
+ let data = ''
83
+ res.on('data', chunk => data += chunk)
84
+ res.on('end', () => resolve(data))
85
+ })
86
+ req.on('error', reject)
87
+ req.setTimeout(8000, () => { req.destroy(); reject(new Error('Bing timeout')) })
88
+ })
89
+ }
90
+
91
+ function fetchUrl(url) {
92
+ return new Promise((resolve, reject) => {
93
+ const urlObj = new URL(url)
94
+ const client = urlObj.protocol === 'https:' ? https : require('http')
95
+ client.get({
96
+ hostname: urlObj.hostname,
97
+ path: urlObj.pathname + urlObj.search,
98
+ headers: {
99
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
100
+ 'Accept': 'text/html'
101
+ }
102
+ }, res => {
103
+ let data = ''
104
+ res.on('data', c => data += c)
105
+ res.on('end', () => resolve(data))
106
+ }).on('error', reject)
107
+ })
108
+ }
109
+
110
+ function stripHtml(html) {
111
+ return html
112
+ .replace(/<[^>]+>/g, '')
113
+ .replace(/&amp;/g, '&')
114
+ .replace(/&lt;/g, '<')
115
+ .replace(/&gt;/g, '>')
116
+ .replace(/&quot;/g, '"')
117
+ .replace(/&#39;/g, "'")
118
+ .replace(/&nbsp;/g, ' ')
119
+ .replace(/\s+/g, ' ')
120
+ .trim()
121
+ }
122
+
123
+ module.exports = { bingSearch }
@@ -1,31 +1,46 @@
1
1
  const https = require('https')
2
+ const http = require('http')
2
3
  const { URL } = require('url')
3
4
 
4
5
  /**
5
6
  * DuckDuckGo search — free, unlimited, no API key needed.
6
- * Uses JSON API + HTML fallback. Filters ads automatically.
7
+ * Uses JSON API + HTML fallback + lite fallback.
8
+ * Built-in retry with backoff for datacenter IP rate limiting.
9
+ * Optional proxy support for reliable results.
7
10
  */
8
11
  async function ddgSearch(query, config = {}) {
9
12
  const maxResults = config.maxResults || 10
10
-
11
- // Strategy 1: JSON API (instant answers)
12
- try {
13
- const results = await ddgJsonApi(query, maxResults)
14
- if (results.length > 0) return results
15
- } catch (e) { /* fall through */ }
16
-
17
- // Strategy 2: HTML search
18
- try {
19
- const results = await ddgHtmlSearch(query, maxResults)
20
- if (results.length > 0) return results
21
- } catch (e) { /* fall through */ }
13
+ const proxy = config.proxy || null
14
+
15
+ // Try up to 2 times with backoff
16
+ for (let attempt = 0; attempt < 2; attempt++) {
17
+ if (attempt > 0) await delay(1000 + Math.random() * 1000)
18
+
19
+ // Strategy 1: JSON API (instant answers — most reliable from datacenter)
20
+ try {
21
+ const results = await ddgJsonApi(query, maxResults, proxy)
22
+ if (results.length > 0) return results
23
+ } catch (e) { /* fall through */ }
24
+
25
+ // Strategy 2: HTML search (html.duckduckgo.com)
26
+ try {
27
+ const results = await ddgHtmlSearch(query, maxResults, 'html.duckduckgo.com', proxy)
28
+ if (results.length > 0) return results
29
+ } catch (e) { /* fall through */ }
30
+
31
+ // Strategy 3: Lite search (lite.duckduckgo.com — simpler, less likely to CAPTCHA)
32
+ try {
33
+ const results = await ddgHtmlSearch(query, maxResults, 'lite.duckduckgo.com', proxy)
34
+ if (results.length > 0) return results
35
+ } catch (e) { /* fall through */ }
36
+ }
22
37
 
23
38
  return []
24
39
  }
25
40
 
26
- async function ddgJsonApi(query, maxResults) {
41
+ async function ddgJsonApi(query, maxResults, proxy) {
27
42
  const url = `https://api.duckduckgo.com/?q=${encodeURIComponent(query)}&format=json&no_html=1&skip_disambig=1`
28
- const data = await fetchJson(url)
43
+ const data = await fetchJson(url, proxy)
29
44
 
30
45
  const results = []
31
46
 
@@ -82,9 +97,14 @@ async function ddgJsonApi(query, maxResults) {
82
97
  return results
83
98
  }
84
99
 
85
- async function ddgHtmlSearch(query, maxResults) {
86
- const url = `https://html.duckduckgo.com/html/?q=${encodeURIComponent(query)}`
87
- const html = await fetchHtml(url)
100
+ async function ddgHtmlSearch(query, maxResults, hostname, proxy) {
101
+ const path = `/html/?q=${encodeURIComponent(query)}`
102
+ const html = await fetchHtml(`https://${hostname}${path}`, proxy)
103
+
104
+ // Detect CAPTCHA / rate limit
105
+ if (html.includes('g-recaptcha') || html.includes('bot detected') || html.length < 500) {
106
+ return []
107
+ }
88
108
 
89
109
  const results = []
90
110
 
@@ -95,11 +115,30 @@ async function ddgHtmlSearch(query, maxResults) {
95
115
  let match
96
116
  while ((match = resultRegex.exec(html)) !== null) {
97
117
  const url = decodeUddg(match[1])
98
- // Filter ads — DDG ads go through duckduckgo.com/y.js
99
118
  if (isAd(url)) continue
100
119
  links.push({ url, title: stripHtml(match[2]) })
101
120
  }
102
121
 
122
+ // Lite endpoint uses different selectors
123
+ if (links.length === 0) {
124
+ const liteRegex = /<a[^>]+class="result-link"[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/g
125
+ while ((match = liteRegex.exec(html)) !== null) {
126
+ const url = decodeUddg(match[1])
127
+ if (isAd(url)) continue
128
+ links.push({ url, title: stripHtml(match[2]) })
129
+ }
130
+ // Even simpler: just grab all non-DDG links from lite results
131
+ if (links.length === 0) {
132
+ const anyLink = /<a[^>]*href="(https?:\/\/(?!duckduckgo)[^"]+)"[^>]*>([\s\S]*?)<\/a>/g
133
+ while ((match = anyLink.exec(html)) !== null) {
134
+ if (results.length >= maxResults) break
135
+ const url = match[1]
136
+ if (isAd(url)) continue
137
+ links.push({ url, title: stripHtml(match[2]) })
138
+ }
139
+ }
140
+ }
141
+
103
142
  const snippets = []
104
143
  while ((match = snippetRegex.exec(html)) !== null) {
105
144
  snippets.push(stripHtml(match[1]))
@@ -117,18 +156,11 @@ async function ddgHtmlSearch(query, maxResults) {
117
156
  return results
118
157
  }
119
158
 
120
- /**
121
- * Filter out DDG ads.
122
- */
123
159
  function isAd(url) {
124
160
  if (!url) return true
125
161
  if (url.includes('duckduckgo.com/y.js')) return true
126
162
  if (url.includes('ad_provider=')) return true
127
163
  if (url.includes('ad_domain=')) return true
128
- if (url.startsWith('//duckduckgo.com/l/?')) {
129
- // This is a redirect — might be organic
130
- return false
131
- }
132
164
  return false
133
165
  }
134
166
 
@@ -140,28 +172,34 @@ function decodeUddg(url) {
140
172
  return url
141
173
  }
142
174
 
143
- function fetchJson(url) {
175
+ function delay(ms) { return new Promise(r => setTimeout(r, ms)) }
176
+
177
+ function fetchJson(url, proxy) {
144
178
  return new Promise((resolve, reject) => {
145
179
  const urlObj = new URL(url)
146
- https.get({
180
+ const opts = {
147
181
  hostname: urlObj.hostname,
148
182
  path: urlObj.pathname + urlObj.search,
149
- headers: { 'User-Agent': 'Spectrawl/0.1.0' }
150
- }, res => {
183
+ headers: { 'User-Agent': 'Spectrawl/0.3' }
184
+ }
185
+
186
+ const req = https.get(opts, res => {
151
187
  let data = ''
152
188
  res.on('data', chunk => data += chunk)
153
189
  res.on('end', () => {
154
190
  try { resolve(JSON.parse(data)) }
155
191
  catch (e) { reject(new Error('Invalid JSON from DDG API')) }
156
192
  })
157
- }).on('error', reject)
193
+ })
194
+ req.on('error', reject)
195
+ req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG timeout')) })
158
196
  })
159
197
  }
160
198
 
161
- function fetchHtml(url) {
199
+ function fetchHtml(url, proxy) {
162
200
  return new Promise((resolve, reject) => {
163
201
  const urlObj = new URL(url)
164
- https.get({
202
+ const opts = {
165
203
  hostname: urlObj.hostname,
166
204
  path: urlObj.pathname + urlObj.search,
167
205
  headers: {
@@ -169,11 +207,19 @@ function fetchHtml(url) {
169
207
  'Accept': 'text/html',
170
208
  'Accept-Language': 'en-US,en;q=0.9'
171
209
  }
172
- }, res => {
210
+ }
211
+
212
+ const req = https.get(opts, res => {
213
+ // Follow redirects
214
+ if ([301, 302, 303].includes(res.statusCode) && res.headers.location) {
215
+ return fetchHtml(res.headers.location, proxy).then(resolve).catch(reject)
216
+ }
173
217
  let data = ''
174
218
  res.on('data', chunk => data += chunk)
175
219
  res.on('end', () => resolve(data))
176
- }).on('error', reject)
220
+ })
221
+ req.on('error', reject)
222
+ req.setTimeout(8000, () => { req.destroy(); reject(new Error('DDG HTML timeout')) })
177
223
  })
178
224
  }
179
225
 
@@ -4,11 +4,13 @@ const { serperSearch } = require('./engines/serper')
4
4
  const { searxngSearch } = require('./engines/searxng')
5
5
  const { googleCseSearch } = require('./engines/google-cse')
6
6
  const { jinaSearch } = require('./engines/jina')
7
+ const { bingSearch } = require('./engines/bing')
7
8
  const { geminiGroundedSearch } = require('./engines/gemini-grounded')
8
9
  const { scrapeUrls } = require('./scraper')
9
10
  const { Summarizer } = require('./summarizer')
10
11
  const { Reranker } = require('./reranker')
11
12
  const { QueryExpander } = require('./query-expander')
13
+ const { SourceRanker } = require('./source-ranker')
12
14
 
13
15
  const ENGINES = {
14
16
  searxng: searxngSearch,
@@ -18,7 +20,8 @@ const ENGINES = {
18
20
  'google-cse': googleCseSearch,
19
21
  jina: jinaSearch,
20
22
  'gemini-grounded': geminiGroundedSearch,
21
- gemini: geminiGroundedSearch
23
+ gemini: geminiGroundedSearch,
24
+ bing: bingSearch
22
25
  }
23
26
 
24
27
  class SearchEngine {
@@ -33,6 +36,7 @@ class SearchEngine {
33
36
  const geminiKey = config.geminiKey || process.env.GEMINI_API_KEY
34
37
  this.reranker = geminiKey ? new Reranker({ apiKey: geminiKey, ...config.reranker }) : null
35
38
  this.expander = geminiKey ? new QueryExpander({ apiKey: geminiKey, ...config.expander }) : null
39
+ this.sourceRanker = new SourceRanker(config.sourceRanker || {})
36
40
  }
37
41
 
38
42
  /**
@@ -90,8 +94,10 @@ class SearchEngine {
90
94
 
91
95
  const response = { answer, sources: results, cached: false }
92
96
 
93
- // Cache the result
94
- this.cache?.set('search', cacheKey, response)
97
+ // Only cache if we got results
98
+ if (results.length > 0) {
99
+ this.cache?.set('search', cacheKey, response)
100
+ }
95
101
 
96
102
  return response
97
103
  }
@@ -123,12 +129,23 @@ class SearchEngine {
123
129
  // When using Gemini Grounded, also run DDG in parallel for volume
124
130
  const resultSets = []
125
131
  if (usesGrounded) {
126
- // Parallel: Gemini for quality + DDG for volume
132
+ // Parallel with staggered DDG start (DDG rate-limits concurrent requests from same IP)
133
+ const delay = ms => new Promise(r => setTimeout(r, ms))
127
134
  const [groundedResults, ddgResults] = await Promise.all([
128
- this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(() => []),
129
- this._rawSearch(query, { ...opts, engines: ['ddg'] }).catch(() => [])
135
+ this._rawSearch(query, { ...opts, engines: ['gemini-grounded', 'gemini'] }).catch(e => { console.warn('Gemini grounded failed:', e.message); return [] }),
136
+ delay(500).then(() => this._rawSearch(query, { ...opts, engines: ['ddg'] })).catch(e => { console.warn('DDG failed:', e.message); return [] })
130
137
  ])
138
+ if (process.env.SPECTRAWL_DEBUG) {
139
+ console.log('[deepSearch] Gemini results:', groundedResults.length, '| DDG results:', ddgResults.length)
140
+ }
131
141
  resultSets.push(groundedResults, ddgResults)
142
+
143
+ // If primary failed, retry with a different approach
144
+ if (groundedResults.length === 0 && ddgResults.length === 0) {
145
+ await delay(1000)
146
+ const retry = await this._rawSearch(query, { ...opts, engines: this.cascade }).catch(() => [])
147
+ resultSets.push(retry)
148
+ }
132
149
  } else {
133
150
  for (const q of queries) {
134
151
  try {
@@ -142,13 +159,21 @@ class SearchEngine {
142
159
  }
143
160
 
144
161
  // Step 3: Merge and deduplicate
145
- let results = dedupeResults(resultSets.flat())
162
+ const flatResults = resultSets.flat()
163
+ let results = dedupeResults(flatResults)
164
+ if (process.env.SPECTRAWL_DEBUG) {
165
+ console.log('[deepSearch] resultSets lengths:', resultSets.map(s => s.length))
166
+ console.log('[deepSearch] flat:', flatResults.length, '→ deduped:', results.length)
167
+ }
146
168
 
147
- // Step 4: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
169
+ // Step 4a: Rerank by relevance (skip for Gemini Grounded — it already returns scored results)
148
170
  if (this.reranker && opts.rerank !== false && !usesGrounded) {
149
171
  results = await this.reranker.rerank(query, results)
150
172
  }
151
173
 
174
+ // Step 4b: Source quality ranking — boost trusted domains, penalize SEO spam
175
+ results = this.sourceRanker.rank(results)
176
+
152
177
  // Step 5: Parallel scrape top N for full content (skip in fast mode)
153
178
  const scrapeCount = opts.mode === 'fast' ? 0 : (opts.scrapeTop ?? this.scrapeTop ?? 5)
154
179
  if (scrapeCount > 0 && results.length > 0) {
@@ -188,7 +213,10 @@ class SearchEngine {
188
213
  cached: false
189
214
  }
190
215
 
191
- this.cache?.set('search', cacheKey, response)
216
+ // Only cache if we got results — never cache failures
217
+ if (response.sources.length > 0) {
218
+ this.cache?.set('search', cacheKey, response)
219
+ }
192
220
  return response
193
221
  }
194
222
 
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Source quality ranker — boost trusted sources, penalize SEO spam.
3
+ * This is something Tavily doesn't have.
4
+ *
5
+ * Users can customize weights per domain or use built-in presets.
6
+ */
7
+
8
+ // Built-in domain quality tiers
9
+ const DEFAULT_WEIGHTS = {
10
+ // Tier 1: Primary sources, high trust (1.3x boost)
11
+ 'github.com': 1.3,
12
+ 'stackoverflow.com': 1.3,
13
+ 'news.ycombinator.com': 1.3,
14
+ 'arxiv.org': 1.3,
15
+ 'docs.google.com': 1.2,
16
+ 'developer.mozilla.org': 1.3,
17
+ 'wikipedia.org': 1.2,
18
+ 'en.wikipedia.org': 1.2,
19
+
20
+ // Tier 2: Quality community/editorial (1.15x boost)
21
+ 'reddit.com': 1.15,
22
+ 'www.reddit.com': 1.15,
23
+ 'dev.to': 1.15,
24
+ 'medium.com': 1.1,
25
+ 'blog.logrocket.com': 1.15,
26
+ 'css-tricks.com': 1.15,
27
+ 'smashingmagazine.com': 1.15,
28
+ 'web.dev': 1.2,
29
+ 'npmjs.com': 1.15,
30
+ 'www.npmjs.com': 1.15,
31
+ 'pypi.org': 1.15,
32
+
33
+ // Tier 3: Known SEO farms / thin content (0.7x penalty)
34
+ 'w3schools.com': 0.8,
35
+ 'www.w3schools.com': 0.8,
36
+ 'geeksforgeeks.org': 0.85,
37
+ 'www.geeksforgeeks.org': 0.85,
38
+ 'tutorialspoint.com': 0.7,
39
+ 'www.tutorialspoint.com': 0.7,
40
+ 'javatpoint.com': 0.7,
41
+ 'www.javatpoint.com': 0.7,
42
+ }
43
+
44
+ // Content-type signals that indicate quality
45
+ const QUALITY_SIGNALS = {
46
+ // URL patterns that suggest high quality
47
+ positive: [
48
+ /\/blog\//i, // Blog posts (usually more detailed)
49
+ /\/docs\//i, // Documentation
50
+ /\/guide/i, // Guides
51
+ /\/tutorial/i, // Tutorials
52
+ /github\.com\/[\w-]+\/[\w-]+$/, // Repo pages (not search)
53
+ /\/wiki\//i, // Wiki pages
54
+ /\/research\//i, // Research
55
+ ],
56
+ // URL patterns that suggest low quality
57
+ negative: [
58
+ /\/tag\//i, // Tag listing pages
59
+ /\/category\//i, // Category pages
60
+ /\/page\/\d+/i, // Pagination
61
+ /\?utm_/i, // Tracking URLs
62
+ /\/amp\//i, // AMP pages (usually stripped)
63
+ /\/slideshow/i, // Slideshow spam
64
+ /\/gallery/i, // Gallery spam
65
+ /\/listicle/i, // Listicle spam
66
+ ]
67
+ }
68
+
69
+ class SourceRanker {
70
+ constructor(config = {}) {
71
+ this.weights = { ...DEFAULT_WEIGHTS, ...(config.weights || {}) }
72
+ this.boostDomains = config.boost || [] // Always boost these domains
73
+ this.blockDomains = config.block || [] // Always exclude these domains
74
+ }
75
+
76
+ /**
77
+ * Apply source quality scoring to search results.
78
+ * Modifies scores in-place and reorders by adjusted score.
79
+ */
80
+ rank(results) {
81
+ if (!results || results.length === 0) return results
82
+
83
+ // Filter blocked domains
84
+ let filtered = results.filter(r => {
85
+ try {
86
+ const host = new URL(r.url).hostname
87
+ return !this.blockDomains.some(d => host.includes(d))
88
+ } catch { return true }
89
+ })
90
+
91
+ // Apply quality weights
92
+ filtered = filtered.map(r => {
93
+ let multiplier = 1.0
94
+
95
+ try {
96
+ const url = new URL(r.url)
97
+ const host = url.hostname
98
+
99
+ // Domain weight
100
+ for (const [domain, weight] of Object.entries(this.weights)) {
101
+ if (host === domain || host.endsWith('.' + domain)) {
102
+ multiplier *= weight
103
+ break
104
+ }
105
+ }
106
+
107
+ // Boost domains
108
+ if (this.boostDomains.some(d => host.includes(d))) {
109
+ multiplier *= 1.3
110
+ }
111
+
112
+ // URL quality signals
113
+ const fullUrl = r.url
114
+ for (const pattern of QUALITY_SIGNALS.positive) {
115
+ if (pattern.test(fullUrl)) { multiplier *= 1.05; break }
116
+ }
117
+ for (const pattern of QUALITY_SIGNALS.negative) {
118
+ if (pattern.test(fullUrl)) { multiplier *= 0.85; break }
119
+ }
120
+
121
+ // Freshness signal (year in URL)
122
+ const yearMatch = fullUrl.match(/20(2[4-9]|3\d)/)
123
+ if (yearMatch) multiplier *= 1.05 // Recent content boost
124
+
125
+ } catch { /* invalid URL, no adjustment */ }
126
+
127
+ const baseScore = r.score || r.confidence || 0.5
128
+ return { ...r, score: Math.min(1, baseScore * multiplier), _multiplier: multiplier }
129
+ })
130
+
131
+ // Sort by adjusted score
132
+ filtered.sort((a, b) => (b.score || 0) - (a.score || 0))
133
+
134
+ return filtered
135
+ }
136
+ }
137
+
138
+ module.exports = { SourceRanker, DEFAULT_WEIGHTS }
@@ -31,10 +31,15 @@ class Summarizer {
31
31
  .map((s, i) => `[${i + 1}] ${s.title}\n${s.url}\n${(s.fullContent || s.snippet || '').slice(0, 1000)}`)
32
32
  .join('\n\n')
33
33
 
34
- const prompt = `Based on the following search results, provide a concise answer to the query: "${query}"
34
+ const prompt = `Answer this question directly: "${query}"
35
35
 
36
- Include citations as [1], [2], etc. referencing the source numbers below.
37
- Be direct and factual. If the sources don't contain enough information, say so.
36
+ Rules:
37
+ - Give a clear, specific answer. Name things, list tools, state facts.
38
+ - Use [1], [2] etc. to cite sources inline.
39
+ - Never say "based on the provided sources" or "according to search results."
40
+ - Never hedge with "it appears" or "it seems." Be direct.
41
+ - If sources disagree, note it briefly.
42
+ - Keep it concise — 2-4 paragraphs max.
38
43
 
39
44
  Sources:
40
45
  ${context}
@@ -77,7 +82,7 @@ Answer:`
77
82
  const body = JSON.stringify({
78
83
  model: this.model,
79
84
  messages: [
80
- { role: 'system', content: 'You are a concise search assistant. Answer with citations.' },
85
+ { role: 'system', content: 'You are a search engine. Give direct, specific answers with numbered citations. Never hedge or qualify with "based on sources" — just answer the question.' },
81
86
  { role: 'user', content: prompt }
82
87
  ],
83
88
  max_tokens: 500,