web-search-plus-plugin 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.template +4 -0
- package/LICENSE +21 -0
- package/README.md +82 -0
- package/index.ts +129 -0
- package/openclaw.plugin.json +11 -0
- package/package.json +22 -0
- package/scripts/search.py +2526 -0
- package/scripts/setup.py +453 -0
|
@@ -0,0 +1,2526 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Web Search Plus — Unified Multi-Provider Search with Intelligent Auto-Routing
|
|
4
|
+
Supports: Serper (Google), Tavily (Research), Exa (Neural), Perplexity (Direct Answers)
|
|
5
|
+
|
|
6
|
+
Smart Routing uses multi-signal analysis:
|
|
7
|
+
- Query intent classification (shopping, research, discovery)
|
|
8
|
+
- Linguistic pattern detection (how much vs how does)
|
|
9
|
+
- Product/brand recognition
|
|
10
|
+
- URL detection
|
|
11
|
+
- Confidence scoring
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
python3 search.py --query "..." # Auto-route based on query
|
|
15
|
+
python3 search.py --provider [serper|tavily|exa] --query "..." [options]
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
python3 search.py -q "iPhone 16 Pro price" # → Serper (shopping intent)
|
|
19
|
+
python3 search.py -q "how does quantum entanglement work" # → Tavily (research intent)
|
|
20
|
+
python3 search.py -q "startups similar to Notion" # → Exa (discovery intent)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import hashlib
|
|
25
|
+
import json
|
|
26
|
+
import os
|
|
27
|
+
import re
|
|
28
|
+
import sys
|
|
29
|
+
import time
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
32
|
+
from urllib.request import Request, urlopen
|
|
33
|
+
from urllib.error import HTTPError, URLError
|
|
34
|
+
from urllib.parse import quote, urlparse
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# =============================================================================
|
|
38
|
+
# Result Caching
|
|
39
|
+
# =============================================================================
|
|
40
|
+
|
|
41
|
+
CACHE_DIR = Path(os.environ.get("WSP_CACHE_DIR", os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".cache")))
|
|
42
|
+
PROVIDER_HEALTH_FILE = CACHE_DIR / "provider_health.json"
|
|
43
|
+
DEFAULT_CACHE_TTL = 3600 # 1 hour in seconds
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _build_cache_payload(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
47
|
+
"""Build normalized payload used for cache key hashing."""
|
|
48
|
+
payload = {
|
|
49
|
+
"query": query,
|
|
50
|
+
"provider": provider,
|
|
51
|
+
"max_results": max_results,
|
|
52
|
+
}
|
|
53
|
+
if params:
|
|
54
|
+
payload.update(params)
|
|
55
|
+
return payload
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _get_cache_key(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> str:
|
|
59
|
+
"""Generate a unique cache key from all relevant query parameters."""
|
|
60
|
+
payload = _build_cache_payload(query, provider, max_results, params)
|
|
61
|
+
key_string = json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
|
|
62
|
+
return hashlib.sha256(key_string.encode("utf-8")).hexdigest()[:32]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _get_cache_path(cache_key: str) -> Path:
|
|
66
|
+
"""Get the file path for a cache entry."""
|
|
67
|
+
return CACHE_DIR / f"{cache_key}.json"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _ensure_cache_dir() -> None:
|
|
71
|
+
"""Create cache directory if it doesn't exist."""
|
|
72
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def cache_get(query: str, provider: str, max_results: int, ttl: int = DEFAULT_CACHE_TTL, params: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
|
|
76
|
+
"""
|
|
77
|
+
Retrieve cached search results if they exist and are not expired.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
query: The search query
|
|
81
|
+
provider: The search provider
|
|
82
|
+
max_results: Maximum results requested
|
|
83
|
+
ttl: Time-to-live in seconds (default: 1 hour)
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Cached result dict or None if not found/expired
|
|
87
|
+
"""
|
|
88
|
+
cache_key = _get_cache_key(query, provider, max_results, params)
|
|
89
|
+
cache_path = _get_cache_path(cache_key)
|
|
90
|
+
|
|
91
|
+
if not cache_path.exists():
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
with open(cache_path, "r", encoding="utf-8") as f:
|
|
96
|
+
cached = json.load(f)
|
|
97
|
+
|
|
98
|
+
cached_time = cached.get("_cache_timestamp", 0)
|
|
99
|
+
if time.time() - cached_time > ttl:
|
|
100
|
+
# Cache expired, remove it
|
|
101
|
+
cache_path.unlink(missing_ok=True)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
return cached
|
|
105
|
+
except (json.JSONDecodeError, IOError, KeyError):
|
|
106
|
+
# Corrupted cache file, remove it
|
|
107
|
+
cache_path.unlink(missing_ok=True)
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def cache_put(query: str, provider: str, max_results: int, result: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> None:
|
|
112
|
+
"""
|
|
113
|
+
Store search results in cache.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
query: The search query
|
|
117
|
+
provider: The search provider
|
|
118
|
+
max_results: Maximum results requested
|
|
119
|
+
result: The search result to cache
|
|
120
|
+
"""
|
|
121
|
+
_ensure_cache_dir()
|
|
122
|
+
|
|
123
|
+
cache_key = _get_cache_key(query, provider, max_results, params)
|
|
124
|
+
cache_path = _get_cache_path(cache_key)
|
|
125
|
+
|
|
126
|
+
# Add cache metadata
|
|
127
|
+
cached_result = result.copy()
|
|
128
|
+
cached_result["_cache_timestamp"] = time.time()
|
|
129
|
+
cached_result["_cache_key"] = cache_key
|
|
130
|
+
cached_result["_cache_query"] = query
|
|
131
|
+
cached_result["_cache_provider"] = provider
|
|
132
|
+
cached_result["_cache_max_results"] = max_results
|
|
133
|
+
cached_result["_cache_params"] = params or {}
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
with open(cache_path, "w", encoding="utf-8") as f:
|
|
137
|
+
json.dump(cached_result, f, ensure_ascii=False, indent=2)
|
|
138
|
+
except IOError as e:
|
|
139
|
+
# Non-fatal: log to stderr but don't fail
|
|
140
|
+
print(json.dumps({"cache_write_error": str(e)}), file=sys.stderr)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def cache_clear() -> Dict[str, Any]:
|
|
144
|
+
"""
|
|
145
|
+
Clear all cached results.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Stats about what was cleared
|
|
149
|
+
"""
|
|
150
|
+
if not CACHE_DIR.exists():
|
|
151
|
+
return {"cleared": 0, "message": "Cache directory does not exist"}
|
|
152
|
+
|
|
153
|
+
count = 0
|
|
154
|
+
size_freed = 0
|
|
155
|
+
|
|
156
|
+
for cache_file in CACHE_DIR.glob("*.json"):
|
|
157
|
+
if cache_file.name == PROVIDER_HEALTH_FILE.name:
|
|
158
|
+
continue
|
|
159
|
+
try:
|
|
160
|
+
size_freed += cache_file.stat().st_size
|
|
161
|
+
cache_file.unlink()
|
|
162
|
+
count += 1
|
|
163
|
+
except IOError:
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
return {
|
|
167
|
+
"cleared": count,
|
|
168
|
+
"size_freed_bytes": size_freed,
|
|
169
|
+
"size_freed_kb": round(size_freed / 1024, 2),
|
|
170
|
+
"message": f"Cleared {count} cached entries"
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def cache_stats() -> Dict[str, Any]:
|
|
175
|
+
"""
|
|
176
|
+
Get statistics about the cache.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Dict with cache statistics
|
|
180
|
+
"""
|
|
181
|
+
if not CACHE_DIR.exists():
|
|
182
|
+
return {
|
|
183
|
+
"total_entries": 0,
|
|
184
|
+
"total_size_bytes": 0,
|
|
185
|
+
"total_size_kb": 0,
|
|
186
|
+
"oldest": None,
|
|
187
|
+
"newest": None,
|
|
188
|
+
"cache_dir": str(CACHE_DIR),
|
|
189
|
+
"exists": False
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
entries = [p for p in CACHE_DIR.glob("*.json") if p.name != PROVIDER_HEALTH_FILE.name]
|
|
193
|
+
total_size = 0
|
|
194
|
+
oldest_time = None
|
|
195
|
+
newest_time = None
|
|
196
|
+
oldest_query = None
|
|
197
|
+
newest_query = None
|
|
198
|
+
provider_counts = {}
|
|
199
|
+
|
|
200
|
+
for cache_file in entries:
|
|
201
|
+
try:
|
|
202
|
+
stat = cache_file.stat()
|
|
203
|
+
total_size += stat.st_size
|
|
204
|
+
|
|
205
|
+
with open(cache_file, "r", encoding="utf-8") as f:
|
|
206
|
+
cached = json.load(f)
|
|
207
|
+
|
|
208
|
+
ts = cached.get("_cache_timestamp", 0)
|
|
209
|
+
query = cached.get("_cache_query", "unknown")
|
|
210
|
+
provider = cached.get("_cache_provider", "unknown")
|
|
211
|
+
|
|
212
|
+
provider_counts[provider] = provider_counts.get(provider, 0) + 1
|
|
213
|
+
|
|
214
|
+
if oldest_time is None or ts < oldest_time:
|
|
215
|
+
oldest_time = ts
|
|
216
|
+
oldest_query = query
|
|
217
|
+
if newest_time is None or ts > newest_time:
|
|
218
|
+
newest_time = ts
|
|
219
|
+
newest_query = query
|
|
220
|
+
except (json.JSONDecodeError, IOError):
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
"total_entries": len(entries),
|
|
225
|
+
"total_size_bytes": total_size,
|
|
226
|
+
"total_size_kb": round(total_size / 1024, 2),
|
|
227
|
+
"providers": provider_counts,
|
|
228
|
+
"oldest": {
|
|
229
|
+
"timestamp": oldest_time,
|
|
230
|
+
"age_seconds": int(time.time() - oldest_time) if oldest_time else None,
|
|
231
|
+
"query": oldest_query
|
|
232
|
+
} if oldest_time else None,
|
|
233
|
+
"newest": {
|
|
234
|
+
"timestamp": newest_time,
|
|
235
|
+
"age_seconds": int(time.time() - newest_time) if newest_time else None,
|
|
236
|
+
"query": newest_query
|
|
237
|
+
} if newest_time else None,
|
|
238
|
+
"cache_dir": str(CACHE_DIR),
|
|
239
|
+
"exists": True
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# =============================================================================
|
|
244
|
+
# Auto-load .env from skill directory (if exists)
|
|
245
|
+
# =============================================================================
|
|
246
|
+
def _load_env_file():
|
|
247
|
+
"""Load .env file from skill root directory if it exists."""
|
|
248
|
+
env_path = Path(__file__).parent.parent / ".env"
|
|
249
|
+
if env_path.exists():
|
|
250
|
+
with open(env_path) as f:
|
|
251
|
+
for line in f:
|
|
252
|
+
line = line.strip()
|
|
253
|
+
if line and not line.startswith("#") and "=" in line:
|
|
254
|
+
# Handle export VAR=value or VAR=value
|
|
255
|
+
if line.startswith("export "):
|
|
256
|
+
line = line[7:]
|
|
257
|
+
key, _, value = line.partition("=")
|
|
258
|
+
key = key.strip()
|
|
259
|
+
value = value.strip().strip('"').strip("'")
|
|
260
|
+
if key and key not in os.environ:
|
|
261
|
+
os.environ[key] = value
|
|
262
|
+
|
|
263
|
+
_load_env_file()
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# =============================================================================
|
|
267
|
+
# Configuration
|
|
268
|
+
# =============================================================================
|
|
269
|
+
|
|
270
|
+
DEFAULT_CONFIG = {
|
|
271
|
+
"defaults": {
|
|
272
|
+
"provider": "serper",
|
|
273
|
+
"max_results": 5
|
|
274
|
+
},
|
|
275
|
+
"auto_routing": {
|
|
276
|
+
"enabled": True,
|
|
277
|
+
"fallback_provider": "serper",
|
|
278
|
+
"provider_priority": ["tavily", "exa", "perplexity", "serper", "you", "searxng"],
|
|
279
|
+
"disabled_providers": [],
|
|
280
|
+
"confidence_threshold": 0.3, # Below this, note low confidence
|
|
281
|
+
},
|
|
282
|
+
"serper": {
|
|
283
|
+
"country": "us",
|
|
284
|
+
"language": "en",
|
|
285
|
+
"type": "search"
|
|
286
|
+
},
|
|
287
|
+
"tavily": {
|
|
288
|
+
"depth": "basic",
|
|
289
|
+
"topic": "general"
|
|
290
|
+
},
|
|
291
|
+
"exa": {
|
|
292
|
+
"type": "neural"
|
|
293
|
+
},
|
|
294
|
+
"perplexity": {
|
|
295
|
+
"api_url": "https://api.kilo.ai/api/gateway/chat/completions",
|
|
296
|
+
"model": "perplexity/sonar-pro"
|
|
297
|
+
},
|
|
298
|
+
"you": {
|
|
299
|
+
"country": "us",
|
|
300
|
+
"safesearch": "moderate"
|
|
301
|
+
},
|
|
302
|
+
"searxng": {
|
|
303
|
+
"instance_url": None, # Required - user must set their own instance
|
|
304
|
+
"safesearch": 0, # 0=off, 1=moderate, 2=strict
|
|
305
|
+
"engines": None, # Optional list of engines to use
|
|
306
|
+
"language": "en"
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def load_config() -> Dict[str, Any]:
|
|
312
|
+
"""Load configuration from config.json if it exists, with defaults."""
|
|
313
|
+
config = DEFAULT_CONFIG.copy()
|
|
314
|
+
config_path = Path(__file__).parent.parent / "config.json"
|
|
315
|
+
|
|
316
|
+
if config_path.exists():
|
|
317
|
+
try:
|
|
318
|
+
with open(config_path) as f:
|
|
319
|
+
user_config = json.load(f)
|
|
320
|
+
for key, value in user_config.items():
|
|
321
|
+
if isinstance(value, dict) and key in config:
|
|
322
|
+
config[key] = {**config.get(key, {}), **value}
|
|
323
|
+
else:
|
|
324
|
+
config[key] = value
|
|
325
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
326
|
+
print(json.dumps({
|
|
327
|
+
"warning": f"Could not load config.json: {e}",
|
|
328
|
+
"using": "default configuration"
|
|
329
|
+
}), file=sys.stderr)
|
|
330
|
+
|
|
331
|
+
return config
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def get_api_key(provider: str, config: Dict[str, Any] = None) -> Optional[str]:
|
|
335
|
+
"""Get API key for provider from config.json or environment.
|
|
336
|
+
|
|
337
|
+
Priority: config.json > .env > environment variable
|
|
338
|
+
|
|
339
|
+
Note: SearXNG doesn't require an API key, but returns instance_url if configured.
|
|
340
|
+
"""
|
|
341
|
+
# Special case: SearXNG uses instance_url instead of API key
|
|
342
|
+
if provider == "searxng":
|
|
343
|
+
return get_searxng_instance_url(config)
|
|
344
|
+
|
|
345
|
+
# Check config.json first
|
|
346
|
+
if config:
|
|
347
|
+
provider_config = config.get(provider, {})
|
|
348
|
+
if isinstance(provider_config, dict):
|
|
349
|
+
key = provider_config.get("api_key") or provider_config.get("apiKey")
|
|
350
|
+
if key:
|
|
351
|
+
return key
|
|
352
|
+
|
|
353
|
+
# Then check environment
|
|
354
|
+
key_map = {
|
|
355
|
+
"serper": "SERPER_API_KEY",
|
|
356
|
+
"tavily": "TAVILY_API_KEY",
|
|
357
|
+
"exa": "EXA_API_KEY",
|
|
358
|
+
"you": "YOU_API_KEY",
|
|
359
|
+
"perplexity": "KILOCODE_API_KEY",
|
|
360
|
+
}
|
|
361
|
+
return os.environ.get(key_map.get(provider, ""))
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _validate_searxng_url(url: str) -> str:
|
|
365
|
+
"""Validate and sanitize SearXNG instance URL to prevent SSRF.
|
|
366
|
+
|
|
367
|
+
Enforces http/https scheme and blocks requests to private/internal networks
|
|
368
|
+
including cloud metadata endpoints, loopback, link-local, and RFC1918 ranges.
|
|
369
|
+
"""
|
|
370
|
+
import ipaddress
|
|
371
|
+
import socket
|
|
372
|
+
from urllib.parse import urlparse
|
|
373
|
+
|
|
374
|
+
parsed = urlparse(url)
|
|
375
|
+
if parsed.scheme not in ("http", "https"):
|
|
376
|
+
raise ValueError(f"SearXNG URL must use http or https scheme, got: {parsed.scheme}")
|
|
377
|
+
if not parsed.hostname:
|
|
378
|
+
raise ValueError("SearXNG URL must include a hostname")
|
|
379
|
+
|
|
380
|
+
hostname = parsed.hostname
|
|
381
|
+
|
|
382
|
+
# Block cloud metadata endpoints by hostname
|
|
383
|
+
BLOCKED_HOSTS = {
|
|
384
|
+
"169.254.169.254", # AWS/GCP/Azure metadata
|
|
385
|
+
"metadata.google.internal",
|
|
386
|
+
"metadata.internal",
|
|
387
|
+
}
|
|
388
|
+
if hostname in BLOCKED_HOSTS:
|
|
389
|
+
raise ValueError(f"SearXNG URL blocked: {hostname} is a cloud metadata endpoint")
|
|
390
|
+
|
|
391
|
+
# Resolve hostname and check for private/internal IPs
|
|
392
|
+
# Operators who intentionally self-host on private networks can opt out
|
|
393
|
+
allow_private = os.environ.get("SEARXNG_ALLOW_PRIVATE", "").strip() == "1"
|
|
394
|
+
if not allow_private:
|
|
395
|
+
try:
|
|
396
|
+
resolved_ips = socket.getaddrinfo(hostname, parsed.port or 80, proto=socket.IPPROTO_TCP)
|
|
397
|
+
for family, _type, _proto, _canonname, sockaddr in resolved_ips:
|
|
398
|
+
ip = ipaddress.ip_address(sockaddr[0])
|
|
399
|
+
if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
|
|
400
|
+
raise ValueError(
|
|
401
|
+
f"SearXNG URL blocked: {hostname} resolves to private/internal IP {ip}. "
|
|
402
|
+
f"If this is intentional, set SEARXNG_ALLOW_PRIVATE=1 in your environment."
|
|
403
|
+
)
|
|
404
|
+
except socket.gaierror:
|
|
405
|
+
raise ValueError(f"SearXNG URL blocked: cannot resolve hostname {hostname}")
|
|
406
|
+
|
|
407
|
+
return url
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def get_searxng_instance_url(config: Dict[str, Any] = None) -> Optional[str]:
|
|
411
|
+
"""Get SearXNG instance URL from config or environment.
|
|
412
|
+
|
|
413
|
+
SearXNG is self-hosted, so no API key needed - just the instance URL.
|
|
414
|
+
Priority: config.json > SEARXNG_INSTANCE_URL environment variable
|
|
415
|
+
|
|
416
|
+
Security: URL is validated to prevent SSRF via scheme enforcement.
|
|
417
|
+
Both config sources (config.json, env var) are operator-controlled,
|
|
418
|
+
not agent-controlled, so private IPs like localhost are permitted.
|
|
419
|
+
"""
|
|
420
|
+
# Check config.json first
|
|
421
|
+
if config:
|
|
422
|
+
searxng_config = config.get("searxng", {})
|
|
423
|
+
if isinstance(searxng_config, dict):
|
|
424
|
+
url = searxng_config.get("instance_url")
|
|
425
|
+
if url:
|
|
426
|
+
return _validate_searxng_url(url)
|
|
427
|
+
|
|
428
|
+
# Then check environment
|
|
429
|
+
env_url = os.environ.get("SEARXNG_INSTANCE_URL")
|
|
430
|
+
if env_url:
|
|
431
|
+
return _validate_searxng_url(env_url)
|
|
432
|
+
return None
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
# Backward compatibility alias
|
|
436
|
+
def get_env_key(provider: str) -> Optional[str]:
|
|
437
|
+
"""Get API key for provider from environment (legacy function)."""
|
|
438
|
+
return get_api_key(provider)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def validate_api_key(provider: str, config: Dict[str, Any] = None) -> str:
|
|
442
|
+
"""Validate and return API key (or instance URL for SearXNG), with helpful error messages."""
|
|
443
|
+
key = get_api_key(provider, config)
|
|
444
|
+
|
|
445
|
+
# Special handling for SearXNG - it needs instance URL, not API key
|
|
446
|
+
if provider == "searxng":
|
|
447
|
+
if not key:
|
|
448
|
+
error_msg = {
|
|
449
|
+
"error": "Missing SearXNG instance URL",
|
|
450
|
+
"env_var": "SEARXNG_INSTANCE_URL",
|
|
451
|
+
"how_to_fix": [
|
|
452
|
+
"1. Set up your own SearXNG instance: https://docs.searxng.org/admin/installation.html",
|
|
453
|
+
"2. Add to config.json: \"searxng\": {\"instance_url\": \"https://your-instance.example.com\"}",
|
|
454
|
+
"3. Or set environment variable: export SEARXNG_INSTANCE_URL=\"https://your-instance.example.com\"",
|
|
455
|
+
"Note: SearXNG requires a self-hosted instance with JSON format enabled.",
|
|
456
|
+
],
|
|
457
|
+
"provider": provider
|
|
458
|
+
}
|
|
459
|
+
print(json.dumps(error_msg, indent=2), file=sys.stderr)
|
|
460
|
+
sys.exit(1)
|
|
461
|
+
|
|
462
|
+
# Validate URL format
|
|
463
|
+
if not key.startswith(("http://", "https://")):
|
|
464
|
+
print(json.dumps({
|
|
465
|
+
"error": "SearXNG instance URL must start with http:// or https://",
|
|
466
|
+
"provided": key,
|
|
467
|
+
"provider": provider
|
|
468
|
+
}, indent=2), file=sys.stderr)
|
|
469
|
+
sys.exit(1)
|
|
470
|
+
|
|
471
|
+
return key
|
|
472
|
+
|
|
473
|
+
if not key:
|
|
474
|
+
env_var = {
|
|
475
|
+
"serper": "SERPER_API_KEY",
|
|
476
|
+
"tavily": "TAVILY_API_KEY",
|
|
477
|
+
"exa": "EXA_API_KEY",
|
|
478
|
+
"you": "YOU_API_KEY",
|
|
479
|
+
"perplexity": "KILOCODE_API_KEY"
|
|
480
|
+
}[provider]
|
|
481
|
+
|
|
482
|
+
urls = {
|
|
483
|
+
"serper": "https://serper.dev",
|
|
484
|
+
"tavily": "https://tavily.com",
|
|
485
|
+
"exa": "https://exa.ai",
|
|
486
|
+
"you": "https://api.you.com",
|
|
487
|
+
"perplexity": "https://api.kilo.ai"
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
error_msg = {
|
|
491
|
+
"error": f"Missing API key for {provider}",
|
|
492
|
+
"env_var": env_var,
|
|
493
|
+
"how_to_fix": [
|
|
494
|
+
f"1. Get your API key from {urls[provider]}",
|
|
495
|
+
f"2. Add to config.json: \"{provider}\": {{\"api_key\": \"your-key\"}}",
|
|
496
|
+
f"3. Or set environment variable: export {env_var}=\"your-key\"",
|
|
497
|
+
],
|
|
498
|
+
"provider": provider
|
|
499
|
+
}
|
|
500
|
+
print(json.dumps(error_msg, indent=2), file=sys.stderr)
|
|
501
|
+
sys.exit(1)
|
|
502
|
+
|
|
503
|
+
if len(key) < 10:
|
|
504
|
+
print(json.dumps({
|
|
505
|
+
"error": f"API key for {provider} appears invalid (too short)",
|
|
506
|
+
"provider": provider
|
|
507
|
+
}, indent=2), file=sys.stderr)
|
|
508
|
+
sys.exit(1)
|
|
509
|
+
|
|
510
|
+
return key
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# =============================================================================
|
|
514
|
+
# Intelligent Auto-Routing Engine
|
|
515
|
+
# =============================================================================
|
|
516
|
+
|
|
517
|
+
class QueryAnalyzer:
|
|
518
|
+
"""
|
|
519
|
+
Intelligent query analysis for smart provider routing.
|
|
520
|
+
|
|
521
|
+
Uses multi-signal analysis:
|
|
522
|
+
- Intent classification (shopping, research, discovery, local, news)
|
|
523
|
+
- Linguistic patterns (question structure, phrase patterns)
|
|
524
|
+
- Entity detection (products, brands, URLs, dates)
|
|
525
|
+
- Complexity assessment
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
# Intent signal patterns with weights
|
|
529
|
+
# Higher weight = stronger signal for that provider
|
|
530
|
+
|
|
531
|
+
SHOPPING_SIGNALS = {
|
|
532
|
+
# Price patterns (very strong)
|
|
533
|
+
r'\bhow much\b': 4.0,
|
|
534
|
+
r'\bprice of\b': 4.0,
|
|
535
|
+
r'\bcost of\b': 4.0,
|
|
536
|
+
r'\bprices?\b': 3.0,
|
|
537
|
+
r'\$\d+|\d+\s*dollars?': 3.0,
|
|
538
|
+
r'€\d+|\d+\s*euros?': 3.0,
|
|
539
|
+
r'£\d+|\d+\s*pounds?': 3.0,
|
|
540
|
+
|
|
541
|
+
# German price patterns (sehr stark)
|
|
542
|
+
r'\bpreis(e)?\b': 3.5,
|
|
543
|
+
r'\bkosten\b': 3.0,
|
|
544
|
+
r'\bwieviel\b': 3.5,
|
|
545
|
+
r'\bwie viel\b': 3.5,
|
|
546
|
+
r'\bwas kostet\b': 4.0,
|
|
547
|
+
|
|
548
|
+
# Purchase intent (strong)
|
|
549
|
+
r'\bbuy\b': 3.5,
|
|
550
|
+
r'\bpurchase\b': 3.5,
|
|
551
|
+
r'\border\b(?!\s+by)': 3.0, # "order" but not "order by"
|
|
552
|
+
r'\bshopping\b': 3.5,
|
|
553
|
+
r'\bshop for\b': 3.5,
|
|
554
|
+
r'\bwhere to (buy|get|purchase)\b': 4.0,
|
|
555
|
+
|
|
556
|
+
# German purchase intent (stark)
|
|
557
|
+
r'\bkaufen\b': 3.5,
|
|
558
|
+
r'\bbestellen\b': 3.5,
|
|
559
|
+
r'\bwo kaufen\b': 4.0,
|
|
560
|
+
r'\bhändler\b': 3.0,
|
|
561
|
+
r'\bshop\b': 2.5,
|
|
562
|
+
|
|
563
|
+
# Deal/discount signals
|
|
564
|
+
r'\bdeal(s)?\b': 3.0,
|
|
565
|
+
r'\bdiscount(s)?\b': 3.0,
|
|
566
|
+
r'\bsale\b': 2.5,
|
|
567
|
+
r'\bcheap(er|est)?\b': 3.0,
|
|
568
|
+
r'\baffordable\b': 2.5,
|
|
569
|
+
r'\bbudget\b': 2.5,
|
|
570
|
+
r'\bbest price\b': 3.5,
|
|
571
|
+
r'\bcompare prices\b': 3.5,
|
|
572
|
+
r'\bcoupon\b': 3.0,
|
|
573
|
+
|
|
574
|
+
# German deal/discount signals
|
|
575
|
+
r'\bgünstig(er|ste)?\b': 3.0,
|
|
576
|
+
r'\bbillig(er|ste)?\b': 3.0,
|
|
577
|
+
r'\bangebot(e)?\b': 3.0,
|
|
578
|
+
r'\brabatt\b': 3.0,
|
|
579
|
+
r'\baktion\b': 2.5,
|
|
580
|
+
r'\bschnäppchen\b': 3.0,
|
|
581
|
+
|
|
582
|
+
# Product comparison
|
|
583
|
+
r'\bvs\.?\b': 2.0,
|
|
584
|
+
r'\bversus\b': 2.0,
|
|
585
|
+
r'\bor\b.*\bwhich\b': 2.0,
|
|
586
|
+
r'\bspecs?\b': 2.5,
|
|
587
|
+
r'\bspecifications?\b': 2.5,
|
|
588
|
+
r'\breview(s)?\b': 2.0,
|
|
589
|
+
r'\brating(s)?\b': 2.0,
|
|
590
|
+
r'\bunboxing\b': 2.5,
|
|
591
|
+
|
|
592
|
+
# German product comparison
|
|
593
|
+
r'\btest\b': 2.5,
|
|
594
|
+
r'\bbewertung(en)?\b': 2.5,
|
|
595
|
+
r'\btechnische daten\b': 3.0,
|
|
596
|
+
r'\bspezifikationen\b': 2.5,
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
RESEARCH_SIGNALS = {
|
|
600
|
+
# Explanation patterns (very strong)
|
|
601
|
+
r'\bhow does\b': 4.0,
|
|
602
|
+
r'\bhow do\b': 3.5,
|
|
603
|
+
r'\bwhy does\b': 4.0,
|
|
604
|
+
r'\bwhy do\b': 3.5,
|
|
605
|
+
r'\bwhy is\b': 3.5,
|
|
606
|
+
r'\bexplain\b': 4.0,
|
|
607
|
+
r'\bexplanation\b': 4.0,
|
|
608
|
+
r'\bwhat is\b': 3.0,
|
|
609
|
+
r'\bwhat are\b': 3.0,
|
|
610
|
+
r'\bdefine\b': 3.5,
|
|
611
|
+
r'\bdefinition of\b': 3.5,
|
|
612
|
+
r'\bmeaning of\b': 3.0,
|
|
613
|
+
|
|
614
|
+
# Analysis patterns (strong)
|
|
615
|
+
r'\banalyze\b': 3.5,
|
|
616
|
+
r'\banalysis\b': 3.5,
|
|
617
|
+
r'\bcompare\b(?!\s*prices?)': 3.0, # compare but not "compare prices"
|
|
618
|
+
r'\bcomparison\b': 3.0,
|
|
619
|
+
r'\bstatus of\b': 3.5,
|
|
620
|
+
r'\bstatus\b': 2.5,
|
|
621
|
+
r'\bwhat happened with\b': 4.0,
|
|
622
|
+
r'\bpros and cons\b': 4.0,
|
|
623
|
+
r'\badvantages?\b': 3.0,
|
|
624
|
+
r'\bdisadvantages?\b': 3.0,
|
|
625
|
+
r'\bbenefits?\b': 2.5,
|
|
626
|
+
r'\bdrawbacks?\b': 3.0,
|
|
627
|
+
r'\bdifference between\b': 3.5,
|
|
628
|
+
|
|
629
|
+
# Learning patterns
|
|
630
|
+
r'\bunderstand\b': 3.0,
|
|
631
|
+
r'\blearn(ing)?\b': 2.5,
|
|
632
|
+
r'\btutorial\b': 3.0,
|
|
633
|
+
r'\bguide\b': 2.5,
|
|
634
|
+
r'\bhow to\b': 2.0, # Lower weight - could be shopping too
|
|
635
|
+
r'\bstep by step\b': 3.0,
|
|
636
|
+
|
|
637
|
+
# Depth signals
|
|
638
|
+
r'\bin[- ]depth\b': 3.0,
|
|
639
|
+
r'\bdetailed\b': 2.5,
|
|
640
|
+
r'\bcomprehensive\b': 3.0,
|
|
641
|
+
r'\bthorough\b': 2.5,
|
|
642
|
+
r'\bdeep dive\b': 3.5,
|
|
643
|
+
r'\boverall\b': 2.0,
|
|
644
|
+
r'\bsummary\b': 2.0,
|
|
645
|
+
|
|
646
|
+
# Academic patterns
|
|
647
|
+
r'\bstudy\b': 2.5,
|
|
648
|
+
r'\bresearch shows\b': 3.5,
|
|
649
|
+
r'\baccording to\b': 2.5,
|
|
650
|
+
r'\bevidence\b': 3.0,
|
|
651
|
+
r'\bscientific\b': 3.0,
|
|
652
|
+
r'\bhistory of\b': 3.0,
|
|
653
|
+
r'\bbackground\b': 2.5,
|
|
654
|
+
r'\bcontext\b': 2.5,
|
|
655
|
+
r'\bimplications?\b': 3.0,
|
|
656
|
+
|
|
657
|
+
# German explanation patterns (sehr stark)
|
|
658
|
+
r'\bwie funktioniert\b': 4.0,
|
|
659
|
+
r'\bwarum\b': 3.5,
|
|
660
|
+
r'\berklär(en|ung)?\b': 4.0,
|
|
661
|
+
r'\bwas ist\b': 3.0,
|
|
662
|
+
r'\bwas sind\b': 3.0,
|
|
663
|
+
r'\bbedeutung\b': 3.0,
|
|
664
|
+
|
|
665
|
+
# German analysis patterns
|
|
666
|
+
r'\banalyse\b': 3.5,
|
|
667
|
+
r'\bvergleich(en)?\b': 3.0,
|
|
668
|
+
r'\bvor- und nachteile\b': 4.0,
|
|
669
|
+
r'\bvorteile\b': 3.0,
|
|
670
|
+
r'\bnachteile\b': 3.0,
|
|
671
|
+
r'\bunterschied(e)?\b': 3.5,
|
|
672
|
+
|
|
673
|
+
# German learning patterns
|
|
674
|
+
r'\bverstehen\b': 3.0,
|
|
675
|
+
r'\blernen\b': 2.5,
|
|
676
|
+
r'\banleitung\b': 3.0,
|
|
677
|
+
r'\bübersicht\b': 2.5,
|
|
678
|
+
r'\bhintergrund\b': 2.5,
|
|
679
|
+
r'\bzusammenfassung\b': 2.5,
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
DISCOVERY_SIGNALS = {
|
|
683
|
+
# Similarity patterns (very strong)
|
|
684
|
+
r'\bsimilar to\b': 5.0,
|
|
685
|
+
r'\blike\s+\w+\.com': 4.5, # "like notion.com"
|
|
686
|
+
r'\balternatives? to\b': 5.0,
|
|
687
|
+
r'\bcompetitors? (of|to)\b': 4.5,
|
|
688
|
+
r'\bcompeting with\b': 4.0,
|
|
689
|
+
r'\brivals? (of|to)\b': 4.0,
|
|
690
|
+
r'\binstead of\b': 3.0,
|
|
691
|
+
r'\breplacement for\b': 3.5,
|
|
692
|
+
|
|
693
|
+
# Company/startup patterns (strong)
|
|
694
|
+
r'\bcompanies (like|that|doing|building)\b': 4.5,
|
|
695
|
+
r'\bstartups? (like|that|doing|building)\b': 4.5,
|
|
696
|
+
r'\bwho else\b': 4.0,
|
|
697
|
+
r'\bother (companies|startups|tools|apps)\b': 3.5,
|
|
698
|
+
r'\bfind (companies|startups|tools|examples?)\b': 4.5,
|
|
699
|
+
r'\bevents? in\b': 4.0,
|
|
700
|
+
r'\bthings to do in\b': 4.5,
|
|
701
|
+
|
|
702
|
+
# Funding/business patterns
|
|
703
|
+
r'\bseries [a-d]\b': 4.0,
|
|
704
|
+
r'\byc\b|y combinator': 4.0,
|
|
705
|
+
r'\bfund(ed|ing|raise)\b': 3.5,
|
|
706
|
+
r'\bventure\b': 3.0,
|
|
707
|
+
r'\bvaluation\b': 3.0,
|
|
708
|
+
|
|
709
|
+
# Category patterns
|
|
710
|
+
r'\bresearch papers? (on|about)\b': 4.0,
|
|
711
|
+
r'\barxiv\b': 4.5,
|
|
712
|
+
r'\bgithub (projects?|repos?)\b': 4.5,
|
|
713
|
+
r'\bopen source\b.*\bprojects?\b': 4.0,
|
|
714
|
+
r'\btweets? (about|on)\b': 3.5,
|
|
715
|
+
r'\bblogs? (about|on|like)\b': 3.0,
|
|
716
|
+
|
|
717
|
+
# URL detection (very strong signal for Exa similar)
|
|
718
|
+
r'https?://[^\s]+': 5.0,
|
|
719
|
+
r'\b\w+\.(com|org|io|ai|co|dev)\b': 3.5,
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
LOCAL_NEWS_SIGNALS = {
|
|
723
|
+
# Local patterns → Serper
|
|
724
|
+
r'\bnear me\b': 4.0,
|
|
725
|
+
r'\bnearby\b': 3.5,
|
|
726
|
+
r'\blocal\b': 3.0,
|
|
727
|
+
r'\bin (my )?(city|area|town|neighborhood)\b': 3.5,
|
|
728
|
+
r'\brestaurants?\b': 2.5,
|
|
729
|
+
r'\bhotels?\b': 2.5,
|
|
730
|
+
r'\bcafes?\b': 2.5,
|
|
731
|
+
r'\bstores?\b': 2.0,
|
|
732
|
+
r'\bdirections? to\b': 3.5,
|
|
733
|
+
r'\bmap of\b': 3.0,
|
|
734
|
+
r'\bphone number\b': 3.0,
|
|
735
|
+
r'\baddress of\b': 3.0,
|
|
736
|
+
r'\bopen(ing)? hours\b': 3.0,
|
|
737
|
+
|
|
738
|
+
# Weather/time
|
|
739
|
+
r'\bweather\b': 4.0,
|
|
740
|
+
r'\bforecast\b': 3.5,
|
|
741
|
+
r'\btemperature\b': 3.0,
|
|
742
|
+
r'\btime in\b': 3.0,
|
|
743
|
+
|
|
744
|
+
# News/recency patterns → Serper (or Tavily for news depth)
|
|
745
|
+
r'\blatest\b': 2.5,
|
|
746
|
+
r'\brecent\b': 2.5,
|
|
747
|
+
r'\btoday\b': 2.5,
|
|
748
|
+
r'\bbreaking\b': 3.5,
|
|
749
|
+
r'\bnews\b': 2.5,
|
|
750
|
+
r'\bheadlines?\b': 3.0,
|
|
751
|
+
r'\b202[4-9]\b': 2.0, # Current year mentions
|
|
752
|
+
r'\blast (week|month|year)\b': 2.0,
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
# RAG/AI signals → You.com
|
|
756
|
+
# You.com excels at providing LLM-ready snippets and combined web+news
|
|
757
|
+
RAG_SIGNALS = {
|
|
758
|
+
# RAG/context patterns (strong signal for You.com)
|
|
759
|
+
r'\brag\b': 4.5,
|
|
760
|
+
r'\bcontext for\b': 4.0,
|
|
761
|
+
r'\bsummarize\b': 3.5,
|
|
762
|
+
r'\bbrief(ly)?\b': 3.0,
|
|
763
|
+
r'\bquick overview\b': 3.5,
|
|
764
|
+
r'\btl;?dr\b': 4.0,
|
|
765
|
+
r'\bkey (points|facts|info)\b': 3.5,
|
|
766
|
+
r'\bmain (points|takeaways)\b': 3.5,
|
|
767
|
+
|
|
768
|
+
# Combined web + news queries
|
|
769
|
+
r'\b(web|online)\s+and\s+news\b': 4.0,
|
|
770
|
+
r'\ball sources\b': 3.5,
|
|
771
|
+
r'\bcomprehensive (search|overview)\b': 3.5,
|
|
772
|
+
r'\blatest\s+(news|updates)\b': 3.0,
|
|
773
|
+
r'\bcurrent (events|situation|status)\b': 3.5,
|
|
774
|
+
|
|
775
|
+
# Real-time information needs
|
|
776
|
+
r'\bright now\b': 3.0,
|
|
777
|
+
r'\bas of today\b': 3.5,
|
|
778
|
+
r'\bup.to.date\b': 3.5,
|
|
779
|
+
r'\breal.time\b': 4.0,
|
|
780
|
+
r'\blive\b': 2.5,
|
|
781
|
+
|
|
782
|
+
# Information synthesis
|
|
783
|
+
r'\bwhat\'?s happening with\b': 3.5,
|
|
784
|
+
r'\bwhat\'?s the latest\b': 4.0,
|
|
785
|
+
r'\bupdates?\s+on\b': 3.5,
|
|
786
|
+
r'\bstatus of\b': 3.0,
|
|
787
|
+
r'\bsituation (in|with|around)\b': 3.5,
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
# Direct answer / synthesis signals → Perplexity via Kilo Gateway
|
|
791
|
+
DIRECT_ANSWER_SIGNALS = {
|
|
792
|
+
r'\bwhat is\b': 3.0,
|
|
793
|
+
r'\bwhat are\b': 2.5,
|
|
794
|
+
r'\bcurrent status\b': 4.0,
|
|
795
|
+
r'\bstatus of\b': 3.5,
|
|
796
|
+
r'\bstatus\b': 2.5,
|
|
797
|
+
r'\bwhat happened with\b': 4.0,
|
|
798
|
+
r"\bwhat'?s happening with\b": 4.0,
|
|
799
|
+
r'\bas of (today|now)\b': 4.0,
|
|
800
|
+
r'\bthis weekend\b': 3.5,
|
|
801
|
+
r'\bevents? in\b': 3.5,
|
|
802
|
+
r'\bthings to do in\b': 4.0,
|
|
803
|
+
r'\bnear me\b': 3.0,
|
|
804
|
+
r'\bcan you (tell me|summarize|explain)\b': 3.5,
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
# Privacy/Multi-source signals → SearXNG (self-hosted meta-search)
|
|
808
|
+
# SearXNG is ideal for privacy-focused queries and aggregating multiple sources
|
|
809
|
+
PRIVACY_SIGNALS = {
|
|
810
|
+
# Privacy signals (very strong)
|
|
811
|
+
r'\bprivate(ly)?\b': 4.0,
|
|
812
|
+
r'\banonymous(ly)?\b': 4.0,
|
|
813
|
+
r'\bwithout tracking\b': 4.5,
|
|
814
|
+
r'\bno track(ing)?\b': 4.5,
|
|
815
|
+
r'\bprivacy\b': 3.5,
|
|
816
|
+
r'\bprivacy.?focused\b': 4.5,
|
|
817
|
+
r'\bprivacy.?first\b': 4.5,
|
|
818
|
+
r'\bduckduckgo alternative\b': 4.5,
|
|
819
|
+
r'\bprivate search\b': 5.0,
|
|
820
|
+
|
|
821
|
+
# German privacy signals
|
|
822
|
+
r'\bprivat\b': 4.0,
|
|
823
|
+
r'\banonym\b': 4.0,
|
|
824
|
+
r'\bohne tracking\b': 4.5,
|
|
825
|
+
r'\bdatenschutz\b': 4.0,
|
|
826
|
+
|
|
827
|
+
# Multi-source aggregation signals
|
|
828
|
+
r'\baggregate results?\b': 4.0,
|
|
829
|
+
r'\bmultiple sources?\b': 4.0,
|
|
830
|
+
r'\bdiverse (results|perspectives|sources)\b': 4.0,
|
|
831
|
+
r'\bfrom (all|multiple|different) (engines?|sources?)\b': 4.5,
|
|
832
|
+
r'\bmeta.?search\b': 5.0,
|
|
833
|
+
r'\ball engines?\b': 4.0,
|
|
834
|
+
|
|
835
|
+
# German multi-source signals
|
|
836
|
+
r'\bverschiedene quellen\b': 4.0,
|
|
837
|
+
r'\baus mehreren quellen\b': 4.0,
|
|
838
|
+
r'\balle suchmaschinen\b': 4.5,
|
|
839
|
+
|
|
840
|
+
# Budget/free signals (SearXNG is self-hosted = $0 API cost)
|
|
841
|
+
r'\bfree search\b': 3.5,
|
|
842
|
+
r'\bno api cost\b': 4.0,
|
|
843
|
+
r'\bself.?hosted search\b': 5.0,
|
|
844
|
+
r'\bzero cost\b': 3.5,
|
|
845
|
+
r'\bbudget\b(?!\s*(laptop|phone|option))\b': 2.5, # "budget" alone, not "budget laptop"
|
|
846
|
+
|
|
847
|
+
# German budget signals
|
|
848
|
+
r'\bkostenlos(e)?\s+suche\b': 3.5,
|
|
849
|
+
r'\bkeine api.?kosten\b': 4.0,
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
# Brand/product patterns for shopping detection
|
|
853
|
+
BRAND_PATTERNS = [
|
|
854
|
+
# Tech brands
|
|
855
|
+
r'\b(apple|iphone|ipad|macbook|airpods?)\b',
|
|
856
|
+
r'\b(samsung|galaxy)\b',
|
|
857
|
+
r'\b(google|pixel)\b',
|
|
858
|
+
r'\b(microsoft|surface|xbox)\b',
|
|
859
|
+
r'\b(sony|playstation)\b',
|
|
860
|
+
r'\b(nvidia|geforce|rtx)\b',
|
|
861
|
+
r'\b(amd|ryzen|radeon)\b',
|
|
862
|
+
r'\b(intel|core i[3579])\b',
|
|
863
|
+
r'\b(dell|hp|lenovo|asus|acer)\b',
|
|
864
|
+
r'\b(lg|tcl|hisense)\b',
|
|
865
|
+
|
|
866
|
+
# Product categories
|
|
867
|
+
r'\b(laptop|phone|tablet|tv|monitor|headphones?|earbuds?)\b',
|
|
868
|
+
r'\b(camera|lens|drone)\b',
|
|
869
|
+
r'\b(watch|smartwatch|fitbit|garmin)\b',
|
|
870
|
+
r'\b(router|modem|wifi)\b',
|
|
871
|
+
r'\b(keyboard|mouse|gaming)\b',
|
|
872
|
+
]
|
|
873
|
+
|
|
874
|
+
def __init__(self, config: Dict[str, Any]):
|
|
875
|
+
self.config = config
|
|
876
|
+
self.auto_config = config.get("auto_routing", DEFAULT_CONFIG["auto_routing"])
|
|
877
|
+
|
|
878
|
+
def _calculate_signal_score(
|
|
879
|
+
self,
|
|
880
|
+
query: str,
|
|
881
|
+
signals: Dict[str, float]
|
|
882
|
+
) -> Tuple[float, List[Dict[str, Any]]]:
|
|
883
|
+
"""
|
|
884
|
+
Calculate score for a signal category.
|
|
885
|
+
Returns (total_score, list of matched signals with details).
|
|
886
|
+
"""
|
|
887
|
+
query_lower = query.lower()
|
|
888
|
+
matches = []
|
|
889
|
+
total_score = 0.0
|
|
890
|
+
|
|
891
|
+
for pattern, weight in signals.items():
|
|
892
|
+
regex = re.compile(pattern, re.IGNORECASE)
|
|
893
|
+
found = regex.findall(query_lower)
|
|
894
|
+
if found:
|
|
895
|
+
# Normalize found matches
|
|
896
|
+
match_text = found[0] if isinstance(found[0], str) else found[0][0] if found[0] else pattern
|
|
897
|
+
matches.append({
|
|
898
|
+
"pattern": pattern,
|
|
899
|
+
"matched": match_text,
|
|
900
|
+
"weight": weight
|
|
901
|
+
})
|
|
902
|
+
total_score += weight
|
|
903
|
+
|
|
904
|
+
return total_score, matches
|
|
905
|
+
|
|
906
|
+
def _detect_product_brand_combo(self, query: str) -> float:
|
|
907
|
+
"""
|
|
908
|
+
Detect product + brand combinations which strongly indicate shopping intent.
|
|
909
|
+
Returns a bonus score.
|
|
910
|
+
"""
|
|
911
|
+
query_lower = query.lower()
|
|
912
|
+
brand_found = False
|
|
913
|
+
product_found = False
|
|
914
|
+
|
|
915
|
+
for pattern in self.BRAND_PATTERNS:
|
|
916
|
+
if re.search(pattern, query_lower, re.IGNORECASE):
|
|
917
|
+
brand_found = True
|
|
918
|
+
break
|
|
919
|
+
|
|
920
|
+
# Check for product indicators
|
|
921
|
+
product_indicators = [
|
|
922
|
+
r'\b(buy|price|specs?|review|vs|compare)\b',
|
|
923
|
+
r'\b(pro|max|plus|mini|ultra|lite)\b', # Product tier names
|
|
924
|
+
r'\b\d+\s*(gb|tb|inch|mm|hz)\b', # Specifications
|
|
925
|
+
]
|
|
926
|
+
for pattern in product_indicators:
|
|
927
|
+
if re.search(pattern, query_lower, re.IGNORECASE):
|
|
928
|
+
product_found = True
|
|
929
|
+
break
|
|
930
|
+
|
|
931
|
+
if brand_found and product_found:
|
|
932
|
+
return 3.0 # Strong shopping signal
|
|
933
|
+
elif brand_found:
|
|
934
|
+
return 1.5 # Moderate shopping signal
|
|
935
|
+
return 0.0
|
|
936
|
+
|
|
937
|
+
def _detect_url(self, query: str) -> Optional[str]:
|
|
938
|
+
"""Detect URLs in query - strong signal for Exa similar search."""
|
|
939
|
+
url_pattern = r'https?://[^\s]+'
|
|
940
|
+
match = re.search(url_pattern, query)
|
|
941
|
+
if match:
|
|
942
|
+
return match.group()
|
|
943
|
+
|
|
944
|
+
# Also check for domain-like patterns
|
|
945
|
+
domain_pattern = r'\b(\w+\.(com|org|io|ai|co|dev|net|app))\b'
|
|
946
|
+
match = re.search(domain_pattern, query, re.IGNORECASE)
|
|
947
|
+
if match:
|
|
948
|
+
return match.group()
|
|
949
|
+
|
|
950
|
+
return None
|
|
951
|
+
|
|
952
|
+
def _assess_query_complexity(self, query: str) -> Dict[str, Any]:
|
|
953
|
+
"""
|
|
954
|
+
Assess query complexity - complex queries favor Tavily.
|
|
955
|
+
"""
|
|
956
|
+
words = query.split()
|
|
957
|
+
word_count = len(words)
|
|
958
|
+
|
|
959
|
+
# Count question words
|
|
960
|
+
question_words = len(re.findall(
|
|
961
|
+
r'\b(what|why|how|when|where|which|who|whose|whom)\b',
|
|
962
|
+
query, re.IGNORECASE
|
|
963
|
+
))
|
|
964
|
+
|
|
965
|
+
# Check for multiple clauses
|
|
966
|
+
clause_markers = len(re.findall(
|
|
967
|
+
r'\b(and|but|or|because|since|while|although|if|when)\b',
|
|
968
|
+
query, re.IGNORECASE
|
|
969
|
+
))
|
|
970
|
+
|
|
971
|
+
complexity_score = 0.0
|
|
972
|
+
if word_count > 10:
|
|
973
|
+
complexity_score += 1.5
|
|
974
|
+
if word_count > 20:
|
|
975
|
+
complexity_score += 1.0
|
|
976
|
+
if question_words > 1:
|
|
977
|
+
complexity_score += 1.0
|
|
978
|
+
if clause_markers > 0:
|
|
979
|
+
complexity_score += 0.5 * clause_markers
|
|
980
|
+
|
|
981
|
+
return {
|
|
982
|
+
"word_count": word_count,
|
|
983
|
+
"question_words": question_words,
|
|
984
|
+
"clause_markers": clause_markers,
|
|
985
|
+
"complexity_score": complexity_score,
|
|
986
|
+
"is_complex": complexity_score > 2.0
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
def _detect_recency_intent(self, query: str) -> Tuple[bool, float]:
|
|
990
|
+
"""
|
|
991
|
+
Detect if query wants recent/timely information.
|
|
992
|
+
Returns (is_recency_focused, score).
|
|
993
|
+
"""
|
|
994
|
+
recency_patterns = [
|
|
995
|
+
(r'\b(latest|newest|recent|current)\b', 2.5),
|
|
996
|
+
(r'\b(today|yesterday|this week|this month)\b', 3.0),
|
|
997
|
+
(r'\b(202[4-9]|2030)\b', 2.0),
|
|
998
|
+
(r'\b(breaking|live|just|now)\b', 3.0),
|
|
999
|
+
(r'\blast (hour|day|week|month)\b', 2.5),
|
|
1000
|
+
]
|
|
1001
|
+
|
|
1002
|
+
total = 0.0
|
|
1003
|
+
for pattern, weight in recency_patterns:
|
|
1004
|
+
if re.search(pattern, query, re.IGNORECASE):
|
|
1005
|
+
total += weight
|
|
1006
|
+
|
|
1007
|
+
return total > 2.0, total
|
|
1008
|
+
|
|
1009
|
+
def analyze(self, query: str) -> Dict[str, Any]:
|
|
1010
|
+
"""
|
|
1011
|
+
Perform comprehensive query analysis.
|
|
1012
|
+
Returns detailed analysis with scores for each provider.
|
|
1013
|
+
"""
|
|
1014
|
+
# Calculate scores for each intent category
|
|
1015
|
+
shopping_score, shopping_matches = self._calculate_signal_score(
|
|
1016
|
+
query, self.SHOPPING_SIGNALS
|
|
1017
|
+
)
|
|
1018
|
+
research_score, research_matches = self._calculate_signal_score(
|
|
1019
|
+
query, self.RESEARCH_SIGNALS
|
|
1020
|
+
)
|
|
1021
|
+
discovery_score, discovery_matches = self._calculate_signal_score(
|
|
1022
|
+
query, self.DISCOVERY_SIGNALS
|
|
1023
|
+
)
|
|
1024
|
+
local_news_score, local_news_matches = self._calculate_signal_score(
|
|
1025
|
+
query, self.LOCAL_NEWS_SIGNALS
|
|
1026
|
+
)
|
|
1027
|
+
rag_score, rag_matches = self._calculate_signal_score(
|
|
1028
|
+
query, self.RAG_SIGNALS
|
|
1029
|
+
)
|
|
1030
|
+
privacy_score, privacy_matches = self._calculate_signal_score(
|
|
1031
|
+
query, self.PRIVACY_SIGNALS
|
|
1032
|
+
)
|
|
1033
|
+
direct_answer_score, direct_answer_matches = self._calculate_signal_score(
|
|
1034
|
+
query, self.DIRECT_ANSWER_SIGNALS
|
|
1035
|
+
)
|
|
1036
|
+
|
|
1037
|
+
# Apply product/brand bonus to shopping
|
|
1038
|
+
brand_bonus = self._detect_product_brand_combo(query)
|
|
1039
|
+
if brand_bonus > 0:
|
|
1040
|
+
shopping_score += brand_bonus
|
|
1041
|
+
shopping_matches.append({
|
|
1042
|
+
"pattern": "product_brand_combo",
|
|
1043
|
+
"matched": "brand + product detected",
|
|
1044
|
+
"weight": brand_bonus
|
|
1045
|
+
})
|
|
1046
|
+
|
|
1047
|
+
# Detect URL → strong Exa signal
|
|
1048
|
+
detected_url = self._detect_url(query)
|
|
1049
|
+
if detected_url:
|
|
1050
|
+
discovery_score += 5.0
|
|
1051
|
+
discovery_matches.append({
|
|
1052
|
+
"pattern": "url_detected",
|
|
1053
|
+
"matched": detected_url,
|
|
1054
|
+
"weight": 5.0
|
|
1055
|
+
})
|
|
1056
|
+
|
|
1057
|
+
# Assess complexity → favors Tavily
|
|
1058
|
+
complexity = self._assess_query_complexity(query)
|
|
1059
|
+
if complexity["is_complex"]:
|
|
1060
|
+
research_score += complexity["complexity_score"]
|
|
1061
|
+
research_matches.append({
|
|
1062
|
+
"pattern": "query_complexity",
|
|
1063
|
+
"matched": f"complex query ({complexity['word_count']} words)",
|
|
1064
|
+
"weight": complexity["complexity_score"]
|
|
1065
|
+
})
|
|
1066
|
+
|
|
1067
|
+
# Check recency intent
|
|
1068
|
+
is_recency, recency_score = self._detect_recency_intent(query)
|
|
1069
|
+
|
|
1070
|
+
# Map intents to providers with final scores
|
|
1071
|
+
provider_scores = {
|
|
1072
|
+
"serper": shopping_score + local_news_score + (recency_score * 0.35),
|
|
1073
|
+
"tavily": research_score + (complexity["complexity_score"] if not complexity["is_complex"] else 0) + (0.2 * recency_score),
|
|
1074
|
+
"exa": discovery_score + (1.0 if re.search(r"\b(similar|alternatives?|examples?)\b", query, re.IGNORECASE) else 0.0),
|
|
1075
|
+
"perplexity": direct_answer_score + (local_news_score * 0.4) + (recency_score * 0.55),
|
|
1076
|
+
"you": rag_score + (recency_score * 0.25), # You.com good for real-time + RAG
|
|
1077
|
+
"searxng": privacy_score, # SearXNG for privacy/multi-source queries
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
# Build match details per provider
|
|
1081
|
+
provider_matches = {
|
|
1082
|
+
"serper": shopping_matches + local_news_matches,
|
|
1083
|
+
"tavily": research_matches,
|
|
1084
|
+
"exa": discovery_matches,
|
|
1085
|
+
"perplexity": direct_answer_matches,
|
|
1086
|
+
"you": rag_matches,
|
|
1087
|
+
"searxng": privacy_matches,
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
return {
|
|
1091
|
+
"query": query,
|
|
1092
|
+
"provider_scores": provider_scores,
|
|
1093
|
+
"provider_matches": provider_matches,
|
|
1094
|
+
"detected_url": detected_url,
|
|
1095
|
+
"complexity": complexity,
|
|
1096
|
+
"recency_focused": is_recency,
|
|
1097
|
+
"recency_score": recency_score,
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
def route(self, query: str) -> Dict[str, Any]:
|
|
1101
|
+
"""
|
|
1102
|
+
Route query to optimal provider with confidence scoring.
|
|
1103
|
+
"""
|
|
1104
|
+
analysis = self.analyze(query)
|
|
1105
|
+
scores = analysis["provider_scores"]
|
|
1106
|
+
|
|
1107
|
+
# Filter to available providers
|
|
1108
|
+
disabled = set(self.auto_config.get("disabled_providers", []))
|
|
1109
|
+
available = {
|
|
1110
|
+
p: s for p, s in scores.items()
|
|
1111
|
+
if p not in disabled and get_env_key(p)
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
if not available:
|
|
1115
|
+
# No providers available, use fallback
|
|
1116
|
+
fallback = self.auto_config.get("fallback_provider", "serper")
|
|
1117
|
+
return {
|
|
1118
|
+
"provider": fallback,
|
|
1119
|
+
"confidence": 0.0,
|
|
1120
|
+
"confidence_level": "low",
|
|
1121
|
+
"reason": "no_available_providers",
|
|
1122
|
+
"scores": scores,
|
|
1123
|
+
"top_signals": [],
|
|
1124
|
+
"analysis": analysis,
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
# Find the winner
|
|
1128
|
+
max_score = max(available.values())
|
|
1129
|
+
total_score = sum(available.values()) or 1.0
|
|
1130
|
+
|
|
1131
|
+
# Handle ties using priority
|
|
1132
|
+
priority = self.auto_config.get("provider_priority", ["tavily", "exa", "perplexity", "serper", "you", "searxng"])
|
|
1133
|
+
winners = [p for p, s in available.items() if s == max_score]
|
|
1134
|
+
|
|
1135
|
+
if len(winners) > 1:
|
|
1136
|
+
# Use priority to break tie
|
|
1137
|
+
for p in priority:
|
|
1138
|
+
if p in winners:
|
|
1139
|
+
winner = p
|
|
1140
|
+
break
|
|
1141
|
+
else:
|
|
1142
|
+
winner = winners[0]
|
|
1143
|
+
else:
|
|
1144
|
+
winner = winners[0]
|
|
1145
|
+
|
|
1146
|
+
# Calculate confidence
|
|
1147
|
+
# High confidence = clear winner with good margin
|
|
1148
|
+
if max_score == 0:
|
|
1149
|
+
confidence = 0.0
|
|
1150
|
+
reason = "no_signals_matched"
|
|
1151
|
+
else:
|
|
1152
|
+
# Confidence based on:
|
|
1153
|
+
# 1. Absolute score (is it strong enough?)
|
|
1154
|
+
# 2. Relative margin (is there a clear winner?)
|
|
1155
|
+
second_best = sorted(available.values(), reverse=True)[1] if len(available) > 1 else 0
|
|
1156
|
+
margin = (max_score - second_best) / max_score if max_score > 0 else 0
|
|
1157
|
+
|
|
1158
|
+
# Normalize score to 0-1 range (assuming max reasonable score ~15)
|
|
1159
|
+
normalized_score = min(max_score / 15.0, 1.0)
|
|
1160
|
+
|
|
1161
|
+
# Confidence is combination of absolute strength and relative margin
|
|
1162
|
+
confidence = round((normalized_score * 0.6 + margin * 0.4), 3)
|
|
1163
|
+
|
|
1164
|
+
if confidence >= 0.7:
|
|
1165
|
+
reason = "high_confidence_match"
|
|
1166
|
+
elif confidence >= 0.4:
|
|
1167
|
+
reason = "moderate_confidence_match"
|
|
1168
|
+
else:
|
|
1169
|
+
reason = "low_confidence_match"
|
|
1170
|
+
|
|
1171
|
+
# Get top signals for the winning provider
|
|
1172
|
+
matches = analysis["provider_matches"].get(winner, [])
|
|
1173
|
+
top_signals = sorted(matches, key=lambda x: x["weight"], reverse=True)[:5]
|
|
1174
|
+
|
|
1175
|
+
# Special case: URL detected and Exa available → strong recommendation
|
|
1176
|
+
if analysis["detected_url"] and "exa" in available:
|
|
1177
|
+
if winner != "exa":
|
|
1178
|
+
# Override if URL is present but didn't win
|
|
1179
|
+
# (user might want similar search)
|
|
1180
|
+
pass # Keep current winner but note it
|
|
1181
|
+
|
|
1182
|
+
# Build detailed routing result
|
|
1183
|
+
threshold = self.auto_config.get("confidence_threshold", 0.3)
|
|
1184
|
+
|
|
1185
|
+
return {
|
|
1186
|
+
"provider": winner,
|
|
1187
|
+
"confidence": confidence,
|
|
1188
|
+
"confidence_level": "high" if confidence >= 0.7 else "medium" if confidence >= 0.4 else "low",
|
|
1189
|
+
"reason": reason,
|
|
1190
|
+
"scores": {p: round(s, 2) for p, s in available.items()},
|
|
1191
|
+
"winning_score": round(max_score, 2),
|
|
1192
|
+
"top_signals": [
|
|
1193
|
+
{"matched": s["matched"], "weight": s["weight"]}
|
|
1194
|
+
for s in top_signals
|
|
1195
|
+
],
|
|
1196
|
+
"below_threshold": confidence < threshold,
|
|
1197
|
+
"analysis_summary": {
|
|
1198
|
+
"query_length": len(query.split()),
|
|
1199
|
+
"is_complex": analysis["complexity"]["is_complex"],
|
|
1200
|
+
"has_url": analysis["detected_url"] is not None,
|
|
1201
|
+
"recency_focused": analysis["recency_focused"],
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
def auto_route_provider(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
|
|
1207
|
+
"""
|
|
1208
|
+
Intelligently route query to the best provider.
|
|
1209
|
+
Returns detailed routing decision with confidence.
|
|
1210
|
+
"""
|
|
1211
|
+
analyzer = QueryAnalyzer(config)
|
|
1212
|
+
return analyzer.route(query)
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
def explain_routing(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
|
|
1216
|
+
"""
|
|
1217
|
+
Provide detailed explanation of routing decision for debugging.
|
|
1218
|
+
"""
|
|
1219
|
+
analyzer = QueryAnalyzer(config)
|
|
1220
|
+
analysis = analyzer.analyze(query)
|
|
1221
|
+
routing = analyzer.route(query)
|
|
1222
|
+
|
|
1223
|
+
return {
|
|
1224
|
+
"query": query,
|
|
1225
|
+
"routing_decision": {
|
|
1226
|
+
"provider": routing["provider"],
|
|
1227
|
+
"confidence": routing["confidence"],
|
|
1228
|
+
"confidence_level": routing["confidence_level"],
|
|
1229
|
+
"reason": routing["reason"],
|
|
1230
|
+
},
|
|
1231
|
+
"scores": routing["scores"],
|
|
1232
|
+
"top_signals": routing["top_signals"],
|
|
1233
|
+
"intent_breakdown": {
|
|
1234
|
+
"shopping_signals": len(analysis["provider_matches"]["serper"]),
|
|
1235
|
+
"research_signals": len(analysis["provider_matches"]["tavily"]),
|
|
1236
|
+
"discovery_signals": len(analysis["provider_matches"]["exa"]),
|
|
1237
|
+
"rag_signals": len(analysis["provider_matches"]["you"]),
|
|
1238
|
+
},
|
|
1239
|
+
"query_analysis": {
|
|
1240
|
+
"word_count": analysis["complexity"]["word_count"],
|
|
1241
|
+
"is_complex": analysis["complexity"]["is_complex"],
|
|
1242
|
+
"complexity_score": round(analysis["complexity"]["complexity_score"], 2),
|
|
1243
|
+
"has_url": analysis["detected_url"],
|
|
1244
|
+
"recency_focused": analysis["recency_focused"],
|
|
1245
|
+
},
|
|
1246
|
+
"all_matches": {
|
|
1247
|
+
provider: [
|
|
1248
|
+
{"matched": m["matched"], "weight": m["weight"]}
|
|
1249
|
+
for m in matches
|
|
1250
|
+
]
|
|
1251
|
+
for provider, matches in analysis["provider_matches"].items()
|
|
1252
|
+
if matches
|
|
1253
|
+
},
|
|
1254
|
+
"available_providers": [
|
|
1255
|
+
p for p in ["serper", "tavily", "exa", "perplexity", "you", "searxng"]
|
|
1256
|
+
if get_env_key(p) and p not in config.get("auto_routing", {}).get("disabled_providers", [])
|
|
1257
|
+
]
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
|
|
1263
|
+
class ProviderRequestError(Exception):
|
|
1264
|
+
"""Structured provider error with retry/cooldown metadata."""
|
|
1265
|
+
|
|
1266
|
+
def __init__(self, message: str, status_code: Optional[int] = None, transient: bool = False):
|
|
1267
|
+
super().__init__(message)
|
|
1268
|
+
self.status_code = status_code
|
|
1269
|
+
self.transient = transient
|
|
1270
|
+
|
|
1271
|
+
|
|
1272
|
+
TRANSIENT_HTTP_CODES = {429, 503}
|
|
1273
|
+
COOLDOWN_STEPS_SECONDS = [60, 300, 1500, 3600] # 1m -> 5m -> 25m -> 1h cap
|
|
1274
|
+
RETRY_BACKOFF_SECONDS = [1, 3, 9]
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
def _ensure_parent(path: Path) -> None:
|
|
1278
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
def _load_provider_health() -> Dict[str, Any]:
|
|
1282
|
+
if not PROVIDER_HEALTH_FILE.exists():
|
|
1283
|
+
return {}
|
|
1284
|
+
try:
|
|
1285
|
+
with open(PROVIDER_HEALTH_FILE, "r", encoding="utf-8") as f:
|
|
1286
|
+
data = json.load(f)
|
|
1287
|
+
return data if isinstance(data, dict) else {}
|
|
1288
|
+
except (json.JSONDecodeError, IOError):
|
|
1289
|
+
return {}
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _save_provider_health(state: Dict[str, Any]) -> None:
|
|
1293
|
+
_ensure_parent(PROVIDER_HEALTH_FILE)
|
|
1294
|
+
with open(PROVIDER_HEALTH_FILE, "w", encoding="utf-8") as f:
|
|
1295
|
+
json.dump(state, f, ensure_ascii=False, indent=2)
|
|
1296
|
+
|
|
1297
|
+
|
|
1298
|
+
def provider_in_cooldown(provider: str) -> Tuple[bool, int]:
|
|
1299
|
+
state = _load_provider_health()
|
|
1300
|
+
pstate = state.get(provider, {})
|
|
1301
|
+
cooldown_until = int(pstate.get("cooldown_until", 0) or 0)
|
|
1302
|
+
remaining = cooldown_until - int(time.time())
|
|
1303
|
+
return (remaining > 0, max(0, remaining))
|
|
1304
|
+
|
|
1305
|
+
|
|
1306
|
+
def mark_provider_failure(provider: str, error_message: str) -> Dict[str, Any]:
|
|
1307
|
+
state = _load_provider_health()
|
|
1308
|
+
now = int(time.time())
|
|
1309
|
+
pstate = state.get(provider, {})
|
|
1310
|
+
fail_count = int(pstate.get("failure_count", 0)) + 1
|
|
1311
|
+
cooldown_seconds = COOLDOWN_STEPS_SECONDS[min(fail_count - 1, len(COOLDOWN_STEPS_SECONDS) - 1)]
|
|
1312
|
+
state[provider] = {
|
|
1313
|
+
"failure_count": fail_count,
|
|
1314
|
+
"cooldown_until": now + cooldown_seconds,
|
|
1315
|
+
"cooldown_seconds": cooldown_seconds,
|
|
1316
|
+
"last_error": error_message,
|
|
1317
|
+
"last_failure_at": now,
|
|
1318
|
+
}
|
|
1319
|
+
_save_provider_health(state)
|
|
1320
|
+
return state[provider]
|
|
1321
|
+
|
|
1322
|
+
|
|
1323
|
+
def reset_provider_health(provider: str) -> None:
|
|
1324
|
+
state = _load_provider_health()
|
|
1325
|
+
if provider in state:
|
|
1326
|
+
state.pop(provider, None)
|
|
1327
|
+
_save_provider_health(state)
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
def normalize_result_url(url: str) -> str:
|
|
1331
|
+
if not url:
|
|
1332
|
+
return ""
|
|
1333
|
+
parsed = urlparse(url.strip())
|
|
1334
|
+
netloc = (parsed.netloc or "").lower()
|
|
1335
|
+
if netloc.startswith("www."):
|
|
1336
|
+
netloc = netloc[4:]
|
|
1337
|
+
path = parsed.path.rstrip("/")
|
|
1338
|
+
return f"{netloc}{path}"
|
|
1339
|
+
|
|
1340
|
+
|
|
1341
|
+
def deduplicate_results_across_providers(results_by_provider: List[Tuple[str, Dict[str, Any]]], max_results: int) -> Tuple[List[Dict[str, Any]], int]:
|
|
1342
|
+
deduped = []
|
|
1343
|
+
seen = set()
|
|
1344
|
+
dedup_count = 0
|
|
1345
|
+
for provider_name, data in results_by_provider:
|
|
1346
|
+
for item in data.get("results", []):
|
|
1347
|
+
norm = normalize_result_url(item.get("url", ""))
|
|
1348
|
+
if norm and norm in seen:
|
|
1349
|
+
dedup_count += 1
|
|
1350
|
+
continue
|
|
1351
|
+
if norm:
|
|
1352
|
+
seen.add(norm)
|
|
1353
|
+
item = item.copy()
|
|
1354
|
+
item.setdefault("provider", provider_name)
|
|
1355
|
+
deduped.append(item)
|
|
1356
|
+
if len(deduped) >= max_results:
|
|
1357
|
+
return deduped, dedup_count
|
|
1358
|
+
return deduped, dedup_count
|
|
1359
|
+
|
|
1360
|
+
# =============================================================================
|
|
1361
|
+
# HTTP Client
|
|
1362
|
+
# =============================================================================
|
|
1363
|
+
|
|
1364
|
+
def make_request(url: str, headers: dict, body: dict, timeout: int = 30) -> dict:
|
|
1365
|
+
"""Make HTTP POST request and return JSON response."""
|
|
1366
|
+
# Ensure User-Agent is set (required by some APIs like Exa/Cloudflare)
|
|
1367
|
+
if "User-Agent" not in headers:
|
|
1368
|
+
headers["User-Agent"] = "ClawdBot-WebSearchPlus/2.1"
|
|
1369
|
+
data = json.dumps(body).encode("utf-8")
|
|
1370
|
+
req = Request(url, data=data, headers=headers, method="POST")
|
|
1371
|
+
|
|
1372
|
+
try:
|
|
1373
|
+
with urlopen(req, timeout=timeout) as response:
|
|
1374
|
+
return json.loads(response.read().decode("utf-8"))
|
|
1375
|
+
except HTTPError as e:
|
|
1376
|
+
error_body = e.read().decode("utf-8") if e.fp else str(e)
|
|
1377
|
+
try:
|
|
1378
|
+
error_json = json.loads(error_body)
|
|
1379
|
+
error_detail = error_json.get("error") or error_json.get("message") or error_body
|
|
1380
|
+
except json.JSONDecodeError:
|
|
1381
|
+
error_detail = error_body[:500]
|
|
1382
|
+
|
|
1383
|
+
error_messages = {
|
|
1384
|
+
401: "Invalid or expired API key. Please check your credentials.",
|
|
1385
|
+
403: "Access forbidden. Your API key may not have permission for this operation.",
|
|
1386
|
+
429: "Rate limit exceeded. Please wait a moment and try again.",
|
|
1387
|
+
500: "Server error. The search provider is experiencing issues.",
|
|
1388
|
+
503: "Service unavailable. The search provider may be down."
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
|
|
1392
|
+
raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
|
|
1393
|
+
except URLError as e:
|
|
1394
|
+
reason = str(getattr(e, "reason", e))
|
|
1395
|
+
is_timeout = "timed out" in reason.lower()
|
|
1396
|
+
raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
|
|
1397
|
+
except TimeoutError:
|
|
1398
|
+
raise ProviderRequestError(f"Request timed out after {timeout}s. Try again or reduce max_results.", transient=True)
|
|
1399
|
+
|
|
1400
|
+
|
|
1401
|
+
# =============================================================================
|
|
1402
|
+
# Serper (Google Search API)
|
|
1403
|
+
# =============================================================================
|
|
1404
|
+
|
|
1405
|
+
def search_serper(
|
|
1406
|
+
query: str,
|
|
1407
|
+
api_key: str,
|
|
1408
|
+
max_results: int = 5,
|
|
1409
|
+
country: str = "us",
|
|
1410
|
+
language: str = "en",
|
|
1411
|
+
search_type: str = "search",
|
|
1412
|
+
time_range: Optional[str] = None,
|
|
1413
|
+
include_images: bool = False,
|
|
1414
|
+
) -> dict:
|
|
1415
|
+
"""Search using Serper (Google Search API)."""
|
|
1416
|
+
endpoint = f"https://google.serper.dev/{search_type}"
|
|
1417
|
+
|
|
1418
|
+
body = {
|
|
1419
|
+
"q": query,
|
|
1420
|
+
"gl": country,
|
|
1421
|
+
"hl": language,
|
|
1422
|
+
"num": max_results,
|
|
1423
|
+
"autocorrect": True,
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
if time_range and time_range != "none":
|
|
1427
|
+
tbs_map = {
|
|
1428
|
+
"hour": "qdr:h",
|
|
1429
|
+
"day": "qdr:d",
|
|
1430
|
+
"week": "qdr:w",
|
|
1431
|
+
"month": "qdr:m",
|
|
1432
|
+
"year": "qdr:y",
|
|
1433
|
+
}
|
|
1434
|
+
if time_range in tbs_map:
|
|
1435
|
+
body["tbs"] = tbs_map[time_range]
|
|
1436
|
+
|
|
1437
|
+
headers = {
|
|
1438
|
+
"X-API-KEY": api_key,
|
|
1439
|
+
"Content-Type": "application/json",
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
data = make_request(endpoint, headers, body)
|
|
1443
|
+
|
|
1444
|
+
results = []
|
|
1445
|
+
for i, item in enumerate(data.get("organic", [])[:max_results]):
|
|
1446
|
+
results.append({
|
|
1447
|
+
"title": item.get("title", ""),
|
|
1448
|
+
"url": item.get("link", ""),
|
|
1449
|
+
"snippet": item.get("snippet", ""),
|
|
1450
|
+
"score": round(1.0 - i * 0.1, 2),
|
|
1451
|
+
"date": item.get("date"),
|
|
1452
|
+
})
|
|
1453
|
+
|
|
1454
|
+
answer = ""
|
|
1455
|
+
if data.get("answerBox", {}).get("answer"):
|
|
1456
|
+
answer = data["answerBox"]["answer"]
|
|
1457
|
+
elif data.get("answerBox", {}).get("snippet"):
|
|
1458
|
+
answer = data["answerBox"]["snippet"]
|
|
1459
|
+
elif data.get("knowledgeGraph", {}).get("description"):
|
|
1460
|
+
answer = data["knowledgeGraph"]["description"]
|
|
1461
|
+
elif results:
|
|
1462
|
+
answer = results[0]["snippet"]
|
|
1463
|
+
|
|
1464
|
+
images = []
|
|
1465
|
+
if include_images:
|
|
1466
|
+
try:
|
|
1467
|
+
img_data = make_request(
|
|
1468
|
+
"https://google.serper.dev/images",
|
|
1469
|
+
headers,
|
|
1470
|
+
{"q": query, "gl": country, "hl": language, "num": 5},
|
|
1471
|
+
)
|
|
1472
|
+
images = [img.get("imageUrl", "") for img in img_data.get("images", [])[:5] if img.get("imageUrl")]
|
|
1473
|
+
except Exception:
|
|
1474
|
+
pass
|
|
1475
|
+
|
|
1476
|
+
return {
|
|
1477
|
+
"provider": "serper",
|
|
1478
|
+
"query": query,
|
|
1479
|
+
"results": results,
|
|
1480
|
+
"images": images,
|
|
1481
|
+
"answer": answer,
|
|
1482
|
+
"knowledge_graph": data.get("knowledgeGraph"),
|
|
1483
|
+
"related_searches": [r.get("query") for r in data.get("relatedSearches", [])]
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
|
|
1487
|
+
# =============================================================================
|
|
1488
|
+
# Tavily (Research Search)
|
|
1489
|
+
# =============================================================================
|
|
1490
|
+
|
|
1491
|
+
def search_tavily(
|
|
1492
|
+
query: str,
|
|
1493
|
+
api_key: str,
|
|
1494
|
+
max_results: int = 5,
|
|
1495
|
+
depth: str = "basic",
|
|
1496
|
+
topic: str = "general",
|
|
1497
|
+
include_domains: Optional[List[str]] = None,
|
|
1498
|
+
exclude_domains: Optional[List[str]] = None,
|
|
1499
|
+
include_images: bool = False,
|
|
1500
|
+
include_raw_content: bool = False,
|
|
1501
|
+
) -> dict:
|
|
1502
|
+
"""Search using Tavily (AI Research Search)."""
|
|
1503
|
+
endpoint = "https://api.tavily.com/search"
|
|
1504
|
+
|
|
1505
|
+
body = {
|
|
1506
|
+
"api_key": api_key,
|
|
1507
|
+
"query": query,
|
|
1508
|
+
"max_results": max_results,
|
|
1509
|
+
"search_depth": depth,
|
|
1510
|
+
"topic": topic,
|
|
1511
|
+
"include_images": include_images,
|
|
1512
|
+
"include_answer": True,
|
|
1513
|
+
"include_raw_content": include_raw_content,
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
if include_domains:
|
|
1517
|
+
body["include_domains"] = include_domains
|
|
1518
|
+
if exclude_domains:
|
|
1519
|
+
body["exclude_domains"] = exclude_domains
|
|
1520
|
+
|
|
1521
|
+
headers = {"Content-Type": "application/json"}
|
|
1522
|
+
|
|
1523
|
+
data = make_request(endpoint, headers, body)
|
|
1524
|
+
|
|
1525
|
+
results = []
|
|
1526
|
+
for item in data.get("results", [])[:max_results]:
|
|
1527
|
+
result = {
|
|
1528
|
+
"title": item.get("title", ""),
|
|
1529
|
+
"url": item.get("url", ""),
|
|
1530
|
+
"snippet": item.get("content", ""),
|
|
1531
|
+
"score": round(item.get("score", 0.0), 3),
|
|
1532
|
+
}
|
|
1533
|
+
if include_raw_content and item.get("raw_content"):
|
|
1534
|
+
result["raw_content"] = item["raw_content"]
|
|
1535
|
+
results.append(result)
|
|
1536
|
+
|
|
1537
|
+
return {
|
|
1538
|
+
"provider": "tavily",
|
|
1539
|
+
"query": query,
|
|
1540
|
+
"results": results,
|
|
1541
|
+
"images": data.get("images", []),
|
|
1542
|
+
"answer": data.get("answer", ""),
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
|
|
1546
|
+
# =============================================================================
|
|
1547
|
+
# Exa (Neural/Semantic Search)
|
|
1548
|
+
# =============================================================================
|
|
1549
|
+
|
|
1550
|
+
def search_exa(
|
|
1551
|
+
query: str,
|
|
1552
|
+
api_key: str,
|
|
1553
|
+
max_results: int = 5,
|
|
1554
|
+
search_type: str = "neural",
|
|
1555
|
+
category: Optional[str] = None,
|
|
1556
|
+
start_date: Optional[str] = None,
|
|
1557
|
+
end_date: Optional[str] = None,
|
|
1558
|
+
similar_url: Optional[str] = None,
|
|
1559
|
+
include_domains: Optional[List[str]] = None,
|
|
1560
|
+
exclude_domains: Optional[List[str]] = None,
|
|
1561
|
+
) -> dict:
|
|
1562
|
+
"""Search using Exa (Neural/Semantic Search)."""
|
|
1563
|
+
if similar_url:
|
|
1564
|
+
endpoint = "https://api.exa.ai/findSimilar"
|
|
1565
|
+
body = {
|
|
1566
|
+
"url": similar_url,
|
|
1567
|
+
"numResults": max_results,
|
|
1568
|
+
"contents": {
|
|
1569
|
+
"text": {"maxCharacters": 1000},
|
|
1570
|
+
"highlights": True,
|
|
1571
|
+
},
|
|
1572
|
+
}
|
|
1573
|
+
else:
|
|
1574
|
+
endpoint = "https://api.exa.ai/search"
|
|
1575
|
+
body = {
|
|
1576
|
+
"query": query,
|
|
1577
|
+
"numResults": max_results,
|
|
1578
|
+
"type": search_type,
|
|
1579
|
+
"contents": {
|
|
1580
|
+
"text": {"maxCharacters": 1000},
|
|
1581
|
+
"highlights": True,
|
|
1582
|
+
},
|
|
1583
|
+
}
|
|
1584
|
+
|
|
1585
|
+
if category:
|
|
1586
|
+
body["category"] = category
|
|
1587
|
+
if start_date:
|
|
1588
|
+
body["startPublishedDate"] = start_date
|
|
1589
|
+
if end_date:
|
|
1590
|
+
body["endPublishedDate"] = end_date
|
|
1591
|
+
if include_domains:
|
|
1592
|
+
body["includeDomains"] = include_domains
|
|
1593
|
+
if exclude_domains:
|
|
1594
|
+
body["excludeDomains"] = exclude_domains
|
|
1595
|
+
|
|
1596
|
+
headers = {
|
|
1597
|
+
"x-api-key": api_key,
|
|
1598
|
+
"Content-Type": "application/json",
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
data = make_request(endpoint, headers, body)
|
|
1602
|
+
|
|
1603
|
+
results = []
|
|
1604
|
+
for item in data.get("results", [])[:max_results]:
|
|
1605
|
+
highlights = item.get("highlights", [])
|
|
1606
|
+
snippet = highlights[0] if highlights else (item.get("text", "") or "")[:500]
|
|
1607
|
+
|
|
1608
|
+
results.append({
|
|
1609
|
+
"title": item.get("title", ""),
|
|
1610
|
+
"url": item.get("url", ""),
|
|
1611
|
+
"snippet": snippet,
|
|
1612
|
+
"score": round(item.get("score", 0.0), 3),
|
|
1613
|
+
"published_date": item.get("publishedDate"),
|
|
1614
|
+
"author": item.get("author"),
|
|
1615
|
+
})
|
|
1616
|
+
|
|
1617
|
+
answer = results[0]["snippet"] if results else ""
|
|
1618
|
+
|
|
1619
|
+
return {
|
|
1620
|
+
"provider": "exa",
|
|
1621
|
+
"query": query if not similar_url else f"Similar to: {similar_url}",
|
|
1622
|
+
"results": results,
|
|
1623
|
+
"images": [],
|
|
1624
|
+
"answer": answer,
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
|
|
1628
|
+
# =============================================================================
|
|
1629
|
+
# Perplexity via Kilo Gateway (Synthesized Direct Answers)
|
|
1630
|
+
# =============================================================================
|
|
1631
|
+
|
|
1632
|
+
def search_perplexity(
|
|
1633
|
+
query: str,
|
|
1634
|
+
api_key: str,
|
|
1635
|
+
max_results: int = 5,
|
|
1636
|
+
model: str = "perplexity/sonar-pro",
|
|
1637
|
+
api_url: str = "https://api.kilo.ai/api/gateway/chat/completions",
|
|
1638
|
+
freshness: Optional[str] = None,
|
|
1639
|
+
) -> dict:
|
|
1640
|
+
"""Search/answer using Perplexity Sonar Pro via Kilo Gateway.
|
|
1641
|
+
|
|
1642
|
+
Args:
|
|
1643
|
+
query: Search query
|
|
1644
|
+
api_key: Kilo Gateway API key
|
|
1645
|
+
max_results: Maximum results to return
|
|
1646
|
+
model: Perplexity model to use
|
|
1647
|
+
api_url: Kilo Gateway endpoint
|
|
1648
|
+
freshness: Filter by recency — 'day', 'week', 'month', 'year' (maps to
|
|
1649
|
+
Perplexity's search_recency_filter parameter)
|
|
1650
|
+
"""
|
|
1651
|
+
# Map generic freshness values to Perplexity's search_recency_filter
|
|
1652
|
+
recency_map = {"day": "day", "pd": "day", "week": "week", "pw": "week", "month": "month", "pm": "month", "year": "year", "py": "year"}
|
|
1653
|
+
recency_filter = recency_map.get(freshness or "", None)
|
|
1654
|
+
|
|
1655
|
+
body = {
|
|
1656
|
+
"model": model,
|
|
1657
|
+
"messages": [
|
|
1658
|
+
{"role": "system", "content": "Answer with concise factual summary and include source URLs."},
|
|
1659
|
+
{"role": "user", "content": query},
|
|
1660
|
+
],
|
|
1661
|
+
"temperature": 0.2,
|
|
1662
|
+
}
|
|
1663
|
+
if recency_filter:
|
|
1664
|
+
body["search_recency_filter"] = recency_filter
|
|
1665
|
+
|
|
1666
|
+
headers = {
|
|
1667
|
+
"Authorization": f"Bearer {api_key}",
|
|
1668
|
+
"Content-Type": "application/json",
|
|
1669
|
+
}
|
|
1670
|
+
|
|
1671
|
+
data = make_request(api_url, headers, body)
|
|
1672
|
+
choices = data.get("choices", [])
|
|
1673
|
+
message = choices[0].get("message", {}) if choices else {}
|
|
1674
|
+
answer = (message.get("content") or "").strip()
|
|
1675
|
+
|
|
1676
|
+
urls = re.findall(r"https?://[^\s)\]}>\"']+", answer)
|
|
1677
|
+
unique_urls = []
|
|
1678
|
+
seen = set()
|
|
1679
|
+
for u in urls:
|
|
1680
|
+
if u not in seen:
|
|
1681
|
+
seen.add(u)
|
|
1682
|
+
unique_urls.append(u)
|
|
1683
|
+
|
|
1684
|
+
results = []
|
|
1685
|
+
|
|
1686
|
+
# Primary result: the synthesized answer itself
|
|
1687
|
+
if answer:
|
|
1688
|
+
# Clean citation markers [1][2] for the snippet
|
|
1689
|
+
clean_answer = re.sub(r'\[\d+\]', '', answer).strip()
|
|
1690
|
+
results.append({
|
|
1691
|
+
"title": f"Perplexity Answer: {query[:80]}",
|
|
1692
|
+
"url": "https://www.perplexity.ai",
|
|
1693
|
+
"snippet": clean_answer[:500],
|
|
1694
|
+
"score": 1.0,
|
|
1695
|
+
})
|
|
1696
|
+
|
|
1697
|
+
# Additional results: extracted source URLs
|
|
1698
|
+
for i, u in enumerate(unique_urls[:max_results - 1]):
|
|
1699
|
+
results.append({
|
|
1700
|
+
"title": f"Source {i+1}",
|
|
1701
|
+
"url": u,
|
|
1702
|
+
"snippet": "Referenced source from Perplexity answer",
|
|
1703
|
+
"score": round(0.9 - i * 0.1, 3),
|
|
1704
|
+
})
|
|
1705
|
+
|
|
1706
|
+
return {
|
|
1707
|
+
"provider": "perplexity",
|
|
1708
|
+
"query": query,
|
|
1709
|
+
"results": results,
|
|
1710
|
+
"images": [],
|
|
1711
|
+
"answer": answer,
|
|
1712
|
+
"metadata": {
|
|
1713
|
+
"model": model,
|
|
1714
|
+
"usage": data.get("usage", {}),
|
|
1715
|
+
}
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1718
|
+
|
|
1719
|
+
|
|
1720
|
+
# =============================================================================
|
|
1721
|
+
# You.com (LLM-Ready Web & News Search)
|
|
1722
|
+
# =============================================================================
|
|
1723
|
+
|
|
1724
|
+
def search_you(
|
|
1725
|
+
query: str,
|
|
1726
|
+
api_key: str,
|
|
1727
|
+
max_results: int = 5,
|
|
1728
|
+
country: str = "US",
|
|
1729
|
+
language: str = "en",
|
|
1730
|
+
freshness: Optional[str] = None,
|
|
1731
|
+
safesearch: str = "moderate",
|
|
1732
|
+
include_news: bool = True,
|
|
1733
|
+
livecrawl: Optional[str] = None,
|
|
1734
|
+
) -> dict:
|
|
1735
|
+
"""Search using You.com (LLM-Ready Web & News Search).
|
|
1736
|
+
|
|
1737
|
+
You.com excels at:
|
|
1738
|
+
- RAG applications with pre-extracted snippets
|
|
1739
|
+
- Combined web + news results in one call
|
|
1740
|
+
- Real-time information with automatic news classification
|
|
1741
|
+
- Clean, structured JSON optimized for AI consumption
|
|
1742
|
+
|
|
1743
|
+
Args:
|
|
1744
|
+
query: Search query
|
|
1745
|
+
api_key: You.com API key
|
|
1746
|
+
max_results: Maximum results to return (default 5, max 100)
|
|
1747
|
+
country: ISO 3166-2 country code (e.g., US, GB, DE)
|
|
1748
|
+
language: BCP 47 language code (e.g., en, de, fr)
|
|
1749
|
+
freshness: Filter by recency: day, week, month, year, or YYYY-MM-DDtoYYYY-MM-DD
|
|
1750
|
+
safesearch: Content filter: off, moderate (default), strict
|
|
1751
|
+
include_news: Include news results when relevant (default True)
|
|
1752
|
+
livecrawl: Fetch full page content: "web", "news", or "all"
|
|
1753
|
+
"""
|
|
1754
|
+
endpoint = "https://ydc-index.io/v1/search"
|
|
1755
|
+
|
|
1756
|
+
# Build query parameters
|
|
1757
|
+
params = {
|
|
1758
|
+
"query": query,
|
|
1759
|
+
"count": max_results,
|
|
1760
|
+
"safesearch": safesearch,
|
|
1761
|
+
}
|
|
1762
|
+
|
|
1763
|
+
if country:
|
|
1764
|
+
params["country"] = country.upper()
|
|
1765
|
+
if language:
|
|
1766
|
+
params["language"] = language.upper()
|
|
1767
|
+
if freshness:
|
|
1768
|
+
params["freshness"] = freshness
|
|
1769
|
+
if livecrawl:
|
|
1770
|
+
params["livecrawl"] = livecrawl
|
|
1771
|
+
params["livecrawl_formats"] = "markdown"
|
|
1772
|
+
|
|
1773
|
+
# Build URL with query params (URL-encode values)
|
|
1774
|
+
query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
|
|
1775
|
+
url = f"{endpoint}?{query_string}"
|
|
1776
|
+
|
|
1777
|
+
headers = {
|
|
1778
|
+
"X-API-KEY": api_key,
|
|
1779
|
+
"Accept": "application/json",
|
|
1780
|
+
"User-Agent": "ClawdBot-WebSearchPlus/2.4",
|
|
1781
|
+
}
|
|
1782
|
+
|
|
1783
|
+
# Make GET request (You.com uses GET, not POST)
|
|
1784
|
+
from urllib.request import Request, urlopen
|
|
1785
|
+
req = Request(url, headers=headers, method="GET")
|
|
1786
|
+
|
|
1787
|
+
try:
|
|
1788
|
+
with urlopen(req, timeout=30) as response:
|
|
1789
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1790
|
+
except HTTPError as e:
|
|
1791
|
+
error_body = e.read().decode("utf-8") if e.fp else str(e)
|
|
1792
|
+
try:
|
|
1793
|
+
error_json = json.loads(error_body)
|
|
1794
|
+
error_detail = error_json.get("error") or error_json.get("message") or error_body
|
|
1795
|
+
except json.JSONDecodeError:
|
|
1796
|
+
error_detail = error_body[:500]
|
|
1797
|
+
|
|
1798
|
+
error_messages = {
|
|
1799
|
+
401: "Invalid or expired API key. Get one at https://api.you.com",
|
|
1800
|
+
403: "Access forbidden. Check your API key permissions.",
|
|
1801
|
+
429: "Rate limit exceeded. Please wait and try again.",
|
|
1802
|
+
500: "You.com server error. Try again later.",
|
|
1803
|
+
503: "You.com service unavailable."
|
|
1804
|
+
}
|
|
1805
|
+
friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
|
|
1806
|
+
raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
|
|
1807
|
+
except URLError as e:
|
|
1808
|
+
reason = str(getattr(e, "reason", e))
|
|
1809
|
+
is_timeout = "timed out" in reason.lower()
|
|
1810
|
+
raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
|
|
1811
|
+
except TimeoutError:
|
|
1812
|
+
raise ProviderRequestError("You.com request timed out after 30s.", transient=True)
|
|
1813
|
+
|
|
1814
|
+
# Parse results
|
|
1815
|
+
results_data = data.get("results", {})
|
|
1816
|
+
web_results = results_data.get("web", [])
|
|
1817
|
+
news_results = results_data.get("news", []) if include_news else []
|
|
1818
|
+
metadata = data.get("metadata", {})
|
|
1819
|
+
|
|
1820
|
+
# Normalize web results
|
|
1821
|
+
results = []
|
|
1822
|
+
for i, item in enumerate(web_results[:max_results]):
|
|
1823
|
+
snippets = item.get("snippets", [])
|
|
1824
|
+
snippet = snippets[0] if snippets else item.get("description", "")
|
|
1825
|
+
|
|
1826
|
+
result = {
|
|
1827
|
+
"title": item.get("title", ""),
|
|
1828
|
+
"url": item.get("url", ""),
|
|
1829
|
+
"snippet": snippet,
|
|
1830
|
+
"score": round(1.0 - i * 0.05, 3), # Assign descending score
|
|
1831
|
+
"date": item.get("page_age"),
|
|
1832
|
+
"source": "web",
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
# Include additional snippets if available (great for RAG)
|
|
1836
|
+
if len(snippets) > 1:
|
|
1837
|
+
result["additional_snippets"] = snippets[1:3]
|
|
1838
|
+
|
|
1839
|
+
# Include thumbnail and favicon for UI display
|
|
1840
|
+
if item.get("thumbnail_url"):
|
|
1841
|
+
result["thumbnail"] = item["thumbnail_url"]
|
|
1842
|
+
if item.get("favicon_url"):
|
|
1843
|
+
result["favicon"] = item["favicon_url"]
|
|
1844
|
+
|
|
1845
|
+
# Include live-crawled content if available
|
|
1846
|
+
if item.get("contents"):
|
|
1847
|
+
result["raw_content"] = item["contents"].get("markdown") or item["contents"].get("html", "")
|
|
1848
|
+
|
|
1849
|
+
results.append(result)
|
|
1850
|
+
|
|
1851
|
+
# Add news results (if any)
|
|
1852
|
+
news = []
|
|
1853
|
+
for item in news_results[:5]:
|
|
1854
|
+
news.append({
|
|
1855
|
+
"title": item.get("title", ""),
|
|
1856
|
+
"url": item.get("url", ""),
|
|
1857
|
+
"snippet": item.get("description", ""),
|
|
1858
|
+
"date": item.get("page_age"),
|
|
1859
|
+
"thumbnail": item.get("thumbnail_url"),
|
|
1860
|
+
"source": "news",
|
|
1861
|
+
})
|
|
1862
|
+
|
|
1863
|
+
# Build answer from best snippets
|
|
1864
|
+
answer = ""
|
|
1865
|
+
if results:
|
|
1866
|
+
# Combine top snippets for LLM context
|
|
1867
|
+
top_snippets = []
|
|
1868
|
+
for r in results[:3]:
|
|
1869
|
+
if r.get("snippet"):
|
|
1870
|
+
top_snippets.append(r["snippet"])
|
|
1871
|
+
answer = " ".join(top_snippets)[:1000]
|
|
1872
|
+
|
|
1873
|
+
return {
|
|
1874
|
+
"provider": "you",
|
|
1875
|
+
"query": query,
|
|
1876
|
+
"results": results,
|
|
1877
|
+
"news": news,
|
|
1878
|
+
"images": [],
|
|
1879
|
+
"answer": answer,
|
|
1880
|
+
"metadata": {
|
|
1881
|
+
"search_uuid": metadata.get("search_uuid"),
|
|
1882
|
+
"latency": metadata.get("latency"),
|
|
1883
|
+
}
|
|
1884
|
+
}
|
|
1885
|
+
|
|
1886
|
+
|
|
1887
|
+
# =============================================================================
|
|
1888
|
+
# SearXNG (Privacy-First Meta-Search)
|
|
1889
|
+
# =============================================================================
|
|
1890
|
+
|
|
1891
|
+
def search_searxng(
|
|
1892
|
+
query: str,
|
|
1893
|
+
instance_url: str,
|
|
1894
|
+
max_results: int = 5,
|
|
1895
|
+
categories: Optional[List[str]] = None,
|
|
1896
|
+
engines: Optional[List[str]] = None,
|
|
1897
|
+
language: str = "en",
|
|
1898
|
+
time_range: Optional[str] = None,
|
|
1899
|
+
safesearch: int = 0,
|
|
1900
|
+
) -> dict:
|
|
1901
|
+
"""Search using SearXNG (self-hosted privacy-first meta-search).
|
|
1902
|
+
|
|
1903
|
+
SearXNG excels at:
|
|
1904
|
+
- Privacy-preserving search (no tracking, no profiling)
|
|
1905
|
+
- Multi-source aggregation (70+ upstream engines)
|
|
1906
|
+
- $0 API cost (self-hosted)
|
|
1907
|
+
- Diverse perspectives from multiple search engines
|
|
1908
|
+
|
|
1909
|
+
Args:
|
|
1910
|
+
query: Search query
|
|
1911
|
+
instance_url: URL of your SearXNG instance (required)
|
|
1912
|
+
max_results: Maximum results to return (default 5)
|
|
1913
|
+
categories: Search categories (general, images, news, videos, etc.)
|
|
1914
|
+
engines: Specific engines to use (google, bing, duckduckgo, etc.)
|
|
1915
|
+
language: Language code (e.g., en, de, fr)
|
|
1916
|
+
time_range: Filter by recency: day, week, month, year
|
|
1917
|
+
safesearch: Content filter: 0=off, 1=moderate, 2=strict
|
|
1918
|
+
|
|
1919
|
+
Note:
|
|
1920
|
+
Requires a self-hosted SearXNG instance with JSON format enabled.
|
|
1921
|
+
See: https://docs.searxng.org/admin/installation.html
|
|
1922
|
+
"""
|
|
1923
|
+
# Build URL with query parameters
|
|
1924
|
+
params = {
|
|
1925
|
+
"q": query,
|
|
1926
|
+
"format": "json",
|
|
1927
|
+
"language": language,
|
|
1928
|
+
"safesearch": str(safesearch),
|
|
1929
|
+
}
|
|
1930
|
+
|
|
1931
|
+
if categories:
|
|
1932
|
+
params["categories"] = ",".join(categories)
|
|
1933
|
+
if engines:
|
|
1934
|
+
params["engines"] = ",".join(engines)
|
|
1935
|
+
if time_range:
|
|
1936
|
+
params["time_range"] = time_range
|
|
1937
|
+
|
|
1938
|
+
# Build URL — instance_url comes from operator-controlled config/env only
|
|
1939
|
+
# (validated by _validate_searxng_url), not from agent/LLM input
|
|
1940
|
+
base_url = instance_url.rstrip("/")
|
|
1941
|
+
query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
|
|
1942
|
+
url = f"{base_url}/search?{query_string}"
|
|
1943
|
+
|
|
1944
|
+
headers = {
|
|
1945
|
+
"User-Agent": "ClawdBot-WebSearchPlus/2.5",
|
|
1946
|
+
"Accept": "application/json",
|
|
1947
|
+
}
|
|
1948
|
+
|
|
1949
|
+
# Make GET request
|
|
1950
|
+
req = Request(url, headers=headers, method="GET")
|
|
1951
|
+
|
|
1952
|
+
try:
|
|
1953
|
+
with urlopen(req, timeout=30) as response:
|
|
1954
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
1955
|
+
except HTTPError as e:
|
|
1956
|
+
error_body = e.read().decode("utf-8") if e.fp else str(e)
|
|
1957
|
+
try:
|
|
1958
|
+
error_json = json.loads(error_body)
|
|
1959
|
+
error_detail = error_json.get("error") or error_json.get("message") or error_body
|
|
1960
|
+
except json.JSONDecodeError:
|
|
1961
|
+
error_detail = error_body[:500]
|
|
1962
|
+
|
|
1963
|
+
error_messages = {
|
|
1964
|
+
403: "JSON API disabled on this SearXNG instance. Enable 'json' in search.formats in settings.yml",
|
|
1965
|
+
404: "SearXNG instance not found. Check your instance URL.",
|
|
1966
|
+
500: "SearXNG server error. Check instance health.",
|
|
1967
|
+
503: "SearXNG service unavailable."
|
|
1968
|
+
}
|
|
1969
|
+
friendly_msg = error_messages.get(e.code, f"SearXNG error: {error_detail}")
|
|
1970
|
+
raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
|
|
1971
|
+
except URLError as e:
|
|
1972
|
+
reason = str(getattr(e, "reason", e))
|
|
1973
|
+
is_timeout = "timed out" in reason.lower()
|
|
1974
|
+
raise ProviderRequestError(f"Cannot reach SearXNG instance at {instance_url}. Error: {reason}", transient=is_timeout)
|
|
1975
|
+
except TimeoutError:
|
|
1976
|
+
raise ProviderRequestError(f"SearXNG request timed out after 30s. Check instance health.", transient=True)
|
|
1977
|
+
|
|
1978
|
+
# Parse results
|
|
1979
|
+
raw_results = data.get("results", [])
|
|
1980
|
+
|
|
1981
|
+
# Normalize results to unified format
|
|
1982
|
+
results = []
|
|
1983
|
+
engines_used = set()
|
|
1984
|
+
for i, item in enumerate(raw_results[:max_results]):
|
|
1985
|
+
engine = item.get("engine", "unknown")
|
|
1986
|
+
engines_used.add(engine)
|
|
1987
|
+
|
|
1988
|
+
results.append({
|
|
1989
|
+
"title": item.get("title", ""),
|
|
1990
|
+
"url": item.get("url", ""),
|
|
1991
|
+
"snippet": item.get("content", ""),
|
|
1992
|
+
"score": round(item.get("score", 1.0 - i * 0.05), 3),
|
|
1993
|
+
"engine": engine,
|
|
1994
|
+
"category": item.get("category", "general"),
|
|
1995
|
+
"date": item.get("publishedDate"),
|
|
1996
|
+
})
|
|
1997
|
+
|
|
1998
|
+
# Build answer from answers, infoboxes, or first result
|
|
1999
|
+
answer = ""
|
|
2000
|
+
if data.get("answers"):
|
|
2001
|
+
answer = data["answers"][0] if isinstance(data["answers"][0], str) else str(data["answers"][0])
|
|
2002
|
+
elif data.get("infoboxes"):
|
|
2003
|
+
infobox = data["infoboxes"][0]
|
|
2004
|
+
answer = infobox.get("content", "") or infobox.get("infobox", "")
|
|
2005
|
+
elif results:
|
|
2006
|
+
answer = results[0]["snippet"]
|
|
2007
|
+
|
|
2008
|
+
return {
|
|
2009
|
+
"provider": "searxng",
|
|
2010
|
+
"query": query,
|
|
2011
|
+
"results": results,
|
|
2012
|
+
"images": [],
|
|
2013
|
+
"answer": answer,
|
|
2014
|
+
"suggestions": data.get("suggestions", []),
|
|
2015
|
+
"corrections": data.get("corrections", []),
|
|
2016
|
+
"metadata": {
|
|
2017
|
+
"number_of_results": data.get("number_of_results"),
|
|
2018
|
+
"engines_used": list(engines_used),
|
|
2019
|
+
"instance_url": instance_url,
|
|
2020
|
+
}
|
|
2021
|
+
}
|
|
2022
|
+
|
|
2023
|
+
|
|
2024
|
+
# =============================================================================
|
|
2025
|
+
# CLI
|
|
2026
|
+
# =============================================================================
|
|
2027
|
+
|
|
2028
|
+
def main():
|
|
2029
|
+
config = load_config()
|
|
2030
|
+
|
|
2031
|
+
parser = argparse.ArgumentParser(
|
|
2032
|
+
description="Web Search Plus — Intelligent multi-provider search with smart auto-routing",
|
|
2033
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
2034
|
+
epilog="""
|
|
2035
|
+
Intelligent Auto-Routing:
|
|
2036
|
+
The query is analyzed using multi-signal detection to find the optimal provider:
|
|
2037
|
+
|
|
2038
|
+
Shopping Intent → Serper (Google)
|
|
2039
|
+
"how much", "price of", "buy", product+brand combos, deals, specs
|
|
2040
|
+
|
|
2041
|
+
Research Intent → Tavily
|
|
2042
|
+
"how does", "explain", "what is", analysis, pros/cons, tutorials
|
|
2043
|
+
|
|
2044
|
+
Discovery Intent → Exa (Neural)
|
|
2045
|
+
"similar to", "companies like", "alternatives", URLs, startups, papers
|
|
2046
|
+
|
|
2047
|
+
Direct Answer Intent → Perplexity (via Kilo Gateway)
|
|
2048
|
+
"what is", "current status", local events, synthesized up-to-date answers
|
|
2049
|
+
|
|
2050
|
+
Examples:
|
|
2051
|
+
python3 search.py -q "iPhone 16 Pro Max price" # → Serper (shopping)
|
|
2052
|
+
python3 search.py -q "how does HTTPS encryption work" # → Tavily (research)
|
|
2053
|
+
python3 search.py -q "startups similar to Notion" # → Exa (discovery)
|
|
2054
|
+
python3 search.py --explain-routing -q "your query" # Debug routing
|
|
2055
|
+
|
|
2056
|
+
Full docs: See README.md and SKILL.md
|
|
2057
|
+
""",
|
|
2058
|
+
)
|
|
2059
|
+
|
|
2060
|
+
# Common arguments
|
|
2061
|
+
parser.add_argument(
|
|
2062
|
+
"--provider", "-p",
|
|
2063
|
+
choices=["serper", "tavily", "exa", "perplexity", "you", "searxng", "auto"],
|
|
2064
|
+
help="Search provider (auto=intelligent routing)"
|
|
2065
|
+
)
|
|
2066
|
+
parser.add_argument(
|
|
2067
|
+
"--query", "-q",
|
|
2068
|
+
help="Search query"
|
|
2069
|
+
)
|
|
2070
|
+
parser.add_argument(
|
|
2071
|
+
"--max-results", "-n",
|
|
2072
|
+
type=int,
|
|
2073
|
+
default=config.get("defaults", {}).get("max_results", 5),
|
|
2074
|
+
help="Maximum results (default: 5)"
|
|
2075
|
+
)
|
|
2076
|
+
parser.add_argument(
|
|
2077
|
+
"--images",
|
|
2078
|
+
action="store_true",
|
|
2079
|
+
help="Include images (Serper/Tavily)"
|
|
2080
|
+
)
|
|
2081
|
+
|
|
2082
|
+
# Auto-routing options
|
|
2083
|
+
parser.add_argument(
|
|
2084
|
+
"--auto", "-a",
|
|
2085
|
+
action="store_true",
|
|
2086
|
+
help="Use intelligent auto-routing (default when no provider specified)"
|
|
2087
|
+
)
|
|
2088
|
+
parser.add_argument(
|
|
2089
|
+
"--explain-routing",
|
|
2090
|
+
action="store_true",
|
|
2091
|
+
help="Show detailed routing analysis (debug mode)"
|
|
2092
|
+
)
|
|
2093
|
+
|
|
2094
|
+
# Serper-specific
|
|
2095
|
+
serper_config = config.get("serper", {})
|
|
2096
|
+
parser.add_argument("--country", default=serper_config.get("country", "us"))
|
|
2097
|
+
parser.add_argument("--language", default=serper_config.get("language", "en"))
|
|
2098
|
+
parser.add_argument(
|
|
2099
|
+
"--type",
|
|
2100
|
+
dest="search_type",
|
|
2101
|
+
default=serper_config.get("type", "search"),
|
|
2102
|
+
choices=["search", "news", "images", "videos", "places", "shopping"]
|
|
2103
|
+
)
|
|
2104
|
+
parser.add_argument(
|
|
2105
|
+
"--time-range",
|
|
2106
|
+
choices=["hour", "day", "week", "month", "year"]
|
|
2107
|
+
)
|
|
2108
|
+
|
|
2109
|
+
# Tavily-specific
|
|
2110
|
+
tavily_config = config.get("tavily", {})
|
|
2111
|
+
parser.add_argument(
|
|
2112
|
+
"--depth",
|
|
2113
|
+
default=tavily_config.get("depth", "basic"),
|
|
2114
|
+
choices=["basic", "advanced"]
|
|
2115
|
+
)
|
|
2116
|
+
parser.add_argument(
|
|
2117
|
+
"--topic",
|
|
2118
|
+
default=tavily_config.get("topic", "general"),
|
|
2119
|
+
choices=["general", "news"]
|
|
2120
|
+
)
|
|
2121
|
+
parser.add_argument("--raw-content", action="store_true")
|
|
2122
|
+
|
|
2123
|
+
# Exa-specific
|
|
2124
|
+
exa_config = config.get("exa", {})
|
|
2125
|
+
parser.add_argument(
|
|
2126
|
+
"--exa-type",
|
|
2127
|
+
default=exa_config.get("type", "neural"),
|
|
2128
|
+
choices=["neural", "keyword"]
|
|
2129
|
+
)
|
|
2130
|
+
parser.add_argument(
|
|
2131
|
+
"--category",
|
|
2132
|
+
choices=[
|
|
2133
|
+
"company", "research paper", "news", "pdf", "github",
|
|
2134
|
+
"tweet", "personal site", "linkedin profile"
|
|
2135
|
+
]
|
|
2136
|
+
)
|
|
2137
|
+
parser.add_argument("--start-date")
|
|
2138
|
+
parser.add_argument("--end-date")
|
|
2139
|
+
parser.add_argument("--similar-url")
|
|
2140
|
+
|
|
2141
|
+
# You.com-specific
|
|
2142
|
+
you_config = config.get("you", {})
|
|
2143
|
+
parser.add_argument(
|
|
2144
|
+
"--you-safesearch",
|
|
2145
|
+
default=you_config.get("safesearch", "moderate"),
|
|
2146
|
+
choices=["off", "moderate", "strict"],
|
|
2147
|
+
help="You.com SafeSearch filter"
|
|
2148
|
+
)
|
|
2149
|
+
parser.add_argument(
|
|
2150
|
+
"--freshness",
|
|
2151
|
+
choices=["day", "week", "month", "year"],
|
|
2152
|
+
help="Filter results by recency (You.com/Serper)"
|
|
2153
|
+
)
|
|
2154
|
+
parser.add_argument(
|
|
2155
|
+
"--livecrawl",
|
|
2156
|
+
choices=["web", "news", "all"],
|
|
2157
|
+
help="You.com: fetch full page content"
|
|
2158
|
+
)
|
|
2159
|
+
parser.add_argument(
|
|
2160
|
+
"--include-news",
|
|
2161
|
+
action="store_true",
|
|
2162
|
+
default=True,
|
|
2163
|
+
help="You.com: include news results (default: true)"
|
|
2164
|
+
)
|
|
2165
|
+
|
|
2166
|
+
# SearXNG-specific
|
|
2167
|
+
searxng_config = config.get("searxng", {})
|
|
2168
|
+
parser.add_argument(
|
|
2169
|
+
"--searxng-url",
|
|
2170
|
+
default=searxng_config.get("instance_url"),
|
|
2171
|
+
help="SearXNG instance URL (e.g., https://searx.example.com)"
|
|
2172
|
+
)
|
|
2173
|
+
parser.add_argument(
|
|
2174
|
+
"--searxng-safesearch",
|
|
2175
|
+
type=int,
|
|
2176
|
+
default=searxng_config.get("safesearch", 0),
|
|
2177
|
+
choices=[0, 1, 2],
|
|
2178
|
+
help="SearXNG SafeSearch: 0=off, 1=moderate, 2=strict"
|
|
2179
|
+
)
|
|
2180
|
+
parser.add_argument(
|
|
2181
|
+
"--engines",
|
|
2182
|
+
nargs="+",
|
|
2183
|
+
default=searxng_config.get("engines"),
|
|
2184
|
+
help="SearXNG: specific engines to use (e.g., google bing duckduckgo)"
|
|
2185
|
+
)
|
|
2186
|
+
parser.add_argument(
|
|
2187
|
+
"--categories",
|
|
2188
|
+
nargs="+",
|
|
2189
|
+
help="SearXNG: search categories (general, images, news, videos, etc.)"
|
|
2190
|
+
)
|
|
2191
|
+
|
|
2192
|
+
# Domain filters
|
|
2193
|
+
parser.add_argument("--include-domains", nargs="+")
|
|
2194
|
+
parser.add_argument("--exclude-domains", nargs="+")
|
|
2195
|
+
|
|
2196
|
+
# Output
|
|
2197
|
+
parser.add_argument("--compact", action="store_true")
|
|
2198
|
+
|
|
2199
|
+
# Caching options
|
|
2200
|
+
parser.add_argument(
|
|
2201
|
+
"--cache-ttl",
|
|
2202
|
+
type=int,
|
|
2203
|
+
default=DEFAULT_CACHE_TTL,
|
|
2204
|
+
help=f"Cache TTL in seconds (default: {DEFAULT_CACHE_TTL} = 1 hour)"
|
|
2205
|
+
)
|
|
2206
|
+
parser.add_argument(
|
|
2207
|
+
"--no-cache",
|
|
2208
|
+
action="store_true",
|
|
2209
|
+
help="Bypass cache (always fetch fresh results)"
|
|
2210
|
+
)
|
|
2211
|
+
parser.add_argument(
|
|
2212
|
+
"--clear-cache",
|
|
2213
|
+
action="store_true",
|
|
2214
|
+
help="Clear all cached results and exit"
|
|
2215
|
+
)
|
|
2216
|
+
parser.add_argument(
|
|
2217
|
+
"--cache-stats",
|
|
2218
|
+
action="store_true",
|
|
2219
|
+
help="Show cache statistics and exit"
|
|
2220
|
+
)
|
|
2221
|
+
|
|
2222
|
+
args = parser.parse_args()
|
|
2223
|
+
|
|
2224
|
+
# Handle cache management commands first (before query validation)
|
|
2225
|
+
if args.clear_cache:
|
|
2226
|
+
result = cache_clear()
|
|
2227
|
+
indent = None if args.compact else 2
|
|
2228
|
+
print(json.dumps(result, indent=indent, ensure_ascii=False))
|
|
2229
|
+
return
|
|
2230
|
+
|
|
2231
|
+
if args.cache_stats:
|
|
2232
|
+
result = cache_stats()
|
|
2233
|
+
indent = None if args.compact else 2
|
|
2234
|
+
print(json.dumps(result, indent=indent, ensure_ascii=False))
|
|
2235
|
+
return
|
|
2236
|
+
|
|
2237
|
+
if not args.query and not args.similar_url:
|
|
2238
|
+
parser.error("--query is required (unless using --similar-url with Exa)")
|
|
2239
|
+
|
|
2240
|
+
# Handle --explain-routing
|
|
2241
|
+
if args.explain_routing:
|
|
2242
|
+
if not args.query:
|
|
2243
|
+
parser.error("--query is required for --explain-routing")
|
|
2244
|
+
explanation = explain_routing(args.query, config)
|
|
2245
|
+
indent = None if args.compact else 2
|
|
2246
|
+
print(json.dumps(explanation, indent=indent, ensure_ascii=False))
|
|
2247
|
+
return
|
|
2248
|
+
|
|
2249
|
+
# Determine provider
|
|
2250
|
+
if args.provider == "auto" or (args.provider is None and not args.similar_url):
|
|
2251
|
+
if args.query:
|
|
2252
|
+
routing = auto_route_provider(args.query, config)
|
|
2253
|
+
provider = routing["provider"]
|
|
2254
|
+
routing_info = {
|
|
2255
|
+
"auto_routed": True,
|
|
2256
|
+
"provider": provider,
|
|
2257
|
+
"confidence": routing["confidence"],
|
|
2258
|
+
"confidence_level": routing["confidence_level"],
|
|
2259
|
+
"reason": routing["reason"],
|
|
2260
|
+
"top_signals": routing["top_signals"],
|
|
2261
|
+
"scores": routing["scores"],
|
|
2262
|
+
}
|
|
2263
|
+
else:
|
|
2264
|
+
provider = "exa"
|
|
2265
|
+
routing_info = {
|
|
2266
|
+
"auto_routed": True,
|
|
2267
|
+
"provider": "exa",
|
|
2268
|
+
"confidence": 1.0,
|
|
2269
|
+
"confidence_level": "high",
|
|
2270
|
+
"reason": "similar_url_specified",
|
|
2271
|
+
}
|
|
2272
|
+
else:
|
|
2273
|
+
provider = args.provider or "serper"
|
|
2274
|
+
routing_info = {"auto_routed": False, "provider": provider}
|
|
2275
|
+
|
|
2276
|
+
# Build provider fallback list
|
|
2277
|
+
auto_config = config.get("auto_routing", {})
|
|
2278
|
+
provider_priority = auto_config.get("provider_priority", ["tavily", "exa", "perplexity", "serper"])
|
|
2279
|
+
disabled_providers = auto_config.get("disabled_providers", [])
|
|
2280
|
+
|
|
2281
|
+
# Start with the selected provider, then try others in priority order
|
|
2282
|
+
providers_to_try = [provider]
|
|
2283
|
+
for p in provider_priority:
|
|
2284
|
+
if p not in providers_to_try and p not in disabled_providers:
|
|
2285
|
+
providers_to_try.append(p)
|
|
2286
|
+
|
|
2287
|
+
# Skip providers currently in cooldown
|
|
2288
|
+
eligible_providers = []
|
|
2289
|
+
cooldown_skips = []
|
|
2290
|
+
for p in providers_to_try:
|
|
2291
|
+
in_cd, remaining = provider_in_cooldown(p)
|
|
2292
|
+
if in_cd:
|
|
2293
|
+
cooldown_skips.append({"provider": p, "cooldown_remaining_seconds": remaining})
|
|
2294
|
+
else:
|
|
2295
|
+
eligible_providers.append(p)
|
|
2296
|
+
|
|
2297
|
+
if not eligible_providers:
|
|
2298
|
+
eligible_providers = providers_to_try[:1]
|
|
2299
|
+
|
|
2300
|
+
# Helper function to execute search for a provider
|
|
2301
|
+
def execute_search(prov: str) -> Dict[str, Any]:
|
|
2302
|
+
key = validate_api_key(prov, config)
|
|
2303
|
+
if prov == "serper":
|
|
2304
|
+
return search_serper(
|
|
2305
|
+
query=args.query,
|
|
2306
|
+
api_key=key,
|
|
2307
|
+
max_results=args.max_results,
|
|
2308
|
+
country=args.country,
|
|
2309
|
+
language=args.language,
|
|
2310
|
+
search_type=args.search_type,
|
|
2311
|
+
time_range=args.time_range,
|
|
2312
|
+
include_images=args.images,
|
|
2313
|
+
)
|
|
2314
|
+
elif prov == "tavily":
|
|
2315
|
+
return search_tavily(
|
|
2316
|
+
query=args.query,
|
|
2317
|
+
api_key=key,
|
|
2318
|
+
max_results=args.max_results,
|
|
2319
|
+
depth=args.depth,
|
|
2320
|
+
topic=args.topic,
|
|
2321
|
+
include_domains=args.include_domains,
|
|
2322
|
+
exclude_domains=args.exclude_domains,
|
|
2323
|
+
include_images=args.images,
|
|
2324
|
+
include_raw_content=args.raw_content,
|
|
2325
|
+
)
|
|
2326
|
+
elif prov == "exa":
|
|
2327
|
+
return search_exa(
|
|
2328
|
+
query=args.query or "",
|
|
2329
|
+
api_key=key,
|
|
2330
|
+
max_results=args.max_results,
|
|
2331
|
+
search_type=args.exa_type,
|
|
2332
|
+
category=args.category,
|
|
2333
|
+
start_date=args.start_date,
|
|
2334
|
+
end_date=args.end_date,
|
|
2335
|
+
similar_url=args.similar_url,
|
|
2336
|
+
include_domains=args.include_domains,
|
|
2337
|
+
exclude_domains=args.exclude_domains,
|
|
2338
|
+
)
|
|
2339
|
+
elif prov == "perplexity":
|
|
2340
|
+
perplexity_config = config.get("perplexity", {})
|
|
2341
|
+
return search_perplexity(
|
|
2342
|
+
query=args.query,
|
|
2343
|
+
api_key=key,
|
|
2344
|
+
max_results=args.max_results,
|
|
2345
|
+
model=perplexity_config.get("model", "perplexity/sonar-pro"),
|
|
2346
|
+
api_url=perplexity_config.get("api_url", "https://api.kilo.ai/api/gateway/chat/completions"),
|
|
2347
|
+
freshness=getattr(args, "freshness", None),
|
|
2348
|
+
)
|
|
2349
|
+
elif prov == "you":
|
|
2350
|
+
return search_you(
|
|
2351
|
+
query=args.query,
|
|
2352
|
+
api_key=key,
|
|
2353
|
+
max_results=args.max_results,
|
|
2354
|
+
country=args.country,
|
|
2355
|
+
language=args.language,
|
|
2356
|
+
freshness=args.freshness,
|
|
2357
|
+
safesearch=args.you_safesearch,
|
|
2358
|
+
include_news=args.include_news,
|
|
2359
|
+
livecrawl=args.livecrawl,
|
|
2360
|
+
)
|
|
2361
|
+
elif prov == "searxng":
|
|
2362
|
+
# For SearXNG, 'key' is actually the instance URL
|
|
2363
|
+
instance_url = args.searxng_url or key
|
|
2364
|
+
if instance_url:
|
|
2365
|
+
instance_url = _validate_searxng_url(instance_url)
|
|
2366
|
+
return search_searxng(
|
|
2367
|
+
query=args.query,
|
|
2368
|
+
instance_url=instance_url,
|
|
2369
|
+
max_results=args.max_results,
|
|
2370
|
+
categories=args.categories,
|
|
2371
|
+
engines=args.engines,
|
|
2372
|
+
language=args.language,
|
|
2373
|
+
time_range=args.time_range,
|
|
2374
|
+
safesearch=args.searxng_safesearch,
|
|
2375
|
+
)
|
|
2376
|
+
else:
|
|
2377
|
+
raise ValueError(f"Unknown provider: {prov}")
|
|
2378
|
+
|
|
2379
|
+
def execute_with_retry(prov: str) -> Dict[str, Any]:
|
|
2380
|
+
last_error = None
|
|
2381
|
+
for attempt in range(0, 3):
|
|
2382
|
+
try:
|
|
2383
|
+
return execute_search(prov)
|
|
2384
|
+
except ProviderRequestError as e:
|
|
2385
|
+
last_error = e
|
|
2386
|
+
if e.status_code in {401, 403}:
|
|
2387
|
+
break
|
|
2388
|
+
if not e.transient:
|
|
2389
|
+
break
|
|
2390
|
+
if attempt < 2:
|
|
2391
|
+
time.sleep(RETRY_BACKOFF_SECONDS[attempt])
|
|
2392
|
+
continue
|
|
2393
|
+
break
|
|
2394
|
+
except Exception as e:
|
|
2395
|
+
last_error = e
|
|
2396
|
+
break
|
|
2397
|
+
raise last_error if last_error else Exception("Unknown provider execution error")
|
|
2398
|
+
|
|
2399
|
+
cache_context = {
|
|
2400
|
+
"locale": f"{args.country}:{args.language}",
|
|
2401
|
+
"freshness": args.freshness,
|
|
2402
|
+
"time_range": args.time_range,
|
|
2403
|
+
"topic": args.topic,
|
|
2404
|
+
"search_engines": sorted(args.engines) if args.engines else None,
|
|
2405
|
+
"include_news": bool(args.include_news),
|
|
2406
|
+
"search_type": args.search_type,
|
|
2407
|
+
"exa_type": args.exa_type,
|
|
2408
|
+
"category": args.category,
|
|
2409
|
+
"similar_url": args.similar_url,
|
|
2410
|
+
}
|
|
2411
|
+
|
|
2412
|
+
# Check cache first (unless --no-cache is set)
|
|
2413
|
+
cached_result = None
|
|
2414
|
+
cache_hit = False
|
|
2415
|
+
if not args.no_cache and args.query:
|
|
2416
|
+
cached_result = cache_get(
|
|
2417
|
+
query=args.query,
|
|
2418
|
+
provider=provider,
|
|
2419
|
+
max_results=args.max_results,
|
|
2420
|
+
ttl=args.cache_ttl,
|
|
2421
|
+
params=cache_context,
|
|
2422
|
+
)
|
|
2423
|
+
if cached_result:
|
|
2424
|
+
cache_hit = True
|
|
2425
|
+
result = {k: v for k, v in cached_result.items() if not k.startswith("_cache_")}
|
|
2426
|
+
result["cached"] = True
|
|
2427
|
+
result["cache_age_seconds"] = int(time.time() - cached_result.get("_cache_timestamp", 0))
|
|
2428
|
+
|
|
2429
|
+
errors = []
|
|
2430
|
+
successful_provider = None
|
|
2431
|
+
successful_results: List[Tuple[str, Dict[str, Any]]] = []
|
|
2432
|
+
result = None if not cache_hit else result
|
|
2433
|
+
|
|
2434
|
+
for idx, current_provider in enumerate(eligible_providers):
|
|
2435
|
+
if cache_hit:
|
|
2436
|
+
successful_provider = provider
|
|
2437
|
+
break
|
|
2438
|
+
try:
|
|
2439
|
+
provider_result = execute_with_retry(current_provider)
|
|
2440
|
+
reset_provider_health(current_provider)
|
|
2441
|
+
successful_results.append((current_provider, provider_result))
|
|
2442
|
+
successful_provider = current_provider
|
|
2443
|
+
|
|
2444
|
+
# If we have enough results, stop.
|
|
2445
|
+
if len(provider_result.get("results", [])) >= args.max_results:
|
|
2446
|
+
break
|
|
2447
|
+
|
|
2448
|
+
# Only continue collecting from lower-priority providers when fallback was needed.
|
|
2449
|
+
if not errors:
|
|
2450
|
+
break
|
|
2451
|
+
except Exception as e:
|
|
2452
|
+
error_msg = str(e)
|
|
2453
|
+
cooldown_info = mark_provider_failure(current_provider, error_msg)
|
|
2454
|
+
errors.append({
|
|
2455
|
+
"provider": current_provider,
|
|
2456
|
+
"error": error_msg,
|
|
2457
|
+
"cooldown_seconds": cooldown_info.get("cooldown_seconds"),
|
|
2458
|
+
})
|
|
2459
|
+
if len(eligible_providers) > 1:
|
|
2460
|
+
remaining = eligible_providers[idx + 1:]
|
|
2461
|
+
if remaining:
|
|
2462
|
+
print(json.dumps({
|
|
2463
|
+
"fallback": True,
|
|
2464
|
+
"failed_provider": current_provider,
|
|
2465
|
+
"error": error_msg,
|
|
2466
|
+
"trying_next": remaining[0],
|
|
2467
|
+
}), file=sys.stderr)
|
|
2468
|
+
continue
|
|
2469
|
+
|
|
2470
|
+
if successful_results:
|
|
2471
|
+
if len(successful_results) == 1:
|
|
2472
|
+
result = successful_results[0][1]
|
|
2473
|
+
else:
|
|
2474
|
+
primary = successful_results[0][1].copy()
|
|
2475
|
+
deduped_results, dedup_count = deduplicate_results_across_providers(successful_results, args.max_results)
|
|
2476
|
+
primary["results"] = deduped_results
|
|
2477
|
+
primary["deduplicated"] = dedup_count > 0
|
|
2478
|
+
primary.setdefault("metadata", {})
|
|
2479
|
+
primary["metadata"]["dedup_count"] = dedup_count
|
|
2480
|
+
primary["metadata"]["providers_merged"] = [p for p, _ in successful_results]
|
|
2481
|
+
result = primary
|
|
2482
|
+
|
|
2483
|
+
if result is not None:
|
|
2484
|
+
if successful_provider != provider:
|
|
2485
|
+
routing_info["fallback_used"] = True
|
|
2486
|
+
routing_info["original_provider"] = provider
|
|
2487
|
+
routing_info["provider"] = successful_provider
|
|
2488
|
+
routing_info["fallback_errors"] = errors
|
|
2489
|
+
|
|
2490
|
+
if cooldown_skips:
|
|
2491
|
+
routing_info["cooldown_skips"] = cooldown_skips
|
|
2492
|
+
|
|
2493
|
+
result["routing"] = routing_info
|
|
2494
|
+
|
|
2495
|
+
if not cache_hit and not args.no_cache and args.query:
|
|
2496
|
+
cache_put(
|
|
2497
|
+
query=args.query,
|
|
2498
|
+
provider=successful_provider or provider,
|
|
2499
|
+
max_results=args.max_results,
|
|
2500
|
+
result=result,
|
|
2501
|
+
params=cache_context,
|
|
2502
|
+
)
|
|
2503
|
+
|
|
2504
|
+
result["cached"] = bool(cache_hit)
|
|
2505
|
+
if "deduplicated" not in result:
|
|
2506
|
+
result["deduplicated"] = False
|
|
2507
|
+
result.setdefault("metadata", {})
|
|
2508
|
+
result["metadata"].setdefault("dedup_count", 0)
|
|
2509
|
+
|
|
2510
|
+
indent = None if args.compact else 2
|
|
2511
|
+
print(json.dumps(result, indent=indent, ensure_ascii=False))
|
|
2512
|
+
else:
|
|
2513
|
+
error_result = {
|
|
2514
|
+
"error": "All providers failed",
|
|
2515
|
+
"provider": provider,
|
|
2516
|
+
"query": args.query,
|
|
2517
|
+
"routing": routing_info,
|
|
2518
|
+
"provider_errors": errors,
|
|
2519
|
+
"cooldown_skips": cooldown_skips,
|
|
2520
|
+
}
|
|
2521
|
+
print(json.dumps(error_result, indent=2), file=sys.stderr)
|
|
2522
|
+
sys.exit(1)
|
|
2523
|
+
|
|
2524
|
+
|
|
2525
|
+
if __name__ == "__main__":
|
|
2526
|
+
main()
|