web-search-plus-plugin 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2526 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Web Search Plus — Unified Multi-Provider Search with Intelligent Auto-Routing
4
+ Supports: Serper (Google), Tavily (Research), Exa (Neural), Perplexity (Direct Answers)
5
+
6
+ Smart Routing uses multi-signal analysis:
7
+ - Query intent classification (shopping, research, discovery)
8
+ - Linguistic pattern detection (how much vs how does)
9
+ - Product/brand recognition
10
+ - URL detection
11
+ - Confidence scoring
12
+
13
+ Usage:
14
+ python3 search.py --query "..." # Auto-route based on query
15
+ python3 search.py --provider [serper|tavily|exa] --query "..." [options]
16
+
17
+ Examples:
18
+ python3 search.py -q "iPhone 16 Pro price" # → Serper (shopping intent)
19
+ python3 search.py -q "how does quantum entanglement work" # → Tavily (research intent)
20
+ python3 search.py -q "startups similar to Notion" # → Exa (discovery intent)
21
+ """
22
+
23
+ import argparse
24
+ import hashlib
25
+ import json
26
+ import os
27
+ import re
28
+ import sys
29
+ import time
30
+ from pathlib import Path
31
+ from typing import Optional, List, Dict, Any, Tuple
32
+ from urllib.request import Request, urlopen
33
+ from urllib.error import HTTPError, URLError
34
+ from urllib.parse import quote, urlparse
35
+
36
+
37
+ # =============================================================================
38
+ # Result Caching
39
+ # =============================================================================
40
+
41
+ CACHE_DIR = Path(os.environ.get("WSP_CACHE_DIR", os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".cache")))
42
+ PROVIDER_HEALTH_FILE = CACHE_DIR / "provider_health.json"
43
+ DEFAULT_CACHE_TTL = 3600 # 1 hour in seconds
44
+
45
+
46
+ def _build_cache_payload(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
47
+ """Build normalized payload used for cache key hashing."""
48
+ payload = {
49
+ "query": query,
50
+ "provider": provider,
51
+ "max_results": max_results,
52
+ }
53
+ if params:
54
+ payload.update(params)
55
+ return payload
56
+
57
+
58
+ def _get_cache_key(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> str:
59
+ """Generate a unique cache key from all relevant query parameters."""
60
+ payload = _build_cache_payload(query, provider, max_results, params)
61
+ key_string = json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
62
+ return hashlib.sha256(key_string.encode("utf-8")).hexdigest()[:32]
63
+
64
+
65
+ def _get_cache_path(cache_key: str) -> Path:
66
+ """Get the file path for a cache entry."""
67
+ return CACHE_DIR / f"{cache_key}.json"
68
+
69
+
70
+ def _ensure_cache_dir() -> None:
71
+ """Create cache directory if it doesn't exist."""
72
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
73
+
74
+
75
+ def cache_get(query: str, provider: str, max_results: int, ttl: int = DEFAULT_CACHE_TTL, params: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
76
+ """
77
+ Retrieve cached search results if they exist and are not expired.
78
+
79
+ Args:
80
+ query: The search query
81
+ provider: The search provider
82
+ max_results: Maximum results requested
83
+ ttl: Time-to-live in seconds (default: 1 hour)
84
+
85
+ Returns:
86
+ Cached result dict or None if not found/expired
87
+ """
88
+ cache_key = _get_cache_key(query, provider, max_results, params)
89
+ cache_path = _get_cache_path(cache_key)
90
+
91
+ if not cache_path.exists():
92
+ return None
93
+
94
+ try:
95
+ with open(cache_path, "r", encoding="utf-8") as f:
96
+ cached = json.load(f)
97
+
98
+ cached_time = cached.get("_cache_timestamp", 0)
99
+ if time.time() - cached_time > ttl:
100
+ # Cache expired, remove it
101
+ cache_path.unlink(missing_ok=True)
102
+ return None
103
+
104
+ return cached
105
+ except (json.JSONDecodeError, IOError, KeyError):
106
+ # Corrupted cache file, remove it
107
+ cache_path.unlink(missing_ok=True)
108
+ return None
109
+
110
+
111
+ def cache_put(query: str, provider: str, max_results: int, result: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> None:
112
+ """
113
+ Store search results in cache.
114
+
115
+ Args:
116
+ query: The search query
117
+ provider: The search provider
118
+ max_results: Maximum results requested
119
+ result: The search result to cache
120
+ """
121
+ _ensure_cache_dir()
122
+
123
+ cache_key = _get_cache_key(query, provider, max_results, params)
124
+ cache_path = _get_cache_path(cache_key)
125
+
126
+ # Add cache metadata
127
+ cached_result = result.copy()
128
+ cached_result["_cache_timestamp"] = time.time()
129
+ cached_result["_cache_key"] = cache_key
130
+ cached_result["_cache_query"] = query
131
+ cached_result["_cache_provider"] = provider
132
+ cached_result["_cache_max_results"] = max_results
133
+ cached_result["_cache_params"] = params or {}
134
+
135
+ try:
136
+ with open(cache_path, "w", encoding="utf-8") as f:
137
+ json.dump(cached_result, f, ensure_ascii=False, indent=2)
138
+ except IOError as e:
139
+ # Non-fatal: log to stderr but don't fail
140
+ print(json.dumps({"cache_write_error": str(e)}), file=sys.stderr)
141
+
142
+
143
+ def cache_clear() -> Dict[str, Any]:
144
+ """
145
+ Clear all cached results.
146
+
147
+ Returns:
148
+ Stats about what was cleared
149
+ """
150
+ if not CACHE_DIR.exists():
151
+ return {"cleared": 0, "message": "Cache directory does not exist"}
152
+
153
+ count = 0
154
+ size_freed = 0
155
+
156
+ for cache_file in CACHE_DIR.glob("*.json"):
157
+ if cache_file.name == PROVIDER_HEALTH_FILE.name:
158
+ continue
159
+ try:
160
+ size_freed += cache_file.stat().st_size
161
+ cache_file.unlink()
162
+ count += 1
163
+ except IOError:
164
+ pass
165
+
166
+ return {
167
+ "cleared": count,
168
+ "size_freed_bytes": size_freed,
169
+ "size_freed_kb": round(size_freed / 1024, 2),
170
+ "message": f"Cleared {count} cached entries"
171
+ }
172
+
173
+
174
+ def cache_stats() -> Dict[str, Any]:
175
+ """
176
+ Get statistics about the cache.
177
+
178
+ Returns:
179
+ Dict with cache statistics
180
+ """
181
+ if not CACHE_DIR.exists():
182
+ return {
183
+ "total_entries": 0,
184
+ "total_size_bytes": 0,
185
+ "total_size_kb": 0,
186
+ "oldest": None,
187
+ "newest": None,
188
+ "cache_dir": str(CACHE_DIR),
189
+ "exists": False
190
+ }
191
+
192
+ entries = [p for p in CACHE_DIR.glob("*.json") if p.name != PROVIDER_HEALTH_FILE.name]
193
+ total_size = 0
194
+ oldest_time = None
195
+ newest_time = None
196
+ oldest_query = None
197
+ newest_query = None
198
+ provider_counts = {}
199
+
200
+ for cache_file in entries:
201
+ try:
202
+ stat = cache_file.stat()
203
+ total_size += stat.st_size
204
+
205
+ with open(cache_file, "r", encoding="utf-8") as f:
206
+ cached = json.load(f)
207
+
208
+ ts = cached.get("_cache_timestamp", 0)
209
+ query = cached.get("_cache_query", "unknown")
210
+ provider = cached.get("_cache_provider", "unknown")
211
+
212
+ provider_counts[provider] = provider_counts.get(provider, 0) + 1
213
+
214
+ if oldest_time is None or ts < oldest_time:
215
+ oldest_time = ts
216
+ oldest_query = query
217
+ if newest_time is None or ts > newest_time:
218
+ newest_time = ts
219
+ newest_query = query
220
+ except (json.JSONDecodeError, IOError):
221
+ pass
222
+
223
+ return {
224
+ "total_entries": len(entries),
225
+ "total_size_bytes": total_size,
226
+ "total_size_kb": round(total_size / 1024, 2),
227
+ "providers": provider_counts,
228
+ "oldest": {
229
+ "timestamp": oldest_time,
230
+ "age_seconds": int(time.time() - oldest_time) if oldest_time else None,
231
+ "query": oldest_query
232
+ } if oldest_time else None,
233
+ "newest": {
234
+ "timestamp": newest_time,
235
+ "age_seconds": int(time.time() - newest_time) if newest_time else None,
236
+ "query": newest_query
237
+ } if newest_time else None,
238
+ "cache_dir": str(CACHE_DIR),
239
+ "exists": True
240
+ }
241
+
242
+
243
+ # =============================================================================
244
+ # Auto-load .env from skill directory (if exists)
245
+ # =============================================================================
246
+ def _load_env_file():
247
+ """Load .env file from skill root directory if it exists."""
248
+ env_path = Path(__file__).parent.parent / ".env"
249
+ if env_path.exists():
250
+ with open(env_path) as f:
251
+ for line in f:
252
+ line = line.strip()
253
+ if line and not line.startswith("#") and "=" in line:
254
+ # Handle export VAR=value or VAR=value
255
+ if line.startswith("export "):
256
+ line = line[7:]
257
+ key, _, value = line.partition("=")
258
+ key = key.strip()
259
+ value = value.strip().strip('"').strip("'")
260
+ if key and key not in os.environ:
261
+ os.environ[key] = value
262
+
263
+ _load_env_file()
264
+
265
+
266
+ # =============================================================================
267
+ # Configuration
268
+ # =============================================================================
269
+
270
+ DEFAULT_CONFIG = {
271
+ "defaults": {
272
+ "provider": "serper",
273
+ "max_results": 5
274
+ },
275
+ "auto_routing": {
276
+ "enabled": True,
277
+ "fallback_provider": "serper",
278
+ "provider_priority": ["tavily", "exa", "perplexity", "serper", "you", "searxng"],
279
+ "disabled_providers": [],
280
+ "confidence_threshold": 0.3, # Below this, note low confidence
281
+ },
282
+ "serper": {
283
+ "country": "us",
284
+ "language": "en",
285
+ "type": "search"
286
+ },
287
+ "tavily": {
288
+ "depth": "basic",
289
+ "topic": "general"
290
+ },
291
+ "exa": {
292
+ "type": "neural"
293
+ },
294
+ "perplexity": {
295
+ "api_url": "https://api.kilo.ai/api/gateway/chat/completions",
296
+ "model": "perplexity/sonar-pro"
297
+ },
298
+ "you": {
299
+ "country": "us",
300
+ "safesearch": "moderate"
301
+ },
302
+ "searxng": {
303
+ "instance_url": None, # Required - user must set their own instance
304
+ "safesearch": 0, # 0=off, 1=moderate, 2=strict
305
+ "engines": None, # Optional list of engines to use
306
+ "language": "en"
307
+ }
308
+ }
309
+
310
+
311
+ def load_config() -> Dict[str, Any]:
312
+ """Load configuration from config.json if it exists, with defaults."""
313
+ config = DEFAULT_CONFIG.copy()
314
+ config_path = Path(__file__).parent.parent / "config.json"
315
+
316
+ if config_path.exists():
317
+ try:
318
+ with open(config_path) as f:
319
+ user_config = json.load(f)
320
+ for key, value in user_config.items():
321
+ if isinstance(value, dict) and key in config:
322
+ config[key] = {**config.get(key, {}), **value}
323
+ else:
324
+ config[key] = value
325
+ except (json.JSONDecodeError, IOError) as e:
326
+ print(json.dumps({
327
+ "warning": f"Could not load config.json: {e}",
328
+ "using": "default configuration"
329
+ }), file=sys.stderr)
330
+
331
+ return config
332
+
333
+
334
+ def get_api_key(provider: str, config: Dict[str, Any] = None) -> Optional[str]:
335
+ """Get API key for provider from config.json or environment.
336
+
337
+ Priority: config.json > .env > environment variable
338
+
339
+ Note: SearXNG doesn't require an API key, but returns instance_url if configured.
340
+ """
341
+ # Special case: SearXNG uses instance_url instead of API key
342
+ if provider == "searxng":
343
+ return get_searxng_instance_url(config)
344
+
345
+ # Check config.json first
346
+ if config:
347
+ provider_config = config.get(provider, {})
348
+ if isinstance(provider_config, dict):
349
+ key = provider_config.get("api_key") or provider_config.get("apiKey")
350
+ if key:
351
+ return key
352
+
353
+ # Then check environment
354
+ key_map = {
355
+ "serper": "SERPER_API_KEY",
356
+ "tavily": "TAVILY_API_KEY",
357
+ "exa": "EXA_API_KEY",
358
+ "you": "YOU_API_KEY",
359
+ "perplexity": "KILOCODE_API_KEY",
360
+ }
361
+ return os.environ.get(key_map.get(provider, ""))
362
+
363
+
364
+ def _validate_searxng_url(url: str) -> str:
365
+ """Validate and sanitize SearXNG instance URL to prevent SSRF.
366
+
367
+ Enforces http/https scheme and blocks requests to private/internal networks
368
+ including cloud metadata endpoints, loopback, link-local, and RFC1918 ranges.
369
+ """
370
+ import ipaddress
371
+ import socket
372
+ from urllib.parse import urlparse
373
+
374
+ parsed = urlparse(url)
375
+ if parsed.scheme not in ("http", "https"):
376
+ raise ValueError(f"SearXNG URL must use http or https scheme, got: {parsed.scheme}")
377
+ if not parsed.hostname:
378
+ raise ValueError("SearXNG URL must include a hostname")
379
+
380
+ hostname = parsed.hostname
381
+
382
+ # Block cloud metadata endpoints by hostname
383
+ BLOCKED_HOSTS = {
384
+ "169.254.169.254", # AWS/GCP/Azure metadata
385
+ "metadata.google.internal",
386
+ "metadata.internal",
387
+ }
388
+ if hostname in BLOCKED_HOSTS:
389
+ raise ValueError(f"SearXNG URL blocked: {hostname} is a cloud metadata endpoint")
390
+
391
+ # Resolve hostname and check for private/internal IPs
392
+ # Operators who intentionally self-host on private networks can opt out
393
+ allow_private = os.environ.get("SEARXNG_ALLOW_PRIVATE", "").strip() == "1"
394
+ if not allow_private:
395
+ try:
396
+ resolved_ips = socket.getaddrinfo(hostname, parsed.port or 80, proto=socket.IPPROTO_TCP)
397
+ for family, _type, _proto, _canonname, sockaddr in resolved_ips:
398
+ ip = ipaddress.ip_address(sockaddr[0])
399
+ if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
400
+ raise ValueError(
401
+ f"SearXNG URL blocked: {hostname} resolves to private/internal IP {ip}. "
402
+ f"If this is intentional, set SEARXNG_ALLOW_PRIVATE=1 in your environment."
403
+ )
404
+ except socket.gaierror:
405
+ raise ValueError(f"SearXNG URL blocked: cannot resolve hostname {hostname}")
406
+
407
+ return url
408
+
409
+
410
+ def get_searxng_instance_url(config: Dict[str, Any] = None) -> Optional[str]:
411
+ """Get SearXNG instance URL from config or environment.
412
+
413
+ SearXNG is self-hosted, so no API key needed - just the instance URL.
414
+ Priority: config.json > SEARXNG_INSTANCE_URL environment variable
415
+
416
+ Security: URL is validated to prevent SSRF via scheme enforcement.
417
+ Both config sources (config.json, env var) are operator-controlled,
418
+ not agent-controlled, so private IPs like localhost are permitted.
419
+ """
420
+ # Check config.json first
421
+ if config:
422
+ searxng_config = config.get("searxng", {})
423
+ if isinstance(searxng_config, dict):
424
+ url = searxng_config.get("instance_url")
425
+ if url:
426
+ return _validate_searxng_url(url)
427
+
428
+ # Then check environment
429
+ env_url = os.environ.get("SEARXNG_INSTANCE_URL")
430
+ if env_url:
431
+ return _validate_searxng_url(env_url)
432
+ return None
433
+
434
+
435
+ # Backward compatibility alias
436
+ def get_env_key(provider: str) -> Optional[str]:
437
+ """Get API key for provider from environment (legacy function)."""
438
+ return get_api_key(provider)
439
+
440
+
441
+ def validate_api_key(provider: str, config: Dict[str, Any] = None) -> str:
442
+ """Validate and return API key (or instance URL for SearXNG), with helpful error messages."""
443
+ key = get_api_key(provider, config)
444
+
445
+ # Special handling for SearXNG - it needs instance URL, not API key
446
+ if provider == "searxng":
447
+ if not key:
448
+ error_msg = {
449
+ "error": "Missing SearXNG instance URL",
450
+ "env_var": "SEARXNG_INSTANCE_URL",
451
+ "how_to_fix": [
452
+ "1. Set up your own SearXNG instance: https://docs.searxng.org/admin/installation.html",
453
+ "2. Add to config.json: \"searxng\": {\"instance_url\": \"https://your-instance.example.com\"}",
454
+ "3. Or set environment variable: export SEARXNG_INSTANCE_URL=\"https://your-instance.example.com\"",
455
+ "Note: SearXNG requires a self-hosted instance with JSON format enabled.",
456
+ ],
457
+ "provider": provider
458
+ }
459
+ print(json.dumps(error_msg, indent=2), file=sys.stderr)
460
+ sys.exit(1)
461
+
462
+ # Validate URL format
463
+ if not key.startswith(("http://", "https://")):
464
+ print(json.dumps({
465
+ "error": "SearXNG instance URL must start with http:// or https://",
466
+ "provided": key,
467
+ "provider": provider
468
+ }, indent=2), file=sys.stderr)
469
+ sys.exit(1)
470
+
471
+ return key
472
+
473
+ if not key:
474
+ env_var = {
475
+ "serper": "SERPER_API_KEY",
476
+ "tavily": "TAVILY_API_KEY",
477
+ "exa": "EXA_API_KEY",
478
+ "you": "YOU_API_KEY",
479
+ "perplexity": "KILOCODE_API_KEY"
480
+ }[provider]
481
+
482
+ urls = {
483
+ "serper": "https://serper.dev",
484
+ "tavily": "https://tavily.com",
485
+ "exa": "https://exa.ai",
486
+ "you": "https://api.you.com",
487
+ "perplexity": "https://api.kilo.ai"
488
+ }
489
+
490
+ error_msg = {
491
+ "error": f"Missing API key for {provider}",
492
+ "env_var": env_var,
493
+ "how_to_fix": [
494
+ f"1. Get your API key from {urls[provider]}",
495
+ f"2. Add to config.json: \"{provider}\": {{\"api_key\": \"your-key\"}}",
496
+ f"3. Or set environment variable: export {env_var}=\"your-key\"",
497
+ ],
498
+ "provider": provider
499
+ }
500
+ print(json.dumps(error_msg, indent=2), file=sys.stderr)
501
+ sys.exit(1)
502
+
503
+ if len(key) < 10:
504
+ print(json.dumps({
505
+ "error": f"API key for {provider} appears invalid (too short)",
506
+ "provider": provider
507
+ }, indent=2), file=sys.stderr)
508
+ sys.exit(1)
509
+
510
+ return key
511
+
512
+
513
+ # =============================================================================
514
+ # Intelligent Auto-Routing Engine
515
+ # =============================================================================
516
+
517
+ class QueryAnalyzer:
518
+ """
519
+ Intelligent query analysis for smart provider routing.
520
+
521
+ Uses multi-signal analysis:
522
+ - Intent classification (shopping, research, discovery, local, news)
523
+ - Linguistic patterns (question structure, phrase patterns)
524
+ - Entity detection (products, brands, URLs, dates)
525
+ - Complexity assessment
526
+ """
527
+
528
+ # Intent signal patterns with weights
529
+ # Higher weight = stronger signal for that provider
530
+
531
+ SHOPPING_SIGNALS = {
532
+ # Price patterns (very strong)
533
+ r'\bhow much\b': 4.0,
534
+ r'\bprice of\b': 4.0,
535
+ r'\bcost of\b': 4.0,
536
+ r'\bprices?\b': 3.0,
537
+ r'\$\d+|\d+\s*dollars?': 3.0,
538
+ r'€\d+|\d+\s*euros?': 3.0,
539
+ r'£\d+|\d+\s*pounds?': 3.0,
540
+
541
+ # German price patterns (sehr stark)
542
+ r'\bpreis(e)?\b': 3.5,
543
+ r'\bkosten\b': 3.0,
544
+ r'\bwieviel\b': 3.5,
545
+ r'\bwie viel\b': 3.5,
546
+ r'\bwas kostet\b': 4.0,
547
+
548
+ # Purchase intent (strong)
549
+ r'\bbuy\b': 3.5,
550
+ r'\bpurchase\b': 3.5,
551
+ r'\border\b(?!\s+by)': 3.0, # "order" but not "order by"
552
+ r'\bshopping\b': 3.5,
553
+ r'\bshop for\b': 3.5,
554
+ r'\bwhere to (buy|get|purchase)\b': 4.0,
555
+
556
+ # German purchase intent (stark)
557
+ r'\bkaufen\b': 3.5,
558
+ r'\bbestellen\b': 3.5,
559
+ r'\bwo kaufen\b': 4.0,
560
+ r'\bhändler\b': 3.0,
561
+ r'\bshop\b': 2.5,
562
+
563
+ # Deal/discount signals
564
+ r'\bdeal(s)?\b': 3.0,
565
+ r'\bdiscount(s)?\b': 3.0,
566
+ r'\bsale\b': 2.5,
567
+ r'\bcheap(er|est)?\b': 3.0,
568
+ r'\baffordable\b': 2.5,
569
+ r'\bbudget\b': 2.5,
570
+ r'\bbest price\b': 3.5,
571
+ r'\bcompare prices\b': 3.5,
572
+ r'\bcoupon\b': 3.0,
573
+
574
+ # German deal/discount signals
575
+ r'\bgünstig(er|ste)?\b': 3.0,
576
+ r'\bbillig(er|ste)?\b': 3.0,
577
+ r'\bangebot(e)?\b': 3.0,
578
+ r'\brabatt\b': 3.0,
579
+ r'\baktion\b': 2.5,
580
+ r'\bschnäppchen\b': 3.0,
581
+
582
+ # Product comparison
583
+ r'\bvs\.?\b': 2.0,
584
+ r'\bversus\b': 2.0,
585
+ r'\bor\b.*\bwhich\b': 2.0,
586
+ r'\bspecs?\b': 2.5,
587
+ r'\bspecifications?\b': 2.5,
588
+ r'\breview(s)?\b': 2.0,
589
+ r'\brating(s)?\b': 2.0,
590
+ r'\bunboxing\b': 2.5,
591
+
592
+ # German product comparison
593
+ r'\btest\b': 2.5,
594
+ r'\bbewertung(en)?\b': 2.5,
595
+ r'\btechnische daten\b': 3.0,
596
+ r'\bspezifikationen\b': 2.5,
597
+ }
598
+
599
+ RESEARCH_SIGNALS = {
600
+ # Explanation patterns (very strong)
601
+ r'\bhow does\b': 4.0,
602
+ r'\bhow do\b': 3.5,
603
+ r'\bwhy does\b': 4.0,
604
+ r'\bwhy do\b': 3.5,
605
+ r'\bwhy is\b': 3.5,
606
+ r'\bexplain\b': 4.0,
607
+ r'\bexplanation\b': 4.0,
608
+ r'\bwhat is\b': 3.0,
609
+ r'\bwhat are\b': 3.0,
610
+ r'\bdefine\b': 3.5,
611
+ r'\bdefinition of\b': 3.5,
612
+ r'\bmeaning of\b': 3.0,
613
+
614
+ # Analysis patterns (strong)
615
+ r'\banalyze\b': 3.5,
616
+ r'\banalysis\b': 3.5,
617
+ r'\bcompare\b(?!\s*prices?)': 3.0, # compare but not "compare prices"
618
+ r'\bcomparison\b': 3.0,
619
+ r'\bstatus of\b': 3.5,
620
+ r'\bstatus\b': 2.5,
621
+ r'\bwhat happened with\b': 4.0,
622
+ r'\bpros and cons\b': 4.0,
623
+ r'\badvantages?\b': 3.0,
624
+ r'\bdisadvantages?\b': 3.0,
625
+ r'\bbenefits?\b': 2.5,
626
+ r'\bdrawbacks?\b': 3.0,
627
+ r'\bdifference between\b': 3.5,
628
+
629
+ # Learning patterns
630
+ r'\bunderstand\b': 3.0,
631
+ r'\blearn(ing)?\b': 2.5,
632
+ r'\btutorial\b': 3.0,
633
+ r'\bguide\b': 2.5,
634
+ r'\bhow to\b': 2.0, # Lower weight - could be shopping too
635
+ r'\bstep by step\b': 3.0,
636
+
637
+ # Depth signals
638
+ r'\bin[- ]depth\b': 3.0,
639
+ r'\bdetailed\b': 2.5,
640
+ r'\bcomprehensive\b': 3.0,
641
+ r'\bthorough\b': 2.5,
642
+ r'\bdeep dive\b': 3.5,
643
+ r'\boverall\b': 2.0,
644
+ r'\bsummary\b': 2.0,
645
+
646
+ # Academic patterns
647
+ r'\bstudy\b': 2.5,
648
+ r'\bresearch shows\b': 3.5,
649
+ r'\baccording to\b': 2.5,
650
+ r'\bevidence\b': 3.0,
651
+ r'\bscientific\b': 3.0,
652
+ r'\bhistory of\b': 3.0,
653
+ r'\bbackground\b': 2.5,
654
+ r'\bcontext\b': 2.5,
655
+ r'\bimplications?\b': 3.0,
656
+
657
+ # German explanation patterns (sehr stark)
658
+ r'\bwie funktioniert\b': 4.0,
659
+ r'\bwarum\b': 3.5,
660
+ r'\berklär(en|ung)?\b': 4.0,
661
+ r'\bwas ist\b': 3.0,
662
+ r'\bwas sind\b': 3.0,
663
+ r'\bbedeutung\b': 3.0,
664
+
665
+ # German analysis patterns
666
+ r'\banalyse\b': 3.5,
667
+ r'\bvergleich(en)?\b': 3.0,
668
+ r'\bvor- und nachteile\b': 4.0,
669
+ r'\bvorteile\b': 3.0,
670
+ r'\bnachteile\b': 3.0,
671
+ r'\bunterschied(e)?\b': 3.5,
672
+
673
+ # German learning patterns
674
+ r'\bverstehen\b': 3.0,
675
+ r'\blernen\b': 2.5,
676
+ r'\banleitung\b': 3.0,
677
+ r'\bübersicht\b': 2.5,
678
+ r'\bhintergrund\b': 2.5,
679
+ r'\bzusammenfassung\b': 2.5,
680
+ }
681
+
682
+ DISCOVERY_SIGNALS = {
683
+ # Similarity patterns (very strong)
684
+ r'\bsimilar to\b': 5.0,
685
+ r'\blike\s+\w+\.com': 4.5, # "like notion.com"
686
+ r'\balternatives? to\b': 5.0,
687
+ r'\bcompetitors? (of|to)\b': 4.5,
688
+ r'\bcompeting with\b': 4.0,
689
+ r'\brivals? (of|to)\b': 4.0,
690
+ r'\binstead of\b': 3.0,
691
+ r'\breplacement for\b': 3.5,
692
+
693
+ # Company/startup patterns (strong)
694
+ r'\bcompanies (like|that|doing|building)\b': 4.5,
695
+ r'\bstartups? (like|that|doing|building)\b': 4.5,
696
+ r'\bwho else\b': 4.0,
697
+ r'\bother (companies|startups|tools|apps)\b': 3.5,
698
+ r'\bfind (companies|startups|tools|examples?)\b': 4.5,
699
+ r'\bevents? in\b': 4.0,
700
+ r'\bthings to do in\b': 4.5,
701
+
702
+ # Funding/business patterns
703
+ r'\bseries [a-d]\b': 4.0,
704
+ r'\byc\b|y combinator': 4.0,
705
+ r'\bfund(ed|ing|raise)\b': 3.5,
706
+ r'\bventure\b': 3.0,
707
+ r'\bvaluation\b': 3.0,
708
+
709
+ # Category patterns
710
+ r'\bresearch papers? (on|about)\b': 4.0,
711
+ r'\barxiv\b': 4.5,
712
+ r'\bgithub (projects?|repos?)\b': 4.5,
713
+ r'\bopen source\b.*\bprojects?\b': 4.0,
714
+ r'\btweets? (about|on)\b': 3.5,
715
+ r'\bblogs? (about|on|like)\b': 3.0,
716
+
717
+ # URL detection (very strong signal for Exa similar)
718
+ r'https?://[^\s]+': 5.0,
719
+ r'\b\w+\.(com|org|io|ai|co|dev)\b': 3.5,
720
+ }
721
+
722
+ LOCAL_NEWS_SIGNALS = {
723
+ # Local patterns → Serper
724
+ r'\bnear me\b': 4.0,
725
+ r'\bnearby\b': 3.5,
726
+ r'\blocal\b': 3.0,
727
+ r'\bin (my )?(city|area|town|neighborhood)\b': 3.5,
728
+ r'\brestaurants?\b': 2.5,
729
+ r'\bhotels?\b': 2.5,
730
+ r'\bcafes?\b': 2.5,
731
+ r'\bstores?\b': 2.0,
732
+ r'\bdirections? to\b': 3.5,
733
+ r'\bmap of\b': 3.0,
734
+ r'\bphone number\b': 3.0,
735
+ r'\baddress of\b': 3.0,
736
+ r'\bopen(ing)? hours\b': 3.0,
737
+
738
+ # Weather/time
739
+ r'\bweather\b': 4.0,
740
+ r'\bforecast\b': 3.5,
741
+ r'\btemperature\b': 3.0,
742
+ r'\btime in\b': 3.0,
743
+
744
+ # News/recency patterns → Serper (or Tavily for news depth)
745
+ r'\blatest\b': 2.5,
746
+ r'\brecent\b': 2.5,
747
+ r'\btoday\b': 2.5,
748
+ r'\bbreaking\b': 3.5,
749
+ r'\bnews\b': 2.5,
750
+ r'\bheadlines?\b': 3.0,
751
+ r'\b202[4-9]\b': 2.0, # Current year mentions
752
+ r'\blast (week|month|year)\b': 2.0,
753
+ }
754
+
755
+ # RAG/AI signals → You.com
756
+ # You.com excels at providing LLM-ready snippets and combined web+news
757
+ RAG_SIGNALS = {
758
+ # RAG/context patterns (strong signal for You.com)
759
+ r'\brag\b': 4.5,
760
+ r'\bcontext for\b': 4.0,
761
+ r'\bsummarize\b': 3.5,
762
+ r'\bbrief(ly)?\b': 3.0,
763
+ r'\bquick overview\b': 3.5,
764
+ r'\btl;?dr\b': 4.0,
765
+ r'\bkey (points|facts|info)\b': 3.5,
766
+ r'\bmain (points|takeaways)\b': 3.5,
767
+
768
+ # Combined web + news queries
769
+ r'\b(web|online)\s+and\s+news\b': 4.0,
770
+ r'\ball sources\b': 3.5,
771
+ r'\bcomprehensive (search|overview)\b': 3.5,
772
+ r'\blatest\s+(news|updates)\b': 3.0,
773
+ r'\bcurrent (events|situation|status)\b': 3.5,
774
+
775
+ # Real-time information needs
776
+ r'\bright now\b': 3.0,
777
+ r'\bas of today\b': 3.5,
778
+ r'\bup.to.date\b': 3.5,
779
+ r'\breal.time\b': 4.0,
780
+ r'\blive\b': 2.5,
781
+
782
+ # Information synthesis
783
+ r'\bwhat\'?s happening with\b': 3.5,
784
+ r'\bwhat\'?s the latest\b': 4.0,
785
+ r'\bupdates?\s+on\b': 3.5,
786
+ r'\bstatus of\b': 3.0,
787
+ r'\bsituation (in|with|around)\b': 3.5,
788
+ }
789
+
790
+ # Direct answer / synthesis signals → Perplexity via Kilo Gateway
791
+ DIRECT_ANSWER_SIGNALS = {
792
+ r'\bwhat is\b': 3.0,
793
+ r'\bwhat are\b': 2.5,
794
+ r'\bcurrent status\b': 4.0,
795
+ r'\bstatus of\b': 3.5,
796
+ r'\bstatus\b': 2.5,
797
+ r'\bwhat happened with\b': 4.0,
798
+ r"\bwhat'?s happening with\b": 4.0,
799
+ r'\bas of (today|now)\b': 4.0,
800
+ r'\bthis weekend\b': 3.5,
801
+ r'\bevents? in\b': 3.5,
802
+ r'\bthings to do in\b': 4.0,
803
+ r'\bnear me\b': 3.0,
804
+ r'\bcan you (tell me|summarize|explain)\b': 3.5,
805
+ }
806
+
807
+ # Privacy/Multi-source signals → SearXNG (self-hosted meta-search)
808
+ # SearXNG is ideal for privacy-focused queries and aggregating multiple sources
809
+ PRIVACY_SIGNALS = {
810
+ # Privacy signals (very strong)
811
+ r'\bprivate(ly)?\b': 4.0,
812
+ r'\banonymous(ly)?\b': 4.0,
813
+ r'\bwithout tracking\b': 4.5,
814
+ r'\bno track(ing)?\b': 4.5,
815
+ r'\bprivacy\b': 3.5,
816
+ r'\bprivacy.?focused\b': 4.5,
817
+ r'\bprivacy.?first\b': 4.5,
818
+ r'\bduckduckgo alternative\b': 4.5,
819
+ r'\bprivate search\b': 5.0,
820
+
821
+ # German privacy signals
822
+ r'\bprivat\b': 4.0,
823
+ r'\banonym\b': 4.0,
824
+ r'\bohne tracking\b': 4.5,
825
+ r'\bdatenschutz\b': 4.0,
826
+
827
+ # Multi-source aggregation signals
828
+ r'\baggregate results?\b': 4.0,
829
+ r'\bmultiple sources?\b': 4.0,
830
+ r'\bdiverse (results|perspectives|sources)\b': 4.0,
831
+ r'\bfrom (all|multiple|different) (engines?|sources?)\b': 4.5,
832
+ r'\bmeta.?search\b': 5.0,
833
+ r'\ball engines?\b': 4.0,
834
+
835
+ # German multi-source signals
836
+ r'\bverschiedene quellen\b': 4.0,
837
+ r'\baus mehreren quellen\b': 4.0,
838
+ r'\balle suchmaschinen\b': 4.5,
839
+
840
+ # Budget/free signals (SearXNG is self-hosted = $0 API cost)
841
+ r'\bfree search\b': 3.5,
842
+ r'\bno api cost\b': 4.0,
843
+ r'\bself.?hosted search\b': 5.0,
844
+ r'\bzero cost\b': 3.5,
845
+ r'\bbudget\b(?!\s*(laptop|phone|option))\b': 2.5, # "budget" alone, not "budget laptop"
846
+
847
+ # German budget signals
848
+ r'\bkostenlos(e)?\s+suche\b': 3.5,
849
+ r'\bkeine api.?kosten\b': 4.0,
850
+ }
851
+
852
+ # Brand/product patterns for shopping detection
853
+ BRAND_PATTERNS = [
854
+ # Tech brands
855
+ r'\b(apple|iphone|ipad|macbook|airpods?)\b',
856
+ r'\b(samsung|galaxy)\b',
857
+ r'\b(google|pixel)\b',
858
+ r'\b(microsoft|surface|xbox)\b',
859
+ r'\b(sony|playstation)\b',
860
+ r'\b(nvidia|geforce|rtx)\b',
861
+ r'\b(amd|ryzen|radeon)\b',
862
+ r'\b(intel|core i[3579])\b',
863
+ r'\b(dell|hp|lenovo|asus|acer)\b',
864
+ r'\b(lg|tcl|hisense)\b',
865
+
866
+ # Product categories
867
+ r'\b(laptop|phone|tablet|tv|monitor|headphones?|earbuds?)\b',
868
+ r'\b(camera|lens|drone)\b',
869
+ r'\b(watch|smartwatch|fitbit|garmin)\b',
870
+ r'\b(router|modem|wifi)\b',
871
+ r'\b(keyboard|mouse|gaming)\b',
872
+ ]
873
+
874
+ def __init__(self, config: Dict[str, Any]):
875
+ self.config = config
876
+ self.auto_config = config.get("auto_routing", DEFAULT_CONFIG["auto_routing"])
877
+
878
+ def _calculate_signal_score(
879
+ self,
880
+ query: str,
881
+ signals: Dict[str, float]
882
+ ) -> Tuple[float, List[Dict[str, Any]]]:
883
+ """
884
+ Calculate score for a signal category.
885
+ Returns (total_score, list of matched signals with details).
886
+ """
887
+ query_lower = query.lower()
888
+ matches = []
889
+ total_score = 0.0
890
+
891
+ for pattern, weight in signals.items():
892
+ regex = re.compile(pattern, re.IGNORECASE)
893
+ found = regex.findall(query_lower)
894
+ if found:
895
+ # Normalize found matches
896
+ match_text = found[0] if isinstance(found[0], str) else found[0][0] if found[0] else pattern
897
+ matches.append({
898
+ "pattern": pattern,
899
+ "matched": match_text,
900
+ "weight": weight
901
+ })
902
+ total_score += weight
903
+
904
+ return total_score, matches
905
+
906
+ def _detect_product_brand_combo(self, query: str) -> float:
907
+ """
908
+ Detect product + brand combinations which strongly indicate shopping intent.
909
+ Returns a bonus score.
910
+ """
911
+ query_lower = query.lower()
912
+ brand_found = False
913
+ product_found = False
914
+
915
+ for pattern in self.BRAND_PATTERNS:
916
+ if re.search(pattern, query_lower, re.IGNORECASE):
917
+ brand_found = True
918
+ break
919
+
920
+ # Check for product indicators
921
+ product_indicators = [
922
+ r'\b(buy|price|specs?|review|vs|compare)\b',
923
+ r'\b(pro|max|plus|mini|ultra|lite)\b', # Product tier names
924
+ r'\b\d+\s*(gb|tb|inch|mm|hz)\b', # Specifications
925
+ ]
926
+ for pattern in product_indicators:
927
+ if re.search(pattern, query_lower, re.IGNORECASE):
928
+ product_found = True
929
+ break
930
+
931
+ if brand_found and product_found:
932
+ return 3.0 # Strong shopping signal
933
+ elif brand_found:
934
+ return 1.5 # Moderate shopping signal
935
+ return 0.0
936
+
937
+ def _detect_url(self, query: str) -> Optional[str]:
938
+ """Detect URLs in query - strong signal for Exa similar search."""
939
+ url_pattern = r'https?://[^\s]+'
940
+ match = re.search(url_pattern, query)
941
+ if match:
942
+ return match.group()
943
+
944
+ # Also check for domain-like patterns
945
+ domain_pattern = r'\b(\w+\.(com|org|io|ai|co|dev|net|app))\b'
946
+ match = re.search(domain_pattern, query, re.IGNORECASE)
947
+ if match:
948
+ return match.group()
949
+
950
+ return None
951
+
952
+ def _assess_query_complexity(self, query: str) -> Dict[str, Any]:
953
+ """
954
+ Assess query complexity - complex queries favor Tavily.
955
+ """
956
+ words = query.split()
957
+ word_count = len(words)
958
+
959
+ # Count question words
960
+ question_words = len(re.findall(
961
+ r'\b(what|why|how|when|where|which|who|whose|whom)\b',
962
+ query, re.IGNORECASE
963
+ ))
964
+
965
+ # Check for multiple clauses
966
+ clause_markers = len(re.findall(
967
+ r'\b(and|but|or|because|since|while|although|if|when)\b',
968
+ query, re.IGNORECASE
969
+ ))
970
+
971
+ complexity_score = 0.0
972
+ if word_count > 10:
973
+ complexity_score += 1.5
974
+ if word_count > 20:
975
+ complexity_score += 1.0
976
+ if question_words > 1:
977
+ complexity_score += 1.0
978
+ if clause_markers > 0:
979
+ complexity_score += 0.5 * clause_markers
980
+
981
+ return {
982
+ "word_count": word_count,
983
+ "question_words": question_words,
984
+ "clause_markers": clause_markers,
985
+ "complexity_score": complexity_score,
986
+ "is_complex": complexity_score > 2.0
987
+ }
988
+
989
+ def _detect_recency_intent(self, query: str) -> Tuple[bool, float]:
990
+ """
991
+ Detect if query wants recent/timely information.
992
+ Returns (is_recency_focused, score).
993
+ """
994
+ recency_patterns = [
995
+ (r'\b(latest|newest|recent|current)\b', 2.5),
996
+ (r'\b(today|yesterday|this week|this month)\b', 3.0),
997
+ (r'\b(202[4-9]|2030)\b', 2.0),
998
+ (r'\b(breaking|live|just|now)\b', 3.0),
999
+ (r'\blast (hour|day|week|month)\b', 2.5),
1000
+ ]
1001
+
1002
+ total = 0.0
1003
+ for pattern, weight in recency_patterns:
1004
+ if re.search(pattern, query, re.IGNORECASE):
1005
+ total += weight
1006
+
1007
+ return total > 2.0, total
1008
+
1009
+ def analyze(self, query: str) -> Dict[str, Any]:
1010
+ """
1011
+ Perform comprehensive query analysis.
1012
+ Returns detailed analysis with scores for each provider.
1013
+ """
1014
+ # Calculate scores for each intent category
1015
+ shopping_score, shopping_matches = self._calculate_signal_score(
1016
+ query, self.SHOPPING_SIGNALS
1017
+ )
1018
+ research_score, research_matches = self._calculate_signal_score(
1019
+ query, self.RESEARCH_SIGNALS
1020
+ )
1021
+ discovery_score, discovery_matches = self._calculate_signal_score(
1022
+ query, self.DISCOVERY_SIGNALS
1023
+ )
1024
+ local_news_score, local_news_matches = self._calculate_signal_score(
1025
+ query, self.LOCAL_NEWS_SIGNALS
1026
+ )
1027
+ rag_score, rag_matches = self._calculate_signal_score(
1028
+ query, self.RAG_SIGNALS
1029
+ )
1030
+ privacy_score, privacy_matches = self._calculate_signal_score(
1031
+ query, self.PRIVACY_SIGNALS
1032
+ )
1033
+ direct_answer_score, direct_answer_matches = self._calculate_signal_score(
1034
+ query, self.DIRECT_ANSWER_SIGNALS
1035
+ )
1036
+
1037
+ # Apply product/brand bonus to shopping
1038
+ brand_bonus = self._detect_product_brand_combo(query)
1039
+ if brand_bonus > 0:
1040
+ shopping_score += brand_bonus
1041
+ shopping_matches.append({
1042
+ "pattern": "product_brand_combo",
1043
+ "matched": "brand + product detected",
1044
+ "weight": brand_bonus
1045
+ })
1046
+
1047
+ # Detect URL → strong Exa signal
1048
+ detected_url = self._detect_url(query)
1049
+ if detected_url:
1050
+ discovery_score += 5.0
1051
+ discovery_matches.append({
1052
+ "pattern": "url_detected",
1053
+ "matched": detected_url,
1054
+ "weight": 5.0
1055
+ })
1056
+
1057
+ # Assess complexity → favors Tavily
1058
+ complexity = self._assess_query_complexity(query)
1059
+ if complexity["is_complex"]:
1060
+ research_score += complexity["complexity_score"]
1061
+ research_matches.append({
1062
+ "pattern": "query_complexity",
1063
+ "matched": f"complex query ({complexity['word_count']} words)",
1064
+ "weight": complexity["complexity_score"]
1065
+ })
1066
+
1067
+ # Check recency intent
1068
+ is_recency, recency_score = self._detect_recency_intent(query)
1069
+
1070
+ # Map intents to providers with final scores
1071
+ provider_scores = {
1072
+ "serper": shopping_score + local_news_score + (recency_score * 0.35),
1073
+ "tavily": research_score + (complexity["complexity_score"] if not complexity["is_complex"] else 0) + (0.2 * recency_score),
1074
+ "exa": discovery_score + (1.0 if re.search(r"\b(similar|alternatives?|examples?)\b", query, re.IGNORECASE) else 0.0),
1075
+ "perplexity": direct_answer_score + (local_news_score * 0.4) + (recency_score * 0.55),
1076
+ "you": rag_score + (recency_score * 0.25), # You.com good for real-time + RAG
1077
+ "searxng": privacy_score, # SearXNG for privacy/multi-source queries
1078
+ }
1079
+
1080
+ # Build match details per provider
1081
+ provider_matches = {
1082
+ "serper": shopping_matches + local_news_matches,
1083
+ "tavily": research_matches,
1084
+ "exa": discovery_matches,
1085
+ "perplexity": direct_answer_matches,
1086
+ "you": rag_matches,
1087
+ "searxng": privacy_matches,
1088
+ }
1089
+
1090
+ return {
1091
+ "query": query,
1092
+ "provider_scores": provider_scores,
1093
+ "provider_matches": provider_matches,
1094
+ "detected_url": detected_url,
1095
+ "complexity": complexity,
1096
+ "recency_focused": is_recency,
1097
+ "recency_score": recency_score,
1098
+ }
1099
+
1100
+ def route(self, query: str) -> Dict[str, Any]:
1101
+ """
1102
+ Route query to optimal provider with confidence scoring.
1103
+ """
1104
+ analysis = self.analyze(query)
1105
+ scores = analysis["provider_scores"]
1106
+
1107
+ # Filter to available providers
1108
+ disabled = set(self.auto_config.get("disabled_providers", []))
1109
+ available = {
1110
+ p: s for p, s in scores.items()
1111
+ if p not in disabled and get_env_key(p)
1112
+ }
1113
+
1114
+ if not available:
1115
+ # No providers available, use fallback
1116
+ fallback = self.auto_config.get("fallback_provider", "serper")
1117
+ return {
1118
+ "provider": fallback,
1119
+ "confidence": 0.0,
1120
+ "confidence_level": "low",
1121
+ "reason": "no_available_providers",
1122
+ "scores": scores,
1123
+ "top_signals": [],
1124
+ "analysis": analysis,
1125
+ }
1126
+
1127
+ # Find the winner
1128
+ max_score = max(available.values())
1129
+ total_score = sum(available.values()) or 1.0
1130
+
1131
+ # Handle ties using priority
1132
+ priority = self.auto_config.get("provider_priority", ["tavily", "exa", "perplexity", "serper", "you", "searxng"])
1133
+ winners = [p for p, s in available.items() if s == max_score]
1134
+
1135
+ if len(winners) > 1:
1136
+ # Use priority to break tie
1137
+ for p in priority:
1138
+ if p in winners:
1139
+ winner = p
1140
+ break
1141
+ else:
1142
+ winner = winners[0]
1143
+ else:
1144
+ winner = winners[0]
1145
+
1146
+ # Calculate confidence
1147
+ # High confidence = clear winner with good margin
1148
+ if max_score == 0:
1149
+ confidence = 0.0
1150
+ reason = "no_signals_matched"
1151
+ else:
1152
+ # Confidence based on:
1153
+ # 1. Absolute score (is it strong enough?)
1154
+ # 2. Relative margin (is there a clear winner?)
1155
+ second_best = sorted(available.values(), reverse=True)[1] if len(available) > 1 else 0
1156
+ margin = (max_score - second_best) / max_score if max_score > 0 else 0
1157
+
1158
+ # Normalize score to 0-1 range (assuming max reasonable score ~15)
1159
+ normalized_score = min(max_score / 15.0, 1.0)
1160
+
1161
+ # Confidence is combination of absolute strength and relative margin
1162
+ confidence = round((normalized_score * 0.6 + margin * 0.4), 3)
1163
+
1164
+ if confidence >= 0.7:
1165
+ reason = "high_confidence_match"
1166
+ elif confidence >= 0.4:
1167
+ reason = "moderate_confidence_match"
1168
+ else:
1169
+ reason = "low_confidence_match"
1170
+
1171
+ # Get top signals for the winning provider
1172
+ matches = analysis["provider_matches"].get(winner, [])
1173
+ top_signals = sorted(matches, key=lambda x: x["weight"], reverse=True)[:5]
1174
+
1175
+ # Special case: URL detected and Exa available → strong recommendation
1176
+ if analysis["detected_url"] and "exa" in available:
1177
+ if winner != "exa":
1178
+ # Override if URL is present but didn't win
1179
+ # (user might want similar search)
1180
+ pass # Keep current winner but note it
1181
+
1182
+ # Build detailed routing result
1183
+ threshold = self.auto_config.get("confidence_threshold", 0.3)
1184
+
1185
+ return {
1186
+ "provider": winner,
1187
+ "confidence": confidence,
1188
+ "confidence_level": "high" if confidence >= 0.7 else "medium" if confidence >= 0.4 else "low",
1189
+ "reason": reason,
1190
+ "scores": {p: round(s, 2) for p, s in available.items()},
1191
+ "winning_score": round(max_score, 2),
1192
+ "top_signals": [
1193
+ {"matched": s["matched"], "weight": s["weight"]}
1194
+ for s in top_signals
1195
+ ],
1196
+ "below_threshold": confidence < threshold,
1197
+ "analysis_summary": {
1198
+ "query_length": len(query.split()),
1199
+ "is_complex": analysis["complexity"]["is_complex"],
1200
+ "has_url": analysis["detected_url"] is not None,
1201
+ "recency_focused": analysis["recency_focused"],
1202
+ }
1203
+ }
1204
+
1205
+
1206
+ def auto_route_provider(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
1207
+ """
1208
+ Intelligently route query to the best provider.
1209
+ Returns detailed routing decision with confidence.
1210
+ """
1211
+ analyzer = QueryAnalyzer(config)
1212
+ return analyzer.route(query)
1213
+
1214
+
1215
+ def explain_routing(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
1216
+ """
1217
+ Provide detailed explanation of routing decision for debugging.
1218
+ """
1219
+ analyzer = QueryAnalyzer(config)
1220
+ analysis = analyzer.analyze(query)
1221
+ routing = analyzer.route(query)
1222
+
1223
+ return {
1224
+ "query": query,
1225
+ "routing_decision": {
1226
+ "provider": routing["provider"],
1227
+ "confidence": routing["confidence"],
1228
+ "confidence_level": routing["confidence_level"],
1229
+ "reason": routing["reason"],
1230
+ },
1231
+ "scores": routing["scores"],
1232
+ "top_signals": routing["top_signals"],
1233
+ "intent_breakdown": {
1234
+ "shopping_signals": len(analysis["provider_matches"]["serper"]),
1235
+ "research_signals": len(analysis["provider_matches"]["tavily"]),
1236
+ "discovery_signals": len(analysis["provider_matches"]["exa"]),
1237
+ "rag_signals": len(analysis["provider_matches"]["you"]),
1238
+ },
1239
+ "query_analysis": {
1240
+ "word_count": analysis["complexity"]["word_count"],
1241
+ "is_complex": analysis["complexity"]["is_complex"],
1242
+ "complexity_score": round(analysis["complexity"]["complexity_score"], 2),
1243
+ "has_url": analysis["detected_url"],
1244
+ "recency_focused": analysis["recency_focused"],
1245
+ },
1246
+ "all_matches": {
1247
+ provider: [
1248
+ {"matched": m["matched"], "weight": m["weight"]}
1249
+ for m in matches
1250
+ ]
1251
+ for provider, matches in analysis["provider_matches"].items()
1252
+ if matches
1253
+ },
1254
+ "available_providers": [
1255
+ p for p in ["serper", "tavily", "exa", "perplexity", "you", "searxng"]
1256
+ if get_env_key(p) and p not in config.get("auto_routing", {}).get("disabled_providers", [])
1257
+ ]
1258
+ }
1259
+
1260
+
1261
+
1262
+
1263
+ class ProviderRequestError(Exception):
1264
+ """Structured provider error with retry/cooldown metadata."""
1265
+
1266
+ def __init__(self, message: str, status_code: Optional[int] = None, transient: bool = False):
1267
+ super().__init__(message)
1268
+ self.status_code = status_code
1269
+ self.transient = transient
1270
+
1271
+
1272
+ TRANSIENT_HTTP_CODES = {429, 503}
1273
+ COOLDOWN_STEPS_SECONDS = [60, 300, 1500, 3600] # 1m -> 5m -> 25m -> 1h cap
1274
+ RETRY_BACKOFF_SECONDS = [1, 3, 9]
1275
+
1276
+
1277
+ def _ensure_parent(path: Path) -> None:
1278
+ path.parent.mkdir(parents=True, exist_ok=True)
1279
+
1280
+
1281
+ def _load_provider_health() -> Dict[str, Any]:
1282
+ if not PROVIDER_HEALTH_FILE.exists():
1283
+ return {}
1284
+ try:
1285
+ with open(PROVIDER_HEALTH_FILE, "r", encoding="utf-8") as f:
1286
+ data = json.load(f)
1287
+ return data if isinstance(data, dict) else {}
1288
+ except (json.JSONDecodeError, IOError):
1289
+ return {}
1290
+
1291
+
1292
+ def _save_provider_health(state: Dict[str, Any]) -> None:
1293
+ _ensure_parent(PROVIDER_HEALTH_FILE)
1294
+ with open(PROVIDER_HEALTH_FILE, "w", encoding="utf-8") as f:
1295
+ json.dump(state, f, ensure_ascii=False, indent=2)
1296
+
1297
+
1298
+ def provider_in_cooldown(provider: str) -> Tuple[bool, int]:
1299
+ state = _load_provider_health()
1300
+ pstate = state.get(provider, {})
1301
+ cooldown_until = int(pstate.get("cooldown_until", 0) or 0)
1302
+ remaining = cooldown_until - int(time.time())
1303
+ return (remaining > 0, max(0, remaining))
1304
+
1305
+
1306
+ def mark_provider_failure(provider: str, error_message: str) -> Dict[str, Any]:
1307
+ state = _load_provider_health()
1308
+ now = int(time.time())
1309
+ pstate = state.get(provider, {})
1310
+ fail_count = int(pstate.get("failure_count", 0)) + 1
1311
+ cooldown_seconds = COOLDOWN_STEPS_SECONDS[min(fail_count - 1, len(COOLDOWN_STEPS_SECONDS) - 1)]
1312
+ state[provider] = {
1313
+ "failure_count": fail_count,
1314
+ "cooldown_until": now + cooldown_seconds,
1315
+ "cooldown_seconds": cooldown_seconds,
1316
+ "last_error": error_message,
1317
+ "last_failure_at": now,
1318
+ }
1319
+ _save_provider_health(state)
1320
+ return state[provider]
1321
+
1322
+
1323
+ def reset_provider_health(provider: str) -> None:
1324
+ state = _load_provider_health()
1325
+ if provider in state:
1326
+ state.pop(provider, None)
1327
+ _save_provider_health(state)
1328
+
1329
+
1330
+ def normalize_result_url(url: str) -> str:
1331
+ if not url:
1332
+ return ""
1333
+ parsed = urlparse(url.strip())
1334
+ netloc = (parsed.netloc or "").lower()
1335
+ if netloc.startswith("www."):
1336
+ netloc = netloc[4:]
1337
+ path = parsed.path.rstrip("/")
1338
+ return f"{netloc}{path}"
1339
+
1340
+
1341
+ def deduplicate_results_across_providers(results_by_provider: List[Tuple[str, Dict[str, Any]]], max_results: int) -> Tuple[List[Dict[str, Any]], int]:
1342
+ deduped = []
1343
+ seen = set()
1344
+ dedup_count = 0
1345
+ for provider_name, data in results_by_provider:
1346
+ for item in data.get("results", []):
1347
+ norm = normalize_result_url(item.get("url", ""))
1348
+ if norm and norm in seen:
1349
+ dedup_count += 1
1350
+ continue
1351
+ if norm:
1352
+ seen.add(norm)
1353
+ item = item.copy()
1354
+ item.setdefault("provider", provider_name)
1355
+ deduped.append(item)
1356
+ if len(deduped) >= max_results:
1357
+ return deduped, dedup_count
1358
+ return deduped, dedup_count
1359
+
1360
+ # =============================================================================
1361
+ # HTTP Client
1362
+ # =============================================================================
1363
+
1364
+ def make_request(url: str, headers: dict, body: dict, timeout: int = 30) -> dict:
1365
+ """Make HTTP POST request and return JSON response."""
1366
+ # Ensure User-Agent is set (required by some APIs like Exa/Cloudflare)
1367
+ if "User-Agent" not in headers:
1368
+ headers["User-Agent"] = "ClawdBot-WebSearchPlus/2.1"
1369
+ data = json.dumps(body).encode("utf-8")
1370
+ req = Request(url, data=data, headers=headers, method="POST")
1371
+
1372
+ try:
1373
+ with urlopen(req, timeout=timeout) as response:
1374
+ return json.loads(response.read().decode("utf-8"))
1375
+ except HTTPError as e:
1376
+ error_body = e.read().decode("utf-8") if e.fp else str(e)
1377
+ try:
1378
+ error_json = json.loads(error_body)
1379
+ error_detail = error_json.get("error") or error_json.get("message") or error_body
1380
+ except json.JSONDecodeError:
1381
+ error_detail = error_body[:500]
1382
+
1383
+ error_messages = {
1384
+ 401: "Invalid or expired API key. Please check your credentials.",
1385
+ 403: "Access forbidden. Your API key may not have permission for this operation.",
1386
+ 429: "Rate limit exceeded. Please wait a moment and try again.",
1387
+ 500: "Server error. The search provider is experiencing issues.",
1388
+ 503: "Service unavailable. The search provider may be down."
1389
+ }
1390
+
1391
+ friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
1392
+ raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
1393
+ except URLError as e:
1394
+ reason = str(getattr(e, "reason", e))
1395
+ is_timeout = "timed out" in reason.lower()
1396
+ raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
1397
+ except TimeoutError:
1398
+ raise ProviderRequestError(f"Request timed out after {timeout}s. Try again or reduce max_results.", transient=True)
1399
+
1400
+
1401
+ # =============================================================================
1402
+ # Serper (Google Search API)
1403
+ # =============================================================================
1404
+
1405
+ def search_serper(
1406
+ query: str,
1407
+ api_key: str,
1408
+ max_results: int = 5,
1409
+ country: str = "us",
1410
+ language: str = "en",
1411
+ search_type: str = "search",
1412
+ time_range: Optional[str] = None,
1413
+ include_images: bool = False,
1414
+ ) -> dict:
1415
+ """Search using Serper (Google Search API)."""
1416
+ endpoint = f"https://google.serper.dev/{search_type}"
1417
+
1418
+ body = {
1419
+ "q": query,
1420
+ "gl": country,
1421
+ "hl": language,
1422
+ "num": max_results,
1423
+ "autocorrect": True,
1424
+ }
1425
+
1426
+ if time_range and time_range != "none":
1427
+ tbs_map = {
1428
+ "hour": "qdr:h",
1429
+ "day": "qdr:d",
1430
+ "week": "qdr:w",
1431
+ "month": "qdr:m",
1432
+ "year": "qdr:y",
1433
+ }
1434
+ if time_range in tbs_map:
1435
+ body["tbs"] = tbs_map[time_range]
1436
+
1437
+ headers = {
1438
+ "X-API-KEY": api_key,
1439
+ "Content-Type": "application/json",
1440
+ }
1441
+
1442
+ data = make_request(endpoint, headers, body)
1443
+
1444
+ results = []
1445
+ for i, item in enumerate(data.get("organic", [])[:max_results]):
1446
+ results.append({
1447
+ "title": item.get("title", ""),
1448
+ "url": item.get("link", ""),
1449
+ "snippet": item.get("snippet", ""),
1450
+ "score": round(1.0 - i * 0.1, 2),
1451
+ "date": item.get("date"),
1452
+ })
1453
+
1454
+ answer = ""
1455
+ if data.get("answerBox", {}).get("answer"):
1456
+ answer = data["answerBox"]["answer"]
1457
+ elif data.get("answerBox", {}).get("snippet"):
1458
+ answer = data["answerBox"]["snippet"]
1459
+ elif data.get("knowledgeGraph", {}).get("description"):
1460
+ answer = data["knowledgeGraph"]["description"]
1461
+ elif results:
1462
+ answer = results[0]["snippet"]
1463
+
1464
+ images = []
1465
+ if include_images:
1466
+ try:
1467
+ img_data = make_request(
1468
+ "https://google.serper.dev/images",
1469
+ headers,
1470
+ {"q": query, "gl": country, "hl": language, "num": 5},
1471
+ )
1472
+ images = [img.get("imageUrl", "") for img in img_data.get("images", [])[:5] if img.get("imageUrl")]
1473
+ except Exception:
1474
+ pass
1475
+
1476
+ return {
1477
+ "provider": "serper",
1478
+ "query": query,
1479
+ "results": results,
1480
+ "images": images,
1481
+ "answer": answer,
1482
+ "knowledge_graph": data.get("knowledgeGraph"),
1483
+ "related_searches": [r.get("query") for r in data.get("relatedSearches", [])]
1484
+ }
1485
+
1486
+
1487
+ # =============================================================================
1488
+ # Tavily (Research Search)
1489
+ # =============================================================================
1490
+
1491
+ def search_tavily(
1492
+ query: str,
1493
+ api_key: str,
1494
+ max_results: int = 5,
1495
+ depth: str = "basic",
1496
+ topic: str = "general",
1497
+ include_domains: Optional[List[str]] = None,
1498
+ exclude_domains: Optional[List[str]] = None,
1499
+ include_images: bool = False,
1500
+ include_raw_content: bool = False,
1501
+ ) -> dict:
1502
+ """Search using Tavily (AI Research Search)."""
1503
+ endpoint = "https://api.tavily.com/search"
1504
+
1505
+ body = {
1506
+ "api_key": api_key,
1507
+ "query": query,
1508
+ "max_results": max_results,
1509
+ "search_depth": depth,
1510
+ "topic": topic,
1511
+ "include_images": include_images,
1512
+ "include_answer": True,
1513
+ "include_raw_content": include_raw_content,
1514
+ }
1515
+
1516
+ if include_domains:
1517
+ body["include_domains"] = include_domains
1518
+ if exclude_domains:
1519
+ body["exclude_domains"] = exclude_domains
1520
+
1521
+ headers = {"Content-Type": "application/json"}
1522
+
1523
+ data = make_request(endpoint, headers, body)
1524
+
1525
+ results = []
1526
+ for item in data.get("results", [])[:max_results]:
1527
+ result = {
1528
+ "title": item.get("title", ""),
1529
+ "url": item.get("url", ""),
1530
+ "snippet": item.get("content", ""),
1531
+ "score": round(item.get("score", 0.0), 3),
1532
+ }
1533
+ if include_raw_content and item.get("raw_content"):
1534
+ result["raw_content"] = item["raw_content"]
1535
+ results.append(result)
1536
+
1537
+ return {
1538
+ "provider": "tavily",
1539
+ "query": query,
1540
+ "results": results,
1541
+ "images": data.get("images", []),
1542
+ "answer": data.get("answer", ""),
1543
+ }
1544
+
1545
+
1546
+ # =============================================================================
1547
+ # Exa (Neural/Semantic Search)
1548
+ # =============================================================================
1549
+
1550
+ def search_exa(
1551
+ query: str,
1552
+ api_key: str,
1553
+ max_results: int = 5,
1554
+ search_type: str = "neural",
1555
+ category: Optional[str] = None,
1556
+ start_date: Optional[str] = None,
1557
+ end_date: Optional[str] = None,
1558
+ similar_url: Optional[str] = None,
1559
+ include_domains: Optional[List[str]] = None,
1560
+ exclude_domains: Optional[List[str]] = None,
1561
+ ) -> dict:
1562
+ """Search using Exa (Neural/Semantic Search)."""
1563
+ if similar_url:
1564
+ endpoint = "https://api.exa.ai/findSimilar"
1565
+ body = {
1566
+ "url": similar_url,
1567
+ "numResults": max_results,
1568
+ "contents": {
1569
+ "text": {"maxCharacters": 1000},
1570
+ "highlights": True,
1571
+ },
1572
+ }
1573
+ else:
1574
+ endpoint = "https://api.exa.ai/search"
1575
+ body = {
1576
+ "query": query,
1577
+ "numResults": max_results,
1578
+ "type": search_type,
1579
+ "contents": {
1580
+ "text": {"maxCharacters": 1000},
1581
+ "highlights": True,
1582
+ },
1583
+ }
1584
+
1585
+ if category:
1586
+ body["category"] = category
1587
+ if start_date:
1588
+ body["startPublishedDate"] = start_date
1589
+ if end_date:
1590
+ body["endPublishedDate"] = end_date
1591
+ if include_domains:
1592
+ body["includeDomains"] = include_domains
1593
+ if exclude_domains:
1594
+ body["excludeDomains"] = exclude_domains
1595
+
1596
+ headers = {
1597
+ "x-api-key": api_key,
1598
+ "Content-Type": "application/json",
1599
+ }
1600
+
1601
+ data = make_request(endpoint, headers, body)
1602
+
1603
+ results = []
1604
+ for item in data.get("results", [])[:max_results]:
1605
+ highlights = item.get("highlights", [])
1606
+ snippet = highlights[0] if highlights else (item.get("text", "") or "")[:500]
1607
+
1608
+ results.append({
1609
+ "title": item.get("title", ""),
1610
+ "url": item.get("url", ""),
1611
+ "snippet": snippet,
1612
+ "score": round(item.get("score", 0.0), 3),
1613
+ "published_date": item.get("publishedDate"),
1614
+ "author": item.get("author"),
1615
+ })
1616
+
1617
+ answer = results[0]["snippet"] if results else ""
1618
+
1619
+ return {
1620
+ "provider": "exa",
1621
+ "query": query if not similar_url else f"Similar to: {similar_url}",
1622
+ "results": results,
1623
+ "images": [],
1624
+ "answer": answer,
1625
+ }
1626
+
1627
+
1628
+ # =============================================================================
1629
+ # Perplexity via Kilo Gateway (Synthesized Direct Answers)
1630
+ # =============================================================================
1631
+
1632
+ def search_perplexity(
1633
+ query: str,
1634
+ api_key: str,
1635
+ max_results: int = 5,
1636
+ model: str = "perplexity/sonar-pro",
1637
+ api_url: str = "https://api.kilo.ai/api/gateway/chat/completions",
1638
+ freshness: Optional[str] = None,
1639
+ ) -> dict:
1640
+ """Search/answer using Perplexity Sonar Pro via Kilo Gateway.
1641
+
1642
+ Args:
1643
+ query: Search query
1644
+ api_key: Kilo Gateway API key
1645
+ max_results: Maximum results to return
1646
+ model: Perplexity model to use
1647
+ api_url: Kilo Gateway endpoint
1648
+ freshness: Filter by recency — 'day', 'week', 'month', 'year' (maps to
1649
+ Perplexity's search_recency_filter parameter)
1650
+ """
1651
+ # Map generic freshness values to Perplexity's search_recency_filter
1652
+ recency_map = {"day": "day", "pd": "day", "week": "week", "pw": "week", "month": "month", "pm": "month", "year": "year", "py": "year"}
1653
+ recency_filter = recency_map.get(freshness or "", None)
1654
+
1655
+ body = {
1656
+ "model": model,
1657
+ "messages": [
1658
+ {"role": "system", "content": "Answer with concise factual summary and include source URLs."},
1659
+ {"role": "user", "content": query},
1660
+ ],
1661
+ "temperature": 0.2,
1662
+ }
1663
+ if recency_filter:
1664
+ body["search_recency_filter"] = recency_filter
1665
+
1666
+ headers = {
1667
+ "Authorization": f"Bearer {api_key}",
1668
+ "Content-Type": "application/json",
1669
+ }
1670
+
1671
+ data = make_request(api_url, headers, body)
1672
+ choices = data.get("choices", [])
1673
+ message = choices[0].get("message", {}) if choices else {}
1674
+ answer = (message.get("content") or "").strip()
1675
+
1676
+ urls = re.findall(r"https?://[^\s)\]}>\"']+", answer)
1677
+ unique_urls = []
1678
+ seen = set()
1679
+ for u in urls:
1680
+ if u not in seen:
1681
+ seen.add(u)
1682
+ unique_urls.append(u)
1683
+
1684
+ results = []
1685
+
1686
+ # Primary result: the synthesized answer itself
1687
+ if answer:
1688
+ # Clean citation markers [1][2] for the snippet
1689
+ clean_answer = re.sub(r'\[\d+\]', '', answer).strip()
1690
+ results.append({
1691
+ "title": f"Perplexity Answer: {query[:80]}",
1692
+ "url": "https://www.perplexity.ai",
1693
+ "snippet": clean_answer[:500],
1694
+ "score": 1.0,
1695
+ })
1696
+
1697
+ # Additional results: extracted source URLs
1698
+ for i, u in enumerate(unique_urls[:max_results - 1]):
1699
+ results.append({
1700
+ "title": f"Source {i+1}",
1701
+ "url": u,
1702
+ "snippet": "Referenced source from Perplexity answer",
1703
+ "score": round(0.9 - i * 0.1, 3),
1704
+ })
1705
+
1706
+ return {
1707
+ "provider": "perplexity",
1708
+ "query": query,
1709
+ "results": results,
1710
+ "images": [],
1711
+ "answer": answer,
1712
+ "metadata": {
1713
+ "model": model,
1714
+ "usage": data.get("usage", {}),
1715
+ }
1716
+ }
1717
+
1718
+
1719
+
1720
+ # =============================================================================
1721
+ # You.com (LLM-Ready Web & News Search)
1722
+ # =============================================================================
1723
+
1724
+ def search_you(
1725
+ query: str,
1726
+ api_key: str,
1727
+ max_results: int = 5,
1728
+ country: str = "US",
1729
+ language: str = "en",
1730
+ freshness: Optional[str] = None,
1731
+ safesearch: str = "moderate",
1732
+ include_news: bool = True,
1733
+ livecrawl: Optional[str] = None,
1734
+ ) -> dict:
1735
+ """Search using You.com (LLM-Ready Web & News Search).
1736
+
1737
+ You.com excels at:
1738
+ - RAG applications with pre-extracted snippets
1739
+ - Combined web + news results in one call
1740
+ - Real-time information with automatic news classification
1741
+ - Clean, structured JSON optimized for AI consumption
1742
+
1743
+ Args:
1744
+ query: Search query
1745
+ api_key: You.com API key
1746
+ max_results: Maximum results to return (default 5, max 100)
1747
+ country: ISO 3166-2 country code (e.g., US, GB, DE)
1748
+ language: BCP 47 language code (e.g., en, de, fr)
1749
+ freshness: Filter by recency: day, week, month, year, or YYYY-MM-DDtoYYYY-MM-DD
1750
+ safesearch: Content filter: off, moderate (default), strict
1751
+ include_news: Include news results when relevant (default True)
1752
+ livecrawl: Fetch full page content: "web", "news", or "all"
1753
+ """
1754
+ endpoint = "https://ydc-index.io/v1/search"
1755
+
1756
+ # Build query parameters
1757
+ params = {
1758
+ "query": query,
1759
+ "count": max_results,
1760
+ "safesearch": safesearch,
1761
+ }
1762
+
1763
+ if country:
1764
+ params["country"] = country.upper()
1765
+ if language:
1766
+ params["language"] = language.upper()
1767
+ if freshness:
1768
+ params["freshness"] = freshness
1769
+ if livecrawl:
1770
+ params["livecrawl"] = livecrawl
1771
+ params["livecrawl_formats"] = "markdown"
1772
+
1773
+ # Build URL with query params (URL-encode values)
1774
+ query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
1775
+ url = f"{endpoint}?{query_string}"
1776
+
1777
+ headers = {
1778
+ "X-API-KEY": api_key,
1779
+ "Accept": "application/json",
1780
+ "User-Agent": "ClawdBot-WebSearchPlus/2.4",
1781
+ }
1782
+
1783
+ # Make GET request (You.com uses GET, not POST)
1784
+ from urllib.request import Request, urlopen
1785
+ req = Request(url, headers=headers, method="GET")
1786
+
1787
+ try:
1788
+ with urlopen(req, timeout=30) as response:
1789
+ data = json.loads(response.read().decode("utf-8"))
1790
+ except HTTPError as e:
1791
+ error_body = e.read().decode("utf-8") if e.fp else str(e)
1792
+ try:
1793
+ error_json = json.loads(error_body)
1794
+ error_detail = error_json.get("error") or error_json.get("message") or error_body
1795
+ except json.JSONDecodeError:
1796
+ error_detail = error_body[:500]
1797
+
1798
+ error_messages = {
1799
+ 401: "Invalid or expired API key. Get one at https://api.you.com",
1800
+ 403: "Access forbidden. Check your API key permissions.",
1801
+ 429: "Rate limit exceeded. Please wait and try again.",
1802
+ 500: "You.com server error. Try again later.",
1803
+ 503: "You.com service unavailable."
1804
+ }
1805
+ friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
1806
+ raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
1807
+ except URLError as e:
1808
+ reason = str(getattr(e, "reason", e))
1809
+ is_timeout = "timed out" in reason.lower()
1810
+ raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
1811
+ except TimeoutError:
1812
+ raise ProviderRequestError("You.com request timed out after 30s.", transient=True)
1813
+
1814
+ # Parse results
1815
+ results_data = data.get("results", {})
1816
+ web_results = results_data.get("web", [])
1817
+ news_results = results_data.get("news", []) if include_news else []
1818
+ metadata = data.get("metadata", {})
1819
+
1820
+ # Normalize web results
1821
+ results = []
1822
+ for i, item in enumerate(web_results[:max_results]):
1823
+ snippets = item.get("snippets", [])
1824
+ snippet = snippets[0] if snippets else item.get("description", "")
1825
+
1826
+ result = {
1827
+ "title": item.get("title", ""),
1828
+ "url": item.get("url", ""),
1829
+ "snippet": snippet,
1830
+ "score": round(1.0 - i * 0.05, 3), # Assign descending score
1831
+ "date": item.get("page_age"),
1832
+ "source": "web",
1833
+ }
1834
+
1835
+ # Include additional snippets if available (great for RAG)
1836
+ if len(snippets) > 1:
1837
+ result["additional_snippets"] = snippets[1:3]
1838
+
1839
+ # Include thumbnail and favicon for UI display
1840
+ if item.get("thumbnail_url"):
1841
+ result["thumbnail"] = item["thumbnail_url"]
1842
+ if item.get("favicon_url"):
1843
+ result["favicon"] = item["favicon_url"]
1844
+
1845
+ # Include live-crawled content if available
1846
+ if item.get("contents"):
1847
+ result["raw_content"] = item["contents"].get("markdown") or item["contents"].get("html", "")
1848
+
1849
+ results.append(result)
1850
+
1851
+ # Add news results (if any)
1852
+ news = []
1853
+ for item in news_results[:5]:
1854
+ news.append({
1855
+ "title": item.get("title", ""),
1856
+ "url": item.get("url", ""),
1857
+ "snippet": item.get("description", ""),
1858
+ "date": item.get("page_age"),
1859
+ "thumbnail": item.get("thumbnail_url"),
1860
+ "source": "news",
1861
+ })
1862
+
1863
+ # Build answer from best snippets
1864
+ answer = ""
1865
+ if results:
1866
+ # Combine top snippets for LLM context
1867
+ top_snippets = []
1868
+ for r in results[:3]:
1869
+ if r.get("snippet"):
1870
+ top_snippets.append(r["snippet"])
1871
+ answer = " ".join(top_snippets)[:1000]
1872
+
1873
+ return {
1874
+ "provider": "you",
1875
+ "query": query,
1876
+ "results": results,
1877
+ "news": news,
1878
+ "images": [],
1879
+ "answer": answer,
1880
+ "metadata": {
1881
+ "search_uuid": metadata.get("search_uuid"),
1882
+ "latency": metadata.get("latency"),
1883
+ }
1884
+ }
1885
+
1886
+
1887
+ # =============================================================================
1888
+ # SearXNG (Privacy-First Meta-Search)
1889
+ # =============================================================================
1890
+
1891
+ def search_searxng(
1892
+ query: str,
1893
+ instance_url: str,
1894
+ max_results: int = 5,
1895
+ categories: Optional[List[str]] = None,
1896
+ engines: Optional[List[str]] = None,
1897
+ language: str = "en",
1898
+ time_range: Optional[str] = None,
1899
+ safesearch: int = 0,
1900
+ ) -> dict:
1901
+ """Search using SearXNG (self-hosted privacy-first meta-search).
1902
+
1903
+ SearXNG excels at:
1904
+ - Privacy-preserving search (no tracking, no profiling)
1905
+ - Multi-source aggregation (70+ upstream engines)
1906
+ - $0 API cost (self-hosted)
1907
+ - Diverse perspectives from multiple search engines
1908
+
1909
+ Args:
1910
+ query: Search query
1911
+ instance_url: URL of your SearXNG instance (required)
1912
+ max_results: Maximum results to return (default 5)
1913
+ categories: Search categories (general, images, news, videos, etc.)
1914
+ engines: Specific engines to use (google, bing, duckduckgo, etc.)
1915
+ language: Language code (e.g., en, de, fr)
1916
+ time_range: Filter by recency: day, week, month, year
1917
+ safesearch: Content filter: 0=off, 1=moderate, 2=strict
1918
+
1919
+ Note:
1920
+ Requires a self-hosted SearXNG instance with JSON format enabled.
1921
+ See: https://docs.searxng.org/admin/installation.html
1922
+ """
1923
+ # Build URL with query parameters
1924
+ params = {
1925
+ "q": query,
1926
+ "format": "json",
1927
+ "language": language,
1928
+ "safesearch": str(safesearch),
1929
+ }
1930
+
1931
+ if categories:
1932
+ params["categories"] = ",".join(categories)
1933
+ if engines:
1934
+ params["engines"] = ",".join(engines)
1935
+ if time_range:
1936
+ params["time_range"] = time_range
1937
+
1938
+ # Build URL — instance_url comes from operator-controlled config/env only
1939
+ # (validated by _validate_searxng_url), not from agent/LLM input
1940
+ base_url = instance_url.rstrip("/")
1941
+ query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
1942
+ url = f"{base_url}/search?{query_string}"
1943
+
1944
+ headers = {
1945
+ "User-Agent": "ClawdBot-WebSearchPlus/2.5",
1946
+ "Accept": "application/json",
1947
+ }
1948
+
1949
+ # Make GET request
1950
+ req = Request(url, headers=headers, method="GET")
1951
+
1952
+ try:
1953
+ with urlopen(req, timeout=30) as response:
1954
+ data = json.loads(response.read().decode("utf-8"))
1955
+ except HTTPError as e:
1956
+ error_body = e.read().decode("utf-8") if e.fp else str(e)
1957
+ try:
1958
+ error_json = json.loads(error_body)
1959
+ error_detail = error_json.get("error") or error_json.get("message") or error_body
1960
+ except json.JSONDecodeError:
1961
+ error_detail = error_body[:500]
1962
+
1963
+ error_messages = {
1964
+ 403: "JSON API disabled on this SearXNG instance. Enable 'json' in search.formats in settings.yml",
1965
+ 404: "SearXNG instance not found. Check your instance URL.",
1966
+ 500: "SearXNG server error. Check instance health.",
1967
+ 503: "SearXNG service unavailable."
1968
+ }
1969
+ friendly_msg = error_messages.get(e.code, f"SearXNG error: {error_detail}")
1970
+ raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
1971
+ except URLError as e:
1972
+ reason = str(getattr(e, "reason", e))
1973
+ is_timeout = "timed out" in reason.lower()
1974
+ raise ProviderRequestError(f"Cannot reach SearXNG instance at {instance_url}. Error: {reason}", transient=is_timeout)
1975
+ except TimeoutError:
1976
+ raise ProviderRequestError(f"SearXNG request timed out after 30s. Check instance health.", transient=True)
1977
+
1978
+ # Parse results
1979
+ raw_results = data.get("results", [])
1980
+
1981
+ # Normalize results to unified format
1982
+ results = []
1983
+ engines_used = set()
1984
+ for i, item in enumerate(raw_results[:max_results]):
1985
+ engine = item.get("engine", "unknown")
1986
+ engines_used.add(engine)
1987
+
1988
+ results.append({
1989
+ "title": item.get("title", ""),
1990
+ "url": item.get("url", ""),
1991
+ "snippet": item.get("content", ""),
1992
+ "score": round(item.get("score", 1.0 - i * 0.05), 3),
1993
+ "engine": engine,
1994
+ "category": item.get("category", "general"),
1995
+ "date": item.get("publishedDate"),
1996
+ })
1997
+
1998
+ # Build answer from answers, infoboxes, or first result
1999
+ answer = ""
2000
+ if data.get("answers"):
2001
+ answer = data["answers"][0] if isinstance(data["answers"][0], str) else str(data["answers"][0])
2002
+ elif data.get("infoboxes"):
2003
+ infobox = data["infoboxes"][0]
2004
+ answer = infobox.get("content", "") or infobox.get("infobox", "")
2005
+ elif results:
2006
+ answer = results[0]["snippet"]
2007
+
2008
+ return {
2009
+ "provider": "searxng",
2010
+ "query": query,
2011
+ "results": results,
2012
+ "images": [],
2013
+ "answer": answer,
2014
+ "suggestions": data.get("suggestions", []),
2015
+ "corrections": data.get("corrections", []),
2016
+ "metadata": {
2017
+ "number_of_results": data.get("number_of_results"),
2018
+ "engines_used": list(engines_used),
2019
+ "instance_url": instance_url,
2020
+ }
2021
+ }
2022
+
2023
+
2024
+ # =============================================================================
2025
+ # CLI
2026
+ # =============================================================================
2027
+
2028
+ def main():
2029
+ config = load_config()
2030
+
2031
+ parser = argparse.ArgumentParser(
2032
+ description="Web Search Plus — Intelligent multi-provider search with smart auto-routing",
2033
+ formatter_class=argparse.RawDescriptionHelpFormatter,
2034
+ epilog="""
2035
+ Intelligent Auto-Routing:
2036
+ The query is analyzed using multi-signal detection to find the optimal provider:
2037
+
2038
+ Shopping Intent → Serper (Google)
2039
+ "how much", "price of", "buy", product+brand combos, deals, specs
2040
+
2041
+ Research Intent → Tavily
2042
+ "how does", "explain", "what is", analysis, pros/cons, tutorials
2043
+
2044
+ Discovery Intent → Exa (Neural)
2045
+ "similar to", "companies like", "alternatives", URLs, startups, papers
2046
+
2047
+ Direct Answer Intent → Perplexity (via Kilo Gateway)
2048
+ "what is", "current status", local events, synthesized up-to-date answers
2049
+
2050
+ Examples:
2051
+ python3 search.py -q "iPhone 16 Pro Max price" # → Serper (shopping)
2052
+ python3 search.py -q "how does HTTPS encryption work" # → Tavily (research)
2053
+ python3 search.py -q "startups similar to Notion" # → Exa (discovery)
2054
+ python3 search.py --explain-routing -q "your query" # Debug routing
2055
+
2056
+ Full docs: See README.md and SKILL.md
2057
+ """,
2058
+ )
2059
+
2060
+ # Common arguments
2061
+ parser.add_argument(
2062
+ "--provider", "-p",
2063
+ choices=["serper", "tavily", "exa", "perplexity", "you", "searxng", "auto"],
2064
+ help="Search provider (auto=intelligent routing)"
2065
+ )
2066
+ parser.add_argument(
2067
+ "--query", "-q",
2068
+ help="Search query"
2069
+ )
2070
+ parser.add_argument(
2071
+ "--max-results", "-n",
2072
+ type=int,
2073
+ default=config.get("defaults", {}).get("max_results", 5),
2074
+ help="Maximum results (default: 5)"
2075
+ )
2076
+ parser.add_argument(
2077
+ "--images",
2078
+ action="store_true",
2079
+ help="Include images (Serper/Tavily)"
2080
+ )
2081
+
2082
+ # Auto-routing options
2083
+ parser.add_argument(
2084
+ "--auto", "-a",
2085
+ action="store_true",
2086
+ help="Use intelligent auto-routing (default when no provider specified)"
2087
+ )
2088
+ parser.add_argument(
2089
+ "--explain-routing",
2090
+ action="store_true",
2091
+ help="Show detailed routing analysis (debug mode)"
2092
+ )
2093
+
2094
+ # Serper-specific
2095
+ serper_config = config.get("serper", {})
2096
+ parser.add_argument("--country", default=serper_config.get("country", "us"))
2097
+ parser.add_argument("--language", default=serper_config.get("language", "en"))
2098
+ parser.add_argument(
2099
+ "--type",
2100
+ dest="search_type",
2101
+ default=serper_config.get("type", "search"),
2102
+ choices=["search", "news", "images", "videos", "places", "shopping"]
2103
+ )
2104
+ parser.add_argument(
2105
+ "--time-range",
2106
+ choices=["hour", "day", "week", "month", "year"]
2107
+ )
2108
+
2109
+ # Tavily-specific
2110
+ tavily_config = config.get("tavily", {})
2111
+ parser.add_argument(
2112
+ "--depth",
2113
+ default=tavily_config.get("depth", "basic"),
2114
+ choices=["basic", "advanced"]
2115
+ )
2116
+ parser.add_argument(
2117
+ "--topic",
2118
+ default=tavily_config.get("topic", "general"),
2119
+ choices=["general", "news"]
2120
+ )
2121
+ parser.add_argument("--raw-content", action="store_true")
2122
+
2123
+ # Exa-specific
2124
+ exa_config = config.get("exa", {})
2125
+ parser.add_argument(
2126
+ "--exa-type",
2127
+ default=exa_config.get("type", "neural"),
2128
+ choices=["neural", "keyword"]
2129
+ )
2130
+ parser.add_argument(
2131
+ "--category",
2132
+ choices=[
2133
+ "company", "research paper", "news", "pdf", "github",
2134
+ "tweet", "personal site", "linkedin profile"
2135
+ ]
2136
+ )
2137
+ parser.add_argument("--start-date")
2138
+ parser.add_argument("--end-date")
2139
+ parser.add_argument("--similar-url")
2140
+
2141
+ # You.com-specific
2142
+ you_config = config.get("you", {})
2143
+ parser.add_argument(
2144
+ "--you-safesearch",
2145
+ default=you_config.get("safesearch", "moderate"),
2146
+ choices=["off", "moderate", "strict"],
2147
+ help="You.com SafeSearch filter"
2148
+ )
2149
+ parser.add_argument(
2150
+ "--freshness",
2151
+ choices=["day", "week", "month", "year"],
2152
+ help="Filter results by recency (You.com/Serper)"
2153
+ )
2154
+ parser.add_argument(
2155
+ "--livecrawl",
2156
+ choices=["web", "news", "all"],
2157
+ help="You.com: fetch full page content"
2158
+ )
2159
+ parser.add_argument(
2160
+ "--include-news",
2161
+ action="store_true",
2162
+ default=True,
2163
+ help="You.com: include news results (default: true)"
2164
+ )
2165
+
2166
+ # SearXNG-specific
2167
+ searxng_config = config.get("searxng", {})
2168
+ parser.add_argument(
2169
+ "--searxng-url",
2170
+ default=searxng_config.get("instance_url"),
2171
+ help="SearXNG instance URL (e.g., https://searx.example.com)"
2172
+ )
2173
+ parser.add_argument(
2174
+ "--searxng-safesearch",
2175
+ type=int,
2176
+ default=searxng_config.get("safesearch", 0),
2177
+ choices=[0, 1, 2],
2178
+ help="SearXNG SafeSearch: 0=off, 1=moderate, 2=strict"
2179
+ )
2180
+ parser.add_argument(
2181
+ "--engines",
2182
+ nargs="+",
2183
+ default=searxng_config.get("engines"),
2184
+ help="SearXNG: specific engines to use (e.g., google bing duckduckgo)"
2185
+ )
2186
+ parser.add_argument(
2187
+ "--categories",
2188
+ nargs="+",
2189
+ help="SearXNG: search categories (general, images, news, videos, etc.)"
2190
+ )
2191
+
2192
+ # Domain filters
2193
+ parser.add_argument("--include-domains", nargs="+")
2194
+ parser.add_argument("--exclude-domains", nargs="+")
2195
+
2196
+ # Output
2197
+ parser.add_argument("--compact", action="store_true")
2198
+
2199
+ # Caching options
2200
+ parser.add_argument(
2201
+ "--cache-ttl",
2202
+ type=int,
2203
+ default=DEFAULT_CACHE_TTL,
2204
+ help=f"Cache TTL in seconds (default: {DEFAULT_CACHE_TTL} = 1 hour)"
2205
+ )
2206
+ parser.add_argument(
2207
+ "--no-cache",
2208
+ action="store_true",
2209
+ help="Bypass cache (always fetch fresh results)"
2210
+ )
2211
+ parser.add_argument(
2212
+ "--clear-cache",
2213
+ action="store_true",
2214
+ help="Clear all cached results and exit"
2215
+ )
2216
+ parser.add_argument(
2217
+ "--cache-stats",
2218
+ action="store_true",
2219
+ help="Show cache statistics and exit"
2220
+ )
2221
+
2222
+ args = parser.parse_args()
2223
+
2224
+ # Handle cache management commands first (before query validation)
2225
+ if args.clear_cache:
2226
+ result = cache_clear()
2227
+ indent = None if args.compact else 2
2228
+ print(json.dumps(result, indent=indent, ensure_ascii=False))
2229
+ return
2230
+
2231
+ if args.cache_stats:
2232
+ result = cache_stats()
2233
+ indent = None if args.compact else 2
2234
+ print(json.dumps(result, indent=indent, ensure_ascii=False))
2235
+ return
2236
+
2237
+ if not args.query and not args.similar_url:
2238
+ parser.error("--query is required (unless using --similar-url with Exa)")
2239
+
2240
+ # Handle --explain-routing
2241
+ if args.explain_routing:
2242
+ if not args.query:
2243
+ parser.error("--query is required for --explain-routing")
2244
+ explanation = explain_routing(args.query, config)
2245
+ indent = None if args.compact else 2
2246
+ print(json.dumps(explanation, indent=indent, ensure_ascii=False))
2247
+ return
2248
+
2249
+ # Determine provider
2250
+ if args.provider == "auto" or (args.provider is None and not args.similar_url):
2251
+ if args.query:
2252
+ routing = auto_route_provider(args.query, config)
2253
+ provider = routing["provider"]
2254
+ routing_info = {
2255
+ "auto_routed": True,
2256
+ "provider": provider,
2257
+ "confidence": routing["confidence"],
2258
+ "confidence_level": routing["confidence_level"],
2259
+ "reason": routing["reason"],
2260
+ "top_signals": routing["top_signals"],
2261
+ "scores": routing["scores"],
2262
+ }
2263
+ else:
2264
+ provider = "exa"
2265
+ routing_info = {
2266
+ "auto_routed": True,
2267
+ "provider": "exa",
2268
+ "confidence": 1.0,
2269
+ "confidence_level": "high",
2270
+ "reason": "similar_url_specified",
2271
+ }
2272
+ else:
2273
+ provider = args.provider or "serper"
2274
+ routing_info = {"auto_routed": False, "provider": provider}
2275
+
2276
+ # Build provider fallback list
2277
+ auto_config = config.get("auto_routing", {})
2278
+ provider_priority = auto_config.get("provider_priority", ["tavily", "exa", "perplexity", "serper"])
2279
+ disabled_providers = auto_config.get("disabled_providers", [])
2280
+
2281
+ # Start with the selected provider, then try others in priority order
2282
+ providers_to_try = [provider]
2283
+ for p in provider_priority:
2284
+ if p not in providers_to_try and p not in disabled_providers:
2285
+ providers_to_try.append(p)
2286
+
2287
+ # Skip providers currently in cooldown
2288
+ eligible_providers = []
2289
+ cooldown_skips = []
2290
+ for p in providers_to_try:
2291
+ in_cd, remaining = provider_in_cooldown(p)
2292
+ if in_cd:
2293
+ cooldown_skips.append({"provider": p, "cooldown_remaining_seconds": remaining})
2294
+ else:
2295
+ eligible_providers.append(p)
2296
+
2297
+ if not eligible_providers:
2298
+ eligible_providers = providers_to_try[:1]
2299
+
2300
+ # Helper function to execute search for a provider
2301
+ def execute_search(prov: str) -> Dict[str, Any]:
2302
+ key = validate_api_key(prov, config)
2303
+ if prov == "serper":
2304
+ return search_serper(
2305
+ query=args.query,
2306
+ api_key=key,
2307
+ max_results=args.max_results,
2308
+ country=args.country,
2309
+ language=args.language,
2310
+ search_type=args.search_type,
2311
+ time_range=args.time_range,
2312
+ include_images=args.images,
2313
+ )
2314
+ elif prov == "tavily":
2315
+ return search_tavily(
2316
+ query=args.query,
2317
+ api_key=key,
2318
+ max_results=args.max_results,
2319
+ depth=args.depth,
2320
+ topic=args.topic,
2321
+ include_domains=args.include_domains,
2322
+ exclude_domains=args.exclude_domains,
2323
+ include_images=args.images,
2324
+ include_raw_content=args.raw_content,
2325
+ )
2326
+ elif prov == "exa":
2327
+ return search_exa(
2328
+ query=args.query or "",
2329
+ api_key=key,
2330
+ max_results=args.max_results,
2331
+ search_type=args.exa_type,
2332
+ category=args.category,
2333
+ start_date=args.start_date,
2334
+ end_date=args.end_date,
2335
+ similar_url=args.similar_url,
2336
+ include_domains=args.include_domains,
2337
+ exclude_domains=args.exclude_domains,
2338
+ )
2339
+ elif prov == "perplexity":
2340
+ perplexity_config = config.get("perplexity", {})
2341
+ return search_perplexity(
2342
+ query=args.query,
2343
+ api_key=key,
2344
+ max_results=args.max_results,
2345
+ model=perplexity_config.get("model", "perplexity/sonar-pro"),
2346
+ api_url=perplexity_config.get("api_url", "https://api.kilo.ai/api/gateway/chat/completions"),
2347
+ freshness=getattr(args, "freshness", None),
2348
+ )
2349
+ elif prov == "you":
2350
+ return search_you(
2351
+ query=args.query,
2352
+ api_key=key,
2353
+ max_results=args.max_results,
2354
+ country=args.country,
2355
+ language=args.language,
2356
+ freshness=args.freshness,
2357
+ safesearch=args.you_safesearch,
2358
+ include_news=args.include_news,
2359
+ livecrawl=args.livecrawl,
2360
+ )
2361
+ elif prov == "searxng":
2362
+ # For SearXNG, 'key' is actually the instance URL
2363
+ instance_url = args.searxng_url or key
2364
+ if instance_url:
2365
+ instance_url = _validate_searxng_url(instance_url)
2366
+ return search_searxng(
2367
+ query=args.query,
2368
+ instance_url=instance_url,
2369
+ max_results=args.max_results,
2370
+ categories=args.categories,
2371
+ engines=args.engines,
2372
+ language=args.language,
2373
+ time_range=args.time_range,
2374
+ safesearch=args.searxng_safesearch,
2375
+ )
2376
+ else:
2377
+ raise ValueError(f"Unknown provider: {prov}")
2378
+
2379
+ def execute_with_retry(prov: str) -> Dict[str, Any]:
2380
+ last_error = None
2381
+ for attempt in range(0, 3):
2382
+ try:
2383
+ return execute_search(prov)
2384
+ except ProviderRequestError as e:
2385
+ last_error = e
2386
+ if e.status_code in {401, 403}:
2387
+ break
2388
+ if not e.transient:
2389
+ break
2390
+ if attempt < 2:
2391
+ time.sleep(RETRY_BACKOFF_SECONDS[attempt])
2392
+ continue
2393
+ break
2394
+ except Exception as e:
2395
+ last_error = e
2396
+ break
2397
+ raise last_error if last_error else Exception("Unknown provider execution error")
2398
+
2399
+ cache_context = {
2400
+ "locale": f"{args.country}:{args.language}",
2401
+ "freshness": args.freshness,
2402
+ "time_range": args.time_range,
2403
+ "topic": args.topic,
2404
+ "search_engines": sorted(args.engines) if args.engines else None,
2405
+ "include_news": bool(args.include_news),
2406
+ "search_type": args.search_type,
2407
+ "exa_type": args.exa_type,
2408
+ "category": args.category,
2409
+ "similar_url": args.similar_url,
2410
+ }
2411
+
2412
+ # Check cache first (unless --no-cache is set)
2413
+ cached_result = None
2414
+ cache_hit = False
2415
+ if not args.no_cache and args.query:
2416
+ cached_result = cache_get(
2417
+ query=args.query,
2418
+ provider=provider,
2419
+ max_results=args.max_results,
2420
+ ttl=args.cache_ttl,
2421
+ params=cache_context,
2422
+ )
2423
+ if cached_result:
2424
+ cache_hit = True
2425
+ result = {k: v for k, v in cached_result.items() if not k.startswith("_cache_")}
2426
+ result["cached"] = True
2427
+ result["cache_age_seconds"] = int(time.time() - cached_result.get("_cache_timestamp", 0))
2428
+
2429
+ errors = []
2430
+ successful_provider = None
2431
+ successful_results: List[Tuple[str, Dict[str, Any]]] = []
2432
+ result = None if not cache_hit else result
2433
+
2434
+ for idx, current_provider in enumerate(eligible_providers):
2435
+ if cache_hit:
2436
+ successful_provider = provider
2437
+ break
2438
+ try:
2439
+ provider_result = execute_with_retry(current_provider)
2440
+ reset_provider_health(current_provider)
2441
+ successful_results.append((current_provider, provider_result))
2442
+ successful_provider = current_provider
2443
+
2444
+ # If we have enough results, stop.
2445
+ if len(provider_result.get("results", [])) >= args.max_results:
2446
+ break
2447
+
2448
+ # Only continue collecting from lower-priority providers when fallback was needed.
2449
+ if not errors:
2450
+ break
2451
+ except Exception as e:
2452
+ error_msg = str(e)
2453
+ cooldown_info = mark_provider_failure(current_provider, error_msg)
2454
+ errors.append({
2455
+ "provider": current_provider,
2456
+ "error": error_msg,
2457
+ "cooldown_seconds": cooldown_info.get("cooldown_seconds"),
2458
+ })
2459
+ if len(eligible_providers) > 1:
2460
+ remaining = eligible_providers[idx + 1:]
2461
+ if remaining:
2462
+ print(json.dumps({
2463
+ "fallback": True,
2464
+ "failed_provider": current_provider,
2465
+ "error": error_msg,
2466
+ "trying_next": remaining[0],
2467
+ }), file=sys.stderr)
2468
+ continue
2469
+
2470
+ if successful_results:
2471
+ if len(successful_results) == 1:
2472
+ result = successful_results[0][1]
2473
+ else:
2474
+ primary = successful_results[0][1].copy()
2475
+ deduped_results, dedup_count = deduplicate_results_across_providers(successful_results, args.max_results)
2476
+ primary["results"] = deduped_results
2477
+ primary["deduplicated"] = dedup_count > 0
2478
+ primary.setdefault("metadata", {})
2479
+ primary["metadata"]["dedup_count"] = dedup_count
2480
+ primary["metadata"]["providers_merged"] = [p for p, _ in successful_results]
2481
+ result = primary
2482
+
2483
+ if result is not None:
2484
+ if successful_provider != provider:
2485
+ routing_info["fallback_used"] = True
2486
+ routing_info["original_provider"] = provider
2487
+ routing_info["provider"] = successful_provider
2488
+ routing_info["fallback_errors"] = errors
2489
+
2490
+ if cooldown_skips:
2491
+ routing_info["cooldown_skips"] = cooldown_skips
2492
+
2493
+ result["routing"] = routing_info
2494
+
2495
+ if not cache_hit and not args.no_cache and args.query:
2496
+ cache_put(
2497
+ query=args.query,
2498
+ provider=successful_provider or provider,
2499
+ max_results=args.max_results,
2500
+ result=result,
2501
+ params=cache_context,
2502
+ )
2503
+
2504
+ result["cached"] = bool(cache_hit)
2505
+ if "deduplicated" not in result:
2506
+ result["deduplicated"] = False
2507
+ result.setdefault("metadata", {})
2508
+ result["metadata"].setdefault("dedup_count", 0)
2509
+
2510
+ indent = None if args.compact else 2
2511
+ print(json.dumps(result, indent=indent, ensure_ascii=False))
2512
+ else:
2513
+ error_result = {
2514
+ "error": "All providers failed",
2515
+ "provider": provider,
2516
+ "query": args.query,
2517
+ "routing": routing_info,
2518
+ "provider_errors": errors,
2519
+ "cooldown_skips": cooldown_skips,
2520
+ }
2521
+ print(json.dumps(error_result, indent=2), file=sys.stderr)
2522
+ sys.exit(1)
2523
+
2524
+
2525
+ if __name__ == "__main__":
2526
+ main()