web-search-plus-plugin 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scripts/search.py DELETED
@@ -1,2940 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Web Search Plus — Unified Multi-Provider Search with Intelligent Auto-Routing
4
- Supports: Serper (Google), Tavily (Research), Querit (Multilingual AI Search),
5
- Exa (Neural), Perplexity (Direct Answers)
6
-
7
- Smart Routing uses multi-signal analysis:
8
- - Query intent classification (shopping, research, discovery)
9
- - Linguistic pattern detection (how much vs how does)
10
- - Product/brand recognition
11
- - URL detection
12
- - Confidence scoring
13
-
14
- Usage:
15
- python3 search.py --query "..." # Auto-route based on query
16
- python3 search.py --provider [serper|tavily|querit|exa] --query "..." [options]
17
-
18
- Examples:
19
- python3 search.py -q "iPhone 16 Pro price" # → Serper (shopping intent)
20
- python3 search.py -q "how does quantum entanglement work" # → Tavily (research intent)
21
- python3 search.py -q "startups similar to Notion" # → Exa (discovery intent)
22
- """
23
-
24
- import argparse
25
- from http.client import IncompleteRead
26
- import hashlib
27
- import json
28
- import os
29
- import re
30
- import sys
31
- import time
32
- from pathlib import Path
33
- from typing import Optional, List, Dict, Any, Tuple
34
- from urllib.request import Request, urlopen
35
- from urllib.error import HTTPError, URLError
36
- from urllib.parse import quote, urlparse
37
-
38
-
39
- # =============================================================================
40
- # Result Caching
41
- # =============================================================================
42
-
43
- CACHE_DIR = Path(os.environ.get("WSP_CACHE_DIR", os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".cache")))
44
- PROVIDER_HEALTH_FILE = CACHE_DIR / "provider_health.json"
45
- DEFAULT_CACHE_TTL = 3600 # 1 hour in seconds
46
-
47
-
48
- def _build_cache_payload(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
49
- """Build normalized payload used for cache key hashing."""
50
- payload = {
51
- "query": query,
52
- "provider": provider,
53
- "max_results": max_results,
54
- }
55
- if params:
56
- payload.update(params)
57
- return payload
58
-
59
-
60
- def _get_cache_key(query: str, provider: str, max_results: int, params: Optional[Dict[str, Any]] = None) -> str:
61
- """Generate a unique cache key from all relevant query parameters."""
62
- payload = _build_cache_payload(query, provider, max_results, params)
63
- key_string = json.dumps(payload, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
64
- return hashlib.sha256(key_string.encode("utf-8")).hexdigest()[:32]
65
-
66
-
67
- def _get_cache_path(cache_key: str) -> Path:
68
- """Get the file path for a cache entry."""
69
- return CACHE_DIR / f"{cache_key}.json"
70
-
71
-
72
- def _ensure_cache_dir() -> None:
73
- """Create cache directory if it doesn't exist."""
74
- CACHE_DIR.mkdir(parents=True, exist_ok=True)
75
-
76
-
77
- def cache_get(query: str, provider: str, max_results: int, ttl: int = DEFAULT_CACHE_TTL, params: Optional[Dict[str, Any]] = None) -> Optional[Dict[str, Any]]:
78
- """
79
- Retrieve cached search results if they exist and are not expired.
80
-
81
- Args:
82
- query: The search query
83
- provider: The search provider
84
- max_results: Maximum results requested
85
- ttl: Time-to-live in seconds (default: 1 hour)
86
-
87
- Returns:
88
- Cached result dict or None if not found/expired
89
- """
90
- cache_key = _get_cache_key(query, provider, max_results, params)
91
- cache_path = _get_cache_path(cache_key)
92
-
93
- if not cache_path.exists():
94
- return None
95
-
96
- try:
97
- with open(cache_path, "r", encoding="utf-8") as f:
98
- cached = json.load(f)
99
-
100
- cached_time = cached.get("_cache_timestamp", 0)
101
- if time.time() - cached_time > ttl:
102
- # Cache expired, remove it
103
- cache_path.unlink(missing_ok=True)
104
- return None
105
-
106
- return cached
107
- except (json.JSONDecodeError, IOError, KeyError):
108
- # Corrupted cache file, remove it
109
- cache_path.unlink(missing_ok=True)
110
- return None
111
-
112
-
113
- def cache_put(query: str, provider: str, max_results: int, result: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> None:
114
- """
115
- Store search results in cache.
116
-
117
- Args:
118
- query: The search query
119
- provider: The search provider
120
- max_results: Maximum results requested
121
- result: The search result to cache
122
- """
123
- _ensure_cache_dir()
124
-
125
- cache_key = _get_cache_key(query, provider, max_results, params)
126
- cache_path = _get_cache_path(cache_key)
127
-
128
- # Add cache metadata
129
- cached_result = result.copy()
130
- cached_result["_cache_timestamp"] = time.time()
131
- cached_result["_cache_key"] = cache_key
132
- cached_result["_cache_query"] = query
133
- cached_result["_cache_provider"] = provider
134
- cached_result["_cache_max_results"] = max_results
135
- cached_result["_cache_params"] = params or {}
136
-
137
- try:
138
- with open(cache_path, "w", encoding="utf-8") as f:
139
- json.dump(cached_result, f, ensure_ascii=False, indent=2)
140
- except IOError as e:
141
- # Non-fatal: log to stderr but don't fail
142
- print(json.dumps({"cache_write_error": str(e)}), file=sys.stderr)
143
-
144
-
145
- def cache_clear() -> Dict[str, Any]:
146
- """
147
- Clear all cached results.
148
-
149
- Returns:
150
- Stats about what was cleared
151
- """
152
- if not CACHE_DIR.exists():
153
- return {"cleared": 0, "message": "Cache directory does not exist"}
154
-
155
- count = 0
156
- size_freed = 0
157
-
158
- for cache_file in CACHE_DIR.glob("*.json"):
159
- if cache_file.name == PROVIDER_HEALTH_FILE.name:
160
- continue
161
- try:
162
- size_freed += cache_file.stat().st_size
163
- cache_file.unlink()
164
- count += 1
165
- except IOError:
166
- pass
167
-
168
- return {
169
- "cleared": count,
170
- "size_freed_bytes": size_freed,
171
- "size_freed_kb": round(size_freed / 1024, 2),
172
- "message": f"Cleared {count} cached entries"
173
- }
174
-
175
-
176
- def cache_stats() -> Dict[str, Any]:
177
- """
178
- Get statistics about the cache.
179
-
180
- Returns:
181
- Dict with cache statistics
182
- """
183
- if not CACHE_DIR.exists():
184
- return {
185
- "total_entries": 0,
186
- "total_size_bytes": 0,
187
- "total_size_kb": 0,
188
- "oldest": None,
189
- "newest": None,
190
- "cache_dir": str(CACHE_DIR),
191
- "exists": False
192
- }
193
-
194
- entries = [p for p in CACHE_DIR.glob("*.json") if p.name != PROVIDER_HEALTH_FILE.name]
195
- total_size = 0
196
- oldest_time = None
197
- newest_time = None
198
- oldest_query = None
199
- newest_query = None
200
- provider_counts = {}
201
-
202
- for cache_file in entries:
203
- try:
204
- stat = cache_file.stat()
205
- total_size += stat.st_size
206
-
207
- with open(cache_file, "r", encoding="utf-8") as f:
208
- cached = json.load(f)
209
-
210
- ts = cached.get("_cache_timestamp", 0)
211
- query = cached.get("_cache_query", "unknown")
212
- provider = cached.get("_cache_provider", "unknown")
213
-
214
- provider_counts[provider] = provider_counts.get(provider, 0) + 1
215
-
216
- if oldest_time is None or ts < oldest_time:
217
- oldest_time = ts
218
- oldest_query = query
219
- if newest_time is None or ts > newest_time:
220
- newest_time = ts
221
- newest_query = query
222
- except (json.JSONDecodeError, IOError):
223
- pass
224
-
225
- return {
226
- "total_entries": len(entries),
227
- "total_size_bytes": total_size,
228
- "total_size_kb": round(total_size / 1024, 2),
229
- "providers": provider_counts,
230
- "oldest": {
231
- "timestamp": oldest_time,
232
- "age_seconds": int(time.time() - oldest_time) if oldest_time else None,
233
- "query": oldest_query
234
- } if oldest_time else None,
235
- "newest": {
236
- "timestamp": newest_time,
237
- "age_seconds": int(time.time() - newest_time) if newest_time else None,
238
- "query": newest_query
239
- } if newest_time else None,
240
- "cache_dir": str(CACHE_DIR),
241
- "exists": True
242
- }
243
-
244
-
245
- # =============================================================================
246
- # Auto-load .env from skill directory (if exists)
247
- # =============================================================================
248
- def _load_env_file():
249
- """Load .env file from skill root directory if it exists."""
250
- env_path = Path(__file__).parent.parent / ".env"
251
- if env_path.exists():
252
- with open(env_path) as f:
253
- for line in f:
254
- line = line.strip()
255
- if line and not line.startswith("#") and "=" in line:
256
- # Handle export VAR=value or VAR=value
257
- if line.startswith("export "):
258
- line = line[7:]
259
- key, _, value = line.partition("=")
260
- key = key.strip()
261
- value = value.strip().strip('"').strip("'")
262
- if key and key not in os.environ:
263
- os.environ[key] = value
264
-
265
- _load_env_file()
266
-
267
-
268
- # =============================================================================
269
- # Configuration
270
- # =============================================================================
271
-
272
- DEFAULT_CONFIG = {
273
- "defaults": {
274
- "provider": "serper",
275
- "max_results": 5
276
- },
277
- "auto_routing": {
278
- "enabled": True,
279
- "fallback_provider": "serper",
280
- "provider_priority": ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"],
281
- "disabled_providers": [],
282
- "confidence_threshold": 0.3, # Below this, note low confidence
283
- },
284
- "serper": {
285
- "country": "us",
286
- "language": "en",
287
- "type": "search"
288
- },
289
- "tavily": {
290
- "depth": "basic",
291
- "topic": "general"
292
- },
293
- "querit": {
294
- "base_url": "https://api.querit.ai",
295
- "base_path": "/v1/search",
296
- "timeout": 10
297
- },
298
- "exa": {
299
- "type": "neural",
300
- "depth": "normal",
301
- "verbosity": "standard"
302
- },
303
- "perplexity": {
304
- "api_url": "https://api.kilo.ai/api/gateway/chat/completions",
305
- "model": "perplexity/sonar-pro"
306
- },
307
- "you": {
308
- "country": "us",
309
- "safesearch": "moderate"
310
- },
311
- "searxng": {
312
- "instance_url": None, # Required - user must set their own instance
313
- "safesearch": 0, # 0=off, 1=moderate, 2=strict
314
- "engines": None, # Optional list of engines to use
315
- "language": "en"
316
- }
317
- }
318
-
319
-
320
- def load_config() -> Dict[str, Any]:
321
- """Load configuration from config.json if it exists, with defaults."""
322
- config = DEFAULT_CONFIG.copy()
323
- config_path = Path(__file__).parent.parent / "config.json"
324
-
325
- if config_path.exists():
326
- try:
327
- with open(config_path) as f:
328
- user_config = json.load(f)
329
- for key, value in user_config.items():
330
- if isinstance(value, dict) and key in config:
331
- config[key] = {**config.get(key, {}), **value}
332
- else:
333
- config[key] = value
334
- except (json.JSONDecodeError, IOError) as e:
335
- print(json.dumps({
336
- "warning": f"Could not load config.json: {e}",
337
- "using": "default configuration"
338
- }), file=sys.stderr)
339
-
340
- return config
341
-
342
-
343
- def get_api_key(provider: str, config: Dict[str, Any] = None) -> Optional[str]:
344
- """Get API key for provider from config.json or environment.
345
-
346
- Priority: config.json > .env > environment variable
347
-
348
- Note: SearXNG doesn't require an API key, but returns instance_url if configured.
349
- """
350
- # Special case: SearXNG uses instance_url instead of API key
351
- if provider == "searxng":
352
- return get_searxng_instance_url(config)
353
-
354
- # Check config.json first
355
- if config:
356
- provider_config = config.get(provider, {})
357
- if isinstance(provider_config, dict):
358
- key = provider_config.get("api_key") or provider_config.get("apiKey")
359
- if key:
360
- return key
361
-
362
- # Then check environment
363
- if provider == "perplexity":
364
- return os.environ.get("PERPLEXITY_API_KEY") or os.environ.get("KILOCODE_API_KEY")
365
- key_map = {
366
- "serper": "SERPER_API_KEY",
367
- "tavily": "TAVILY_API_KEY",
368
- "querit": "QUERIT_API_KEY",
369
- "exa": "EXA_API_KEY",
370
- "you": "YOU_API_KEY",
371
- }
372
- return os.environ.get(key_map.get(provider, ""))
373
-
374
-
375
- def _validate_searxng_url(url: str) -> str:
376
- """Validate and sanitize SearXNG instance URL to prevent SSRF.
377
-
378
- Enforces http/https scheme and blocks requests to private/internal networks
379
- including cloud metadata endpoints, loopback, link-local, and RFC1918 ranges.
380
- """
381
- import ipaddress
382
- import socket
383
- from urllib.parse import urlparse
384
-
385
- parsed = urlparse(url)
386
- if parsed.scheme not in ("http", "https"):
387
- raise ValueError(f"SearXNG URL must use http or https scheme, got: {parsed.scheme}")
388
- if not parsed.hostname:
389
- raise ValueError("SearXNG URL must include a hostname")
390
-
391
- hostname = parsed.hostname
392
-
393
- # Block cloud metadata endpoints by hostname
394
- BLOCKED_HOSTS = {
395
- "169.254.169.254", # AWS/GCP/Azure metadata
396
- "metadata.google.internal",
397
- "metadata.internal",
398
- }
399
- if hostname in BLOCKED_HOSTS:
400
- raise ValueError(f"SearXNG URL blocked: {hostname} is a cloud metadata endpoint")
401
-
402
- # Resolve hostname and check for private/internal IPs
403
- # Operators who intentionally self-host on private networks can opt out
404
- allow_private = os.environ.get("SEARXNG_ALLOW_PRIVATE", "").strip() == "1"
405
- if not allow_private:
406
- try:
407
- resolved_ips = socket.getaddrinfo(hostname, parsed.port or 80, proto=socket.IPPROTO_TCP)
408
- for family, _type, _proto, _canonname, sockaddr in resolved_ips:
409
- ip = ipaddress.ip_address(sockaddr[0])
410
- if ip.is_loopback or ip.is_private or ip.is_link_local or ip.is_reserved:
411
- raise ValueError(
412
- f"SearXNG URL blocked: {hostname} resolves to private/internal IP {ip}. "
413
- f"If this is intentional, set SEARXNG_ALLOW_PRIVATE=1 in your environment."
414
- )
415
- except socket.gaierror:
416
- raise ValueError(f"SearXNG URL blocked: cannot resolve hostname {hostname}")
417
-
418
- return url
419
-
420
-
421
- def get_searxng_instance_url(config: Dict[str, Any] = None) -> Optional[str]:
422
- """Get SearXNG instance URL from config or environment.
423
-
424
- SearXNG is self-hosted, so no API key needed - just the instance URL.
425
- Priority: config.json > SEARXNG_INSTANCE_URL environment variable
426
-
427
- Security: URL is validated to prevent SSRF via scheme enforcement.
428
- Both config sources (config.json, env var) are operator-controlled,
429
- not agent-controlled, so private IPs like localhost are permitted.
430
- """
431
- # Check config.json first
432
- if config:
433
- searxng_config = config.get("searxng", {})
434
- if isinstance(searxng_config, dict):
435
- url = searxng_config.get("instance_url")
436
- if url:
437
- return _validate_searxng_url(url)
438
-
439
- # Then check environment
440
- env_url = os.environ.get("SEARXNG_INSTANCE_URL")
441
- if env_url:
442
- return _validate_searxng_url(env_url)
443
- return None
444
-
445
-
446
- # Backward compatibility alias
447
- def get_env_key(provider: str) -> Optional[str]:
448
- """Get API key for provider from environment (legacy function)."""
449
- return get_api_key(provider)
450
-
451
-
452
- def validate_api_key(provider: str, config: Dict[str, Any] = None) -> str:
453
- """Validate and return API key (or instance URL for SearXNG), with helpful error messages."""
454
- key = get_api_key(provider, config)
455
-
456
- # Special handling for SearXNG - it needs instance URL, not API key
457
- if provider == "searxng":
458
- if not key:
459
- error_msg = {
460
- "error": "Missing SearXNG instance URL",
461
- "env_var": "SEARXNG_INSTANCE_URL",
462
- "how_to_fix": [
463
- "1. Set up your own SearXNG instance: https://docs.searxng.org/admin/installation.html",
464
- "2. Add to config.json: \"searxng\": {\"instance_url\": \"https://your-instance.example.com\"}",
465
- "3. Or set environment variable: export SEARXNG_INSTANCE_URL=\"https://your-instance.example.com\"",
466
- "Note: SearXNG requires a self-hosted instance with JSON format enabled.",
467
- ],
468
- "provider": provider
469
- }
470
- raise ProviderConfigError(json.dumps(error_msg))
471
-
472
- # Validate URL format
473
- if not key.startswith(("http://", "https://")):
474
- raise ProviderConfigError(json.dumps({
475
- "error": "SearXNG instance URL must start with http:// or https://",
476
- "provided": key,
477
- "provider": provider
478
- }))
479
-
480
- return key
481
-
482
- if not key:
483
- env_var = {
484
- "serper": "SERPER_API_KEY",
485
- "tavily": "TAVILY_API_KEY",
486
- "querit": "QUERIT_API_KEY",
487
- "exa": "EXA_API_KEY",
488
- "you": "YOU_API_KEY",
489
- "perplexity": "KILOCODE_API_KEY"
490
- }[provider]
491
-
492
- urls = {
493
- "serper": "https://serper.dev",
494
- "tavily": "https://tavily.com",
495
- "querit": "https://querit.ai",
496
- "exa": "https://exa.ai",
497
- "you": "https://api.you.com",
498
- "perplexity": "https://api.kilo.ai"
499
- }
500
-
501
- error_msg = {
502
- "error": f"Missing API key for {provider}",
503
- "env_var": env_var,
504
- "how_to_fix": [
505
- f"1. Get your API key from {urls[provider]}",
506
- f"2. Add to config.json: \"{provider}\": {{\"api_key\": \"your-key\"}}",
507
- f"3. Or set environment variable: export {env_var}=\"your-key\"",
508
- ],
509
- "provider": provider
510
- }
511
- raise ProviderConfigError(json.dumps(error_msg))
512
-
513
- if len(key) < 10:
514
- raise ProviderConfigError(json.dumps({
515
- "error": f"API key for {provider} appears invalid (too short)",
516
- "provider": provider
517
- }))
518
-
519
- return key
520
-
521
-
522
- # =============================================================================
523
- # Intelligent Auto-Routing Engine
524
- # =============================================================================
525
-
526
- class QueryAnalyzer:
527
- """
528
- Intelligent query analysis for smart provider routing.
529
-
530
- Uses multi-signal analysis:
531
- - Intent classification (shopping, research, discovery, local, news)
532
- - Linguistic patterns (question structure, phrase patterns)
533
- - Entity detection (products, brands, URLs, dates)
534
- - Complexity assessment
535
- """
536
-
537
- # Intent signal patterns with weights
538
- # Higher weight = stronger signal for that provider
539
-
540
- SHOPPING_SIGNALS = {
541
- # Price patterns (very strong)
542
- r'\bhow much\b': 4.0,
543
- r'\bprice of\b': 4.0,
544
- r'\bcost of\b': 4.0,
545
- r'\bprices?\b': 3.0,
546
- r'\$\d+|\d+\s*dollars?': 3.0,
547
- r'€\d+|\d+\s*euros?': 3.0,
548
- r'£\d+|\d+\s*pounds?': 3.0,
549
-
550
- # German price patterns (sehr stark)
551
- r'\bpreis(e)?\b': 3.5,
552
- r'\bkosten\b': 3.0,
553
- r'\bwieviel\b': 3.5,
554
- r'\bwie viel\b': 3.5,
555
- r'\bwas kostet\b': 4.0,
556
-
557
- # Purchase intent (strong)
558
- r'\bbuy\b': 3.5,
559
- r'\bpurchase\b': 3.5,
560
- r'\border\b(?!\s+by)': 3.0, # "order" but not "order by"
561
- r'\bshopping\b': 3.5,
562
- r'\bshop for\b': 3.5,
563
- r'\bwhere to (buy|get|purchase)\b': 4.0,
564
-
565
- # German purchase intent (stark)
566
- r'\bkaufen\b': 3.5,
567
- r'\bbestellen\b': 3.5,
568
- r'\bwo kaufen\b': 4.0,
569
- r'\bhändler\b': 3.0,
570
- r'\bshop\b': 2.5,
571
-
572
- # Deal/discount signals
573
- r'\bdeal(s)?\b': 3.0,
574
- r'\bdiscount(s)?\b': 3.0,
575
- r'\bsale\b': 2.5,
576
- r'\bcheap(er|est)?\b': 3.0,
577
- r'\baffordable\b': 2.5,
578
- r'\bbudget\b': 2.5,
579
- r'\bbest price\b': 3.5,
580
- r'\bcompare prices\b': 3.5,
581
- r'\bcoupon\b': 3.0,
582
-
583
- # German deal/discount signals
584
- r'\bgünstig(er|ste)?\b': 3.0,
585
- r'\bbillig(er|ste)?\b': 3.0,
586
- r'\bangebot(e)?\b': 3.0,
587
- r'\brabatt\b': 3.0,
588
- r'\baktion\b': 2.5,
589
- r'\bschnäppchen\b': 3.0,
590
-
591
- # Product comparison
592
- r'\bvs\.?\b': 2.0,
593
- r'\bversus\b': 2.0,
594
- r'\bor\b.*\bwhich\b': 2.0,
595
- r'\bspecs?\b': 2.5,
596
- r'\bspecifications?\b': 2.5,
597
- r'\breview(s)?\b': 2.0,
598
- r'\brating(s)?\b': 2.0,
599
- r'\bunboxing\b': 2.5,
600
-
601
- # German product comparison
602
- r'\btest\b': 2.5,
603
- r'\bbewertung(en)?\b': 2.5,
604
- r'\btechnische daten\b': 3.0,
605
- r'\bspezifikationen\b': 2.5,
606
- }
607
-
608
- RESEARCH_SIGNALS = {
609
- # Explanation patterns (very strong)
610
- r'\bhow does\b': 4.0,
611
- r'\bhow do\b': 3.5,
612
- r'\bwhy does\b': 4.0,
613
- r'\bwhy do\b': 3.5,
614
- r'\bwhy is\b': 3.5,
615
- r'\bexplain\b': 4.0,
616
- r'\bexplanation\b': 4.0,
617
- r'\bwhat is\b': 3.0,
618
- r'\bwhat are\b': 3.0,
619
- r'\bdefine\b': 3.5,
620
- r'\bdefinition of\b': 3.5,
621
- r'\bmeaning of\b': 3.0,
622
-
623
- # Analysis patterns (strong)
624
- r'\banalyze\b': 3.5,
625
- r'\banalysis\b': 3.5,
626
- r'\bcompare\b(?!\s*prices?)': 3.0, # compare but not "compare prices"
627
- r'\bcomparison\b': 3.0,
628
- r'\bstatus of\b': 3.5,
629
- r'\bstatus\b': 2.5,
630
- r'\bwhat happened with\b': 4.0,
631
- r'\bpros and cons\b': 4.0,
632
- r'\badvantages?\b': 3.0,
633
- r'\bdisadvantages?\b': 3.0,
634
- r'\bbenefits?\b': 2.5,
635
- r'\bdrawbacks?\b': 3.0,
636
- r'\bdifference between\b': 3.5,
637
-
638
- # Learning patterns
639
- r'\bunderstand\b': 3.0,
640
- r'\blearn(ing)?\b': 2.5,
641
- r'\btutorial\b': 3.0,
642
- r'\bguide\b': 2.5,
643
- r'\bhow to\b': 2.0, # Lower weight - could be shopping too
644
- r'\bstep by step\b': 3.0,
645
-
646
- # Depth signals
647
- r'\bin[- ]depth\b': 3.0,
648
- r'\bdetailed\b': 2.5,
649
- r'\bcomprehensive\b': 3.0,
650
- r'\bthorough\b': 2.5,
651
- r'\bdeep dive\b': 3.5,
652
- r'\boverall\b': 2.0,
653
- r'\bsummary\b': 2.0,
654
-
655
- # Academic patterns
656
- r'\bstudy\b': 2.5,
657
- r'\bresearch shows\b': 3.5,
658
- r'\baccording to\b': 2.5,
659
- r'\bevidence\b': 3.0,
660
- r'\bscientific\b': 3.0,
661
- r'\bhistory of\b': 3.0,
662
- r'\bbackground\b': 2.5,
663
- r'\bcontext\b': 2.5,
664
- r'\bimplications?\b': 3.0,
665
-
666
- # German explanation patterns (sehr stark)
667
- r'\bwie funktioniert\b': 4.0,
668
- r'\bwarum\b': 3.5,
669
- r'\berklär(en|ung)?\b': 4.0,
670
- r'\bwas ist\b': 3.0,
671
- r'\bwas sind\b': 3.0,
672
- r'\bbedeutung\b': 3.0,
673
-
674
- # German analysis patterns
675
- r'\banalyse\b': 3.5,
676
- r'\bvergleich(en)?\b': 3.0,
677
- r'\bvor- und nachteile\b': 4.0,
678
- r'\bvorteile\b': 3.0,
679
- r'\bnachteile\b': 3.0,
680
- r'\bunterschied(e)?\b': 3.5,
681
-
682
- # German learning patterns
683
- r'\bverstehen\b': 3.0,
684
- r'\blernen\b': 2.5,
685
- r'\banleitung\b': 3.0,
686
- r'\bübersicht\b': 2.5,
687
- r'\bhintergrund\b': 2.5,
688
- r'\bzusammenfassung\b': 2.5,
689
- }
690
-
691
- DISCOVERY_SIGNALS = {
692
- # Similarity patterns (very strong)
693
- r'\bsimilar to\b': 5.0,
694
- r'\blike\s+\w+\.com': 4.5, # "like notion.com"
695
- r'\balternatives? to\b': 5.0,
696
- r'\bcompetitors? (of|to)\b': 4.5,
697
- r'\bcompeting with\b': 4.0,
698
- r'\brivals? (of|to)\b': 4.0,
699
- r'\binstead of\b': 3.0,
700
- r'\breplacement for\b': 3.5,
701
-
702
- # Company/startup patterns (strong)
703
- r'\bcompanies (like|that|doing|building)\b': 4.5,
704
- r'\bstartups? (like|that|doing|building)\b': 4.5,
705
- r'\bwho else\b': 4.0,
706
- r'\bother (companies|startups|tools|apps)\b': 3.5,
707
- r'\bfind (companies|startups|tools|examples?)\b': 4.5,
708
- r'\bevents? in\b': 4.0,
709
- r'\bthings to do in\b': 4.5,
710
-
711
- # Funding/business patterns
712
- r'\bseries [a-d]\b': 4.0,
713
- r'\byc\b|y combinator': 4.0,
714
- r'\bfund(ed|ing|raise)\b': 3.5,
715
- r'\bventure\b': 3.0,
716
- r'\bvaluation\b': 3.0,
717
-
718
- # Category patterns
719
- r'\bresearch papers? (on|about)\b': 4.0,
720
- r'\barxiv\b': 4.5,
721
- r'\bgithub (projects?|repos?)\b': 4.5,
722
- r'\bopen source\b.*\bprojects?\b': 4.0,
723
- r'\btweets? (about|on)\b': 3.5,
724
- r'\bblogs? (about|on|like)\b': 3.0,
725
-
726
- # URL detection (very strong signal for Exa similar)
727
- r'https?://[^\s]+': 5.0,
728
- r'\b\w+\.(com|org|io|ai|co|dev)\b': 3.5,
729
- }
730
-
731
- LOCAL_NEWS_SIGNALS = {
732
- # Local patterns → Serper
733
- r'\bnear me\b': 4.0,
734
- r'\bnearby\b': 3.5,
735
- r'\blocal\b': 3.0,
736
- r'\bin (my )?(city|area|town|neighborhood)\b': 3.5,
737
- r'\brestaurants?\b': 2.5,
738
- r'\bhotels?\b': 2.5,
739
- r'\bcafes?\b': 2.5,
740
- r'\bstores?\b': 2.0,
741
- r'\bdirections? to\b': 3.5,
742
- r'\bmap of\b': 3.0,
743
- r'\bphone number\b': 3.0,
744
- r'\baddress of\b': 3.0,
745
- r'\bopen(ing)? hours\b': 3.0,
746
-
747
- # Weather/time
748
- r'\bweather\b': 4.0,
749
- r'\bforecast\b': 3.5,
750
- r'\btemperature\b': 3.0,
751
- r'\btime in\b': 3.0,
752
-
753
- # News/recency patterns → Serper (or Tavily for news depth)
754
- r'\blatest\b': 2.5,
755
- r'\brecent\b': 2.5,
756
- r'\btoday\b': 2.5,
757
- r'\bbreaking\b': 3.5,
758
- r'\bnews\b': 2.5,
759
- r'\bheadlines?\b': 3.0,
760
- r'\b202[4-9]\b': 2.0, # Current year mentions
761
- r'\blast (week|month|year)\b': 2.0,
762
-
763
- # German local patterns
764
- r'\bin der nähe\b': 4.0,
765
- r'\bin meiner nähe\b': 4.0,
766
- r'\böffnungszeiten\b': 3.0,
767
- r'\badresse von\b': 3.0,
768
- r'\bweg(beschreibung)? nach\b': 3.5,
769
-
770
- # German news/recency patterns
771
- r'\bheute\b': 2.5,
772
- r'\bmorgen\b': 2.0,
773
- r'\baktuell\b': 2.5,
774
- r'\bnachrichten\b': 3.0,
775
- }
776
-
777
- # RAG/AI signals → You.com
778
- # You.com excels at providing LLM-ready snippets and combined web+news
779
- RAG_SIGNALS = {
780
- # RAG/context patterns (strong signal for You.com)
781
- r'\brag\b': 4.5,
782
- r'\bcontext for\b': 4.0,
783
- r'\bsummarize\b': 3.5,
784
- r'\bbrief(ly)?\b': 3.0,
785
- r'\bquick overview\b': 3.5,
786
- r'\btl;?dr\b': 4.0,
787
- r'\bkey (points|facts|info)\b': 3.5,
788
- r'\bmain (points|takeaways)\b': 3.5,
789
-
790
- # Combined web + news queries
791
- r'\b(web|online)\s+and\s+news\b': 4.0,
792
- r'\ball sources\b': 3.5,
793
- r'\bcomprehensive (search|overview)\b': 3.5,
794
- r'\blatest\s+(news|updates)\b': 3.0,
795
- r'\bcurrent (events|situation|status)\b': 3.5,
796
-
797
- # Real-time information needs
798
- r'\bright now\b': 3.0,
799
- r'\bas of today\b': 3.5,
800
- r'\bup.to.date\b': 3.5,
801
- r'\breal.time\b': 4.0,
802
- r'\blive\b': 2.5,
803
-
804
- # Information synthesis
805
- r'\bwhat\'?s happening with\b': 3.5,
806
- r'\bwhat\'?s the latest\b': 4.0,
807
- r'\bupdates?\s+on\b': 3.5,
808
- r'\bstatus of\b': 3.0,
809
- r'\bsituation (in|with|around)\b': 3.5,
810
- }
811
-
812
- # Direct answer / synthesis signals → Perplexity via Kilo Gateway
813
- DIRECT_ANSWER_SIGNALS = {
814
- r'\bwhat is\b': 3.0,
815
- r'\bwhat are\b': 2.5,
816
- r'\bcurrent status\b': 4.0,
817
- r'\bstatus of\b': 3.5,
818
- r'\bstatus\b': 2.5,
819
- r'\bwhat happened with\b': 4.0,
820
- r"\bwhat'?s happening with\b": 4.0,
821
- r'\bas of (today|now)\b': 4.0,
822
- r'\bthis weekend\b': 3.5,
823
- r'\bevents? in\b': 3.5,
824
- r'\bthings to do in\b': 4.0,
825
- r'\bnear me\b': 3.0,
826
- r'\bcan you (tell me|summarize|explain)\b': 3.5,
827
- # German
828
- r'\bwann\b': 3.0,
829
- r'\bwer\b': 3.0,
830
- r'\bwo\b': 2.5,
831
- r'\bwie viele\b': 3.0,
832
- }
833
-
834
- # Privacy/Multi-source signals → SearXNG (self-hosted meta-search)
835
- # SearXNG is ideal for privacy-focused queries and aggregating multiple sources
836
- PRIVACY_SIGNALS = {
837
- # Privacy signals (very strong)
838
- r'\bprivate(ly)?\b': 4.0,
839
- r'\banonymous(ly)?\b': 4.0,
840
- r'\bwithout tracking\b': 4.5,
841
- r'\bno track(ing)?\b': 4.5,
842
- r'\bprivacy\b': 3.5,
843
- r'\bprivacy.?focused\b': 4.5,
844
- r'\bprivacy.?first\b': 4.5,
845
- r'\bduckduckgo alternative\b': 4.5,
846
- r'\bprivate search\b': 5.0,
847
-
848
- # German privacy signals
849
- r'\bprivat\b': 4.0,
850
- r'\banonym\b': 4.0,
851
- r'\bohne tracking\b': 4.5,
852
- r'\bdatenschutz\b': 4.0,
853
-
854
- # Multi-source aggregation signals
855
- r'\baggregate results?\b': 4.0,
856
- r'\bmultiple sources?\b': 4.0,
857
- r'\bdiverse (results|perspectives|sources)\b': 4.0,
858
- r'\bfrom (all|multiple|different) (engines?|sources?)\b': 4.5,
859
- r'\bmeta.?search\b': 5.0,
860
- r'\ball engines?\b': 4.0,
861
-
862
- # German multi-source signals
863
- r'\bverschiedene quellen\b': 4.0,
864
- r'\baus mehreren quellen\b': 4.0,
865
- r'\balle suchmaschinen\b': 4.5,
866
-
867
- # Budget/free signals (SearXNG is self-hosted = $0 API cost)
868
- r'\bfree search\b': 3.5,
869
- r'\bno api cost\b': 4.0,
870
- r'\bself.?hosted search\b': 5.0,
871
- r'\bzero cost\b': 3.5,
872
- r'\bbudget\b(?!\s*(laptop|phone|option))\b': 2.5, # "budget" alone, not "budget laptop"
873
-
874
- # German budget signals
875
- r'\bkostenlos(e)?\s+suche\b': 3.5,
876
- r'\bkeine api.?kosten\b': 4.0,
877
- }
878
-
879
- # Exa Deep Search signals → deep multi-source synthesis
880
- EXA_DEEP_SIGNALS = {
881
- r'\bsynthesi[sz]e\b': 5.0,
882
- r'\bdeep research\b': 5.0,
883
- r'\bcomprehensive (analysis|report|overview|survey)\b': 4.5,
884
- r'\bacross (multiple|many|several) (sources|documents|papers)\b': 4.5,
885
- r'\baggregat(e|ing) (information|data|results)\b': 4.0,
886
- r'\bcross.?referenc': 4.5,
887
- r'\bsec filings?\b': 4.5,
888
- r'\bannual reports?\b': 4.0,
889
- r'\bearnings (call|report|transcript)\b': 4.5,
890
- r'\bfinancial analysis\b': 4.0,
891
- r'\bliterature (review|survey)\b': 5.0,
892
- r'\bacademic literature\b': 4.5,
893
- r'\bstate of the (art|field|industry)\b': 4.0,
894
- r'\bcompile (a |the )?(report|findings|results)\b': 4.5,
895
- r'\bsummariz(e|ing) (research|papers|studies)\b': 4.0,
896
- r'\bmultiple documents?\b': 4.0,
897
- r'\bdossier\b': 4.5,
898
- r'\bdue diligence\b': 4.5,
899
- r'\bstructured (output|data|report)\b': 4.0,
900
- r'\bmarket research\b': 4.0,
901
- r'\bindustry (report|analysis|overview)\b': 4.0,
902
- r'\bresearch (on|about|into)\b': 4.0,
903
- r'\bwhitepaper\b': 4.5,
904
- r'\btechnical report\b': 4.0,
905
- r'\bsurvey of\b': 4.5,
906
- r'\bmeta.?analysis\b': 5.0,
907
- r'\bsystematic review\b': 5.0,
908
- r'\bcase study\b': 3.5,
909
- r'\bbenchmark(s|ing)?\b': 3.5,
910
- # German
911
- r'\btiefenrecherche\b': 5.0,
912
- r'\bumfassende (analyse|übersicht|recherche)\b': 4.5,
913
- r'\baus mehreren quellen zusammenfassen\b': 4.5,
914
- r'\bmarktforschung\b': 4.0,
915
- }
916
-
917
- # Exa Deep Reasoning signals → complex cross-reference analysis
918
- EXA_DEEP_REASONING_SIGNALS = {
919
- r'\bdeep.?reasoning\b': 6.0,
920
- r'\bcomplex (analysis|reasoning|research)\b': 4.5,
921
- r'\bcontradictions?\b': 4.5,
922
- r'\breconcil(e|ing)\b': 5.0,
923
- r'\bcritical(ly)? analyz': 4.5,
924
- r'\bweigh(ing)? (the )?evidence\b': 4.5,
925
- r'\bcompeting (claims|theories|perspectives)\b': 4.5,
926
- r'\bcomplex financial\b': 4.5,
927
- r'\bregulatory (analysis|compliance|landscape)\b': 4.5,
928
- r'\blegal analysis\b': 4.5,
929
- r'\bcomprehensive (due diligence|investigation)\b': 5.0,
930
- r'\bpatent (landscape|analysis|search)\b': 4.5,
931
- r'\bmarket intelligence\b': 4.5,
932
- r'\bcompetitive (intelligence|landscape)\b': 4.5,
933
- r'\btrade.?offs?\b': 4.0,
934
- r'\bpros and cons of\b': 4.0,
935
- r'\bshould I (use|choose|pick)\b': 3.5,
936
- r'\bwhich is better\b': 4.0,
937
- # German
938
- r'\bkomplexe analyse\b': 4.5,
939
- r'\bwidersprüche\b': 4.5,
940
- r'\bquellen abwägen\b': 4.5,
941
- r'\brechtliche analyse\b': 4.5,
942
- r'\bvergleich(e|en)?\b': 3.5,
943
- }
944
-
945
-
946
- # Brand/product patterns for shopping detection
947
- BRAND_PATTERNS = [
948
- # Tech brands
949
- r'\b(apple|iphone|ipad|macbook|airpods?)\b',
950
- r'\b(samsung|galaxy)\b',
951
- r'\b(google|pixel)\b',
952
- r'\b(microsoft|surface|xbox)\b',
953
- r'\b(sony|playstation)\b',
954
- r'\b(nvidia|geforce|rtx)\b',
955
- r'\b(amd|ryzen|radeon)\b',
956
- r'\b(intel|core i[3579])\b',
957
- r'\b(dell|hp|lenovo|asus|acer)\b',
958
- r'\b(lg|tcl|hisense)\b',
959
-
960
- # Product categories
961
- r'\b(laptop|phone|tablet|tv|monitor|headphones?|earbuds?)\b',
962
- r'\b(camera|lens|drone)\b',
963
- r'\b(watch|smartwatch|fitbit|garmin)\b',
964
- r'\b(router|modem|wifi)\b',
965
- r'\b(keyboard|mouse|gaming)\b',
966
- ]
967
-
968
- def __init__(self, config: Dict[str, Any]):
969
- self.config = config
970
- self.auto_config = config.get("auto_routing", DEFAULT_CONFIG["auto_routing"])
971
-
972
- def _calculate_signal_score(
973
- self,
974
- query: str,
975
- signals: Dict[str, float]
976
- ) -> Tuple[float, List[Dict[str, Any]]]:
977
- """
978
- Calculate score for a signal category.
979
- Returns (total_score, list of matched signals with details).
980
- """
981
- query_lower = query.lower()
982
- matches = []
983
- total_score = 0.0
984
-
985
- for pattern, weight in signals.items():
986
- regex = re.compile(pattern, re.IGNORECASE)
987
- found = regex.findall(query_lower)
988
- if found:
989
- # Normalize found matches
990
- match_text = found[0] if isinstance(found[0], str) else found[0][0] if found[0] else pattern
991
- matches.append({
992
- "pattern": pattern,
993
- "matched": match_text,
994
- "weight": weight
995
- })
996
- total_score += weight
997
-
998
- return total_score, matches
999
-
1000
- def _detect_product_brand_combo(self, query: str) -> float:
1001
- """
1002
- Detect product + brand combinations which strongly indicate shopping intent.
1003
- Returns a bonus score.
1004
- """
1005
- query_lower = query.lower()
1006
- brand_found = False
1007
- product_found = False
1008
-
1009
- for pattern in self.BRAND_PATTERNS:
1010
- if re.search(pattern, query_lower, re.IGNORECASE):
1011
- brand_found = True
1012
- break
1013
-
1014
- # Check for product indicators
1015
- product_indicators = [
1016
- r'\b(buy|price|specs?|review|vs|compare)\b',
1017
- r'\b(pro|max|plus|mini|ultra|lite)\b', # Product tier names
1018
- r'\b\d+\s*(gb|tb|inch|mm|hz)\b', # Specifications
1019
- ]
1020
- for pattern in product_indicators:
1021
- if re.search(pattern, query_lower, re.IGNORECASE):
1022
- product_found = True
1023
- break
1024
-
1025
- if brand_found and product_found:
1026
- return 3.0 # Strong shopping signal
1027
- elif brand_found:
1028
- return 1.5 # Moderate shopping signal
1029
- return 0.0
1030
-
1031
- def _detect_url(self, query: str) -> Optional[str]:
1032
- """Detect URLs in query - strong signal for Exa similar search."""
1033
- url_pattern = r'https?://[^\s]+'
1034
- match = re.search(url_pattern, query)
1035
- if match:
1036
- return match.group()
1037
-
1038
- # Also check for domain-like patterns
1039
- domain_pattern = r'\b(\w+\.(com|org|io|ai|co|dev|net|app))\b'
1040
- match = re.search(domain_pattern, query, re.IGNORECASE)
1041
- if match:
1042
- return match.group()
1043
-
1044
- return None
1045
-
1046
- def _assess_query_complexity(self, query: str) -> Dict[str, Any]:
1047
- """
1048
- Assess query complexity - complex queries favor Tavily.
1049
- """
1050
- words = query.split()
1051
- word_count = len(words)
1052
-
1053
- # Count question words
1054
- question_words = len(re.findall(
1055
- r'\b(what|why|how|when|where|which|who|whose|whom)\b',
1056
- query, re.IGNORECASE
1057
- ))
1058
-
1059
- # Check for multiple clauses
1060
- clause_markers = len(re.findall(
1061
- r'\b(and|but|or|because|since|while|although|if|when)\b',
1062
- query, re.IGNORECASE
1063
- ))
1064
-
1065
- complexity_score = 0.0
1066
- if word_count > 10:
1067
- complexity_score += 1.5
1068
- if word_count > 20:
1069
- complexity_score += 1.0
1070
- if question_words > 1:
1071
- complexity_score += 1.0
1072
- if clause_markers > 0:
1073
- complexity_score += 0.5 * clause_markers
1074
-
1075
- return {
1076
- "word_count": word_count,
1077
- "question_words": question_words,
1078
- "clause_markers": clause_markers,
1079
- "complexity_score": complexity_score,
1080
- "is_complex": complexity_score > 2.0
1081
- }
1082
-
1083
- def _detect_recency_intent(self, query: str) -> Tuple[bool, float]:
1084
- """
1085
- Detect if query wants recent/timely information.
1086
- Returns (is_recency_focused, score).
1087
- """
1088
- recency_patterns = [
1089
- (r'\b(latest|newest|recent|current)\b', 2.5),
1090
- (r'\b(today|yesterday|this week|this month)\b', 3.0),
1091
- (r'\b(202[4-9]|2030)\b', 2.0),
1092
- (r'\b(breaking|live|just|now)\b', 3.0),
1093
- (r'\blast (hour|day|week|month)\b', 2.5),
1094
- ]
1095
-
1096
- total = 0.0
1097
- for pattern, weight in recency_patterns:
1098
- if re.search(pattern, query, re.IGNORECASE):
1099
- total += weight
1100
-
1101
- return total > 2.0, total
1102
-
1103
- def analyze(self, query: str) -> Dict[str, Any]:
1104
- """
1105
- Perform comprehensive query analysis.
1106
- Returns detailed analysis with scores for each provider.
1107
- """
1108
- # Calculate scores for each intent category
1109
- shopping_score, shopping_matches = self._calculate_signal_score(
1110
- query, self.SHOPPING_SIGNALS
1111
- )
1112
- research_score, research_matches = self._calculate_signal_score(
1113
- query, self.RESEARCH_SIGNALS
1114
- )
1115
- discovery_score, discovery_matches = self._calculate_signal_score(
1116
- query, self.DISCOVERY_SIGNALS
1117
- )
1118
- local_news_score, local_news_matches = self._calculate_signal_score(
1119
- query, self.LOCAL_NEWS_SIGNALS
1120
- )
1121
- rag_score, rag_matches = self._calculate_signal_score(
1122
- query, self.RAG_SIGNALS
1123
- )
1124
- privacy_score, privacy_matches = self._calculate_signal_score(
1125
- query, self.PRIVACY_SIGNALS
1126
- )
1127
- direct_answer_score, direct_answer_matches = self._calculate_signal_score(
1128
- query, self.DIRECT_ANSWER_SIGNALS
1129
- )
1130
- exa_deep_score, exa_deep_matches = self._calculate_signal_score(
1131
- query, self.EXA_DEEP_SIGNALS
1132
- )
1133
- exa_deep_reasoning_score, exa_deep_reasoning_matches = self._calculate_signal_score(
1134
- query, self.EXA_DEEP_REASONING_SIGNALS
1135
- )
1136
-
1137
- # Apply product/brand bonus to shopping
1138
- brand_bonus = self._detect_product_brand_combo(query)
1139
- if brand_bonus > 0:
1140
- shopping_score += brand_bonus
1141
- shopping_matches.append({
1142
- "pattern": "product_brand_combo",
1143
- "matched": "brand + product detected",
1144
- "weight": brand_bonus
1145
- })
1146
-
1147
- # Detect URL → strong Exa signal
1148
- detected_url = self._detect_url(query)
1149
- if detected_url:
1150
- discovery_score += 5.0
1151
- discovery_matches.append({
1152
- "pattern": "url_detected",
1153
- "matched": detected_url,
1154
- "weight": 5.0
1155
- })
1156
-
1157
- # Assess complexity → favors Tavily
1158
- complexity = self._assess_query_complexity(query)
1159
- if complexity["is_complex"]:
1160
- research_score += complexity["complexity_score"]
1161
- research_matches.append({
1162
- "pattern": "query_complexity",
1163
- "matched": f"complex query ({complexity['word_count']} words)",
1164
- "weight": complexity["complexity_score"]
1165
- })
1166
-
1167
- # Check recency intent
1168
- is_recency, recency_score = self._detect_recency_intent(query)
1169
-
1170
- # Map intents to providers with final scores
1171
- provider_scores = {
1172
- "serper": shopping_score + local_news_score + (recency_score * 0.35),
1173
- "tavily": research_score + (complexity["complexity_score"] if not complexity["is_complex"] else 0) + (0.2 * recency_score),
1174
- "querit": (research_score * 0.65) + (rag_score * 0.35) + (recency_score * 0.45),
1175
- "exa": discovery_score + (1.0 if re.search(r"\b(similar|alternatives?|examples?)\b", query, re.IGNORECASE) else 0.0) + (exa_deep_score * 0.5) + (exa_deep_reasoning_score * 0.5),
1176
- "perplexity": direct_answer_score + (local_news_score * 0.4) + (recency_score * 0.55),
1177
- "you": rag_score + (recency_score * 0.25), # You.com good for real-time + RAG
1178
- "searxng": privacy_score, # SearXNG for privacy/multi-source queries
1179
- }
1180
-
1181
- # Build match details per provider
1182
- provider_matches = {
1183
- "serper": shopping_matches + local_news_matches,
1184
- "tavily": research_matches,
1185
- "querit": research_matches,
1186
- "exa": discovery_matches + exa_deep_matches + exa_deep_reasoning_matches,
1187
- "perplexity": direct_answer_matches,
1188
- "you": rag_matches,
1189
- "searxng": privacy_matches,
1190
- }
1191
-
1192
- return {
1193
- "query": query,
1194
- "provider_scores": provider_scores,
1195
- "provider_matches": provider_matches,
1196
- "detected_url": detected_url,
1197
- "complexity": complexity,
1198
- "recency_focused": is_recency,
1199
- "recency_score": recency_score,
1200
- "exa_deep_score": exa_deep_score,
1201
- "exa_deep_reasoning_score": exa_deep_reasoning_score,
1202
- }
1203
-
1204
- def route(self, query: str) -> Dict[str, Any]:
1205
- """
1206
- Route query to optimal provider with confidence scoring.
1207
- """
1208
- analysis = self.analyze(query)
1209
- scores = analysis["provider_scores"]
1210
-
1211
- # Filter to available providers
1212
- disabled = set(self.auto_config.get("disabled_providers", []))
1213
- available = {
1214
- p: s for p, s in scores.items()
1215
- if p not in disabled and get_api_key(p, self.config)
1216
- }
1217
-
1218
- if not available:
1219
- # No providers available, use fallback
1220
- fallback = self.auto_config.get("fallback_provider", "serper")
1221
- return {
1222
- "provider": fallback,
1223
- "confidence": 0.0,
1224
- "confidence_level": "low",
1225
- "reason": "no_available_providers",
1226
- "scores": scores,
1227
- "top_signals": [],
1228
- "analysis": analysis,
1229
- }
1230
-
1231
- # Find the winner
1232
- max_score = max(available.values())
1233
- total_score = sum(available.values()) or 1.0
1234
-
1235
- # Handle ties using priority
1236
- priority = self.auto_config.get("provider_priority", ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"])
1237
- winners = [p for p, s in available.items() if s == max_score]
1238
-
1239
- if len(winners) > 1:
1240
- # Use priority to break tie
1241
- for p in priority:
1242
- if p in winners:
1243
- winner = p
1244
- break
1245
- else:
1246
- winner = winners[0]
1247
- else:
1248
- winner = winners[0]
1249
-
1250
- # Calculate confidence
1251
- # High confidence = clear winner with good margin
1252
- if max_score == 0:
1253
- confidence = 0.0
1254
- reason = "no_signals_matched"
1255
- else:
1256
- # Confidence based on:
1257
- # 1. Absolute score (is it strong enough?)
1258
- # 2. Relative margin (is there a clear winner?)
1259
- second_best = sorted(available.values(), reverse=True)[1] if len(available) > 1 else 0
1260
- margin = (max_score - second_best) / max_score if max_score > 0 else 0
1261
-
1262
- # Normalize score to 0-1 range (assuming max reasonable score ~15)
1263
- normalized_score = min(max_score / 15.0, 1.0)
1264
-
1265
- # Confidence is combination of absolute strength and relative margin
1266
- confidence = round((normalized_score * 0.6 + margin * 0.4), 3)
1267
-
1268
- if confidence >= 0.7:
1269
- reason = "high_confidence_match"
1270
- elif confidence >= 0.4:
1271
- reason = "moderate_confidence_match"
1272
- else:
1273
- reason = "low_confidence_match"
1274
-
1275
- # Get top signals for the winning provider
1276
- matches = analysis["provider_matches"].get(winner, [])
1277
- top_signals = sorted(matches, key=lambda x: x["weight"], reverse=True)[:5]
1278
-
1279
- # Special case: URL detected and Exa available → strong recommendation
1280
- if analysis["detected_url"] and "exa" in available:
1281
- if winner != "exa":
1282
- # Override if URL is present but didn't win
1283
- # (user might want similar search)
1284
- pass # Keep current winner but note it
1285
-
1286
- # Determine Exa search depth when routed to Exa
1287
- exa_depth = "normal"
1288
- if winner == "exa":
1289
- deep_r_score = analysis.get("exa_deep_reasoning_score", 0)
1290
- deep_score = analysis.get("exa_deep_score", 0)
1291
- if deep_r_score >= 4.0:
1292
- exa_depth = "deep-reasoning"
1293
- elif deep_score >= 4.0:
1294
- exa_depth = "deep"
1295
-
1296
- # Build detailed routing result
1297
- threshold = self.auto_config.get("confidence_threshold", 0.3)
1298
-
1299
- return {
1300
- "provider": winner,
1301
- "confidence": confidence,
1302
- "confidence_level": "high" if confidence >= 0.7 else "medium" if confidence >= 0.4 else "low",
1303
- "reason": reason,
1304
- "exa_depth": exa_depth,
1305
- "scores": {p: round(s, 2) for p, s in available.items()},
1306
- "winning_score": round(max_score, 2),
1307
- "top_signals": [
1308
- {"matched": s["matched"], "weight": s["weight"]}
1309
- for s in top_signals
1310
- ],
1311
- "below_threshold": confidence < threshold,
1312
- "analysis_summary": {
1313
- "query_length": len(query.split()),
1314
- "is_complex": analysis["complexity"]["is_complex"],
1315
- "has_url": analysis["detected_url"] is not None,
1316
- "recency_focused": analysis["recency_focused"],
1317
- }
1318
- }
1319
-
1320
-
1321
- def auto_route_provider(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
1322
- """
1323
- Intelligently route query to the best provider.
1324
- Returns detailed routing decision with confidence.
1325
- """
1326
- analyzer = QueryAnalyzer(config)
1327
- return analyzer.route(query)
1328
-
1329
-
1330
- def explain_routing(query: str, config: Dict[str, Any]) -> Dict[str, Any]:
1331
- """
1332
- Provide detailed explanation of routing decision for debugging.
1333
- """
1334
- analyzer = QueryAnalyzer(config)
1335
- analysis = analyzer.analyze(query)
1336
- routing = analyzer.route(query)
1337
-
1338
- return {
1339
- "query": query,
1340
- "routing_decision": {
1341
- "provider": routing["provider"],
1342
- "confidence": routing["confidence"],
1343
- "confidence_level": routing["confidence_level"],
1344
- "reason": routing["reason"],
1345
- "exa_depth": routing.get("exa_depth", "normal"),
1346
- },
1347
- "scores": routing["scores"],
1348
- "top_signals": routing["top_signals"],
1349
- "intent_breakdown": {
1350
- "shopping_signals": len(analysis["provider_matches"]["serper"]),
1351
- "research_signals": len(analysis["provider_matches"]["tavily"]),
1352
- "querit_signals": len(analysis["provider_matches"]["querit"]),
1353
- "discovery_signals": len(analysis["provider_matches"]["exa"]),
1354
- "rag_signals": len(analysis["provider_matches"]["you"]),
1355
- "exa_deep_score": round(analysis.get("exa_deep_score", 0), 2),
1356
- "exa_deep_reasoning_score": round(analysis.get("exa_deep_reasoning_score", 0), 2),
1357
- },
1358
- "query_analysis": {
1359
- "word_count": analysis["complexity"]["word_count"],
1360
- "is_complex": analysis["complexity"]["is_complex"],
1361
- "complexity_score": round(analysis["complexity"]["complexity_score"], 2),
1362
- "has_url": analysis["detected_url"],
1363
- "recency_focused": analysis["recency_focused"],
1364
- },
1365
- "all_matches": {
1366
- provider: [
1367
- {"matched": m["matched"], "weight": m["weight"]}
1368
- for m in matches
1369
- ]
1370
- for provider, matches in analysis["provider_matches"].items()
1371
- if matches
1372
- },
1373
- "available_providers": [
1374
- p for p in ["serper", "tavily", "querit", "exa", "perplexity", "you", "searxng"]
1375
- if get_api_key(p, config) and p not in config.get("auto_routing", {}).get("disabled_providers", [])
1376
- ]
1377
- }
1378
-
1379
-
1380
-
1381
-
1382
- class ProviderConfigError(Exception):
1383
- """Raised when a provider is missing or has an invalid API key/config."""
1384
- pass
1385
-
1386
-
1387
- class ProviderRequestError(Exception):
1388
- """Structured provider error with retry/cooldown metadata."""
1389
-
1390
- def __init__(self, message: str, status_code: Optional[int] = None, transient: bool = False):
1391
- super().__init__(message)
1392
- self.status_code = status_code
1393
- self.transient = transient
1394
-
1395
-
1396
- TRANSIENT_HTTP_CODES = {429, 503}
1397
- COOLDOWN_STEPS_SECONDS = [60, 300, 1500, 3600] # 1m -> 5m -> 25m -> 1h cap
1398
- RETRY_BACKOFF_SECONDS = [1, 3, 9]
1399
-
1400
-
1401
- def _ensure_parent(path: Path) -> None:
1402
- path.parent.mkdir(parents=True, exist_ok=True)
1403
-
1404
-
1405
- def _load_provider_health() -> Dict[str, Any]:
1406
- if not PROVIDER_HEALTH_FILE.exists():
1407
- return {}
1408
- try:
1409
- with open(PROVIDER_HEALTH_FILE, "r", encoding="utf-8") as f:
1410
- data = json.load(f)
1411
- return data if isinstance(data, dict) else {}
1412
- except (json.JSONDecodeError, IOError):
1413
- return {}
1414
-
1415
-
1416
- def _save_provider_health(state: Dict[str, Any]) -> None:
1417
- _ensure_parent(PROVIDER_HEALTH_FILE)
1418
- with open(PROVIDER_HEALTH_FILE, "w", encoding="utf-8") as f:
1419
- json.dump(state, f, ensure_ascii=False, indent=2)
1420
-
1421
-
1422
- def provider_in_cooldown(provider: str) -> Tuple[bool, int]:
1423
- state = _load_provider_health()
1424
- pstate = state.get(provider, {})
1425
- cooldown_until = int(pstate.get("cooldown_until", 0) or 0)
1426
- remaining = cooldown_until - int(time.time())
1427
- return (remaining > 0, max(0, remaining))
1428
-
1429
-
1430
- def mark_provider_failure(provider: str, error_message: str) -> Dict[str, Any]:
1431
- state = _load_provider_health()
1432
- now = int(time.time())
1433
- pstate = state.get(provider, {})
1434
- fail_count = int(pstate.get("failure_count", 0)) + 1
1435
- cooldown_seconds = COOLDOWN_STEPS_SECONDS[min(fail_count - 1, len(COOLDOWN_STEPS_SECONDS) - 1)]
1436
- state[provider] = {
1437
- "failure_count": fail_count,
1438
- "cooldown_until": now + cooldown_seconds,
1439
- "cooldown_seconds": cooldown_seconds,
1440
- "last_error": error_message,
1441
- "last_failure_at": now,
1442
- }
1443
- _save_provider_health(state)
1444
- return state[provider]
1445
-
1446
-
1447
- def reset_provider_health(provider: str) -> None:
1448
- state = _load_provider_health()
1449
- if provider in state:
1450
- state.pop(provider, None)
1451
- _save_provider_health(state)
1452
-
1453
-
1454
- def _title_from_url(url: str) -> str:
1455
- """Derive a readable title from a URL when none is provided."""
1456
- try:
1457
- parsed = urlparse(url)
1458
- domain = parsed.netloc.replace("www.", "")
1459
- # Use last meaningful path segment as context
1460
- segments = [s for s in parsed.path.strip("/").split("/") if s]
1461
- if segments:
1462
- last = segments[-1].replace("-", " ").replace("_", " ")
1463
- # Strip file extensions
1464
- last = re.sub(r'\.\w{2,4}$', '', last)
1465
- if last:
1466
- return f"{domain} — {last[:80]}"
1467
- return domain
1468
- except Exception:
1469
- return url[:60]
1470
-
1471
-
1472
- def normalize_result_url(url: str) -> str:
1473
- if not url:
1474
- return ""
1475
- parsed = urlparse(url.strip())
1476
- netloc = (parsed.netloc or "").lower()
1477
- if netloc.startswith("www."):
1478
- netloc = netloc[4:]
1479
- path = parsed.path.rstrip("/")
1480
- return f"{netloc}{path}"
1481
-
1482
-
1483
- def deduplicate_results_across_providers(results_by_provider: List[Tuple[str, Dict[str, Any]]], max_results: int) -> Tuple[List[Dict[str, Any]], int]:
1484
- deduped = []
1485
- seen = set()
1486
- dedup_count = 0
1487
- for provider_name, data in results_by_provider:
1488
- for item in data.get("results", []):
1489
- norm = normalize_result_url(item.get("url", ""))
1490
- if norm and norm in seen:
1491
- dedup_count += 1
1492
- continue
1493
- if norm:
1494
- seen.add(norm)
1495
- item = item.copy()
1496
- item.setdefault("provider", provider_name)
1497
- deduped.append(item)
1498
- if len(deduped) >= max_results:
1499
- return deduped, dedup_count
1500
- return deduped, dedup_count
1501
-
1502
- # =============================================================================
1503
- # HTTP Client
1504
- # =============================================================================
1505
-
1506
- def make_request(url: str, headers: dict, body: dict, timeout: int = 30) -> dict:
1507
- """Make HTTP POST request and return JSON response."""
1508
- # Ensure User-Agent is set (required by some APIs like Exa/Cloudflare)
1509
- if "User-Agent" not in headers:
1510
- headers["User-Agent"] = "ClawdBot-WebSearchPlus/2.1"
1511
- data = json.dumps(body).encode("utf-8")
1512
- req = Request(url, data=data, headers=headers, method="POST")
1513
-
1514
- try:
1515
- with urlopen(req, timeout=timeout) as response:
1516
- return json.loads(response.read().decode("utf-8"))
1517
- except HTTPError as e:
1518
- error_body = e.read().decode("utf-8") if e.fp else str(e)
1519
- try:
1520
- error_json = json.loads(error_body)
1521
- error_detail = error_json.get("error") or error_json.get("message") or error_body
1522
- except json.JSONDecodeError:
1523
- error_detail = error_body[:500]
1524
-
1525
- error_messages = {
1526
- 401: "Invalid or expired API key. Please check your credentials.",
1527
- 403: "Access forbidden. Your API key may not have permission for this operation.",
1528
- 429: "Rate limit exceeded. Please wait a moment and try again.",
1529
- 500: "Server error. The search provider is experiencing issues.",
1530
- 503: "Service unavailable. The search provider may be down."
1531
- }
1532
-
1533
- friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
1534
- raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
1535
- except URLError as e:
1536
- reason = str(getattr(e, "reason", e))
1537
- is_timeout = "timed out" in reason.lower()
1538
- raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
1539
- except IncompleteRead as e:
1540
- partial_len = len(getattr(e, "partial", b"") or b"")
1541
- raise ProviderRequestError(
1542
- f"Connection interrupted while reading response ({partial_len} bytes received). Please retry.",
1543
- transient=True,
1544
- )
1545
- except TimeoutError:
1546
- raise ProviderRequestError(f"Request timed out after {timeout}s. Try again or reduce max_results.", transient=True)
1547
-
1548
-
1549
- # =============================================================================
1550
- # Serper (Google Search API)
1551
- # =============================================================================
1552
-
1553
- def search_serper(
1554
- query: str,
1555
- api_key: str,
1556
- max_results: int = 5,
1557
- country: str = "us",
1558
- language: str = "en",
1559
- search_type: str = "search",
1560
- time_range: Optional[str] = None,
1561
- include_images: bool = False,
1562
- ) -> dict:
1563
- """Search using Serper (Google Search API)."""
1564
- endpoint = f"https://google.serper.dev/{search_type}"
1565
-
1566
- body = {
1567
- "q": query,
1568
- "gl": country,
1569
- "hl": language,
1570
- "num": max_results,
1571
- "autocorrect": True,
1572
- }
1573
-
1574
- if time_range and time_range != "none":
1575
- tbs_map = {
1576
- "hour": "qdr:h",
1577
- "day": "qdr:d",
1578
- "week": "qdr:w",
1579
- "month": "qdr:m",
1580
- "year": "qdr:y",
1581
- }
1582
- if time_range in tbs_map:
1583
- body["tbs"] = tbs_map[time_range]
1584
-
1585
- headers = {
1586
- "X-API-KEY": api_key,
1587
- "Content-Type": "application/json",
1588
- }
1589
-
1590
- data = make_request(endpoint, headers, body)
1591
-
1592
- results = []
1593
- for i, item in enumerate(data.get("organic", [])[:max_results]):
1594
- results.append({
1595
- "title": item.get("title", ""),
1596
- "url": item.get("link", ""),
1597
- "snippet": item.get("snippet", ""),
1598
- "score": round(1.0 - i * 0.1, 2),
1599
- "date": item.get("date"),
1600
- })
1601
-
1602
- answer = ""
1603
- if data.get("answerBox", {}).get("answer"):
1604
- answer = data["answerBox"]["answer"]
1605
- elif data.get("answerBox", {}).get("snippet"):
1606
- answer = data["answerBox"]["snippet"]
1607
- elif data.get("knowledgeGraph", {}).get("description"):
1608
- answer = data["knowledgeGraph"]["description"]
1609
- elif results:
1610
- answer = results[0]["snippet"]
1611
-
1612
- images = []
1613
- if include_images:
1614
- try:
1615
- img_data = make_request(
1616
- "https://google.serper.dev/images",
1617
- headers,
1618
- {"q": query, "gl": country, "hl": language, "num": 5},
1619
- )
1620
- images = [img.get("imageUrl", "") for img in img_data.get("images", [])[:5] if img.get("imageUrl")]
1621
- except Exception:
1622
- pass
1623
-
1624
- return {
1625
- "provider": "serper",
1626
- "query": query,
1627
- "results": results,
1628
- "images": images,
1629
- "answer": answer,
1630
- "knowledge_graph": data.get("knowledgeGraph"),
1631
- "related_searches": [r.get("query") for r in data.get("relatedSearches", [])]
1632
- }
1633
-
1634
-
1635
- # =============================================================================
1636
- # Tavily (Research Search)
1637
- # =============================================================================
1638
-
1639
- def search_tavily(
1640
- query: str,
1641
- api_key: str,
1642
- max_results: int = 5,
1643
- depth: str = "basic",
1644
- topic: str = "general",
1645
- include_domains: Optional[List[str]] = None,
1646
- exclude_domains: Optional[List[str]] = None,
1647
- include_images: bool = False,
1648
- include_raw_content: bool = False,
1649
- ) -> dict:
1650
- """Search using Tavily (AI Research Search)."""
1651
- endpoint = "https://api.tavily.com/search"
1652
-
1653
- body = {
1654
- "api_key": api_key,
1655
- "query": query,
1656
- "max_results": max_results,
1657
- "search_depth": depth,
1658
- "topic": topic,
1659
- "include_images": include_images,
1660
- "include_answer": True,
1661
- "include_raw_content": include_raw_content,
1662
- }
1663
-
1664
- if include_domains:
1665
- body["include_domains"] = include_domains
1666
- if exclude_domains:
1667
- body["exclude_domains"] = exclude_domains
1668
-
1669
- headers = {"Content-Type": "application/json"}
1670
-
1671
- data = make_request(endpoint, headers, body)
1672
-
1673
- results = []
1674
- for item in data.get("results", [])[:max_results]:
1675
- result = {
1676
- "title": item.get("title", ""),
1677
- "url": item.get("url", ""),
1678
- "snippet": item.get("content", ""),
1679
- "score": round(item.get("score", 0.0), 3),
1680
- }
1681
- if include_raw_content and item.get("raw_content"):
1682
- result["raw_content"] = item["raw_content"]
1683
- results.append(result)
1684
-
1685
- return {
1686
- "provider": "tavily",
1687
- "query": query,
1688
- "results": results,
1689
- "images": data.get("images", []),
1690
- "answer": data.get("answer", ""),
1691
- }
1692
-
1693
-
1694
- # =============================================================================
1695
- # Querit (Multi-lingual search API for AI, with rich metadata and real-time information)
1696
- # =============================================================================
1697
-
1698
- def _map_querit_time_range(time_range: Optional[str]) -> Optional[str]:
1699
- """Map generic time ranges to Querit's compact date filter format."""
1700
- if not time_range:
1701
- return None
1702
- return {
1703
- "day": "d1",
1704
- "week": "w1",
1705
- "month": "m1",
1706
- "year": "y1",
1707
- }.get(time_range, time_range)
1708
-
1709
-
1710
- def search_querit(
1711
- query: str,
1712
- api_key: str,
1713
- max_results: int = 5,
1714
- language: str = "en",
1715
- country: str = "us",
1716
- time_range: Optional[str] = None,
1717
- include_domains: Optional[List[str]] = None,
1718
- exclude_domains: Optional[List[str]] = None,
1719
- base_url: str = "https://api.querit.ai",
1720
- base_path: str = "/v1/search",
1721
- timeout: int = 30,
1722
- ) -> dict:
1723
- """Search using Querit.
1724
-
1725
- Mirrors the Querit Python SDK payload shape:
1726
- - query
1727
- - count
1728
- - optional filters: languages, geo, sites, timeRange
1729
- """
1730
- endpoint = base_url.rstrip("/") + base_path
1731
-
1732
- filters: Dict[str, Any] = {}
1733
- if language:
1734
- filters["languages"] = {"include": [language.lower()]}
1735
- if country:
1736
- filters["geo"] = {"countries": {"include": [country.upper()]}}
1737
- if include_domains or exclude_domains:
1738
- sites: Dict[str, List[str]] = {}
1739
- if include_domains:
1740
- sites["include"] = include_domains
1741
- if exclude_domains:
1742
- sites["exclude"] = exclude_domains
1743
- filters["sites"] = sites
1744
-
1745
- querit_time_range = _map_querit_time_range(time_range)
1746
- if querit_time_range:
1747
- filters["timeRange"] = {"date": querit_time_range}
1748
-
1749
- body: Dict[str, Any] = {
1750
- "query": query,
1751
- "count": max_results,
1752
- }
1753
- if filters:
1754
- body["filters"] = filters
1755
-
1756
- headers = {
1757
- "Authorization": f"Bearer {api_key}",
1758
- "Content-Type": "application/json",
1759
- }
1760
-
1761
- data = make_request(endpoint, headers, body, timeout=timeout)
1762
-
1763
- error_code = data.get("error_code")
1764
- error_msg = data.get("error_msg")
1765
- if error_msg or (error_code not in (None, 0, 200)):
1766
- message = error_msg or f"Querit request failed with error_code={error_code}"
1767
- raise ProviderRequestError(message)
1768
-
1769
- raw_results = ((data.get("results") or {}).get("result")) or []
1770
- results = []
1771
- for i, item in enumerate(raw_results[:max_results]):
1772
- snippet = item.get("snippet") or item.get("page_age") or ""
1773
- result = {
1774
- "title": item.get("title") or _title_from_url(item.get("url", "")),
1775
- "url": item.get("url", ""),
1776
- "snippet": snippet,
1777
- "score": round(1.0 - i * 0.05, 3),
1778
- }
1779
- if item.get("page_time") is not None:
1780
- result["page_time"] = item["page_time"]
1781
- if item.get("page_age"):
1782
- result["date"] = item["page_age"]
1783
- if item.get("language") is not None:
1784
- result["language"] = item["language"]
1785
- results.append(result)
1786
-
1787
- answer = results[0]["snippet"] if results else ""
1788
-
1789
- return {
1790
- "provider": "querit",
1791
- "query": query,
1792
- "results": results,
1793
- "images": [],
1794
- "answer": answer,
1795
- "metadata": {
1796
- "search_id": data.get("search_id"),
1797
- "time_range": querit_time_range,
1798
- }
1799
- }
1800
-
1801
-
1802
- # =============================================================================
1803
- # Exa (Neural/Semantic/Deep Search)
1804
- # =============================================================================
1805
-
1806
- def search_exa(
1807
- query: str,
1808
- api_key: str,
1809
- max_results: int = 5,
1810
- search_type: str = "neural",
1811
- exa_depth: str = "normal",
1812
- category: Optional[str] = None,
1813
- start_date: Optional[str] = None,
1814
- end_date: Optional[str] = None,
1815
- similar_url: Optional[str] = None,
1816
- include_domains: Optional[List[str]] = None,
1817
- exclude_domains: Optional[List[str]] = None,
1818
- text_verbosity: str = "standard",
1819
- ) -> dict:
1820
- """Search using Exa (Neural/Semantic/Deep Search).
1821
-
1822
- exa_depth controls synthesis level:
1823
- - "normal": standard search (neural/fast/auto/keyword/instant)
1824
- - "deep": multi-source synthesis with grounding (4-12s, $12/1k)
1825
- - "deep-reasoning": cross-reference reasoning with grounding (12-50s, $15/1k)
1826
- """
1827
- is_deep = exa_depth in ("deep", "deep-reasoning")
1828
-
1829
- if similar_url:
1830
- # findSimilar does not support deep search types
1831
- endpoint = "https://api.exa.ai/findSimilar"
1832
- body: Dict[str, Any] = {
1833
- "url": similar_url,
1834
- "numResults": max_results,
1835
- "contents": {
1836
- "text": {"maxCharacters": 2000, "verbosity": text_verbosity},
1837
- "highlights": {"numSentences": 3, "highlightsPerUrl": 2},
1838
- },
1839
- }
1840
- elif is_deep:
1841
- endpoint = "https://api.exa.ai/search"
1842
- body = {
1843
- "query": query,
1844
- "numResults": max_results,
1845
- "type": exa_depth,
1846
- "contents": {
1847
- "text": {"maxCharacters": 5000, "verbosity": "full"},
1848
- },
1849
- }
1850
- else:
1851
- endpoint = "https://api.exa.ai/search"
1852
- body = {
1853
- "query": query,
1854
- "numResults": max_results,
1855
- "type": search_type,
1856
- "contents": {
1857
- "text": {"maxCharacters": 2000, "verbosity": text_verbosity},
1858
- "highlights": {"numSentences": 3, "highlightsPerUrl": 2},
1859
- },
1860
- }
1861
-
1862
- if category:
1863
- body["category"] = category
1864
- if start_date:
1865
- body["startPublishedDate"] = start_date
1866
- if end_date:
1867
- body["endPublishedDate"] = end_date
1868
- if include_domains:
1869
- body["includeDomains"] = include_domains
1870
- if exclude_domains:
1871
- body["excludeDomains"] = exclude_domains
1872
-
1873
- headers = {
1874
- "x-api-key": api_key,
1875
- "Content-Type": "application/json",
1876
- }
1877
-
1878
- timeout = 55 if is_deep else 30
1879
- data = make_request(endpoint, headers, body, timeout=timeout)
1880
-
1881
- results = []
1882
-
1883
- # Deep search: primary content in output field with grounding citations
1884
- if is_deep:
1885
- deep_output = data.get("output", {})
1886
- synthesized_text = ""
1887
- grounding_citations: List[Dict[str, Any]] = []
1888
-
1889
- if isinstance(deep_output.get("content"), str):
1890
- synthesized_text = deep_output["content"]
1891
- elif isinstance(deep_output.get("content"), dict):
1892
- synthesized_text = json.dumps(deep_output["content"], ensure_ascii=False)
1893
-
1894
- for field_citation in deep_output.get("grounding", []):
1895
- for cite in field_citation.get("citations", []):
1896
- grounding_citations.append({
1897
- "url": cite.get("url", ""),
1898
- "title": cite.get("title", ""),
1899
- "confidence": field_citation.get("confidence", ""),
1900
- "field": field_citation.get("field", ""),
1901
- })
1902
-
1903
- # Primary synthesized result
1904
- if synthesized_text:
1905
- results.append({
1906
- "title": f"Exa {exa_depth.replace('-', ' ').title()} Synthesis",
1907
- "url": "",
1908
- "snippet": synthesized_text,
1909
- "full_synthesis": synthesized_text,
1910
- "score": 1.0,
1911
- "grounding": grounding_citations[:10],
1912
- "type": "synthesis",
1913
- })
1914
-
1915
- # Supporting source documents
1916
- for item in data.get("results", [])[:max_results]:
1917
- text_content = item.get("text", "") or ""
1918
- highlights = item.get("highlights", [])
1919
- snippet = text_content[:800] if text_content else (highlights[0] if highlights else "")
1920
- results.append({
1921
- "title": item.get("title", ""),
1922
- "url": item.get("url", ""),
1923
- "snippet": snippet,
1924
- "score": round(item.get("score", 0.0), 3),
1925
- "published_date": item.get("publishedDate"),
1926
- "author": item.get("author"),
1927
- "type": "source",
1928
- })
1929
-
1930
- answer = synthesized_text if synthesized_text else (results[1]["snippet"] if len(results) > 1 else "")
1931
-
1932
- return {
1933
- "provider": "exa",
1934
- "query": query,
1935
- "exa_depth": exa_depth,
1936
- "results": results,
1937
- "images": [],
1938
- "answer": answer,
1939
- "grounding": grounding_citations,
1940
- "metadata": {
1941
- "synthesis_length": len(synthesized_text),
1942
- "source_count": len(data.get("results", [])),
1943
- },
1944
- }
1945
-
1946
- # Standard search result parsing
1947
- for item in data.get("results", [])[:max_results]:
1948
- text_content = item.get("text", "") or ""
1949
- highlights = item.get("highlights", [])
1950
- if text_content:
1951
- snippet = text_content[:800]
1952
- elif highlights:
1953
- snippet = " ... ".join(highlights[:2])
1954
- else:
1955
- snippet = ""
1956
-
1957
- results.append({
1958
- "title": item.get("title", ""),
1959
- "url": item.get("url", ""),
1960
- "snippet": snippet,
1961
- "score": round(item.get("score", 0.0), 3),
1962
- "published_date": item.get("publishedDate"),
1963
- "author": item.get("author"),
1964
- })
1965
-
1966
- answer = results[0]["snippet"] if results else ""
1967
-
1968
- return {
1969
- "provider": "exa",
1970
- "query": query if not similar_url else f"Similar to: {similar_url}",
1971
- "results": results,
1972
- "images": [],
1973
- "answer": answer,
1974
- }
1975
-
1976
-
1977
- # =============================================================================
1978
- # Perplexity via Kilo Gateway (Synthesized Direct Answers)
1979
- # =============================================================================
1980
-
1981
- def search_perplexity(
1982
- query: str,
1983
- api_key: str,
1984
- max_results: int = 5,
1985
- model: str = "perplexity/sonar-pro",
1986
- api_url: str = "https://api.kilo.ai/api/gateway/chat/completions",
1987
- freshness: Optional[str] = None,
1988
- ) -> dict:
1989
- """Search/answer using Perplexity Sonar Pro via Kilo Gateway.
1990
-
1991
- Args:
1992
- query: Search query
1993
- api_key: Kilo Gateway API key
1994
- max_results: Maximum results to return
1995
- model: Perplexity model to use
1996
- api_url: Kilo Gateway endpoint
1997
- freshness: Filter by recency — 'day', 'week', 'month', 'year' (maps to
1998
- Perplexity's search_recency_filter parameter)
1999
- """
2000
- # Map generic freshness values to Perplexity's search_recency_filter
2001
- recency_map = {"day": "day", "pd": "day", "week": "week", "pw": "week", "month": "month", "pm": "month", "year": "year", "py": "year"}
2002
- recency_filter = recency_map.get(freshness or "", None)
2003
-
2004
- body = {
2005
- "model": model,
2006
- "messages": [
2007
- {"role": "system", "content": "Answer with concise factual summary and include source URLs."},
2008
- {"role": "user", "content": query},
2009
- ],
2010
- "temperature": 0.2,
2011
- }
2012
- if recency_filter:
2013
- body["search_recency_filter"] = recency_filter
2014
-
2015
- headers = {
2016
- "Authorization": f"Bearer {api_key}",
2017
- "Content-Type": "application/json",
2018
- }
2019
-
2020
- data = make_request(api_url, headers, body)
2021
- choices = data.get("choices", [])
2022
- message = choices[0].get("message", {}) if choices else {}
2023
- answer = (message.get("content") or "").strip()
2024
-
2025
- # Prefer the structured citations array from Perplexity API response
2026
- api_citations = data.get("citations", [])
2027
-
2028
- # Fallback: extract URLs from answer text if API doesn't provide citations
2029
- if not api_citations:
2030
- api_citations = []
2031
- seen = set()
2032
- for u in re.findall(r"https?://[^\s)\]}>\"']+", answer):
2033
- if u not in seen:
2034
- seen.add(u)
2035
- api_citations.append(u)
2036
-
2037
- results = []
2038
-
2039
- # Primary result: the synthesized answer itself
2040
- if answer:
2041
- # Clean citation markers [1][2] for the snippet
2042
- clean_answer = re.sub(r'\[\d+\]', '', answer).strip()
2043
- results.append({
2044
- "title": f"Perplexity Answer: {query[:80]}",
2045
- "url": "https://www.perplexity.ai",
2046
- "snippet": clean_answer[:500],
2047
- "score": 1.0,
2048
- })
2049
-
2050
- # Source results from citations
2051
- for i, citation in enumerate(api_citations[:max_results - 1]):
2052
- # citations can be plain URL strings or dicts with url/title
2053
- if isinstance(citation, str):
2054
- url = citation
2055
- title = _title_from_url(url)
2056
- else:
2057
- url = citation.get("url", "")
2058
- title = citation.get("title") or _title_from_url(url)
2059
- results.append({
2060
- "title": title,
2061
- "url": url,
2062
- "snippet": f"Source cited in Perplexity answer [citation {i+1}]",
2063
- "score": round(0.9 - i * 0.1, 3),
2064
- })
2065
-
2066
- return {
2067
- "provider": "perplexity",
2068
- "query": query,
2069
- "results": results,
2070
- "images": [],
2071
- "answer": answer,
2072
- "metadata": {
2073
- "model": model,
2074
- "usage": data.get("usage", {}),
2075
- }
2076
- }
2077
-
2078
-
2079
-
2080
- # =============================================================================
2081
- # You.com (LLM-Ready Web & News Search)
2082
- # =============================================================================
2083
-
2084
- def search_you(
2085
- query: str,
2086
- api_key: str,
2087
- max_results: int = 5,
2088
- country: str = "US",
2089
- language: str = "en",
2090
- freshness: Optional[str] = None,
2091
- safesearch: str = "moderate",
2092
- include_news: bool = True,
2093
- livecrawl: Optional[str] = None,
2094
- ) -> dict:
2095
- """Search using You.com (LLM-Ready Web & News Search).
2096
-
2097
- You.com excels at:
2098
- - RAG applications with pre-extracted snippets
2099
- - Combined web + news results in one call
2100
- - Real-time information with automatic news classification
2101
- - Clean, structured JSON optimized for AI consumption
2102
-
2103
- Args:
2104
- query: Search query
2105
- api_key: You.com API key
2106
- max_results: Maximum results to return (default 5, max 100)
2107
- country: ISO 3166-2 country code (e.g., US, GB, DE)
2108
- language: BCP 47 language code (e.g., en, de, fr)
2109
- freshness: Filter by recency: day, week, month, year, or YYYY-MM-DDtoYYYY-MM-DD
2110
- safesearch: Content filter: off, moderate (default), strict
2111
- include_news: Include news results when relevant (default True)
2112
- livecrawl: Fetch full page content: "web", "news", or "all"
2113
- """
2114
- endpoint = "https://ydc-index.io/v1/search"
2115
-
2116
- # Build query parameters
2117
- params = {
2118
- "query": query,
2119
- "count": max_results,
2120
- "safesearch": safesearch,
2121
- }
2122
-
2123
- if country:
2124
- params["country"] = country.upper()
2125
- if language:
2126
- params["language"] = language.upper()
2127
- if freshness:
2128
- params["freshness"] = freshness
2129
- if livecrawl:
2130
- params["livecrawl"] = livecrawl
2131
- params["livecrawl_formats"] = "markdown"
2132
-
2133
- # Build URL with query params (URL-encode values)
2134
- query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
2135
- url = f"{endpoint}?{query_string}"
2136
-
2137
- headers = {
2138
- "X-API-KEY": api_key,
2139
- "Accept": "application/json",
2140
- "User-Agent": "ClawdBot-WebSearchPlus/2.4",
2141
- }
2142
-
2143
- # Make GET request (You.com uses GET, not POST)
2144
- from urllib.request import Request, urlopen
2145
- req = Request(url, headers=headers, method="GET")
2146
-
2147
- try:
2148
- with urlopen(req, timeout=30) as response:
2149
- data = json.loads(response.read().decode("utf-8"))
2150
- except HTTPError as e:
2151
- error_body = e.read().decode("utf-8") if e.fp else str(e)
2152
- try:
2153
- error_json = json.loads(error_body)
2154
- error_detail = error_json.get("error") or error_json.get("message") or error_body
2155
- except json.JSONDecodeError:
2156
- error_detail = error_body[:500]
2157
-
2158
- error_messages = {
2159
- 401: "Invalid or expired API key. Get one at https://api.you.com",
2160
- 403: "Access forbidden. Check your API key permissions.",
2161
- 429: "Rate limit exceeded. Please wait and try again.",
2162
- 500: "You.com server error. Try again later.",
2163
- 503: "You.com service unavailable."
2164
- }
2165
- friendly_msg = error_messages.get(e.code, f"API error: {error_detail}")
2166
- raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
2167
- except URLError as e:
2168
- reason = str(getattr(e, "reason", e))
2169
- is_timeout = "timed out" in reason.lower()
2170
- raise ProviderRequestError(f"Network error: {reason}. Check your internet connection.", transient=is_timeout)
2171
- except TimeoutError:
2172
- raise ProviderRequestError("You.com request timed out after 30s.", transient=True)
2173
-
2174
- # Parse results
2175
- results_data = data.get("results", {})
2176
- web_results = results_data.get("web", [])
2177
- news_results = results_data.get("news", []) if include_news else []
2178
- metadata = data.get("metadata", {})
2179
-
2180
- # Normalize web results
2181
- results = []
2182
- for i, item in enumerate(web_results[:max_results]):
2183
- snippets = item.get("snippets", [])
2184
- snippet = snippets[0] if snippets else item.get("description", "")
2185
-
2186
- result = {
2187
- "title": item.get("title", ""),
2188
- "url": item.get("url", ""),
2189
- "snippet": snippet,
2190
- "score": round(1.0 - i * 0.05, 3), # Assign descending score
2191
- "date": item.get("page_age"),
2192
- "source": "web",
2193
- }
2194
-
2195
- # Include additional snippets if available (great for RAG)
2196
- if len(snippets) > 1:
2197
- result["additional_snippets"] = snippets[1:3]
2198
-
2199
- # Include thumbnail and favicon for UI display
2200
- if item.get("thumbnail_url"):
2201
- result["thumbnail"] = item["thumbnail_url"]
2202
- if item.get("favicon_url"):
2203
- result["favicon"] = item["favicon_url"]
2204
-
2205
- # Include live-crawled content if available
2206
- if item.get("contents"):
2207
- result["raw_content"] = item["contents"].get("markdown") or item["contents"].get("html", "")
2208
-
2209
- results.append(result)
2210
-
2211
- # Add news results (if any)
2212
- news = []
2213
- for item in news_results[:5]:
2214
- news.append({
2215
- "title": item.get("title", ""),
2216
- "url": item.get("url", ""),
2217
- "snippet": item.get("description", ""),
2218
- "date": item.get("page_age"),
2219
- "thumbnail": item.get("thumbnail_url"),
2220
- "source": "news",
2221
- })
2222
-
2223
- # Build answer from best snippets
2224
- answer = ""
2225
- if results:
2226
- # Combine top snippets for LLM context
2227
- top_snippets = []
2228
- for r in results[:3]:
2229
- if r.get("snippet"):
2230
- top_snippets.append(r["snippet"])
2231
- answer = " ".join(top_snippets)[:1000]
2232
-
2233
- return {
2234
- "provider": "you",
2235
- "query": query,
2236
- "results": results,
2237
- "news": news,
2238
- "images": [],
2239
- "answer": answer,
2240
- "metadata": {
2241
- "search_uuid": metadata.get("search_uuid"),
2242
- "latency": metadata.get("latency"),
2243
- }
2244
- }
2245
-
2246
-
2247
- # =============================================================================
2248
- # SearXNG (Privacy-First Meta-Search)
2249
- # =============================================================================
2250
-
2251
- def search_searxng(
2252
- query: str,
2253
- instance_url: str,
2254
- max_results: int = 5,
2255
- categories: Optional[List[str]] = None,
2256
- engines: Optional[List[str]] = None,
2257
- language: str = "en",
2258
- time_range: Optional[str] = None,
2259
- safesearch: int = 0,
2260
- ) -> dict:
2261
- """Search using SearXNG (self-hosted privacy-first meta-search).
2262
-
2263
- SearXNG excels at:
2264
- - Privacy-preserving search (no tracking, no profiling)
2265
- - Multi-source aggregation (70+ upstream engines)
2266
- - $0 API cost (self-hosted)
2267
- - Diverse perspectives from multiple search engines
2268
-
2269
- Args:
2270
- query: Search query
2271
- instance_url: URL of your SearXNG instance (required)
2272
- max_results: Maximum results to return (default 5)
2273
- categories: Search categories (general, images, news, videos, etc.)
2274
- engines: Specific engines to use (google, bing, duckduckgo, etc.)
2275
- language: Language code (e.g., en, de, fr)
2276
- time_range: Filter by recency: day, week, month, year
2277
- safesearch: Content filter: 0=off, 1=moderate, 2=strict
2278
-
2279
- Note:
2280
- Requires a self-hosted SearXNG instance with JSON format enabled.
2281
- See: https://docs.searxng.org/admin/installation.html
2282
- """
2283
- # Build URL with query parameters
2284
- params = {
2285
- "q": query,
2286
- "format": "json",
2287
- "language": language,
2288
- "safesearch": str(safesearch),
2289
- }
2290
-
2291
- if categories:
2292
- params["categories"] = ",".join(categories)
2293
- if engines:
2294
- params["engines"] = ",".join(engines)
2295
- if time_range:
2296
- params["time_range"] = time_range
2297
-
2298
- # Build URL — instance_url comes from operator-controlled config/env only
2299
- # (validated by _validate_searxng_url), not from agent/LLM input
2300
- base_url = instance_url.rstrip("/")
2301
- query_string = "&".join(f"{k}={quote(str(v))}" for k, v in params.items())
2302
- url = f"{base_url}/search?{query_string}"
2303
-
2304
- headers = {
2305
- "User-Agent": "ClawdBot-WebSearchPlus/2.5",
2306
- "Accept": "application/json",
2307
- }
2308
-
2309
- # Make GET request
2310
- req = Request(url, headers=headers, method="GET")
2311
-
2312
- try:
2313
- with urlopen(req, timeout=30) as response:
2314
- data = json.loads(response.read().decode("utf-8"))
2315
- except HTTPError as e:
2316
- error_body = e.read().decode("utf-8") if e.fp else str(e)
2317
- try:
2318
- error_json = json.loads(error_body)
2319
- error_detail = error_json.get("error") or error_json.get("message") or error_body
2320
- except json.JSONDecodeError:
2321
- error_detail = error_body[:500]
2322
-
2323
- error_messages = {
2324
- 403: "JSON API disabled on this SearXNG instance. Enable 'json' in search.formats in settings.yml",
2325
- 404: "SearXNG instance not found. Check your instance URL.",
2326
- 500: "SearXNG server error. Check instance health.",
2327
- 503: "SearXNG service unavailable."
2328
- }
2329
- friendly_msg = error_messages.get(e.code, f"SearXNG error: {error_detail}")
2330
- raise ProviderRequestError(f"{friendly_msg} (HTTP {e.code})", status_code=e.code, transient=e.code in TRANSIENT_HTTP_CODES)
2331
- except URLError as e:
2332
- reason = str(getattr(e, "reason", e))
2333
- is_timeout = "timed out" in reason.lower()
2334
- raise ProviderRequestError(f"Cannot reach SearXNG instance at {instance_url}. Error: {reason}", transient=is_timeout)
2335
- except TimeoutError:
2336
- raise ProviderRequestError(f"SearXNG request timed out after 30s. Check instance health.", transient=True)
2337
-
2338
- # Parse results
2339
- raw_results = data.get("results", [])
2340
-
2341
- # Normalize results to unified format
2342
- results = []
2343
- engines_used = set()
2344
- for i, item in enumerate(raw_results[:max_results]):
2345
- engine = item.get("engine", "unknown")
2346
- engines_used.add(engine)
2347
-
2348
- results.append({
2349
- "title": item.get("title", ""),
2350
- "url": item.get("url", ""),
2351
- "snippet": item.get("content", ""),
2352
- "score": round(item.get("score", 1.0 - i * 0.05), 3),
2353
- "engine": engine,
2354
- "category": item.get("category", "general"),
2355
- "date": item.get("publishedDate"),
2356
- })
2357
-
2358
- # Build answer from answers, infoboxes, or first result
2359
- answer = ""
2360
- if data.get("answers"):
2361
- answer = data["answers"][0] if isinstance(data["answers"][0], str) else str(data["answers"][0])
2362
- elif data.get("infoboxes"):
2363
- infobox = data["infoboxes"][0]
2364
- answer = infobox.get("content", "") or infobox.get("infobox", "")
2365
- elif results:
2366
- answer = results[0]["snippet"]
2367
-
2368
- return {
2369
- "provider": "searxng",
2370
- "query": query,
2371
- "results": results,
2372
- "images": [],
2373
- "answer": answer,
2374
- "suggestions": data.get("suggestions", []),
2375
- "corrections": data.get("corrections", []),
2376
- "metadata": {
2377
- "number_of_results": data.get("number_of_results"),
2378
- "engines_used": list(engines_used),
2379
- "instance_url": instance_url,
2380
- }
2381
- }
2382
-
2383
-
2384
- # =============================================================================
2385
- # CLI
2386
- # =============================================================================
2387
-
2388
- def main():
2389
- config = load_config()
2390
-
2391
- parser = argparse.ArgumentParser(
2392
- description="Web Search Plus — Intelligent multi-provider search with smart auto-routing",
2393
- formatter_class=argparse.RawDescriptionHelpFormatter,
2394
- epilog="""
2395
- Intelligent Auto-Routing:
2396
- The query is analyzed using multi-signal detection to find the optimal provider:
2397
-
2398
- Shopping Intent → Serper (Google)
2399
- "how much", "price of", "buy", product+brand combos, deals, specs
2400
-
2401
- Research Intent → Tavily
2402
- "how does", "explain", "what is", analysis, pros/cons, tutorials
2403
-
2404
- Multilingual + Real-Time AI Search → Querit
2405
- multilingual search, metadata-rich results, current information for AI workflows
2406
-
2407
- Discovery Intent → Exa (Neural)
2408
- "similar to", "companies like", "alternatives", URLs, startups, papers
2409
-
2410
- Direct Answer Intent → Perplexity (via Kilo Gateway)
2411
- "what is", "current status", local events, synthesized up-to-date answers
2412
-
2413
- Examples:
2414
- python3 search.py -q "iPhone 16 Pro Max price" # → Serper (shopping)
2415
- python3 search.py -q "how does HTTPS encryption work" # → Tavily (research)
2416
- python3 search.py -q "startups similar to Notion" # → Exa (discovery)
2417
- python3 search.py --explain-routing -q "your query" # Debug routing
2418
-
2419
- Full docs: See README.md and SKILL.md
2420
- """,
2421
- )
2422
-
2423
- # Common arguments
2424
- parser.add_argument(
2425
- "--provider", "-p",
2426
- choices=["serper", "tavily", "querit", "exa", "perplexity", "you", "searxng", "auto"],
2427
- help="Search provider (auto=intelligent routing)"
2428
- )
2429
- parser.add_argument(
2430
- "--query", "-q",
2431
- help="Search query"
2432
- )
2433
- parser.add_argument(
2434
- "--max-results", "-n",
2435
- type=int,
2436
- default=config.get("defaults", {}).get("max_results", 5),
2437
- help="Maximum results (default: 5)"
2438
- )
2439
- parser.add_argument(
2440
- "--images",
2441
- action="store_true",
2442
- help="Include images (Serper/Tavily)"
2443
- )
2444
-
2445
- # Auto-routing options
2446
- parser.add_argument(
2447
- "--auto", "-a",
2448
- action="store_true",
2449
- help="Use intelligent auto-routing (default when no provider specified)"
2450
- )
2451
- parser.add_argument(
2452
- "--explain-routing",
2453
- action="store_true",
2454
- help="Show detailed routing analysis (debug mode)"
2455
- )
2456
-
2457
- # Serper-specific
2458
- serper_config = config.get("serper", {})
2459
- parser.add_argument("--country", default=serper_config.get("country", "us"))
2460
- parser.add_argument("--language", default=serper_config.get("language", "en"))
2461
- parser.add_argument(
2462
- "--type",
2463
- dest="search_type",
2464
- default=serper_config.get("type", "search"),
2465
- choices=["search", "news", "images", "videos", "places", "shopping"]
2466
- )
2467
- parser.add_argument(
2468
- "--time-range",
2469
- choices=["hour", "day", "week", "month", "year"]
2470
- )
2471
-
2472
- # Tavily-specific
2473
- tavily_config = config.get("tavily", {})
2474
- parser.add_argument(
2475
- "--depth",
2476
- default=tavily_config.get("depth", "basic"),
2477
- choices=["basic", "advanced"]
2478
- )
2479
- parser.add_argument(
2480
- "--topic",
2481
- default=tavily_config.get("topic", "general"),
2482
- choices=["general", "news"]
2483
- )
2484
- parser.add_argument("--raw-content", action="store_true")
2485
-
2486
- # Querit-specific
2487
- querit_config = config.get("querit", {})
2488
- parser.add_argument(
2489
- "--querit-base-url",
2490
- default=querit_config.get("base_url", "https://api.querit.ai"),
2491
- help="Querit API base URL"
2492
- )
2493
- parser.add_argument(
2494
- "--querit-base-path",
2495
- default=querit_config.get("base_path", "/v1/search"),
2496
- help="Querit API path"
2497
- )
2498
-
2499
- # Exa-specific
2500
- exa_config = config.get("exa", {})
2501
- parser.add_argument(
2502
- "--exa-type",
2503
- default=exa_config.get("type", "neural"),
2504
- choices=["neural", "fast", "auto", "keyword", "instant"],
2505
- help="Exa search type (for standard search, ignored when --exa-depth is set)"
2506
- )
2507
- parser.add_argument(
2508
- "--exa-depth",
2509
- default=exa_config.get("depth", "normal"),
2510
- choices=["normal", "deep", "deep-reasoning"],
2511
- help="Exa search depth: deep (synthesized, 4-12s), deep-reasoning (cross-reference, 12-50s)"
2512
- )
2513
- parser.add_argument(
2514
- "--exa-verbosity",
2515
- default=exa_config.get("verbosity", "standard"),
2516
- choices=["compact", "standard", "full"],
2517
- help="Exa text verbosity for content extraction"
2518
- )
2519
- parser.add_argument(
2520
- "--category",
2521
- choices=[
2522
- "company", "research paper", "news", "pdf", "github",
2523
- "tweet", "personal site", "linkedin profile"
2524
- ]
2525
- )
2526
- parser.add_argument("--start-date")
2527
- parser.add_argument("--end-date")
2528
- parser.add_argument("--similar-url")
2529
-
2530
- # You.com-specific
2531
- you_config = config.get("you", {})
2532
- parser.add_argument(
2533
- "--you-safesearch",
2534
- default=you_config.get("safesearch", "moderate"),
2535
- choices=["off", "moderate", "strict"],
2536
- help="You.com SafeSearch filter"
2537
- )
2538
- parser.add_argument(
2539
- "--freshness",
2540
- choices=["day", "week", "month", "year"],
2541
- help="Filter results by recency (You.com/Serper)"
2542
- )
2543
- parser.add_argument(
2544
- "--livecrawl",
2545
- choices=["web", "news", "all"],
2546
- help="You.com: fetch full page content"
2547
- )
2548
- parser.add_argument(
2549
- "--no-news",
2550
- action="store_true",
2551
- help="You.com: exclude news results (included by default)"
2552
- )
2553
-
2554
- # SearXNG-specific
2555
- searxng_config = config.get("searxng", {})
2556
- parser.add_argument(
2557
- "--searxng-url",
2558
- default=searxng_config.get("instance_url"),
2559
- help="SearXNG instance URL (e.g., https://searx.example.com)"
2560
- )
2561
- parser.add_argument(
2562
- "--searxng-safesearch",
2563
- type=int,
2564
- default=searxng_config.get("safesearch", 0),
2565
- choices=[0, 1, 2],
2566
- help="SearXNG SafeSearch: 0=off, 1=moderate, 2=strict"
2567
- )
2568
- parser.add_argument(
2569
- "--engines",
2570
- nargs="+",
2571
- default=searxng_config.get("engines"),
2572
- help="SearXNG: specific engines to use (e.g., google bing duckduckgo)"
2573
- )
2574
- parser.add_argument(
2575
- "--categories",
2576
- nargs="+",
2577
- help="SearXNG: search categories (general, images, news, videos, etc.)"
2578
- )
2579
-
2580
- # Domain filters
2581
- parser.add_argument("--include-domains", nargs="+")
2582
- parser.add_argument("--exclude-domains", nargs="+")
2583
-
2584
- # Output
2585
- parser.add_argument("--compact", action="store_true")
2586
-
2587
- # Caching options
2588
- parser.add_argument(
2589
- "--cache-ttl",
2590
- type=int,
2591
- default=DEFAULT_CACHE_TTL,
2592
- help=f"Cache TTL in seconds (default: {DEFAULT_CACHE_TTL} = 1 hour)"
2593
- )
2594
- parser.add_argument(
2595
- "--no-cache",
2596
- action="store_true",
2597
- help="Bypass cache (always fetch fresh results)"
2598
- )
2599
- parser.add_argument(
2600
- "--clear-cache",
2601
- action="store_true",
2602
- help="Clear all cached results and exit"
2603
- )
2604
- parser.add_argument(
2605
- "--cache-stats",
2606
- action="store_true",
2607
- help="Show cache statistics and exit"
2608
- )
2609
-
2610
- args = parser.parse_args()
2611
-
2612
- # Handle cache management commands first (before query validation)
2613
- if args.clear_cache:
2614
- result = cache_clear()
2615
- indent = None if args.compact else 2
2616
- print(json.dumps(result, indent=indent, ensure_ascii=False))
2617
- return
2618
-
2619
- if args.cache_stats:
2620
- result = cache_stats()
2621
- indent = None if args.compact else 2
2622
- print(json.dumps(result, indent=indent, ensure_ascii=False))
2623
- return
2624
-
2625
- if not args.query and not args.similar_url:
2626
- parser.error("--query is required (unless using --similar-url with Exa)")
2627
-
2628
- # Handle --explain-routing
2629
- if args.explain_routing:
2630
- if not args.query:
2631
- parser.error("--query is required for --explain-routing")
2632
- explanation = explain_routing(args.query, config)
2633
- indent = None if args.compact else 2
2634
- print(json.dumps(explanation, indent=indent, ensure_ascii=False))
2635
- return
2636
-
2637
- # Determine provider
2638
- if args.provider == "auto" or (args.provider is None and not args.similar_url):
2639
- if args.query:
2640
- routing = auto_route_provider(args.query, config)
2641
- provider = routing["provider"]
2642
- routing_info = {
2643
- "auto_routed": True,
2644
- "provider": provider,
2645
- "confidence": routing["confidence"],
2646
- "confidence_level": routing["confidence_level"],
2647
- "reason": routing["reason"],
2648
- "top_signals": routing["top_signals"],
2649
- "scores": routing["scores"],
2650
- }
2651
- else:
2652
- provider = "exa"
2653
- routing_info = {
2654
- "auto_routed": True,
2655
- "provider": "exa",
2656
- "confidence": 1.0,
2657
- "confidence_level": "high",
2658
- "reason": "similar_url_specified",
2659
- }
2660
- else:
2661
- provider = args.provider or "serper"
2662
- routing_info = {"auto_routed": False, "provider": provider}
2663
-
2664
- # Build provider fallback list
2665
- auto_config = config.get("auto_routing", {})
2666
- provider_priority = auto_config.get("provider_priority", ["tavily", "querit", "exa", "perplexity", "serper", "you", "searxng"])
2667
- disabled_providers = auto_config.get("disabled_providers", [])
2668
-
2669
- # Start with the selected provider, then try others in priority order
2670
- # Only include providers that have a configured API key (except the primary,
2671
- # which gets a clear error if unconfigured and no fallback succeeds)
2672
- providers_to_try = [provider]
2673
- for p in provider_priority:
2674
- if p not in providers_to_try and p not in disabled_providers and get_api_key(p, config):
2675
- providers_to_try.append(p)
2676
-
2677
- # Skip providers currently in cooldown
2678
- eligible_providers = []
2679
- cooldown_skips = []
2680
- for p in providers_to_try:
2681
- in_cd, remaining = provider_in_cooldown(p)
2682
- if in_cd:
2683
- cooldown_skips.append({"provider": p, "cooldown_remaining_seconds": remaining})
2684
- else:
2685
- eligible_providers.append(p)
2686
-
2687
- if not eligible_providers:
2688
- eligible_providers = providers_to_try[:1]
2689
-
2690
- # Helper function to execute search for a provider
2691
- def execute_search(prov: str) -> Dict[str, Any]:
2692
- key = validate_api_key(prov, config)
2693
- if prov == "serper":
2694
- return search_serper(
2695
- query=args.query,
2696
- api_key=key,
2697
- max_results=args.max_results,
2698
- country=args.country,
2699
- language=args.language,
2700
- search_type=args.search_type,
2701
- time_range=args.time_range,
2702
- include_images=args.images,
2703
- )
2704
- elif prov == "tavily":
2705
- return search_tavily(
2706
- query=args.query,
2707
- api_key=key,
2708
- max_results=args.max_results,
2709
- depth=args.depth,
2710
- topic=args.topic,
2711
- include_domains=args.include_domains,
2712
- exclude_domains=args.exclude_domains,
2713
- include_images=args.images,
2714
- include_raw_content=args.raw_content,
2715
- )
2716
- elif prov == "querit":
2717
- return search_querit(
2718
- query=args.query,
2719
- api_key=key,
2720
- max_results=args.max_results,
2721
- language=args.language,
2722
- country=args.country,
2723
- time_range=args.time_range or args.freshness,
2724
- include_domains=args.include_domains,
2725
- exclude_domains=args.exclude_domains,
2726
- base_url=args.querit_base_url,
2727
- base_path=args.querit_base_path,
2728
- timeout=int(querit_config.get("timeout", 30)),
2729
- )
2730
- elif prov == "exa":
2731
- # CLI --exa-depth overrides; fallback to auto-routing suggestion
2732
- exa_depth = args.exa_depth
2733
- if exa_depth == "normal" and routing_info.get("exa_depth") in ("deep", "deep-reasoning"):
2734
- exa_depth = routing_info["exa_depth"]
2735
- return search_exa(
2736
- query=args.query or "",
2737
- api_key=key,
2738
- max_results=args.max_results,
2739
- search_type=args.exa_type,
2740
- exa_depth=exa_depth,
2741
- category=args.category,
2742
- start_date=args.start_date,
2743
- end_date=args.end_date,
2744
- similar_url=args.similar_url,
2745
- include_domains=args.include_domains,
2746
- exclude_domains=args.exclude_domains,
2747
- text_verbosity=args.exa_verbosity,
2748
- )
2749
- elif prov == "perplexity":
2750
- perplexity_config = config.get("perplexity", {})
2751
- return search_perplexity(
2752
- query=args.query,
2753
- api_key=key,
2754
- max_results=args.max_results,
2755
- model=perplexity_config.get("model", "perplexity/sonar-pro"),
2756
- api_url=perplexity_config.get("api_url", "https://api.kilo.ai/api/gateway/chat/completions"),
2757
- freshness=getattr(args, "freshness", None),
2758
- )
2759
- elif prov == "you":
2760
- return search_you(
2761
- query=args.query,
2762
- api_key=key,
2763
- max_results=args.max_results,
2764
- country=args.country,
2765
- language=args.language,
2766
- freshness=args.freshness,
2767
- safesearch=args.you_safesearch,
2768
- include_news=not args.no_news,
2769
- livecrawl=args.livecrawl,
2770
- )
2771
- elif prov == "searxng":
2772
- # For SearXNG, 'key' is actually the instance URL
2773
- instance_url = args.searxng_url or key
2774
- if instance_url:
2775
- instance_url = _validate_searxng_url(instance_url)
2776
- return search_searxng(
2777
- query=args.query,
2778
- instance_url=instance_url,
2779
- max_results=args.max_results,
2780
- categories=args.categories,
2781
- engines=args.engines,
2782
- language=args.language,
2783
- time_range=args.time_range,
2784
- safesearch=args.searxng_safesearch,
2785
- )
2786
- else:
2787
- raise ValueError(f"Unknown provider: {prov}")
2788
-
2789
- def execute_with_retry(prov: str) -> Dict[str, Any]:
2790
- last_error = None
2791
- for attempt in range(0, 3):
2792
- try:
2793
- return execute_search(prov)
2794
- except ProviderRequestError as e:
2795
- last_error = e
2796
- if e.status_code in {401, 403}:
2797
- break
2798
- if not e.transient:
2799
- break
2800
- if attempt < 2:
2801
- time.sleep(RETRY_BACKOFF_SECONDS[attempt])
2802
- continue
2803
- break
2804
- except Exception as e:
2805
- last_error = e
2806
- break
2807
- raise last_error if last_error else Exception("Unknown provider execution error")
2808
-
2809
- cache_context = {
2810
- "locale": f"{args.country}:{args.language}",
2811
- "freshness": args.freshness,
2812
- "time_range": args.time_range,
2813
- "include_domains": sorted(args.include_domains) if args.include_domains else None,
2814
- "exclude_domains": sorted(args.exclude_domains) if args.exclude_domains else None,
2815
- "topic": args.topic,
2816
- "search_engines": sorted(args.engines) if args.engines else None,
2817
- "include_news": not args.no_news,
2818
- "search_type": args.search_type,
2819
- "exa_type": args.exa_type,
2820
- "exa_depth": args.exa_depth,
2821
- "exa_verbosity": args.exa_verbosity,
2822
- "category": args.category,
2823
- "similar_url": args.similar_url,
2824
- }
2825
-
2826
- # Check cache first (unless --no-cache is set)
2827
- cached_result = None
2828
- cache_hit = False
2829
- if not args.no_cache and args.query:
2830
- cached_result = cache_get(
2831
- query=args.query,
2832
- provider=provider,
2833
- max_results=args.max_results,
2834
- ttl=args.cache_ttl,
2835
- params=cache_context,
2836
- )
2837
- if cached_result:
2838
- cache_hit = True
2839
- result = {k: v for k, v in cached_result.items() if not k.startswith("_cache_")}
2840
- result["cached"] = True
2841
- result["cache_age_seconds"] = int(time.time() - cached_result.get("_cache_timestamp", 0))
2842
-
2843
- errors = []
2844
- successful_provider = None
2845
- successful_results: List[Tuple[str, Dict[str, Any]]] = []
2846
- result = None if not cache_hit else result
2847
-
2848
- for idx, current_provider in enumerate(eligible_providers):
2849
- if cache_hit:
2850
- successful_provider = provider
2851
- break
2852
- try:
2853
- provider_result = execute_with_retry(current_provider)
2854
- reset_provider_health(current_provider)
2855
- successful_results.append((current_provider, provider_result))
2856
- successful_provider = current_provider
2857
-
2858
- # If we have enough results, stop.
2859
- if len(provider_result.get("results", [])) >= args.max_results:
2860
- break
2861
-
2862
- # Only continue collecting from lower-priority providers when fallback was needed.
2863
- if not errors:
2864
- break
2865
- except Exception as e:
2866
- error_msg = str(e)
2867
- cooldown_info = mark_provider_failure(current_provider, error_msg)
2868
- errors.append({
2869
- "provider": current_provider,
2870
- "error": error_msg,
2871
- "cooldown_seconds": cooldown_info.get("cooldown_seconds"),
2872
- })
2873
- if len(eligible_providers) > 1:
2874
- remaining = eligible_providers[idx + 1:]
2875
- if remaining:
2876
- print(json.dumps({
2877
- "fallback": True,
2878
- "failed_provider": current_provider,
2879
- "error": error_msg,
2880
- "trying_next": remaining[0],
2881
- }), file=sys.stderr)
2882
- continue
2883
-
2884
- if successful_results:
2885
- if len(successful_results) == 1:
2886
- result = successful_results[0][1]
2887
- else:
2888
- primary = successful_results[0][1].copy()
2889
- deduped_results, dedup_count = deduplicate_results_across_providers(successful_results, args.max_results)
2890
- primary["results"] = deduped_results
2891
- primary["deduplicated"] = dedup_count > 0
2892
- primary.setdefault("metadata", {})
2893
- primary["metadata"]["dedup_count"] = dedup_count
2894
- primary["metadata"]["providers_merged"] = [p for p, _ in successful_results]
2895
- result = primary
2896
-
2897
- if result is not None:
2898
- if successful_provider != provider:
2899
- routing_info["fallback_used"] = True
2900
- routing_info["original_provider"] = provider
2901
- routing_info["provider"] = successful_provider
2902
- routing_info["fallback_errors"] = errors
2903
-
2904
- if cooldown_skips:
2905
- routing_info["cooldown_skips"] = cooldown_skips
2906
-
2907
- result["routing"] = routing_info
2908
-
2909
- if not cache_hit and not args.no_cache and args.query:
2910
- cache_put(
2911
- query=args.query,
2912
- provider=successful_provider or provider,
2913
- max_results=args.max_results,
2914
- result=result,
2915
- params=cache_context,
2916
- )
2917
-
2918
- result["cached"] = bool(cache_hit)
2919
- if "deduplicated" not in result:
2920
- result["deduplicated"] = False
2921
- result.setdefault("metadata", {})
2922
- result["metadata"].setdefault("dedup_count", 0)
2923
-
2924
- indent = None if args.compact else 2
2925
- print(json.dumps(result, indent=indent, ensure_ascii=False))
2926
- else:
2927
- error_result = {
2928
- "error": "All providers failed",
2929
- "provider": provider,
2930
- "query": args.query,
2931
- "routing": routing_info,
2932
- "provider_errors": errors,
2933
- "cooldown_skips": cooldown_skips,
2934
- }
2935
- print(json.dumps(error_result, indent=2), file=sys.stderr)
2936
- sys.exit(1)
2937
-
2938
-
2939
- if __name__ == "__main__":
2940
- main()