iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. documentation_search_enhanced/__init__.py +14 -0
  2. documentation_search_enhanced/__main__.py +6 -0
  3. documentation_search_enhanced/config.json +1674 -0
  4. documentation_search_enhanced/config_manager.py +233 -0
  5. documentation_search_enhanced/config_validator.py +79 -0
  6. documentation_search_enhanced/content_enhancer.py +578 -0
  7. documentation_search_enhanced/docker_manager.py +87 -0
  8. documentation_search_enhanced/logger.py +179 -0
  9. documentation_search_enhanced/main.py +2170 -0
  10. documentation_search_enhanced/project_generator.py +260 -0
  11. documentation_search_enhanced/project_scanner.py +85 -0
  12. documentation_search_enhanced/reranker.py +230 -0
  13. documentation_search_enhanced/site_index_builder.py +274 -0
  14. documentation_search_enhanced/site_index_downloader.py +222 -0
  15. documentation_search_enhanced/site_search.py +1325 -0
  16. documentation_search_enhanced/smart_search.py +473 -0
  17. documentation_search_enhanced/snyk_integration.py +657 -0
  18. documentation_search_enhanced/vector_search.py +303 -0
  19. documentation_search_enhanced/version_resolver.py +189 -0
  20. documentation_search_enhanced/vulnerability_scanner.py +545 -0
  21. documentation_search_enhanced/web_scraper.py +117 -0
  22. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
  23. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
  24. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
  25. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
  26. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1325 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Serper-free documentation site search.
4
+
5
+ This module provides a Serper-free fallback for `site:` queries by:
6
+
7
+ 1) Preferring docs-native search indexes when available (MkDocs / Sphinx)
8
+ 2) Falling back to sitemap discovery (robots.txt + sitemap.xml)
9
+ 3) Optionally using a Playwright-backed fetcher to score/snippet page content
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import asyncio
15
+ import gzip
16
+ import json
17
+ import os
18
+ import re
19
+ import xml.etree.ElementTree as ET
20
+ from dataclasses import dataclass
21
+ from datetime import datetime, timedelta, timezone
22
+ from typing import (
23
+ Any,
24
+ Awaitable,
25
+ Callable,
26
+ Dict,
27
+ Iterable,
28
+ List,
29
+ Optional,
30
+ Sequence,
31
+ Tuple,
32
+ )
33
+ from urllib.parse import urljoin, urlparse
34
+
35
+ import httpx
36
+ from bs4 import BeautifulSoup
37
+
38
+
39
+ _SITEMAP_CACHE_TTL = timedelta(hours=24)
40
+ _INDEX_CACHE_TTL = timedelta(hours=24)
41
+ _MAX_SITEMAP_URLS = 50_000
42
+ _MAX_SITEMAPS_TO_FETCH = 12
43
+ _MAX_INDEX_BYTES = 10_000_000
44
+ _MAX_INDEX_DOC_TEXT_CHARS = 5_000
45
+ _DEFAULT_CONTENT_FETCH_CONCURRENCY = 3
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class _SitemapCacheEntry:
50
+ fetched_at: datetime
51
+ urls: Tuple[str, ...]
52
+
53
+
54
+ _sitemap_cache: Dict[str, _SitemapCacheEntry] = {}
55
+ _sitemap_locks: Dict[str, asyncio.Lock] = {}
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class _IndexCacheEntry:
60
+ fetched_at: datetime
61
+ kind: str
62
+ payload: Any
63
+
64
+
65
+ _index_cache: Dict[str, _IndexCacheEntry] = {}
66
+ _index_locks: Dict[str, asyncio.Lock] = {}
67
+
68
+
69
+ def _parse_iso_datetime(value: Any) -> Optional[datetime]:
70
+ if not isinstance(value, str):
71
+ return None
72
+ raw = value.strip()
73
+ if not raw:
74
+ return None
75
+ try:
76
+ parsed = datetime.fromisoformat(raw)
77
+ except ValueError:
78
+ return None
79
+ if parsed.tzinfo is not None:
80
+ try:
81
+ parsed = parsed.astimezone(timezone.utc).replace(tzinfo=None)
82
+ except Exception:
83
+ parsed = parsed.replace(tzinfo=None)
84
+ return parsed
85
+
86
+
87
+ def export_preindexed_state() -> Dict[str, Any]:
88
+ """Export in-memory sitemap/index caches for persistence."""
89
+ now = datetime.now()
90
+ sitemaps: Dict[str, Dict[str, Any]] = {}
91
+ for origin, entry in _sitemap_cache.items():
92
+ sitemaps[origin] = {
93
+ "fetched_at": entry.fetched_at.isoformat(),
94
+ "urls": list(entry.urls),
95
+ }
96
+
97
+ indexes: Dict[str, Dict[str, Any]] = {}
98
+ for index_url, entry in _index_cache.items():
99
+ payload = entry.payload
100
+ if isinstance(payload, tuple):
101
+ payload = list(payload)
102
+
103
+ indexes[index_url] = {
104
+ "fetched_at": entry.fetched_at.isoformat(),
105
+ "kind": entry.kind,
106
+ "payload": payload,
107
+ }
108
+
109
+ return {
110
+ "schema_version": 1,
111
+ "generated_at": now.isoformat(),
112
+ "sitemaps": sitemaps,
113
+ "indexes": indexes,
114
+ }
115
+
116
+
117
+ def import_preindexed_state(state: Any) -> None:
118
+ """Import previously persisted sitemap/index caches."""
119
+ if not isinstance(state, dict):
120
+ return
121
+
122
+ imported_at = datetime.now()
123
+ max_future_skew = timedelta(days=1)
124
+
125
+ sitemaps = state.get("sitemaps")
126
+ if isinstance(sitemaps, dict):
127
+ for origin, entry in sitemaps.items():
128
+ if not isinstance(origin, str) or not isinstance(entry, dict):
129
+ continue
130
+ urls_raw = entry.get("urls")
131
+ if not isinstance(urls_raw, list):
132
+ continue
133
+ urls = tuple(str(url).strip() for url in urls_raw if str(url).strip())
134
+ if not urls:
135
+ continue
136
+ fetched_at = _parse_iso_datetime(entry.get("fetched_at"))
137
+ if fetched_at is None or fetched_at > imported_at + max_future_skew:
138
+ fetched_at = imported_at
139
+ _sitemap_cache[origin] = _SitemapCacheEntry(
140
+ fetched_at=fetched_at, urls=urls
141
+ )
142
+
143
+ indexes = state.get("indexes")
144
+ if isinstance(indexes, dict):
145
+ for index_url, entry in indexes.items():
146
+ if not isinstance(index_url, str) or not isinstance(entry, dict):
147
+ continue
148
+ kind = entry.get("kind")
149
+ payload_raw = entry.get("payload")
150
+ if not isinstance(kind, str):
151
+ continue
152
+
153
+ if kind == "mkdocs":
154
+ if not isinstance(payload_raw, list):
155
+ continue
156
+ prepared = []
157
+ for doc in payload_raw:
158
+ if not isinstance(doc, dict):
159
+ continue
160
+ location = str(doc.get("location") or "").strip()
161
+ if not location:
162
+ continue
163
+ title = str(doc.get("title") or "").strip()
164
+ text = str(doc.get("text") or "").strip()
165
+ if len(text) > _MAX_INDEX_DOC_TEXT_CHARS:
166
+ text = text[:_MAX_INDEX_DOC_TEXT_CHARS]
167
+ prepared.append(
168
+ {"location": location, "title": title, "text": text}
169
+ )
170
+ if not prepared:
171
+ continue
172
+ payload: Any = tuple(prepared)
173
+ elif kind == "sphinx":
174
+ if not isinstance(payload_raw, dict):
175
+ continue
176
+ payload = payload_raw
177
+ else:
178
+ continue
179
+
180
+ fetched_at = _parse_iso_datetime(entry.get("fetched_at"))
181
+ if fetched_at is None or fetched_at > imported_at + max_future_skew:
182
+ fetched_at = imported_at
183
+
184
+ _index_cache[index_url] = _IndexCacheEntry(
185
+ fetched_at=fetched_at, kind=kind, payload=payload
186
+ )
187
+
188
+
189
+ def load_preindexed_state(path: str) -> bool:
190
+ """Load a persisted index cache from disk into memory."""
191
+ if not path:
192
+ return False
193
+ if not os.path.exists(path):
194
+ return False
195
+ try:
196
+ with open(path, "r", encoding="utf-8") as fh:
197
+ raw = json.load(fh)
198
+ except Exception:
199
+ return False
200
+ import_preindexed_state(raw)
201
+ return True
202
+
203
+
204
+ def save_preindexed_state(path: str) -> None:
205
+ """Persist current in-memory sitemap/index caches to disk."""
206
+ if not path:
207
+ raise ValueError("persist path must be non-empty")
208
+ state = export_preindexed_state()
209
+ tmp_path = f"{path}.tmp"
210
+ with open(tmp_path, "w", encoding="utf-8") as fh:
211
+ json.dump(state, fh)
212
+ os.replace(tmp_path, path)
213
+
214
+
215
+ def _get_cached_index_from_memory(index_url: str, *, kind: str) -> Optional[Any]:
216
+ cache_entry = _index_cache.get(index_url)
217
+ if cache_entry and cache_entry.kind == kind:
218
+ return cache_entry.payload
219
+ return None
220
+
221
+
222
+ def _get_cached_sitemap_urls_from_memory(
223
+ origin: str, *, allow_stale: bool
224
+ ) -> Optional[List[str]]:
225
+ cache_entry = _sitemap_cache.get(origin)
226
+ if not cache_entry:
227
+ return None
228
+ if not cache_entry.urls:
229
+ return None
230
+ if allow_stale:
231
+ return list(cache_entry.urls)
232
+ if datetime.now() - cache_entry.fetched_at <= _SITEMAP_CACHE_TTL:
233
+ return list(cache_entry.urls)
234
+ return None
235
+
236
+
237
+ async def preindex_site(
238
+ site_url: str,
239
+ client: httpx.AsyncClient,
240
+ *,
241
+ user_agent: str,
242
+ include_sitemap: bool = False,
243
+ ) -> Dict[str, Any]:
244
+ """Fetch and cache on-site search indexes for a docs site."""
245
+ parsed = urlparse(site_url)
246
+ if not parsed.scheme or not parsed.netloc:
247
+ return {"site_url": site_url, "status": "invalid_url"}
248
+
249
+ origin = f"{parsed.scheme}://{parsed.netloc}"
250
+ results: Dict[str, Any] = {
251
+ "site_url": site_url,
252
+ "origin": origin,
253
+ "mkdocs_index": None,
254
+ "sphinx_index": None,
255
+ "sitemap": None,
256
+ "errors": [],
257
+ }
258
+
259
+ for index_url in _mkdocs_index_candidates(site_url):
260
+ try:
261
+ docs = await _get_cached_index(
262
+ client,
263
+ index_url,
264
+ user_agent=user_agent,
265
+ kind="mkdocs",
266
+ timeout_seconds=20.0,
267
+ )
268
+ except Exception as e:
269
+ results["errors"].append(f"mkdocs:{index_url}: {e}")
270
+ continue
271
+ if docs:
272
+ results["mkdocs_index"] = {"index_url": index_url, "documents": len(docs)}
273
+ break
274
+
275
+ for index_url in _sphinx_index_candidates(site_url):
276
+ try:
277
+ index = await _get_cached_index(
278
+ client,
279
+ index_url,
280
+ user_agent=user_agent,
281
+ kind="sphinx",
282
+ timeout_seconds=20.0,
283
+ )
284
+ except Exception as e:
285
+ results["errors"].append(f"sphinx:{index_url}: {e}")
286
+ continue
287
+ if isinstance(index, dict):
288
+ filenames = index.get("filenames")
289
+ results["sphinx_index"] = {
290
+ "index_url": index_url,
291
+ "documents": len(filenames) if isinstance(filenames, list) else None,
292
+ }
293
+ break
294
+
295
+ if include_sitemap:
296
+ try:
297
+ urls = await _load_site_sitemap_urls(
298
+ client, site_url, user_agent=user_agent
299
+ )
300
+ if urls:
301
+ _sitemap_cache[origin] = _SitemapCacheEntry(
302
+ fetched_at=datetime.now(), urls=tuple(urls)
303
+ )
304
+ results["sitemap"] = {"urls": len(urls)}
305
+ except Exception as e:
306
+ results["errors"].append(f"sitemap:{origin}: {e}")
307
+
308
+ results["status"] = (
309
+ "ok"
310
+ if results.get("mkdocs_index")
311
+ or results.get("sphinx_index")
312
+ or results.get("sitemap")
313
+ else "no_index_found"
314
+ )
315
+ return results
316
+
317
+
318
+ def _parse_site_query(query: str) -> Tuple[Optional[str], str]:
319
+ match = re.search(r"\bsite:(\S+)", query)
320
+ if not match:
321
+ return None, query.strip()
322
+
323
+ site_token = match.group(1).strip().strip('"').strip("'")
324
+ remaining = (query[: match.start()] + query[match.end() :]).strip()
325
+ return site_token, remaining
326
+
327
+
328
+ _STOPWORDS = {
329
+ "a",
330
+ "an",
331
+ "and",
332
+ "are",
333
+ "as",
334
+ "at",
335
+ "be",
336
+ "for",
337
+ "from",
338
+ "how",
339
+ "in",
340
+ "into",
341
+ "is",
342
+ "it",
343
+ "of",
344
+ "on",
345
+ "or",
346
+ "that",
347
+ "the",
348
+ "these",
349
+ "this",
350
+ "to",
351
+ "using",
352
+ "what",
353
+ "when",
354
+ "where",
355
+ "why",
356
+ "with",
357
+ }
358
+
359
+
360
+ def _tokenize_query(text: str) -> List[str]:
361
+ tokens = [t for t in re.findall(r"[a-z0-9]+", text.lower()) if t]
362
+ filtered: List[str] = []
363
+ for token in tokens:
364
+ if token in _STOPWORDS:
365
+ continue
366
+ if len(token) <= 1:
367
+ continue
368
+ if token not in filtered:
369
+ filtered.append(token)
370
+ return filtered[:12]
371
+
372
+
373
+ def _matches_site_prefix(candidate_url: str, site_url: str) -> bool:
374
+ try:
375
+ candidate = urlparse(candidate_url)
376
+ site = urlparse(site_url)
377
+ except Exception:
378
+ return False
379
+
380
+ if not candidate.netloc or candidate.netloc.lower() != site.netloc.lower():
381
+ return False
382
+
383
+ site_path = site.path or "/"
384
+ candidate_path = candidate.path or "/"
385
+
386
+ site_path_norm = site_path.rstrip("/")
387
+ candidate_path_norm = candidate_path.rstrip("/")
388
+
389
+ if site_path_norm in ("", "/"):
390
+ return True
391
+
392
+ return candidate_path_norm == site_path_norm or candidate_path_norm.startswith(
393
+ f"{site_path_norm}/"
394
+ )
395
+
396
+
397
+ def _sitemap_candidates(site_url: str) -> List[str]:
398
+ parsed = urlparse(site_url)
399
+ if not parsed.scheme or not parsed.netloc:
400
+ return []
401
+
402
+ origin = f"{parsed.scheme}://{parsed.netloc}"
403
+ site_base = site_url.rstrip("/")
404
+
405
+ candidates = [
406
+ f"{origin}/sitemap.xml",
407
+ f"{origin}/sitemap_index.xml",
408
+ f"{origin}/sitemap-index.xml",
409
+ ]
410
+
411
+ if site_base != origin:
412
+ candidates.extend(
413
+ [
414
+ f"{site_base}/sitemap.xml",
415
+ f"{site_base}/sitemap_index.xml",
416
+ ]
417
+ )
418
+
419
+ # Deduplicate while preserving order.
420
+ seen = set()
421
+ unique: List[str] = []
422
+ for item in candidates:
423
+ if item not in seen:
424
+ unique.append(item)
425
+ seen.add(item)
426
+ return unique
427
+
428
+
429
+ def _mkdocs_index_candidates(site_url: str) -> List[str]:
430
+ parsed = urlparse(site_url)
431
+ if not parsed.scheme or not parsed.netloc:
432
+ return []
433
+
434
+ origin = f"{parsed.scheme}://{parsed.netloc}/"
435
+ base = site_url.rstrip("/") + "/"
436
+
437
+ candidates = [
438
+ urljoin(base, "search/search_index.json"),
439
+ urljoin(base, "search_index.json"),
440
+ ]
441
+ if base != origin:
442
+ candidates.extend(
443
+ [
444
+ urljoin(origin, "search/search_index.json"),
445
+ urljoin(origin, "search_index.json"),
446
+ ]
447
+ )
448
+
449
+ seen: set[str] = set()
450
+ unique: List[str] = []
451
+ for item in candidates:
452
+ if item not in seen:
453
+ unique.append(item)
454
+ seen.add(item)
455
+ return unique
456
+
457
+
458
+ def _mkdocs_base_from_index_url(index_url: str) -> str:
459
+ suffixes = ("/search/search_index.json", "/search_index.json")
460
+ for suffix in suffixes:
461
+ if index_url.endswith(suffix):
462
+ return index_url[: -len(suffix)] + "/"
463
+ return urljoin(index_url, "./")
464
+
465
+
466
+ def _sphinx_index_candidates(site_url: str) -> List[str]:
467
+ parsed = urlparse(site_url)
468
+ if not parsed.scheme or not parsed.netloc:
469
+ return []
470
+
471
+ origin = f"{parsed.scheme}://{parsed.netloc}/"
472
+ base = site_url.rstrip("/") + "/"
473
+
474
+ candidates = [urljoin(base, "searchindex.js")]
475
+ if base != origin:
476
+ candidates.append(urljoin(origin, "searchindex.js"))
477
+
478
+ seen: set[str] = set()
479
+ unique: List[str] = []
480
+ for item in candidates:
481
+ if item not in seen:
482
+ unique.append(item)
483
+ seen.add(item)
484
+ return unique
485
+
486
+
487
+ def _sphinx_base_from_index_url(index_url: str) -> str:
488
+ if index_url.endswith("/searchindex.js"):
489
+ return index_url[: -len("/searchindex.js")] + "/"
490
+ if index_url.endswith("searchindex.js"):
491
+ return index_url[: -len("searchindex.js")]
492
+ return urljoin(index_url, "./")
493
+
494
+
495
+ async def _fetch_bytes(
496
+ client: httpx.AsyncClient, url: str, *, user_agent: str, timeout_seconds: float
497
+ ) -> Optional[bytes]:
498
+ try:
499
+ response = await client.get(
500
+ url,
501
+ headers={"User-Agent": user_agent},
502
+ timeout=httpx.Timeout(timeout_seconds),
503
+ follow_redirects=True,
504
+ )
505
+ if response.status_code >= 400:
506
+ return None
507
+ return response.content
508
+ except Exception:
509
+ return None
510
+
511
+
512
+ def _maybe_decompress_gzip(blob: bytes) -> bytes:
513
+ # Some sitemaps are served as *.gz without Content-Encoding headers.
514
+ if len(blob) >= 2 and blob[0] == 0x1F and blob[1] == 0x8B:
515
+ try:
516
+ return gzip.decompress(blob)
517
+ except Exception:
518
+ return blob
519
+ return blob
520
+
521
+
522
+ def _xml_root_tag(root: ET.Element) -> str:
523
+ if "}" in root.tag:
524
+ return root.tag.split("}", 1)[1]
525
+ return root.tag
526
+
527
+
528
+ def _parse_sitemap_xml(blob: bytes) -> Tuple[List[str], List[str]]:
529
+ """
530
+ Returns (urls, child_sitemaps).
531
+ """
532
+ try:
533
+ root = ET.fromstring(blob)
534
+ except Exception:
535
+ return [], []
536
+
537
+ tag = _xml_root_tag(root)
538
+ if tag == "urlset":
539
+ urls = [
540
+ (loc.text or "").strip()
541
+ for loc in root.findall(".//{*}url/{*}loc")
542
+ if (loc.text or "").strip()
543
+ ]
544
+ return urls, []
545
+
546
+ if tag == "sitemapindex":
547
+ sitemaps = [
548
+ (loc.text or "").strip()
549
+ for loc in root.findall(".//{*}sitemap/{*}loc")
550
+ if (loc.text or "").strip()
551
+ ]
552
+ return [], sitemaps
553
+
554
+ return [], []
555
+
556
+
557
+ async def _discover_sitemaps_from_robots(
558
+ client: httpx.AsyncClient, site_url: str, *, user_agent: str
559
+ ) -> List[str]:
560
+ parsed = urlparse(site_url)
561
+ if not parsed.scheme or not parsed.netloc:
562
+ return []
563
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
564
+ try:
565
+ response = await client.get(
566
+ robots_url,
567
+ headers={"User-Agent": user_agent},
568
+ timeout=httpx.Timeout(10.0),
569
+ follow_redirects=True,
570
+ )
571
+ if response.status_code >= 400:
572
+ return []
573
+ sitemaps = []
574
+ for line in response.text.splitlines():
575
+ if line.lower().startswith("sitemap:"):
576
+ sitemap = line.split(":", 1)[1].strip()
577
+ if sitemap:
578
+ sitemaps.append(sitemap)
579
+ return sitemaps
580
+ except Exception:
581
+ return []
582
+
583
+
584
+ async def _load_site_sitemap_urls(
585
+ client: httpx.AsyncClient,
586
+ site_url: str,
587
+ *,
588
+ user_agent: str,
589
+ allow_html_fallback: bool = True,
590
+ ) -> List[str]:
591
+ sitemap_urls = await _discover_sitemaps_from_robots(
592
+ client, site_url, user_agent=user_agent
593
+ )
594
+ sitemap_urls.extend(_sitemap_candidates(site_url))
595
+
596
+ visited_sitemaps = set()
597
+ sitemap_queue = []
598
+ for sitemap_url in sitemap_urls:
599
+ if sitemap_url and sitemap_url not in visited_sitemaps:
600
+ sitemap_queue.append(sitemap_url)
601
+ visited_sitemaps.add(sitemap_url)
602
+
603
+ discovered_urls: List[str] = []
604
+ seen_urls = set()
605
+
606
+ while sitemap_queue and len(visited_sitemaps) <= _MAX_SITEMAPS_TO_FETCH:
607
+ sitemap_url = sitemap_queue.pop(0)
608
+
609
+ blob = await _fetch_bytes(
610
+ client, sitemap_url, user_agent=user_agent, timeout_seconds=15.0
611
+ )
612
+ if not blob:
613
+ continue
614
+
615
+ blob = _maybe_decompress_gzip(blob)
616
+ urls, child_sitemaps = _parse_sitemap_xml(blob)
617
+
618
+ for url in urls:
619
+ if url in seen_urls:
620
+ continue
621
+ seen_urls.add(url)
622
+ discovered_urls.append(url)
623
+ if len(discovered_urls) >= _MAX_SITEMAP_URLS:
624
+ return discovered_urls
625
+
626
+ for child in child_sitemaps:
627
+ if child in visited_sitemaps:
628
+ continue
629
+ if len(visited_sitemaps) >= _MAX_SITEMAPS_TO_FETCH:
630
+ break
631
+ visited_sitemaps.add(child)
632
+ sitemap_queue.append(child)
633
+
634
+ if discovered_urls:
635
+ return discovered_urls
636
+
637
+ if not allow_html_fallback:
638
+ return []
639
+
640
+ return await _discover_urls_from_html_links(client, site_url, user_agent=user_agent)
641
+
642
+
643
+ async def _discover_urls_from_html_links(
644
+ client: httpx.AsyncClient, site_url: str, *, user_agent: str
645
+ ) -> List[str]:
646
+ """Discover internal links from the site's HTML when no sitemap is available."""
647
+ parsed = urlparse(site_url)
648
+ if not parsed.scheme or not parsed.netloc:
649
+ return []
650
+ origin = f"{parsed.scheme}://{parsed.netloc}"
651
+ base_for_join = site_url.rstrip("/") + "/"
652
+
653
+ html_blob = await _fetch_bytes(
654
+ client, site_url, user_agent=user_agent, timeout_seconds=15.0
655
+ )
656
+ if not html_blob:
657
+ return []
658
+
659
+ try:
660
+ html_text = html_blob.decode("utf-8", errors="ignore")
661
+ except Exception:
662
+ return []
663
+
664
+ soup = BeautifulSoup(html_text, "html.parser")
665
+ discovered_from_html: List[str] = []
666
+ seen_html_urls: set[str] = set()
667
+
668
+ def _is_asset_path(path: str) -> bool:
669
+ return bool(
670
+ re.search(
671
+ r"\.(?:png|jpe?g|gif|svg|webp|css|js|map|ico|woff2?|ttf|otf|eot|pdf|zip|gz)$",
672
+ path.lower(),
673
+ )
674
+ )
675
+
676
+ for anchor in soup.find_all("a", href=True):
677
+ href = str(anchor.get("href") or "").strip()
678
+ if not href:
679
+ continue
680
+ lower = href.lower()
681
+ if lower.startswith(("#", "mailto:", "javascript:", "tel:")):
682
+ continue
683
+
684
+ absolute = urljoin(base_for_join, href)
685
+ parsed_link = urlparse(absolute)
686
+ if parsed_link.scheme not in {"http", "https"}:
687
+ continue
688
+ if parsed_link.netloc.lower() != parsed.netloc.lower():
689
+ continue
690
+ if _is_asset_path(parsed_link.path or ""):
691
+ continue
692
+
693
+ sanitized = parsed_link._replace(query="", fragment="").geturl()
694
+ if not sanitized.startswith(origin):
695
+ continue
696
+ if sanitized in seen_html_urls:
697
+ continue
698
+ seen_html_urls.add(sanitized)
699
+ discovered_from_html.append(sanitized)
700
+ if len(discovered_from_html) >= _MAX_SITEMAP_URLS:
701
+ break
702
+
703
+ return discovered_from_html
704
+
705
+
706
+ def _extract_text_snippet(text: str, tokens: Sequence[str]) -> str:
707
+ cleaned = re.sub(r"\s+", " ", text).strip()
708
+ if not cleaned:
709
+ return ""
710
+
711
+ if not tokens:
712
+ return cleaned[:240]
713
+
714
+ lower = cleaned.lower()
715
+ best_idx: Optional[int] = None
716
+ for token in tokens:
717
+ idx = lower.find(token)
718
+ if idx == -1:
719
+ continue
720
+ if best_idx is None or idx < best_idx:
721
+ best_idx = idx
722
+
723
+ if best_idx is None:
724
+ return cleaned[:240]
725
+
726
+ start = max(0, best_idx - 80)
727
+ end = min(len(cleaned), best_idx + 160)
728
+ return cleaned[start:end].strip()[:240]
729
+
730
+
731
+ def _score_urls(urls: Iterable[str], tokens: Sequence[str]) -> List[Tuple[int, str]]:
732
+ scored: List[Tuple[int, str]] = []
733
+ if not tokens:
734
+ return [(1, url) for url in urls]
735
+
736
+ for url in urls:
737
+ url_lower = url.lower()
738
+ score = 0
739
+ for token in tokens:
740
+ if token not in url_lower:
741
+ continue
742
+ score += 1
743
+ # Boost for segment-level matches.
744
+ path = urlparse(url).path.lower()
745
+ segments = [seg for seg in re.split(r"[/._-]+", path) if seg]
746
+ if token in segments:
747
+ score += 6
748
+ else:
749
+ score += 2
750
+ if score > 0:
751
+ scored.append((score, url))
752
+ scored.sort(key=lambda item: (-item[0], len(item[1])))
753
+ return scored
754
+
755
+
756
+ def _fallback_title_from_url(url: str) -> str:
757
+ parsed = urlparse(url)
758
+ segment = (parsed.path or "/").rstrip("/").split("/")[-1]
759
+ segment = re.sub(r"\.[a-z0-9]+$", "", segment, flags=re.IGNORECASE)
760
+ segment = segment.replace("-", " ").replace("_", " ").strip()
761
+ if not segment:
762
+ return url
763
+ return segment[:1].upper() + segment[1:]
764
+
765
+
766
+ def _extract_page_snippet(soup: BeautifulSoup, tokens: Sequence[str]) -> str:
767
+ for meta_name in ("description", "og:description"):
768
+ meta = soup.find("meta", attrs={"name": meta_name}) or soup.find(
769
+ "meta", attrs={"property": meta_name}
770
+ )
771
+ if meta and meta.get("content"):
772
+ return str(meta["content"]).strip()[:240]
773
+
774
+ text = soup.get_text(" ", strip=True)
775
+ if not text:
776
+ return ""
777
+
778
+ if not tokens:
779
+ return text[:240]
780
+
781
+ text_lower = text.lower()
782
+ for token in tokens:
783
+ idx = text_lower.find(token)
784
+ if idx == -1:
785
+ continue
786
+ start = max(0, idx - 80)
787
+ end = min(len(text), idx + 160)
788
+ snippet = text[start:end].strip()
789
+ return snippet[:240]
790
+
791
+ return text[:240]
792
+
793
+
794
+ async def _fetch_result_metadata(
795
+ client: httpx.AsyncClient, url: str, *, user_agent: str, tokens: Sequence[str]
796
+ ) -> Dict[str, str]:
797
+ try:
798
+ response = await client.get(
799
+ url,
800
+ headers={"User-Agent": user_agent},
801
+ timeout=httpx.Timeout(12.0),
802
+ follow_redirects=True,
803
+ )
804
+ if response.status_code >= 400:
805
+ return {"title": _fallback_title_from_url(url), "snippet": ""}
806
+ soup = BeautifulSoup(response.text, "html.parser")
807
+ title = (soup.title.string or "").strip() if soup.title else ""
808
+ if not title:
809
+ title = _fallback_title_from_url(url)
810
+ snippet = _extract_page_snippet(soup, tokens)
811
+ return {"title": title, "snippet": snippet}
812
+ except Exception:
813
+ return {"title": _fallback_title_from_url(url), "snippet": ""}
814
+
815
+
816
+ async def _get_cached_index(
817
+ client: httpx.AsyncClient,
818
+ index_url: str,
819
+ *,
820
+ user_agent: str,
821
+ kind: str,
822
+ timeout_seconds: float,
823
+ ) -> Optional[Any]:
824
+ now = datetime.now()
825
+ cache_entry = _index_cache.get(index_url)
826
+ stale_payload: Optional[Any] = None
827
+ if cache_entry and cache_entry.kind == kind:
828
+ stale_payload = cache_entry.payload
829
+ if (
830
+ cache_entry
831
+ and cache_entry.kind == kind
832
+ and now - cache_entry.fetched_at <= _INDEX_CACHE_TTL
833
+ ):
834
+ return cache_entry.payload
835
+
836
+ lock = _index_locks.setdefault(index_url, asyncio.Lock())
837
+ async with lock:
838
+ cache_entry = _index_cache.get(index_url)
839
+ if (
840
+ cache_entry
841
+ and cache_entry.kind == kind
842
+ and now - cache_entry.fetched_at <= _INDEX_CACHE_TTL
843
+ ):
844
+ return cache_entry.payload
845
+ if cache_entry and cache_entry.kind == kind:
846
+ stale_payload = cache_entry.payload
847
+
848
+ blob = await _fetch_bytes(
849
+ client, index_url, user_agent=user_agent, timeout_seconds=timeout_seconds
850
+ )
851
+ if (not blob or len(blob) > _MAX_INDEX_BYTES) and stale_payload is not None:
852
+ _index_cache[index_url] = _IndexCacheEntry(
853
+ fetched_at=datetime.now(), kind=kind, payload=stale_payload
854
+ )
855
+ return stale_payload
856
+ if not blob or len(blob) > _MAX_INDEX_BYTES:
857
+ return None
858
+
859
+ payload: Any
860
+ if kind == "mkdocs":
861
+ try:
862
+ raw = json.loads(blob.decode("utf-8"))
863
+ except Exception:
864
+ if stale_payload is not None:
865
+ _index_cache[index_url] = _IndexCacheEntry(
866
+ fetched_at=datetime.now(), kind=kind, payload=stale_payload
867
+ )
868
+ return stale_payload
869
+ docs = raw.get("docs")
870
+ if not isinstance(docs, list):
871
+ if stale_payload is not None:
872
+ _index_cache[index_url] = _IndexCacheEntry(
873
+ fetched_at=datetime.now(), kind=kind, payload=stale_payload
874
+ )
875
+ return stale_payload
876
+
877
+ prepared = []
878
+ for doc in docs:
879
+ if not isinstance(doc, dict):
880
+ continue
881
+ location = str(doc.get("location") or "").strip()
882
+ title = str(doc.get("title") or "").strip()
883
+ text = str(doc.get("text") or "").strip()
884
+ if len(text) > _MAX_INDEX_DOC_TEXT_CHARS:
885
+ text = text[:_MAX_INDEX_DOC_TEXT_CHARS]
886
+ if not location:
887
+ continue
888
+ prepared.append({"location": location, "title": title, "text": text})
889
+
890
+ payload = tuple(prepared)
891
+ elif kind == "sphinx":
892
+ try:
893
+ text = blob.decode("utf-8", errors="ignore")
894
+ except Exception:
895
+ if stale_payload is not None:
896
+ _index_cache[index_url] = _IndexCacheEntry(
897
+ fetched_at=datetime.now(), kind=kind, payload=stale_payload
898
+ )
899
+ return stale_payload
900
+
901
+ marker = "Search.setIndex("
902
+ idx = text.find(marker)
903
+ if idx == -1:
904
+ if stale_payload is not None:
905
+ _index_cache[index_url] = _IndexCacheEntry(
906
+ fetched_at=datetime.now(), kind=kind, payload=stale_payload
907
+ )
908
+ return stale_payload
909
+ start = text.find("{", idx)
910
+ end = text.rfind("}")
911
+ if start == -1 or end == -1 or end <= start:
912
+ if stale_payload is not None:
913
+ _index_cache[index_url] = _IndexCacheEntry(
914
+ fetched_at=datetime.now(), kind=kind, payload=stale_payload
915
+ )
916
+ return stale_payload
917
+ json_text = text[start : end + 1]
918
+ try:
919
+ payload = json.loads(json_text)
920
+ except Exception:
921
+ if stale_payload is not None:
922
+ _index_cache[index_url] = _IndexCacheEntry(
923
+ fetched_at=datetime.now(), kind=kind, payload=stale_payload
924
+ )
925
+ return stale_payload
926
+ else:
927
+ return None
928
+
929
+ _index_cache[index_url] = _IndexCacheEntry(
930
+ fetched_at=datetime.now(), kind=kind, payload=payload
931
+ )
932
+ return payload
933
+
934
+
935
+ def _score_document(url: str, title: str, text: str, tokens: Sequence[str]) -> int:
936
+ if not tokens:
937
+ return 1
938
+
939
+ url_lower = url.lower()
940
+ title_lower = title.lower()
941
+ text_lower = text.lower()
942
+
943
+ score = 0
944
+ for token in tokens:
945
+ if token in title_lower:
946
+ score += 25
947
+ if token in url_lower:
948
+ score += 6
949
+ occurrences = text_lower.count(token)
950
+ if occurrences:
951
+ score += 8 + min(occurrences, 20)
952
+
953
+ return score
954
+
955
+
956
+ async def _gather_with_limit(
957
+ coros: Sequence[Awaitable[Any]], *, concurrency: int
958
+ ) -> List[Any]:
959
+ if concurrency <= 1:
960
+ results: List[Any] = []
961
+ for coro in coros:
962
+ results.append(await coro)
963
+ return results
964
+
965
+ semaphore = asyncio.Semaphore(concurrency)
966
+
967
+ async def _runner(coro: Awaitable[Any]) -> Any:
968
+ async with semaphore:
969
+ return await coro
970
+
971
+ return await asyncio.gather(
972
+ *[_runner(coro) for coro in coros], return_exceptions=True
973
+ )
974
+
975
+
976
+ async def _search_via_mkdocs_index(
977
+ site_url: str,
978
+ tokens: Sequence[str],
979
+ client: httpx.AsyncClient,
980
+ *,
981
+ user_agent: str,
982
+ num_results: int,
983
+ allow_network: bool,
984
+ ) -> Optional[List[Dict[str, str]]]:
985
+ for index_url in _mkdocs_index_candidates(site_url):
986
+ if allow_network:
987
+ docs = await _get_cached_index(
988
+ client,
989
+ index_url,
990
+ user_agent=user_agent,
991
+ kind="mkdocs",
992
+ timeout_seconds=20.0,
993
+ )
994
+ else:
995
+ docs = _get_cached_index_from_memory(index_url, kind="mkdocs")
996
+ if not docs:
997
+ continue
998
+
999
+ base_url = _mkdocs_base_from_index_url(index_url)
1000
+ scored: List[Tuple[int, Dict[str, str]]] = []
1001
+ for doc in docs:
1002
+ location = str(doc.get("location") or "")
1003
+ url = urljoin(base_url, location)
1004
+ if not _matches_site_prefix(url, site_url):
1005
+ continue
1006
+ title = str(doc.get("title") or "") or _fallback_title_from_url(url)
1007
+ text = str(doc.get("text") or "")
1008
+ score = _score_document(url, title, text, tokens)
1009
+ if score <= 0:
1010
+ continue
1011
+ snippet = _extract_text_snippet(text, tokens)
1012
+ scored.append((score, {"link": url, "title": title, "snippet": snippet}))
1013
+
1014
+ scored.sort(key=lambda item: (-item[0], len(item[1]["link"])))
1015
+ organic = [item[1] for item in scored[: max(num_results, 1)]]
1016
+ if organic:
1017
+ return organic
1018
+
1019
+ return None
1020
+
1021
+
1022
+ def _coerce_sphinx_doc_hits(entry: Any) -> List[Tuple[int, int]]:
1023
+ """Return a list of (doc_id, weight) pairs."""
1024
+ if not isinstance(entry, list):
1025
+ return []
1026
+ if not entry:
1027
+ return []
1028
+
1029
+ if all(isinstance(item, int) for item in entry):
1030
+ return [(item, 1) for item in entry]
1031
+
1032
+ hits: List[Tuple[int, int]] = []
1033
+ for item in entry:
1034
+ if isinstance(item, int):
1035
+ hits.append((item, 1))
1036
+ continue
1037
+ if isinstance(item, (list, tuple)) and item and isinstance(item[0], int):
1038
+ weight = 1
1039
+ if len(item) > 1 and isinstance(item[1], int):
1040
+ weight = max(item[1], 1)
1041
+ hits.append((item[0], weight))
1042
+ return hits
1043
+
1044
+
1045
+ async def _search_via_sphinx_index(
1046
+ site_url: str,
1047
+ tokens: Sequence[str],
1048
+ client: httpx.AsyncClient,
1049
+ *,
1050
+ user_agent: str,
1051
+ num_results: int,
1052
+ fetch_text: Optional[Callable[[str], Awaitable[str]]],
1053
+ fetch_text_concurrency: int,
1054
+ allow_network: bool,
1055
+ ) -> Optional[List[Dict[str, str]]]:
1056
+ for index_url in _sphinx_index_candidates(site_url):
1057
+ if allow_network:
1058
+ index = await _get_cached_index(
1059
+ client,
1060
+ index_url,
1061
+ user_agent=user_agent,
1062
+ kind="sphinx",
1063
+ timeout_seconds=20.0,
1064
+ )
1065
+ else:
1066
+ index = _get_cached_index_from_memory(index_url, kind="sphinx")
1067
+ if not isinstance(index, dict):
1068
+ continue
1069
+
1070
+ filenames = index.get("filenames")
1071
+ titles = index.get("titles")
1072
+ terms = index.get("terms")
1073
+ titleterms = index.get("titleterms")
1074
+ if not (
1075
+ isinstance(filenames, list)
1076
+ and isinstance(titles, list)
1077
+ and isinstance(terms, dict)
1078
+ ):
1079
+ continue
1080
+
1081
+ base_url = _sphinx_base_from_index_url(index_url)
1082
+ scores: Dict[int, int] = {}
1083
+ for token in tokens:
1084
+ for doc_id, weight in _coerce_sphinx_doc_hits(terms.get(token)):
1085
+ scores[doc_id] = scores.get(doc_id, 0) + (10 * weight)
1086
+ if isinstance(titleterms, dict):
1087
+ for doc_id, weight in _coerce_sphinx_doc_hits(titleterms.get(token)):
1088
+ scores[doc_id] = scores.get(doc_id, 0) + (20 * weight)
1089
+
1090
+ ranked_doc_ids = sorted(scores.items(), key=lambda item: -item[1])
1091
+ hits: List[Tuple[int, str]] = []
1092
+ for doc_id, _ in ranked_doc_ids:
1093
+ if not isinstance(doc_id, int) or doc_id < 0 or doc_id >= len(filenames):
1094
+ continue
1095
+ url = urljoin(base_url, str(filenames[doc_id]))
1096
+ if not _matches_site_prefix(url, site_url):
1097
+ continue
1098
+ hits.append((doc_id, url))
1099
+ if len(hits) >= max(num_results, 1):
1100
+ break
1101
+
1102
+ if not hits:
1103
+ continue
1104
+
1105
+ urls = [url for _, url in hits]
1106
+ snippets_by_url: Dict[str, str] = {url: "" for url in urls}
1107
+ if allow_network and fetch_text and tokens:
1108
+ texts = await _gather_with_limit(
1109
+ [fetch_text(url) for url in urls], concurrency=fetch_text_concurrency
1110
+ )
1111
+ for url, text in zip(urls, texts):
1112
+ if isinstance(text, Exception):
1113
+ continue
1114
+ snippets_by_url[url] = _extract_text_snippet(str(text), tokens)
1115
+ elif allow_network and tokens:
1116
+ metadatas = await asyncio.gather(
1117
+ *[
1118
+ _fetch_result_metadata(
1119
+ client, url, user_agent=user_agent, tokens=tokens
1120
+ )
1121
+ for url in urls
1122
+ ],
1123
+ return_exceptions=True,
1124
+ )
1125
+ for url, metadata in zip(urls, metadatas):
1126
+ if isinstance(metadata, Exception):
1127
+ continue
1128
+ snippets_by_url[url] = metadata.get("snippet", "")
1129
+
1130
+ organic: List[Dict[str, str]] = []
1131
+ for doc_id, url in hits:
1132
+ title = _fallback_title_from_url(url)
1133
+ if doc_id < len(titles) and titles[doc_id]:
1134
+ title = str(titles[doc_id])
1135
+ organic.append(
1136
+ {
1137
+ "link": url,
1138
+ "title": title,
1139
+ "snippet": snippets_by_url.get(url, ""),
1140
+ }
1141
+ )
1142
+
1143
+ if organic:
1144
+ return organic
1145
+
1146
+ return None
1147
+
1148
+
1149
+ async def search_site_via_sitemap(
1150
+ query: str,
1151
+ client: httpx.AsyncClient,
1152
+ *,
1153
+ user_agent: str,
1154
+ num_results: int = 5,
1155
+ fetch_text: Optional[Callable[[str], Awaitable[str]]] = None,
1156
+ fetch_text_concurrency: int = _DEFAULT_CONTENT_FETCH_CONCURRENCY,
1157
+ allow_network: bool = True,
1158
+ ) -> Dict[str, Any]:
1159
+ """
1160
+ Perform a Serper-free search for `site:` queries.
1161
+
1162
+ Returns a Serper-like payload: {"organic": [{"link","title","snippet"}, ...]}.
1163
+ """
1164
+ site_url, terms = _parse_site_query(query)
1165
+ if not site_url:
1166
+ return {"organic": []}
1167
+
1168
+ parsed = urlparse(site_url)
1169
+ if not parsed.scheme or not parsed.netloc:
1170
+ return {"organic": []}
1171
+
1172
+ origin = f"{parsed.scheme}://{parsed.netloc}"
1173
+
1174
+ tokens = _tokenize_query(terms)
1175
+
1176
+ # 1) Prefer docs-native search indexes when present.
1177
+ organic = await _search_via_mkdocs_index(
1178
+ site_url,
1179
+ tokens,
1180
+ client,
1181
+ user_agent=user_agent,
1182
+ num_results=num_results,
1183
+ allow_network=allow_network,
1184
+ )
1185
+ if not organic:
1186
+ organic = await _search_via_sphinx_index(
1187
+ site_url,
1188
+ tokens,
1189
+ client,
1190
+ user_agent=user_agent,
1191
+ num_results=num_results,
1192
+ fetch_text=fetch_text,
1193
+ fetch_text_concurrency=fetch_text_concurrency,
1194
+ allow_network=allow_network,
1195
+ )
1196
+ if organic:
1197
+ return {"organic": organic}
1198
+
1199
+ # 2) Fallback: sitemap discovery + ranking.
1200
+ cached_urls = _get_cached_sitemap_urls_from_memory(origin, allow_stale=False)
1201
+ if cached_urls is not None:
1202
+ all_urls = cached_urls
1203
+ else:
1204
+ if not allow_network:
1205
+ stale_cached = _get_cached_sitemap_urls_from_memory(
1206
+ origin, allow_stale=True
1207
+ )
1208
+ if stale_cached is None:
1209
+ return {"organic": []}
1210
+ all_urls = stale_cached
1211
+ else:
1212
+ lock = _sitemap_locks.setdefault(origin, asyncio.Lock())
1213
+ async with lock:
1214
+ cached_urls = _get_cached_sitemap_urls_from_memory(
1215
+ origin, allow_stale=False
1216
+ )
1217
+ if cached_urls is not None:
1218
+ all_urls = cached_urls
1219
+ else:
1220
+ loaded = await _load_site_sitemap_urls(
1221
+ client,
1222
+ site_url,
1223
+ user_agent=user_agent,
1224
+ allow_html_fallback=False,
1225
+ )
1226
+ if loaded:
1227
+ _sitemap_cache[origin] = _SitemapCacheEntry(
1228
+ fetched_at=datetime.now(), urls=tuple(loaded)
1229
+ )
1230
+ all_urls = loaded
1231
+ else:
1232
+ stale_cached = _get_cached_sitemap_urls_from_memory(
1233
+ origin, allow_stale=True
1234
+ )
1235
+ if stale_cached is not None:
1236
+ existing = _sitemap_cache.get(origin)
1237
+ if existing and existing.urls:
1238
+ _sitemap_cache[origin] = _SitemapCacheEntry(
1239
+ fetched_at=datetime.now(), urls=existing.urls
1240
+ )
1241
+ all_urls = stale_cached
1242
+ else:
1243
+ discovered = await _discover_urls_from_html_links(
1244
+ client, site_url, user_agent=user_agent
1245
+ )
1246
+ if not discovered:
1247
+ return {"organic": []}
1248
+ _sitemap_cache[origin] = _SitemapCacheEntry(
1249
+ fetched_at=datetime.now(), urls=tuple(discovered)
1250
+ )
1251
+ all_urls = discovered
1252
+
1253
+ candidates = [u for u in all_urls if _matches_site_prefix(u, site_url)]
1254
+ scored = _score_urls(candidates, tokens)
1255
+
1256
+ # Preselect candidates (URL-based), then optionally rescore using page text.
1257
+ preselect_limit = min(12, max(6, max(num_results, 1) * 2))
1258
+ if scored:
1259
+ preselect_urls = [url for _, url in scored[:preselect_limit]]
1260
+ url_scores = {url: score for score, url in scored[:preselect_limit]}
1261
+ else:
1262
+ preselect_urls = sorted(candidates, key=len)[:preselect_limit]
1263
+ url_scores = {url: 0 for url in preselect_urls}
1264
+
1265
+ if fetch_text and tokens and preselect_urls:
1266
+ texts = await _gather_with_limit(
1267
+ [fetch_text(url) for url in preselect_urls],
1268
+ concurrency=fetch_text_concurrency,
1269
+ )
1270
+ rescored: List[Tuple[int, str, str, str]] = []
1271
+ for url, text in zip(preselect_urls, texts):
1272
+ title = _fallback_title_from_url(url)
1273
+ if isinstance(text, Exception):
1274
+ rescored.append((url_scores.get(url, 0), url, title, ""))
1275
+ continue
1276
+ snippet = _extract_text_snippet(str(text), tokens)
1277
+ content_score = _score_document(url, title, str(text), tokens)
1278
+ total = url_scores.get(url, 0) + content_score
1279
+ rescored.append((total, url, title, snippet))
1280
+
1281
+ rescored.sort(key=lambda item: (-item[0], len(item[1])))
1282
+ organic = [
1283
+ {"link": url, "title": title, "snippet": snippet}
1284
+ for _, url, title, snippet in rescored[: max(num_results, 1)]
1285
+ ]
1286
+ return {"organic": organic}
1287
+
1288
+ top_urls = preselect_urls[: max(num_results, 1)]
1289
+ if not top_urls:
1290
+ return {"organic": []}
1291
+
1292
+ if not allow_network:
1293
+ return {
1294
+ "organic": [
1295
+ {"link": url, "title": _fallback_title_from_url(url), "snippet": ""}
1296
+ for url in top_urls
1297
+ ]
1298
+ }
1299
+
1300
+ tasks = [
1301
+ _fetch_result_metadata(client, url, user_agent=user_agent, tokens=tokens)
1302
+ for url in top_urls
1303
+ ]
1304
+ metadatas = await asyncio.gather(*tasks, return_exceptions=True)
1305
+
1306
+ organic: List[Dict[str, str]] = []
1307
+ for url, metadata in zip(top_urls, metadatas):
1308
+ if isinstance(metadata, Exception):
1309
+ organic.append(
1310
+ {
1311
+ "link": url,
1312
+ "title": _fallback_title_from_url(url),
1313
+ "snippet": "",
1314
+ }
1315
+ )
1316
+ else:
1317
+ organic.append(
1318
+ {
1319
+ "link": url,
1320
+ "title": metadata.get("title", _fallback_title_from_url(url)),
1321
+ "snippet": metadata.get("snippet", ""),
1322
+ }
1323
+ )
1324
+
1325
+ return {"organic": organic}