iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documentation_search_enhanced/__init__.py +14 -0
- documentation_search_enhanced/__main__.py +6 -0
- documentation_search_enhanced/config.json +1674 -0
- documentation_search_enhanced/config_manager.py +233 -0
- documentation_search_enhanced/config_validator.py +79 -0
- documentation_search_enhanced/content_enhancer.py +578 -0
- documentation_search_enhanced/docker_manager.py +87 -0
- documentation_search_enhanced/logger.py +179 -0
- documentation_search_enhanced/main.py +2170 -0
- documentation_search_enhanced/project_generator.py +260 -0
- documentation_search_enhanced/project_scanner.py +85 -0
- documentation_search_enhanced/reranker.py +230 -0
- documentation_search_enhanced/site_index_builder.py +274 -0
- documentation_search_enhanced/site_index_downloader.py +222 -0
- documentation_search_enhanced/site_search.py +1325 -0
- documentation_search_enhanced/smart_search.py +473 -0
- documentation_search_enhanced/snyk_integration.py +657 -0
- documentation_search_enhanced/vector_search.py +303 -0
- documentation_search_enhanced/version_resolver.py +189 -0
- documentation_search_enhanced/vulnerability_scanner.py +545 -0
- documentation_search_enhanced/web_scraper.py +117 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1325 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Serper-free documentation site search.
|
|
4
|
+
|
|
5
|
+
This module provides a Serper-free fallback for `site:` queries by:
|
|
6
|
+
|
|
7
|
+
1) Preferring docs-native search indexes when available (MkDocs / Sphinx)
|
|
8
|
+
2) Falling back to sitemap discovery (robots.txt + sitemap.xml)
|
|
9
|
+
3) Optionally using a Playwright-backed fetcher to score/snippet page content
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import gzip
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import re
|
|
19
|
+
import xml.etree.ElementTree as ET
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from datetime import datetime, timedelta, timezone
|
|
22
|
+
from typing import (
|
|
23
|
+
Any,
|
|
24
|
+
Awaitable,
|
|
25
|
+
Callable,
|
|
26
|
+
Dict,
|
|
27
|
+
Iterable,
|
|
28
|
+
List,
|
|
29
|
+
Optional,
|
|
30
|
+
Sequence,
|
|
31
|
+
Tuple,
|
|
32
|
+
)
|
|
33
|
+
from urllib.parse import urljoin, urlparse
|
|
34
|
+
|
|
35
|
+
import httpx
|
|
36
|
+
from bs4 import BeautifulSoup
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_SITEMAP_CACHE_TTL = timedelta(hours=24)
|
|
40
|
+
_INDEX_CACHE_TTL = timedelta(hours=24)
|
|
41
|
+
_MAX_SITEMAP_URLS = 50_000
|
|
42
|
+
_MAX_SITEMAPS_TO_FETCH = 12
|
|
43
|
+
_MAX_INDEX_BYTES = 10_000_000
|
|
44
|
+
_MAX_INDEX_DOC_TEXT_CHARS = 5_000
|
|
45
|
+
_DEFAULT_CONTENT_FETCH_CONCURRENCY = 3
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(frozen=True)
|
|
49
|
+
class _SitemapCacheEntry:
|
|
50
|
+
fetched_at: datetime
|
|
51
|
+
urls: Tuple[str, ...]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_sitemap_cache: Dict[str, _SitemapCacheEntry] = {}
|
|
55
|
+
_sitemap_locks: Dict[str, asyncio.Lock] = {}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(frozen=True)
|
|
59
|
+
class _IndexCacheEntry:
|
|
60
|
+
fetched_at: datetime
|
|
61
|
+
kind: str
|
|
62
|
+
payload: Any
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
_index_cache: Dict[str, _IndexCacheEntry] = {}
|
|
66
|
+
_index_locks: Dict[str, asyncio.Lock] = {}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _parse_iso_datetime(value: Any) -> Optional[datetime]:
|
|
70
|
+
if not isinstance(value, str):
|
|
71
|
+
return None
|
|
72
|
+
raw = value.strip()
|
|
73
|
+
if not raw:
|
|
74
|
+
return None
|
|
75
|
+
try:
|
|
76
|
+
parsed = datetime.fromisoformat(raw)
|
|
77
|
+
except ValueError:
|
|
78
|
+
return None
|
|
79
|
+
if parsed.tzinfo is not None:
|
|
80
|
+
try:
|
|
81
|
+
parsed = parsed.astimezone(timezone.utc).replace(tzinfo=None)
|
|
82
|
+
except Exception:
|
|
83
|
+
parsed = parsed.replace(tzinfo=None)
|
|
84
|
+
return parsed
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def export_preindexed_state() -> Dict[str, Any]:
|
|
88
|
+
"""Export in-memory sitemap/index caches for persistence."""
|
|
89
|
+
now = datetime.now()
|
|
90
|
+
sitemaps: Dict[str, Dict[str, Any]] = {}
|
|
91
|
+
for origin, entry in _sitemap_cache.items():
|
|
92
|
+
sitemaps[origin] = {
|
|
93
|
+
"fetched_at": entry.fetched_at.isoformat(),
|
|
94
|
+
"urls": list(entry.urls),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
indexes: Dict[str, Dict[str, Any]] = {}
|
|
98
|
+
for index_url, entry in _index_cache.items():
|
|
99
|
+
payload = entry.payload
|
|
100
|
+
if isinstance(payload, tuple):
|
|
101
|
+
payload = list(payload)
|
|
102
|
+
|
|
103
|
+
indexes[index_url] = {
|
|
104
|
+
"fetched_at": entry.fetched_at.isoformat(),
|
|
105
|
+
"kind": entry.kind,
|
|
106
|
+
"payload": payload,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return {
|
|
110
|
+
"schema_version": 1,
|
|
111
|
+
"generated_at": now.isoformat(),
|
|
112
|
+
"sitemaps": sitemaps,
|
|
113
|
+
"indexes": indexes,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def import_preindexed_state(state: Any) -> None:
|
|
118
|
+
"""Import previously persisted sitemap/index caches."""
|
|
119
|
+
if not isinstance(state, dict):
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
imported_at = datetime.now()
|
|
123
|
+
max_future_skew = timedelta(days=1)
|
|
124
|
+
|
|
125
|
+
sitemaps = state.get("sitemaps")
|
|
126
|
+
if isinstance(sitemaps, dict):
|
|
127
|
+
for origin, entry in sitemaps.items():
|
|
128
|
+
if not isinstance(origin, str) or not isinstance(entry, dict):
|
|
129
|
+
continue
|
|
130
|
+
urls_raw = entry.get("urls")
|
|
131
|
+
if not isinstance(urls_raw, list):
|
|
132
|
+
continue
|
|
133
|
+
urls = tuple(str(url).strip() for url in urls_raw if str(url).strip())
|
|
134
|
+
if not urls:
|
|
135
|
+
continue
|
|
136
|
+
fetched_at = _parse_iso_datetime(entry.get("fetched_at"))
|
|
137
|
+
if fetched_at is None or fetched_at > imported_at + max_future_skew:
|
|
138
|
+
fetched_at = imported_at
|
|
139
|
+
_sitemap_cache[origin] = _SitemapCacheEntry(
|
|
140
|
+
fetched_at=fetched_at, urls=urls
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
indexes = state.get("indexes")
|
|
144
|
+
if isinstance(indexes, dict):
|
|
145
|
+
for index_url, entry in indexes.items():
|
|
146
|
+
if not isinstance(index_url, str) or not isinstance(entry, dict):
|
|
147
|
+
continue
|
|
148
|
+
kind = entry.get("kind")
|
|
149
|
+
payload_raw = entry.get("payload")
|
|
150
|
+
if not isinstance(kind, str):
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
if kind == "mkdocs":
|
|
154
|
+
if not isinstance(payload_raw, list):
|
|
155
|
+
continue
|
|
156
|
+
prepared = []
|
|
157
|
+
for doc in payload_raw:
|
|
158
|
+
if not isinstance(doc, dict):
|
|
159
|
+
continue
|
|
160
|
+
location = str(doc.get("location") or "").strip()
|
|
161
|
+
if not location:
|
|
162
|
+
continue
|
|
163
|
+
title = str(doc.get("title") or "").strip()
|
|
164
|
+
text = str(doc.get("text") or "").strip()
|
|
165
|
+
if len(text) > _MAX_INDEX_DOC_TEXT_CHARS:
|
|
166
|
+
text = text[:_MAX_INDEX_DOC_TEXT_CHARS]
|
|
167
|
+
prepared.append(
|
|
168
|
+
{"location": location, "title": title, "text": text}
|
|
169
|
+
)
|
|
170
|
+
if not prepared:
|
|
171
|
+
continue
|
|
172
|
+
payload: Any = tuple(prepared)
|
|
173
|
+
elif kind == "sphinx":
|
|
174
|
+
if not isinstance(payload_raw, dict):
|
|
175
|
+
continue
|
|
176
|
+
payload = payload_raw
|
|
177
|
+
else:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
fetched_at = _parse_iso_datetime(entry.get("fetched_at"))
|
|
181
|
+
if fetched_at is None or fetched_at > imported_at + max_future_skew:
|
|
182
|
+
fetched_at = imported_at
|
|
183
|
+
|
|
184
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
185
|
+
fetched_at=fetched_at, kind=kind, payload=payload
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def load_preindexed_state(path: str) -> bool:
|
|
190
|
+
"""Load a persisted index cache from disk into memory."""
|
|
191
|
+
if not path:
|
|
192
|
+
return False
|
|
193
|
+
if not os.path.exists(path):
|
|
194
|
+
return False
|
|
195
|
+
try:
|
|
196
|
+
with open(path, "r", encoding="utf-8") as fh:
|
|
197
|
+
raw = json.load(fh)
|
|
198
|
+
except Exception:
|
|
199
|
+
return False
|
|
200
|
+
import_preindexed_state(raw)
|
|
201
|
+
return True
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def save_preindexed_state(path: str) -> None:
|
|
205
|
+
"""Persist current in-memory sitemap/index caches to disk."""
|
|
206
|
+
if not path:
|
|
207
|
+
raise ValueError("persist path must be non-empty")
|
|
208
|
+
state = export_preindexed_state()
|
|
209
|
+
tmp_path = f"{path}.tmp"
|
|
210
|
+
with open(tmp_path, "w", encoding="utf-8") as fh:
|
|
211
|
+
json.dump(state, fh)
|
|
212
|
+
os.replace(tmp_path, path)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _get_cached_index_from_memory(index_url: str, *, kind: str) -> Optional[Any]:
|
|
216
|
+
cache_entry = _index_cache.get(index_url)
|
|
217
|
+
if cache_entry and cache_entry.kind == kind:
|
|
218
|
+
return cache_entry.payload
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _get_cached_sitemap_urls_from_memory(
|
|
223
|
+
origin: str, *, allow_stale: bool
|
|
224
|
+
) -> Optional[List[str]]:
|
|
225
|
+
cache_entry = _sitemap_cache.get(origin)
|
|
226
|
+
if not cache_entry:
|
|
227
|
+
return None
|
|
228
|
+
if not cache_entry.urls:
|
|
229
|
+
return None
|
|
230
|
+
if allow_stale:
|
|
231
|
+
return list(cache_entry.urls)
|
|
232
|
+
if datetime.now() - cache_entry.fetched_at <= _SITEMAP_CACHE_TTL:
|
|
233
|
+
return list(cache_entry.urls)
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
async def preindex_site(
|
|
238
|
+
site_url: str,
|
|
239
|
+
client: httpx.AsyncClient,
|
|
240
|
+
*,
|
|
241
|
+
user_agent: str,
|
|
242
|
+
include_sitemap: bool = False,
|
|
243
|
+
) -> Dict[str, Any]:
|
|
244
|
+
"""Fetch and cache on-site search indexes for a docs site."""
|
|
245
|
+
parsed = urlparse(site_url)
|
|
246
|
+
if not parsed.scheme or not parsed.netloc:
|
|
247
|
+
return {"site_url": site_url, "status": "invalid_url"}
|
|
248
|
+
|
|
249
|
+
origin = f"{parsed.scheme}://{parsed.netloc}"
|
|
250
|
+
results: Dict[str, Any] = {
|
|
251
|
+
"site_url": site_url,
|
|
252
|
+
"origin": origin,
|
|
253
|
+
"mkdocs_index": None,
|
|
254
|
+
"sphinx_index": None,
|
|
255
|
+
"sitemap": None,
|
|
256
|
+
"errors": [],
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
for index_url in _mkdocs_index_candidates(site_url):
|
|
260
|
+
try:
|
|
261
|
+
docs = await _get_cached_index(
|
|
262
|
+
client,
|
|
263
|
+
index_url,
|
|
264
|
+
user_agent=user_agent,
|
|
265
|
+
kind="mkdocs",
|
|
266
|
+
timeout_seconds=20.0,
|
|
267
|
+
)
|
|
268
|
+
except Exception as e:
|
|
269
|
+
results["errors"].append(f"mkdocs:{index_url}: {e}")
|
|
270
|
+
continue
|
|
271
|
+
if docs:
|
|
272
|
+
results["mkdocs_index"] = {"index_url": index_url, "documents": len(docs)}
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
for index_url in _sphinx_index_candidates(site_url):
|
|
276
|
+
try:
|
|
277
|
+
index = await _get_cached_index(
|
|
278
|
+
client,
|
|
279
|
+
index_url,
|
|
280
|
+
user_agent=user_agent,
|
|
281
|
+
kind="sphinx",
|
|
282
|
+
timeout_seconds=20.0,
|
|
283
|
+
)
|
|
284
|
+
except Exception as e:
|
|
285
|
+
results["errors"].append(f"sphinx:{index_url}: {e}")
|
|
286
|
+
continue
|
|
287
|
+
if isinstance(index, dict):
|
|
288
|
+
filenames = index.get("filenames")
|
|
289
|
+
results["sphinx_index"] = {
|
|
290
|
+
"index_url": index_url,
|
|
291
|
+
"documents": len(filenames) if isinstance(filenames, list) else None,
|
|
292
|
+
}
|
|
293
|
+
break
|
|
294
|
+
|
|
295
|
+
if include_sitemap:
|
|
296
|
+
try:
|
|
297
|
+
urls = await _load_site_sitemap_urls(
|
|
298
|
+
client, site_url, user_agent=user_agent
|
|
299
|
+
)
|
|
300
|
+
if urls:
|
|
301
|
+
_sitemap_cache[origin] = _SitemapCacheEntry(
|
|
302
|
+
fetched_at=datetime.now(), urls=tuple(urls)
|
|
303
|
+
)
|
|
304
|
+
results["sitemap"] = {"urls": len(urls)}
|
|
305
|
+
except Exception as e:
|
|
306
|
+
results["errors"].append(f"sitemap:{origin}: {e}")
|
|
307
|
+
|
|
308
|
+
results["status"] = (
|
|
309
|
+
"ok"
|
|
310
|
+
if results.get("mkdocs_index")
|
|
311
|
+
or results.get("sphinx_index")
|
|
312
|
+
or results.get("sitemap")
|
|
313
|
+
else "no_index_found"
|
|
314
|
+
)
|
|
315
|
+
return results
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _parse_site_query(query: str) -> Tuple[Optional[str], str]:
|
|
319
|
+
match = re.search(r"\bsite:(\S+)", query)
|
|
320
|
+
if not match:
|
|
321
|
+
return None, query.strip()
|
|
322
|
+
|
|
323
|
+
site_token = match.group(1).strip().strip('"').strip("'")
|
|
324
|
+
remaining = (query[: match.start()] + query[match.end() :]).strip()
|
|
325
|
+
return site_token, remaining
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
_STOPWORDS = {
|
|
329
|
+
"a",
|
|
330
|
+
"an",
|
|
331
|
+
"and",
|
|
332
|
+
"are",
|
|
333
|
+
"as",
|
|
334
|
+
"at",
|
|
335
|
+
"be",
|
|
336
|
+
"for",
|
|
337
|
+
"from",
|
|
338
|
+
"how",
|
|
339
|
+
"in",
|
|
340
|
+
"into",
|
|
341
|
+
"is",
|
|
342
|
+
"it",
|
|
343
|
+
"of",
|
|
344
|
+
"on",
|
|
345
|
+
"or",
|
|
346
|
+
"that",
|
|
347
|
+
"the",
|
|
348
|
+
"these",
|
|
349
|
+
"this",
|
|
350
|
+
"to",
|
|
351
|
+
"using",
|
|
352
|
+
"what",
|
|
353
|
+
"when",
|
|
354
|
+
"where",
|
|
355
|
+
"why",
|
|
356
|
+
"with",
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _tokenize_query(text: str) -> List[str]:
|
|
361
|
+
tokens = [t for t in re.findall(r"[a-z0-9]+", text.lower()) if t]
|
|
362
|
+
filtered: List[str] = []
|
|
363
|
+
for token in tokens:
|
|
364
|
+
if token in _STOPWORDS:
|
|
365
|
+
continue
|
|
366
|
+
if len(token) <= 1:
|
|
367
|
+
continue
|
|
368
|
+
if token not in filtered:
|
|
369
|
+
filtered.append(token)
|
|
370
|
+
return filtered[:12]
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _matches_site_prefix(candidate_url: str, site_url: str) -> bool:
|
|
374
|
+
try:
|
|
375
|
+
candidate = urlparse(candidate_url)
|
|
376
|
+
site = urlparse(site_url)
|
|
377
|
+
except Exception:
|
|
378
|
+
return False
|
|
379
|
+
|
|
380
|
+
if not candidate.netloc or candidate.netloc.lower() != site.netloc.lower():
|
|
381
|
+
return False
|
|
382
|
+
|
|
383
|
+
site_path = site.path or "/"
|
|
384
|
+
candidate_path = candidate.path or "/"
|
|
385
|
+
|
|
386
|
+
site_path_norm = site_path.rstrip("/")
|
|
387
|
+
candidate_path_norm = candidate_path.rstrip("/")
|
|
388
|
+
|
|
389
|
+
if site_path_norm in ("", "/"):
|
|
390
|
+
return True
|
|
391
|
+
|
|
392
|
+
return candidate_path_norm == site_path_norm or candidate_path_norm.startswith(
|
|
393
|
+
f"{site_path_norm}/"
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _sitemap_candidates(site_url: str) -> List[str]:
|
|
398
|
+
parsed = urlparse(site_url)
|
|
399
|
+
if not parsed.scheme or not parsed.netloc:
|
|
400
|
+
return []
|
|
401
|
+
|
|
402
|
+
origin = f"{parsed.scheme}://{parsed.netloc}"
|
|
403
|
+
site_base = site_url.rstrip("/")
|
|
404
|
+
|
|
405
|
+
candidates = [
|
|
406
|
+
f"{origin}/sitemap.xml",
|
|
407
|
+
f"{origin}/sitemap_index.xml",
|
|
408
|
+
f"{origin}/sitemap-index.xml",
|
|
409
|
+
]
|
|
410
|
+
|
|
411
|
+
if site_base != origin:
|
|
412
|
+
candidates.extend(
|
|
413
|
+
[
|
|
414
|
+
f"{site_base}/sitemap.xml",
|
|
415
|
+
f"{site_base}/sitemap_index.xml",
|
|
416
|
+
]
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Deduplicate while preserving order.
|
|
420
|
+
seen = set()
|
|
421
|
+
unique: List[str] = []
|
|
422
|
+
for item in candidates:
|
|
423
|
+
if item not in seen:
|
|
424
|
+
unique.append(item)
|
|
425
|
+
seen.add(item)
|
|
426
|
+
return unique
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _mkdocs_index_candidates(site_url: str) -> List[str]:
|
|
430
|
+
parsed = urlparse(site_url)
|
|
431
|
+
if not parsed.scheme or not parsed.netloc:
|
|
432
|
+
return []
|
|
433
|
+
|
|
434
|
+
origin = f"{parsed.scheme}://{parsed.netloc}/"
|
|
435
|
+
base = site_url.rstrip("/") + "/"
|
|
436
|
+
|
|
437
|
+
candidates = [
|
|
438
|
+
urljoin(base, "search/search_index.json"),
|
|
439
|
+
urljoin(base, "search_index.json"),
|
|
440
|
+
]
|
|
441
|
+
if base != origin:
|
|
442
|
+
candidates.extend(
|
|
443
|
+
[
|
|
444
|
+
urljoin(origin, "search/search_index.json"),
|
|
445
|
+
urljoin(origin, "search_index.json"),
|
|
446
|
+
]
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
seen: set[str] = set()
|
|
450
|
+
unique: List[str] = []
|
|
451
|
+
for item in candidates:
|
|
452
|
+
if item not in seen:
|
|
453
|
+
unique.append(item)
|
|
454
|
+
seen.add(item)
|
|
455
|
+
return unique
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _mkdocs_base_from_index_url(index_url: str) -> str:
|
|
459
|
+
suffixes = ("/search/search_index.json", "/search_index.json")
|
|
460
|
+
for suffix in suffixes:
|
|
461
|
+
if index_url.endswith(suffix):
|
|
462
|
+
return index_url[: -len(suffix)] + "/"
|
|
463
|
+
return urljoin(index_url, "./")
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _sphinx_index_candidates(site_url: str) -> List[str]:
|
|
467
|
+
parsed = urlparse(site_url)
|
|
468
|
+
if not parsed.scheme or not parsed.netloc:
|
|
469
|
+
return []
|
|
470
|
+
|
|
471
|
+
origin = f"{parsed.scheme}://{parsed.netloc}/"
|
|
472
|
+
base = site_url.rstrip("/") + "/"
|
|
473
|
+
|
|
474
|
+
candidates = [urljoin(base, "searchindex.js")]
|
|
475
|
+
if base != origin:
|
|
476
|
+
candidates.append(urljoin(origin, "searchindex.js"))
|
|
477
|
+
|
|
478
|
+
seen: set[str] = set()
|
|
479
|
+
unique: List[str] = []
|
|
480
|
+
for item in candidates:
|
|
481
|
+
if item not in seen:
|
|
482
|
+
unique.append(item)
|
|
483
|
+
seen.add(item)
|
|
484
|
+
return unique
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def _sphinx_base_from_index_url(index_url: str) -> str:
|
|
488
|
+
if index_url.endswith("/searchindex.js"):
|
|
489
|
+
return index_url[: -len("/searchindex.js")] + "/"
|
|
490
|
+
if index_url.endswith("searchindex.js"):
|
|
491
|
+
return index_url[: -len("searchindex.js")]
|
|
492
|
+
return urljoin(index_url, "./")
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
async def _fetch_bytes(
|
|
496
|
+
client: httpx.AsyncClient, url: str, *, user_agent: str, timeout_seconds: float
|
|
497
|
+
) -> Optional[bytes]:
|
|
498
|
+
try:
|
|
499
|
+
response = await client.get(
|
|
500
|
+
url,
|
|
501
|
+
headers={"User-Agent": user_agent},
|
|
502
|
+
timeout=httpx.Timeout(timeout_seconds),
|
|
503
|
+
follow_redirects=True,
|
|
504
|
+
)
|
|
505
|
+
if response.status_code >= 400:
|
|
506
|
+
return None
|
|
507
|
+
return response.content
|
|
508
|
+
except Exception:
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _maybe_decompress_gzip(blob: bytes) -> bytes:
|
|
513
|
+
# Some sitemaps are served as *.gz without Content-Encoding headers.
|
|
514
|
+
if len(blob) >= 2 and blob[0] == 0x1F and blob[1] == 0x8B:
|
|
515
|
+
try:
|
|
516
|
+
return gzip.decompress(blob)
|
|
517
|
+
except Exception:
|
|
518
|
+
return blob
|
|
519
|
+
return blob
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def _xml_root_tag(root: ET.Element) -> str:
|
|
523
|
+
if "}" in root.tag:
|
|
524
|
+
return root.tag.split("}", 1)[1]
|
|
525
|
+
return root.tag
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def _parse_sitemap_xml(blob: bytes) -> Tuple[List[str], List[str]]:
|
|
529
|
+
"""
|
|
530
|
+
Returns (urls, child_sitemaps).
|
|
531
|
+
"""
|
|
532
|
+
try:
|
|
533
|
+
root = ET.fromstring(blob)
|
|
534
|
+
except Exception:
|
|
535
|
+
return [], []
|
|
536
|
+
|
|
537
|
+
tag = _xml_root_tag(root)
|
|
538
|
+
if tag == "urlset":
|
|
539
|
+
urls = [
|
|
540
|
+
(loc.text or "").strip()
|
|
541
|
+
for loc in root.findall(".//{*}url/{*}loc")
|
|
542
|
+
if (loc.text or "").strip()
|
|
543
|
+
]
|
|
544
|
+
return urls, []
|
|
545
|
+
|
|
546
|
+
if tag == "sitemapindex":
|
|
547
|
+
sitemaps = [
|
|
548
|
+
(loc.text or "").strip()
|
|
549
|
+
for loc in root.findall(".//{*}sitemap/{*}loc")
|
|
550
|
+
if (loc.text or "").strip()
|
|
551
|
+
]
|
|
552
|
+
return [], sitemaps
|
|
553
|
+
|
|
554
|
+
return [], []
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
async def _discover_sitemaps_from_robots(
|
|
558
|
+
client: httpx.AsyncClient, site_url: str, *, user_agent: str
|
|
559
|
+
) -> List[str]:
|
|
560
|
+
parsed = urlparse(site_url)
|
|
561
|
+
if not parsed.scheme or not parsed.netloc:
|
|
562
|
+
return []
|
|
563
|
+
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
|
564
|
+
try:
|
|
565
|
+
response = await client.get(
|
|
566
|
+
robots_url,
|
|
567
|
+
headers={"User-Agent": user_agent},
|
|
568
|
+
timeout=httpx.Timeout(10.0),
|
|
569
|
+
follow_redirects=True,
|
|
570
|
+
)
|
|
571
|
+
if response.status_code >= 400:
|
|
572
|
+
return []
|
|
573
|
+
sitemaps = []
|
|
574
|
+
for line in response.text.splitlines():
|
|
575
|
+
if line.lower().startswith("sitemap:"):
|
|
576
|
+
sitemap = line.split(":", 1)[1].strip()
|
|
577
|
+
if sitemap:
|
|
578
|
+
sitemaps.append(sitemap)
|
|
579
|
+
return sitemaps
|
|
580
|
+
except Exception:
|
|
581
|
+
return []
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
async def _load_site_sitemap_urls(
|
|
585
|
+
client: httpx.AsyncClient,
|
|
586
|
+
site_url: str,
|
|
587
|
+
*,
|
|
588
|
+
user_agent: str,
|
|
589
|
+
allow_html_fallback: bool = True,
|
|
590
|
+
) -> List[str]:
|
|
591
|
+
sitemap_urls = await _discover_sitemaps_from_robots(
|
|
592
|
+
client, site_url, user_agent=user_agent
|
|
593
|
+
)
|
|
594
|
+
sitemap_urls.extend(_sitemap_candidates(site_url))
|
|
595
|
+
|
|
596
|
+
visited_sitemaps = set()
|
|
597
|
+
sitemap_queue = []
|
|
598
|
+
for sitemap_url in sitemap_urls:
|
|
599
|
+
if sitemap_url and sitemap_url not in visited_sitemaps:
|
|
600
|
+
sitemap_queue.append(sitemap_url)
|
|
601
|
+
visited_sitemaps.add(sitemap_url)
|
|
602
|
+
|
|
603
|
+
discovered_urls: List[str] = []
|
|
604
|
+
seen_urls = set()
|
|
605
|
+
|
|
606
|
+
while sitemap_queue and len(visited_sitemaps) <= _MAX_SITEMAPS_TO_FETCH:
|
|
607
|
+
sitemap_url = sitemap_queue.pop(0)
|
|
608
|
+
|
|
609
|
+
blob = await _fetch_bytes(
|
|
610
|
+
client, sitemap_url, user_agent=user_agent, timeout_seconds=15.0
|
|
611
|
+
)
|
|
612
|
+
if not blob:
|
|
613
|
+
continue
|
|
614
|
+
|
|
615
|
+
blob = _maybe_decompress_gzip(blob)
|
|
616
|
+
urls, child_sitemaps = _parse_sitemap_xml(blob)
|
|
617
|
+
|
|
618
|
+
for url in urls:
|
|
619
|
+
if url in seen_urls:
|
|
620
|
+
continue
|
|
621
|
+
seen_urls.add(url)
|
|
622
|
+
discovered_urls.append(url)
|
|
623
|
+
if len(discovered_urls) >= _MAX_SITEMAP_URLS:
|
|
624
|
+
return discovered_urls
|
|
625
|
+
|
|
626
|
+
for child in child_sitemaps:
|
|
627
|
+
if child in visited_sitemaps:
|
|
628
|
+
continue
|
|
629
|
+
if len(visited_sitemaps) >= _MAX_SITEMAPS_TO_FETCH:
|
|
630
|
+
break
|
|
631
|
+
visited_sitemaps.add(child)
|
|
632
|
+
sitemap_queue.append(child)
|
|
633
|
+
|
|
634
|
+
if discovered_urls:
|
|
635
|
+
return discovered_urls
|
|
636
|
+
|
|
637
|
+
if not allow_html_fallback:
|
|
638
|
+
return []
|
|
639
|
+
|
|
640
|
+
return await _discover_urls_from_html_links(client, site_url, user_agent=user_agent)
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
async def _discover_urls_from_html_links(
|
|
644
|
+
client: httpx.AsyncClient, site_url: str, *, user_agent: str
|
|
645
|
+
) -> List[str]:
|
|
646
|
+
"""Discover internal links from the site's HTML when no sitemap is available."""
|
|
647
|
+
parsed = urlparse(site_url)
|
|
648
|
+
if not parsed.scheme or not parsed.netloc:
|
|
649
|
+
return []
|
|
650
|
+
origin = f"{parsed.scheme}://{parsed.netloc}"
|
|
651
|
+
base_for_join = site_url.rstrip("/") + "/"
|
|
652
|
+
|
|
653
|
+
html_blob = await _fetch_bytes(
|
|
654
|
+
client, site_url, user_agent=user_agent, timeout_seconds=15.0
|
|
655
|
+
)
|
|
656
|
+
if not html_blob:
|
|
657
|
+
return []
|
|
658
|
+
|
|
659
|
+
try:
|
|
660
|
+
html_text = html_blob.decode("utf-8", errors="ignore")
|
|
661
|
+
except Exception:
|
|
662
|
+
return []
|
|
663
|
+
|
|
664
|
+
soup = BeautifulSoup(html_text, "html.parser")
|
|
665
|
+
discovered_from_html: List[str] = []
|
|
666
|
+
seen_html_urls: set[str] = set()
|
|
667
|
+
|
|
668
|
+
def _is_asset_path(path: str) -> bool:
|
|
669
|
+
return bool(
|
|
670
|
+
re.search(
|
|
671
|
+
r"\.(?:png|jpe?g|gif|svg|webp|css|js|map|ico|woff2?|ttf|otf|eot|pdf|zip|gz)$",
|
|
672
|
+
path.lower(),
|
|
673
|
+
)
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
for anchor in soup.find_all("a", href=True):
|
|
677
|
+
href = str(anchor.get("href") or "").strip()
|
|
678
|
+
if not href:
|
|
679
|
+
continue
|
|
680
|
+
lower = href.lower()
|
|
681
|
+
if lower.startswith(("#", "mailto:", "javascript:", "tel:")):
|
|
682
|
+
continue
|
|
683
|
+
|
|
684
|
+
absolute = urljoin(base_for_join, href)
|
|
685
|
+
parsed_link = urlparse(absolute)
|
|
686
|
+
if parsed_link.scheme not in {"http", "https"}:
|
|
687
|
+
continue
|
|
688
|
+
if parsed_link.netloc.lower() != parsed.netloc.lower():
|
|
689
|
+
continue
|
|
690
|
+
if _is_asset_path(parsed_link.path or ""):
|
|
691
|
+
continue
|
|
692
|
+
|
|
693
|
+
sanitized = parsed_link._replace(query="", fragment="").geturl()
|
|
694
|
+
if not sanitized.startswith(origin):
|
|
695
|
+
continue
|
|
696
|
+
if sanitized in seen_html_urls:
|
|
697
|
+
continue
|
|
698
|
+
seen_html_urls.add(sanitized)
|
|
699
|
+
discovered_from_html.append(sanitized)
|
|
700
|
+
if len(discovered_from_html) >= _MAX_SITEMAP_URLS:
|
|
701
|
+
break
|
|
702
|
+
|
|
703
|
+
return discovered_from_html
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def _extract_text_snippet(text: str, tokens: Sequence[str]) -> str:
|
|
707
|
+
cleaned = re.sub(r"\s+", " ", text).strip()
|
|
708
|
+
if not cleaned:
|
|
709
|
+
return ""
|
|
710
|
+
|
|
711
|
+
if not tokens:
|
|
712
|
+
return cleaned[:240]
|
|
713
|
+
|
|
714
|
+
lower = cleaned.lower()
|
|
715
|
+
best_idx: Optional[int] = None
|
|
716
|
+
for token in tokens:
|
|
717
|
+
idx = lower.find(token)
|
|
718
|
+
if idx == -1:
|
|
719
|
+
continue
|
|
720
|
+
if best_idx is None or idx < best_idx:
|
|
721
|
+
best_idx = idx
|
|
722
|
+
|
|
723
|
+
if best_idx is None:
|
|
724
|
+
return cleaned[:240]
|
|
725
|
+
|
|
726
|
+
start = max(0, best_idx - 80)
|
|
727
|
+
end = min(len(cleaned), best_idx + 160)
|
|
728
|
+
return cleaned[start:end].strip()[:240]
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
def _score_urls(urls: Iterable[str], tokens: Sequence[str]) -> List[Tuple[int, str]]:
|
|
732
|
+
scored: List[Tuple[int, str]] = []
|
|
733
|
+
if not tokens:
|
|
734
|
+
return [(1, url) for url in urls]
|
|
735
|
+
|
|
736
|
+
for url in urls:
|
|
737
|
+
url_lower = url.lower()
|
|
738
|
+
score = 0
|
|
739
|
+
for token in tokens:
|
|
740
|
+
if token not in url_lower:
|
|
741
|
+
continue
|
|
742
|
+
score += 1
|
|
743
|
+
# Boost for segment-level matches.
|
|
744
|
+
path = urlparse(url).path.lower()
|
|
745
|
+
segments = [seg for seg in re.split(r"[/._-]+", path) if seg]
|
|
746
|
+
if token in segments:
|
|
747
|
+
score += 6
|
|
748
|
+
else:
|
|
749
|
+
score += 2
|
|
750
|
+
if score > 0:
|
|
751
|
+
scored.append((score, url))
|
|
752
|
+
scored.sort(key=lambda item: (-item[0], len(item[1])))
|
|
753
|
+
return scored
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def _fallback_title_from_url(url: str) -> str:
|
|
757
|
+
parsed = urlparse(url)
|
|
758
|
+
segment = (parsed.path or "/").rstrip("/").split("/")[-1]
|
|
759
|
+
segment = re.sub(r"\.[a-z0-9]+$", "", segment, flags=re.IGNORECASE)
|
|
760
|
+
segment = segment.replace("-", " ").replace("_", " ").strip()
|
|
761
|
+
if not segment:
|
|
762
|
+
return url
|
|
763
|
+
return segment[:1].upper() + segment[1:]
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def _extract_page_snippet(soup: BeautifulSoup, tokens: Sequence[str]) -> str:
|
|
767
|
+
for meta_name in ("description", "og:description"):
|
|
768
|
+
meta = soup.find("meta", attrs={"name": meta_name}) or soup.find(
|
|
769
|
+
"meta", attrs={"property": meta_name}
|
|
770
|
+
)
|
|
771
|
+
if meta and meta.get("content"):
|
|
772
|
+
return str(meta["content"]).strip()[:240]
|
|
773
|
+
|
|
774
|
+
text = soup.get_text(" ", strip=True)
|
|
775
|
+
if not text:
|
|
776
|
+
return ""
|
|
777
|
+
|
|
778
|
+
if not tokens:
|
|
779
|
+
return text[:240]
|
|
780
|
+
|
|
781
|
+
text_lower = text.lower()
|
|
782
|
+
for token in tokens:
|
|
783
|
+
idx = text_lower.find(token)
|
|
784
|
+
if idx == -1:
|
|
785
|
+
continue
|
|
786
|
+
start = max(0, idx - 80)
|
|
787
|
+
end = min(len(text), idx + 160)
|
|
788
|
+
snippet = text[start:end].strip()
|
|
789
|
+
return snippet[:240]
|
|
790
|
+
|
|
791
|
+
return text[:240]
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
async def _fetch_result_metadata(
|
|
795
|
+
client: httpx.AsyncClient, url: str, *, user_agent: str, tokens: Sequence[str]
|
|
796
|
+
) -> Dict[str, str]:
|
|
797
|
+
try:
|
|
798
|
+
response = await client.get(
|
|
799
|
+
url,
|
|
800
|
+
headers={"User-Agent": user_agent},
|
|
801
|
+
timeout=httpx.Timeout(12.0),
|
|
802
|
+
follow_redirects=True,
|
|
803
|
+
)
|
|
804
|
+
if response.status_code >= 400:
|
|
805
|
+
return {"title": _fallback_title_from_url(url), "snippet": ""}
|
|
806
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
807
|
+
title = (soup.title.string or "").strip() if soup.title else ""
|
|
808
|
+
if not title:
|
|
809
|
+
title = _fallback_title_from_url(url)
|
|
810
|
+
snippet = _extract_page_snippet(soup, tokens)
|
|
811
|
+
return {"title": title, "snippet": snippet}
|
|
812
|
+
except Exception:
|
|
813
|
+
return {"title": _fallback_title_from_url(url), "snippet": ""}
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
async def _get_cached_index(
|
|
817
|
+
client: httpx.AsyncClient,
|
|
818
|
+
index_url: str,
|
|
819
|
+
*,
|
|
820
|
+
user_agent: str,
|
|
821
|
+
kind: str,
|
|
822
|
+
timeout_seconds: float,
|
|
823
|
+
) -> Optional[Any]:
|
|
824
|
+
now = datetime.now()
|
|
825
|
+
cache_entry = _index_cache.get(index_url)
|
|
826
|
+
stale_payload: Optional[Any] = None
|
|
827
|
+
if cache_entry and cache_entry.kind == kind:
|
|
828
|
+
stale_payload = cache_entry.payload
|
|
829
|
+
if (
|
|
830
|
+
cache_entry
|
|
831
|
+
and cache_entry.kind == kind
|
|
832
|
+
and now - cache_entry.fetched_at <= _INDEX_CACHE_TTL
|
|
833
|
+
):
|
|
834
|
+
return cache_entry.payload
|
|
835
|
+
|
|
836
|
+
lock = _index_locks.setdefault(index_url, asyncio.Lock())
|
|
837
|
+
async with lock:
|
|
838
|
+
cache_entry = _index_cache.get(index_url)
|
|
839
|
+
if (
|
|
840
|
+
cache_entry
|
|
841
|
+
and cache_entry.kind == kind
|
|
842
|
+
and now - cache_entry.fetched_at <= _INDEX_CACHE_TTL
|
|
843
|
+
):
|
|
844
|
+
return cache_entry.payload
|
|
845
|
+
if cache_entry and cache_entry.kind == kind:
|
|
846
|
+
stale_payload = cache_entry.payload
|
|
847
|
+
|
|
848
|
+
blob = await _fetch_bytes(
|
|
849
|
+
client, index_url, user_agent=user_agent, timeout_seconds=timeout_seconds
|
|
850
|
+
)
|
|
851
|
+
if (not blob or len(blob) > _MAX_INDEX_BYTES) and stale_payload is not None:
|
|
852
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
853
|
+
fetched_at=datetime.now(), kind=kind, payload=stale_payload
|
|
854
|
+
)
|
|
855
|
+
return stale_payload
|
|
856
|
+
if not blob or len(blob) > _MAX_INDEX_BYTES:
|
|
857
|
+
return None
|
|
858
|
+
|
|
859
|
+
payload: Any
|
|
860
|
+
if kind == "mkdocs":
|
|
861
|
+
try:
|
|
862
|
+
raw = json.loads(blob.decode("utf-8"))
|
|
863
|
+
except Exception:
|
|
864
|
+
if stale_payload is not None:
|
|
865
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
866
|
+
fetched_at=datetime.now(), kind=kind, payload=stale_payload
|
|
867
|
+
)
|
|
868
|
+
return stale_payload
|
|
869
|
+
docs = raw.get("docs")
|
|
870
|
+
if not isinstance(docs, list):
|
|
871
|
+
if stale_payload is not None:
|
|
872
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
873
|
+
fetched_at=datetime.now(), kind=kind, payload=stale_payload
|
|
874
|
+
)
|
|
875
|
+
return stale_payload
|
|
876
|
+
|
|
877
|
+
prepared = []
|
|
878
|
+
for doc in docs:
|
|
879
|
+
if not isinstance(doc, dict):
|
|
880
|
+
continue
|
|
881
|
+
location = str(doc.get("location") or "").strip()
|
|
882
|
+
title = str(doc.get("title") or "").strip()
|
|
883
|
+
text = str(doc.get("text") or "").strip()
|
|
884
|
+
if len(text) > _MAX_INDEX_DOC_TEXT_CHARS:
|
|
885
|
+
text = text[:_MAX_INDEX_DOC_TEXT_CHARS]
|
|
886
|
+
if not location:
|
|
887
|
+
continue
|
|
888
|
+
prepared.append({"location": location, "title": title, "text": text})
|
|
889
|
+
|
|
890
|
+
payload = tuple(prepared)
|
|
891
|
+
elif kind == "sphinx":
|
|
892
|
+
try:
|
|
893
|
+
text = blob.decode("utf-8", errors="ignore")
|
|
894
|
+
except Exception:
|
|
895
|
+
if stale_payload is not None:
|
|
896
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
897
|
+
fetched_at=datetime.now(), kind=kind, payload=stale_payload
|
|
898
|
+
)
|
|
899
|
+
return stale_payload
|
|
900
|
+
|
|
901
|
+
marker = "Search.setIndex("
|
|
902
|
+
idx = text.find(marker)
|
|
903
|
+
if idx == -1:
|
|
904
|
+
if stale_payload is not None:
|
|
905
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
906
|
+
fetched_at=datetime.now(), kind=kind, payload=stale_payload
|
|
907
|
+
)
|
|
908
|
+
return stale_payload
|
|
909
|
+
start = text.find("{", idx)
|
|
910
|
+
end = text.rfind("}")
|
|
911
|
+
if start == -1 or end == -1 or end <= start:
|
|
912
|
+
if stale_payload is not None:
|
|
913
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
914
|
+
fetched_at=datetime.now(), kind=kind, payload=stale_payload
|
|
915
|
+
)
|
|
916
|
+
return stale_payload
|
|
917
|
+
json_text = text[start : end + 1]
|
|
918
|
+
try:
|
|
919
|
+
payload = json.loads(json_text)
|
|
920
|
+
except Exception:
|
|
921
|
+
if stale_payload is not None:
|
|
922
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
923
|
+
fetched_at=datetime.now(), kind=kind, payload=stale_payload
|
|
924
|
+
)
|
|
925
|
+
return stale_payload
|
|
926
|
+
else:
|
|
927
|
+
return None
|
|
928
|
+
|
|
929
|
+
_index_cache[index_url] = _IndexCacheEntry(
|
|
930
|
+
fetched_at=datetime.now(), kind=kind, payload=payload
|
|
931
|
+
)
|
|
932
|
+
return payload
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def _score_document(url: str, title: str, text: str, tokens: Sequence[str]) -> int:
|
|
936
|
+
if not tokens:
|
|
937
|
+
return 1
|
|
938
|
+
|
|
939
|
+
url_lower = url.lower()
|
|
940
|
+
title_lower = title.lower()
|
|
941
|
+
text_lower = text.lower()
|
|
942
|
+
|
|
943
|
+
score = 0
|
|
944
|
+
for token in tokens:
|
|
945
|
+
if token in title_lower:
|
|
946
|
+
score += 25
|
|
947
|
+
if token in url_lower:
|
|
948
|
+
score += 6
|
|
949
|
+
occurrences = text_lower.count(token)
|
|
950
|
+
if occurrences:
|
|
951
|
+
score += 8 + min(occurrences, 20)
|
|
952
|
+
|
|
953
|
+
return score
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
async def _gather_with_limit(
|
|
957
|
+
coros: Sequence[Awaitable[Any]], *, concurrency: int
|
|
958
|
+
) -> List[Any]:
|
|
959
|
+
if concurrency <= 1:
|
|
960
|
+
results: List[Any] = []
|
|
961
|
+
for coro in coros:
|
|
962
|
+
results.append(await coro)
|
|
963
|
+
return results
|
|
964
|
+
|
|
965
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
966
|
+
|
|
967
|
+
async def _runner(coro: Awaitable[Any]) -> Any:
|
|
968
|
+
async with semaphore:
|
|
969
|
+
return await coro
|
|
970
|
+
|
|
971
|
+
return await asyncio.gather(
|
|
972
|
+
*[_runner(coro) for coro in coros], return_exceptions=True
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
async def _search_via_mkdocs_index(
|
|
977
|
+
site_url: str,
|
|
978
|
+
tokens: Sequence[str],
|
|
979
|
+
client: httpx.AsyncClient,
|
|
980
|
+
*,
|
|
981
|
+
user_agent: str,
|
|
982
|
+
num_results: int,
|
|
983
|
+
allow_network: bool,
|
|
984
|
+
) -> Optional[List[Dict[str, str]]]:
|
|
985
|
+
for index_url in _mkdocs_index_candidates(site_url):
|
|
986
|
+
if allow_network:
|
|
987
|
+
docs = await _get_cached_index(
|
|
988
|
+
client,
|
|
989
|
+
index_url,
|
|
990
|
+
user_agent=user_agent,
|
|
991
|
+
kind="mkdocs",
|
|
992
|
+
timeout_seconds=20.0,
|
|
993
|
+
)
|
|
994
|
+
else:
|
|
995
|
+
docs = _get_cached_index_from_memory(index_url, kind="mkdocs")
|
|
996
|
+
if not docs:
|
|
997
|
+
continue
|
|
998
|
+
|
|
999
|
+
base_url = _mkdocs_base_from_index_url(index_url)
|
|
1000
|
+
scored: List[Tuple[int, Dict[str, str]]] = []
|
|
1001
|
+
for doc in docs:
|
|
1002
|
+
location = str(doc.get("location") or "")
|
|
1003
|
+
url = urljoin(base_url, location)
|
|
1004
|
+
if not _matches_site_prefix(url, site_url):
|
|
1005
|
+
continue
|
|
1006
|
+
title = str(doc.get("title") or "") or _fallback_title_from_url(url)
|
|
1007
|
+
text = str(doc.get("text") or "")
|
|
1008
|
+
score = _score_document(url, title, text, tokens)
|
|
1009
|
+
if score <= 0:
|
|
1010
|
+
continue
|
|
1011
|
+
snippet = _extract_text_snippet(text, tokens)
|
|
1012
|
+
scored.append((score, {"link": url, "title": title, "snippet": snippet}))
|
|
1013
|
+
|
|
1014
|
+
scored.sort(key=lambda item: (-item[0], len(item[1]["link"])))
|
|
1015
|
+
organic = [item[1] for item in scored[: max(num_results, 1)]]
|
|
1016
|
+
if organic:
|
|
1017
|
+
return organic
|
|
1018
|
+
|
|
1019
|
+
return None
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
def _coerce_sphinx_doc_hits(entry: Any) -> List[Tuple[int, int]]:
|
|
1023
|
+
"""Return a list of (doc_id, weight) pairs."""
|
|
1024
|
+
if not isinstance(entry, list):
|
|
1025
|
+
return []
|
|
1026
|
+
if not entry:
|
|
1027
|
+
return []
|
|
1028
|
+
|
|
1029
|
+
if all(isinstance(item, int) for item in entry):
|
|
1030
|
+
return [(item, 1) for item in entry]
|
|
1031
|
+
|
|
1032
|
+
hits: List[Tuple[int, int]] = []
|
|
1033
|
+
for item in entry:
|
|
1034
|
+
if isinstance(item, int):
|
|
1035
|
+
hits.append((item, 1))
|
|
1036
|
+
continue
|
|
1037
|
+
if isinstance(item, (list, tuple)) and item and isinstance(item[0], int):
|
|
1038
|
+
weight = 1
|
|
1039
|
+
if len(item) > 1 and isinstance(item[1], int):
|
|
1040
|
+
weight = max(item[1], 1)
|
|
1041
|
+
hits.append((item[0], weight))
|
|
1042
|
+
return hits
|
|
1043
|
+
|
|
1044
|
+
|
|
1045
|
+
async def _search_via_sphinx_index(
|
|
1046
|
+
site_url: str,
|
|
1047
|
+
tokens: Sequence[str],
|
|
1048
|
+
client: httpx.AsyncClient,
|
|
1049
|
+
*,
|
|
1050
|
+
user_agent: str,
|
|
1051
|
+
num_results: int,
|
|
1052
|
+
fetch_text: Optional[Callable[[str], Awaitable[str]]],
|
|
1053
|
+
fetch_text_concurrency: int,
|
|
1054
|
+
allow_network: bool,
|
|
1055
|
+
) -> Optional[List[Dict[str, str]]]:
|
|
1056
|
+
for index_url in _sphinx_index_candidates(site_url):
|
|
1057
|
+
if allow_network:
|
|
1058
|
+
index = await _get_cached_index(
|
|
1059
|
+
client,
|
|
1060
|
+
index_url,
|
|
1061
|
+
user_agent=user_agent,
|
|
1062
|
+
kind="sphinx",
|
|
1063
|
+
timeout_seconds=20.0,
|
|
1064
|
+
)
|
|
1065
|
+
else:
|
|
1066
|
+
index = _get_cached_index_from_memory(index_url, kind="sphinx")
|
|
1067
|
+
if not isinstance(index, dict):
|
|
1068
|
+
continue
|
|
1069
|
+
|
|
1070
|
+
filenames = index.get("filenames")
|
|
1071
|
+
titles = index.get("titles")
|
|
1072
|
+
terms = index.get("terms")
|
|
1073
|
+
titleterms = index.get("titleterms")
|
|
1074
|
+
if not (
|
|
1075
|
+
isinstance(filenames, list)
|
|
1076
|
+
and isinstance(titles, list)
|
|
1077
|
+
and isinstance(terms, dict)
|
|
1078
|
+
):
|
|
1079
|
+
continue
|
|
1080
|
+
|
|
1081
|
+
base_url = _sphinx_base_from_index_url(index_url)
|
|
1082
|
+
scores: Dict[int, int] = {}
|
|
1083
|
+
for token in tokens:
|
|
1084
|
+
for doc_id, weight in _coerce_sphinx_doc_hits(terms.get(token)):
|
|
1085
|
+
scores[doc_id] = scores.get(doc_id, 0) + (10 * weight)
|
|
1086
|
+
if isinstance(titleterms, dict):
|
|
1087
|
+
for doc_id, weight in _coerce_sphinx_doc_hits(titleterms.get(token)):
|
|
1088
|
+
scores[doc_id] = scores.get(doc_id, 0) + (20 * weight)
|
|
1089
|
+
|
|
1090
|
+
ranked_doc_ids = sorted(scores.items(), key=lambda item: -item[1])
|
|
1091
|
+
hits: List[Tuple[int, str]] = []
|
|
1092
|
+
for doc_id, _ in ranked_doc_ids:
|
|
1093
|
+
if not isinstance(doc_id, int) or doc_id < 0 or doc_id >= len(filenames):
|
|
1094
|
+
continue
|
|
1095
|
+
url = urljoin(base_url, str(filenames[doc_id]))
|
|
1096
|
+
if not _matches_site_prefix(url, site_url):
|
|
1097
|
+
continue
|
|
1098
|
+
hits.append((doc_id, url))
|
|
1099
|
+
if len(hits) >= max(num_results, 1):
|
|
1100
|
+
break
|
|
1101
|
+
|
|
1102
|
+
if not hits:
|
|
1103
|
+
continue
|
|
1104
|
+
|
|
1105
|
+
urls = [url for _, url in hits]
|
|
1106
|
+
snippets_by_url: Dict[str, str] = {url: "" for url in urls}
|
|
1107
|
+
if allow_network and fetch_text and tokens:
|
|
1108
|
+
texts = await _gather_with_limit(
|
|
1109
|
+
[fetch_text(url) for url in urls], concurrency=fetch_text_concurrency
|
|
1110
|
+
)
|
|
1111
|
+
for url, text in zip(urls, texts):
|
|
1112
|
+
if isinstance(text, Exception):
|
|
1113
|
+
continue
|
|
1114
|
+
snippets_by_url[url] = _extract_text_snippet(str(text), tokens)
|
|
1115
|
+
elif allow_network and tokens:
|
|
1116
|
+
metadatas = await asyncio.gather(
|
|
1117
|
+
*[
|
|
1118
|
+
_fetch_result_metadata(
|
|
1119
|
+
client, url, user_agent=user_agent, tokens=tokens
|
|
1120
|
+
)
|
|
1121
|
+
for url in urls
|
|
1122
|
+
],
|
|
1123
|
+
return_exceptions=True,
|
|
1124
|
+
)
|
|
1125
|
+
for url, metadata in zip(urls, metadatas):
|
|
1126
|
+
if isinstance(metadata, Exception):
|
|
1127
|
+
continue
|
|
1128
|
+
snippets_by_url[url] = metadata.get("snippet", "")
|
|
1129
|
+
|
|
1130
|
+
organic: List[Dict[str, str]] = []
|
|
1131
|
+
for doc_id, url in hits:
|
|
1132
|
+
title = _fallback_title_from_url(url)
|
|
1133
|
+
if doc_id < len(titles) and titles[doc_id]:
|
|
1134
|
+
title = str(titles[doc_id])
|
|
1135
|
+
organic.append(
|
|
1136
|
+
{
|
|
1137
|
+
"link": url,
|
|
1138
|
+
"title": title,
|
|
1139
|
+
"snippet": snippets_by_url.get(url, ""),
|
|
1140
|
+
}
|
|
1141
|
+
)
|
|
1142
|
+
|
|
1143
|
+
if organic:
|
|
1144
|
+
return organic
|
|
1145
|
+
|
|
1146
|
+
return None
|
|
1147
|
+
|
|
1148
|
+
|
|
1149
|
+
async def search_site_via_sitemap(
|
|
1150
|
+
query: str,
|
|
1151
|
+
client: httpx.AsyncClient,
|
|
1152
|
+
*,
|
|
1153
|
+
user_agent: str,
|
|
1154
|
+
num_results: int = 5,
|
|
1155
|
+
fetch_text: Optional[Callable[[str], Awaitable[str]]] = None,
|
|
1156
|
+
fetch_text_concurrency: int = _DEFAULT_CONTENT_FETCH_CONCURRENCY,
|
|
1157
|
+
allow_network: bool = True,
|
|
1158
|
+
) -> Dict[str, Any]:
|
|
1159
|
+
"""
|
|
1160
|
+
Perform a Serper-free search for `site:` queries.
|
|
1161
|
+
|
|
1162
|
+
Returns a Serper-like payload: {"organic": [{"link","title","snippet"}, ...]}.
|
|
1163
|
+
"""
|
|
1164
|
+
site_url, terms = _parse_site_query(query)
|
|
1165
|
+
if not site_url:
|
|
1166
|
+
return {"organic": []}
|
|
1167
|
+
|
|
1168
|
+
parsed = urlparse(site_url)
|
|
1169
|
+
if not parsed.scheme or not parsed.netloc:
|
|
1170
|
+
return {"organic": []}
|
|
1171
|
+
|
|
1172
|
+
origin = f"{parsed.scheme}://{parsed.netloc}"
|
|
1173
|
+
|
|
1174
|
+
tokens = _tokenize_query(terms)
|
|
1175
|
+
|
|
1176
|
+
# 1) Prefer docs-native search indexes when present.
|
|
1177
|
+
organic = await _search_via_mkdocs_index(
|
|
1178
|
+
site_url,
|
|
1179
|
+
tokens,
|
|
1180
|
+
client,
|
|
1181
|
+
user_agent=user_agent,
|
|
1182
|
+
num_results=num_results,
|
|
1183
|
+
allow_network=allow_network,
|
|
1184
|
+
)
|
|
1185
|
+
if not organic:
|
|
1186
|
+
organic = await _search_via_sphinx_index(
|
|
1187
|
+
site_url,
|
|
1188
|
+
tokens,
|
|
1189
|
+
client,
|
|
1190
|
+
user_agent=user_agent,
|
|
1191
|
+
num_results=num_results,
|
|
1192
|
+
fetch_text=fetch_text,
|
|
1193
|
+
fetch_text_concurrency=fetch_text_concurrency,
|
|
1194
|
+
allow_network=allow_network,
|
|
1195
|
+
)
|
|
1196
|
+
if organic:
|
|
1197
|
+
return {"organic": organic}
|
|
1198
|
+
|
|
1199
|
+
# 2) Fallback: sitemap discovery + ranking.
|
|
1200
|
+
cached_urls = _get_cached_sitemap_urls_from_memory(origin, allow_stale=False)
|
|
1201
|
+
if cached_urls is not None:
|
|
1202
|
+
all_urls = cached_urls
|
|
1203
|
+
else:
|
|
1204
|
+
if not allow_network:
|
|
1205
|
+
stale_cached = _get_cached_sitemap_urls_from_memory(
|
|
1206
|
+
origin, allow_stale=True
|
|
1207
|
+
)
|
|
1208
|
+
if stale_cached is None:
|
|
1209
|
+
return {"organic": []}
|
|
1210
|
+
all_urls = stale_cached
|
|
1211
|
+
else:
|
|
1212
|
+
lock = _sitemap_locks.setdefault(origin, asyncio.Lock())
|
|
1213
|
+
async with lock:
|
|
1214
|
+
cached_urls = _get_cached_sitemap_urls_from_memory(
|
|
1215
|
+
origin, allow_stale=False
|
|
1216
|
+
)
|
|
1217
|
+
if cached_urls is not None:
|
|
1218
|
+
all_urls = cached_urls
|
|
1219
|
+
else:
|
|
1220
|
+
loaded = await _load_site_sitemap_urls(
|
|
1221
|
+
client,
|
|
1222
|
+
site_url,
|
|
1223
|
+
user_agent=user_agent,
|
|
1224
|
+
allow_html_fallback=False,
|
|
1225
|
+
)
|
|
1226
|
+
if loaded:
|
|
1227
|
+
_sitemap_cache[origin] = _SitemapCacheEntry(
|
|
1228
|
+
fetched_at=datetime.now(), urls=tuple(loaded)
|
|
1229
|
+
)
|
|
1230
|
+
all_urls = loaded
|
|
1231
|
+
else:
|
|
1232
|
+
stale_cached = _get_cached_sitemap_urls_from_memory(
|
|
1233
|
+
origin, allow_stale=True
|
|
1234
|
+
)
|
|
1235
|
+
if stale_cached is not None:
|
|
1236
|
+
existing = _sitemap_cache.get(origin)
|
|
1237
|
+
if existing and existing.urls:
|
|
1238
|
+
_sitemap_cache[origin] = _SitemapCacheEntry(
|
|
1239
|
+
fetched_at=datetime.now(), urls=existing.urls
|
|
1240
|
+
)
|
|
1241
|
+
all_urls = stale_cached
|
|
1242
|
+
else:
|
|
1243
|
+
discovered = await _discover_urls_from_html_links(
|
|
1244
|
+
client, site_url, user_agent=user_agent
|
|
1245
|
+
)
|
|
1246
|
+
if not discovered:
|
|
1247
|
+
return {"organic": []}
|
|
1248
|
+
_sitemap_cache[origin] = _SitemapCacheEntry(
|
|
1249
|
+
fetched_at=datetime.now(), urls=tuple(discovered)
|
|
1250
|
+
)
|
|
1251
|
+
all_urls = discovered
|
|
1252
|
+
|
|
1253
|
+
candidates = [u for u in all_urls if _matches_site_prefix(u, site_url)]
|
|
1254
|
+
scored = _score_urls(candidates, tokens)
|
|
1255
|
+
|
|
1256
|
+
# Preselect candidates (URL-based), then optionally rescore using page text.
|
|
1257
|
+
preselect_limit = min(12, max(6, max(num_results, 1) * 2))
|
|
1258
|
+
if scored:
|
|
1259
|
+
preselect_urls = [url for _, url in scored[:preselect_limit]]
|
|
1260
|
+
url_scores = {url: score for score, url in scored[:preselect_limit]}
|
|
1261
|
+
else:
|
|
1262
|
+
preselect_urls = sorted(candidates, key=len)[:preselect_limit]
|
|
1263
|
+
url_scores = {url: 0 for url in preselect_urls}
|
|
1264
|
+
|
|
1265
|
+
if fetch_text and tokens and preselect_urls:
|
|
1266
|
+
texts = await _gather_with_limit(
|
|
1267
|
+
[fetch_text(url) for url in preselect_urls],
|
|
1268
|
+
concurrency=fetch_text_concurrency,
|
|
1269
|
+
)
|
|
1270
|
+
rescored: List[Tuple[int, str, str, str]] = []
|
|
1271
|
+
for url, text in zip(preselect_urls, texts):
|
|
1272
|
+
title = _fallback_title_from_url(url)
|
|
1273
|
+
if isinstance(text, Exception):
|
|
1274
|
+
rescored.append((url_scores.get(url, 0), url, title, ""))
|
|
1275
|
+
continue
|
|
1276
|
+
snippet = _extract_text_snippet(str(text), tokens)
|
|
1277
|
+
content_score = _score_document(url, title, str(text), tokens)
|
|
1278
|
+
total = url_scores.get(url, 0) + content_score
|
|
1279
|
+
rescored.append((total, url, title, snippet))
|
|
1280
|
+
|
|
1281
|
+
rescored.sort(key=lambda item: (-item[0], len(item[1])))
|
|
1282
|
+
organic = [
|
|
1283
|
+
{"link": url, "title": title, "snippet": snippet}
|
|
1284
|
+
for _, url, title, snippet in rescored[: max(num_results, 1)]
|
|
1285
|
+
]
|
|
1286
|
+
return {"organic": organic}
|
|
1287
|
+
|
|
1288
|
+
top_urls = preselect_urls[: max(num_results, 1)]
|
|
1289
|
+
if not top_urls:
|
|
1290
|
+
return {"organic": []}
|
|
1291
|
+
|
|
1292
|
+
if not allow_network:
|
|
1293
|
+
return {
|
|
1294
|
+
"organic": [
|
|
1295
|
+
{"link": url, "title": _fallback_title_from_url(url), "snippet": ""}
|
|
1296
|
+
for url in top_urls
|
|
1297
|
+
]
|
|
1298
|
+
}
|
|
1299
|
+
|
|
1300
|
+
tasks = [
|
|
1301
|
+
_fetch_result_metadata(client, url, user_agent=user_agent, tokens=tokens)
|
|
1302
|
+
for url in top_urls
|
|
1303
|
+
]
|
|
1304
|
+
metadatas = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1305
|
+
|
|
1306
|
+
organic: List[Dict[str, str]] = []
|
|
1307
|
+
for url, metadata in zip(top_urls, metadatas):
|
|
1308
|
+
if isinstance(metadata, Exception):
|
|
1309
|
+
organic.append(
|
|
1310
|
+
{
|
|
1311
|
+
"link": url,
|
|
1312
|
+
"title": _fallback_title_from_url(url),
|
|
1313
|
+
"snippet": "",
|
|
1314
|
+
}
|
|
1315
|
+
)
|
|
1316
|
+
else:
|
|
1317
|
+
organic.append(
|
|
1318
|
+
{
|
|
1319
|
+
"link": url,
|
|
1320
|
+
"title": metadata.get("title", _fallback_title_from_url(url)),
|
|
1321
|
+
"snippet": metadata.get("snippet", ""),
|
|
1322
|
+
}
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
return {"organic": organic}
|