iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. documentation_search_enhanced/__init__.py +14 -0
  2. documentation_search_enhanced/__main__.py +6 -0
  3. documentation_search_enhanced/config.json +1674 -0
  4. documentation_search_enhanced/config_manager.py +233 -0
  5. documentation_search_enhanced/config_validator.py +79 -0
  6. documentation_search_enhanced/content_enhancer.py +578 -0
  7. documentation_search_enhanced/docker_manager.py +87 -0
  8. documentation_search_enhanced/logger.py +179 -0
  9. documentation_search_enhanced/main.py +2170 -0
  10. documentation_search_enhanced/project_generator.py +260 -0
  11. documentation_search_enhanced/project_scanner.py +85 -0
  12. documentation_search_enhanced/reranker.py +230 -0
  13. documentation_search_enhanced/site_index_builder.py +274 -0
  14. documentation_search_enhanced/site_index_downloader.py +222 -0
  15. documentation_search_enhanced/site_search.py +1325 -0
  16. documentation_search_enhanced/smart_search.py +473 -0
  17. documentation_search_enhanced/snyk_integration.py +657 -0
  18. documentation_search_enhanced/vector_search.py +303 -0
  19. documentation_search_enhanced/version_resolver.py +189 -0
  20. documentation_search_enhanced/vulnerability_scanner.py +545 -0
  21. documentation_search_enhanced/web_scraper.py +117 -0
  22. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
  23. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
  24. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
  25. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
  26. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,274 @@
1
+ #!/usr/bin/env python3
2
+ """Build a preindexed docs site index for Serper-free search.
3
+
4
+ This module is used by CI to generate `docs_site_index.json` (+ optional `.gz`) assets
5
+ that are published to GitHub Releases and auto-downloaded by the server at startup.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import asyncio
11
+ import argparse
12
+ import gzip
13
+ import json
14
+ import os
15
+ import sys
16
+ from dataclasses import dataclass
17
+ from datetime import datetime
18
+ from typing import Any, Dict, Iterable, Mapping, Optional
19
+ from urllib.parse import urlparse
20
+
21
+ import httpx
22
+
23
+ from . import site_search
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class SiteIndexBuildSettings:
28
+ user_agent: str
29
+ max_concurrent_sites: int = 5
30
+ sitemap_mode: str = "missing" # none|missing|all
31
+ max_sitemap_urls: int = 5_000
32
+ timeout_seconds: float = 60.0
33
+
34
+
35
+ def load_docs_urls_from_config(config_path: str) -> Dict[str, str]:
36
+ if not config_path:
37
+ raise ValueError("config_path must be non-empty")
38
+ with open(config_path, "r", encoding="utf-8") as fh:
39
+ data = json.load(fh)
40
+
41
+ raw = data.get("docs_urls", {})
42
+ if not isinstance(raw, dict):
43
+ return {}
44
+
45
+ docs_urls: Dict[str, str] = {}
46
+ for name, value in raw.items():
47
+ if not isinstance(name, str):
48
+ continue
49
+ if isinstance(value, dict):
50
+ url = str(value.get("url") or "").strip()
51
+ else:
52
+ url = str(value or "").strip()
53
+ if url:
54
+ docs_urls[name] = url
55
+ return docs_urls
56
+
57
+
58
+ def _parse_libraries(value: Optional[str]) -> Optional[list[str]]:
59
+ if not value:
60
+ return None
61
+ parts = [part.strip() for part in value.split(",")]
62
+ libraries = [part for part in parts if part]
63
+ return libraries or None
64
+
65
+
66
+ def _origin_from_url(url: str) -> Optional[str]:
67
+ parsed = urlparse(url)
68
+ if not parsed.scheme or not parsed.netloc:
69
+ return None
70
+ return f"{parsed.scheme}://{parsed.netloc}"
71
+
72
+
73
+ async def build_site_index_file(
74
+ docs_urls: Mapping[str, str],
75
+ *,
76
+ output_path: str,
77
+ gzip_output: bool,
78
+ settings: SiteIndexBuildSettings,
79
+ client: Optional[httpx.AsyncClient] = None,
80
+ ) -> Dict[str, Any]:
81
+ if not output_path:
82
+ raise ValueError("output_path must be non-empty")
83
+
84
+ if settings.sitemap_mode not in {"none", "missing", "all"}:
85
+ raise ValueError("sitemap_mode must be one of: none, missing, all")
86
+
87
+ site_search._sitemap_cache.clear()
88
+ site_search._sitemap_locks.clear()
89
+ site_search._index_cache.clear()
90
+ site_search._index_locks.clear()
91
+
92
+ original_max_sitemap_urls = getattr(site_search, "_MAX_SITEMAP_URLS", None)
93
+ if settings.max_sitemap_urls and settings.max_sitemap_urls > 0:
94
+ site_search._MAX_SITEMAP_URLS = int(settings.max_sitemap_urls)
95
+
96
+ created_client = client is None
97
+ if client is None:
98
+ client = httpx.AsyncClient(timeout=httpx.Timeout(settings.timeout_seconds))
99
+
100
+ try:
101
+ concurrency = max(1, min(int(settings.max_concurrent_sites), 20))
102
+ semaphore = asyncio.Semaphore(concurrency)
103
+
104
+ results: list[dict[str, Any]] = []
105
+
106
+ async def run_one(library: str, url: str) -> None:
107
+ async with semaphore:
108
+ summary = await site_search.preindex_site(
109
+ url,
110
+ client,
111
+ user_agent=settings.user_agent,
112
+ include_sitemap=False,
113
+ )
114
+
115
+ has_index = bool(
116
+ summary.get("mkdocs_index") or summary.get("sphinx_index")
117
+ )
118
+ include_sitemap = settings.sitemap_mode == "all" or (
119
+ settings.sitemap_mode == "missing" and not has_index
120
+ )
121
+ if include_sitemap:
122
+ origin = summary.get("origin") or _origin_from_url(url)
123
+ try:
124
+ urls = await site_search._load_site_sitemap_urls( # type: ignore[attr-defined]
125
+ client,
126
+ url,
127
+ user_agent=settings.user_agent,
128
+ )
129
+ except Exception as e:
130
+ summary.setdefault("errors", []).append(f"sitemap:{e}")
131
+ else:
132
+ if origin and urls:
133
+ site_search._sitemap_cache[origin] = (
134
+ site_search._SitemapCacheEntry( # type: ignore[attr-defined]
135
+ fetched_at=datetime.now(),
136
+ urls=tuple(urls),
137
+ )
138
+ )
139
+ summary["sitemap"] = {"urls": len(urls)}
140
+
141
+ summary["library"] = library
142
+ results.append(summary)
143
+
144
+ tasks = [
145
+ run_one(library, url)
146
+ for library, url in sorted(docs_urls.items(), key=lambda item: item[0])
147
+ if url
148
+ ]
149
+ if tasks:
150
+ await asyncio.gather(*tasks)
151
+
152
+ site_search.save_preindexed_state(output_path)
153
+
154
+ gz_path: Optional[str] = None
155
+ if gzip_output:
156
+ gz_path = f"{output_path}.gz"
157
+ with open(output_path, "rb") as fh:
158
+ blob = fh.read()
159
+ with gzip.open(gz_path, "wb", compresslevel=9) as fh:
160
+ fh.write(blob)
161
+
162
+ indexed_sites = sum(
163
+ 1
164
+ for summary in results
165
+ if summary.get("mkdocs_index")
166
+ or summary.get("sphinx_index")
167
+ or summary.get("sitemap")
168
+ )
169
+
170
+ return {
171
+ "status": "ok" if indexed_sites else "error",
172
+ "output_path": output_path,
173
+ "gzip_path": gz_path,
174
+ "total_libraries": len(docs_urls),
175
+ "indexed_libraries": indexed_sites,
176
+ "results": sorted(results, key=lambda item: str(item.get("library") or "")),
177
+ }
178
+ finally:
179
+ if original_max_sitemap_urls is not None:
180
+ site_search._MAX_SITEMAP_URLS = original_max_sitemap_urls
181
+ if created_client:
182
+ await client.aclose()
183
+
184
+
185
+ def _build_parser() -> argparse.ArgumentParser:
186
+ parser = argparse.ArgumentParser(
187
+ description="Build docs_site_index.json for releases."
188
+ )
189
+ parser.add_argument(
190
+ "--config",
191
+ default=os.path.join("src", "documentation_search_enhanced", "config.json"),
192
+ help="Path to config.json (default: bundled src config).",
193
+ )
194
+ parser.add_argument(
195
+ "--output",
196
+ default="docs_site_index.json",
197
+ help="Path to write the index JSON (default: docs_site_index.json).",
198
+ )
199
+ parser.add_argument(
200
+ "--gzip",
201
+ action="store_true",
202
+ help="Also write a .gz next to the JSON file.",
203
+ )
204
+ parser.add_argument(
205
+ "--libraries",
206
+ default=None,
207
+ help="Comma-separated list of libraries to index (default: all).",
208
+ )
209
+ parser.add_argument(
210
+ "--sitemap",
211
+ choices=("none", "missing", "all"),
212
+ default="missing",
213
+ help="Whether to include sitemap URLs: none, missing, or all (default: missing).",
214
+ )
215
+ parser.add_argument(
216
+ "--max-sitemap-urls",
217
+ type=int,
218
+ default=5_000,
219
+ help="Max sitemap URLs per site when included (default: 5000).",
220
+ )
221
+ parser.add_argument(
222
+ "--max-concurrency",
223
+ type=int,
224
+ default=5,
225
+ help="Max concurrent sites to index (default: 5).",
226
+ )
227
+ parser.add_argument(
228
+ "--timeout-seconds",
229
+ type=float,
230
+ default=60.0,
231
+ help="HTTP timeout (default: 60s).",
232
+ )
233
+ parser.add_argument(
234
+ "--user-agent",
235
+ default="documentation-search-enhanced/index-builder",
236
+ help="User-Agent header to use for fetches.",
237
+ )
238
+ return parser
239
+
240
+
241
+ def main(argv: Optional[Iterable[str]] = None) -> int:
242
+ args = _build_parser().parse_args(list(argv) if argv is not None else None)
243
+
244
+ docs_urls = load_docs_urls_from_config(args.config)
245
+ libraries = _parse_libraries(args.libraries)
246
+ if libraries:
247
+ docs_urls = {lib: docs_urls[lib] for lib in libraries if lib in docs_urls}
248
+
249
+ settings = SiteIndexBuildSettings(
250
+ user_agent=args.user_agent,
251
+ max_concurrent_sites=args.max_concurrency,
252
+ sitemap_mode=args.sitemap,
253
+ max_sitemap_urls=args.max_sitemap_urls,
254
+ timeout_seconds=args.timeout_seconds,
255
+ )
256
+
257
+ if not docs_urls:
258
+ print("No documentation sources found to index.", file=sys.stderr)
259
+ return 2
260
+
261
+ result = asyncio.run(
262
+ build_site_index_file(
263
+ docs_urls,
264
+ output_path=args.output,
265
+ gzip_output=bool(args.gzip),
266
+ settings=settings,
267
+ )
268
+ )
269
+ print(json.dumps(result, indent=2, sort_keys=True), file=sys.stderr)
270
+ return 0 if result.get("status") == "ok" else 1
271
+
272
+
273
+ if __name__ == "__main__":
274
+ raise SystemExit(main())
@@ -0,0 +1,222 @@
1
+ #!/usr/bin/env python3
2
+ """Download and manage the prebuilt docs site index.
3
+
4
+ The server can operate without Serper by using docs-native search indexes (MkDocs/Sphinx)
5
+ and/or a prebuilt index file. This module implements an optional auto-download flow
6
+ from GitHub Releases to keep that prebuilt index up to date without requiring users
7
+ to run any indexing commands.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import gzip
13
+ import json
14
+ import os
15
+ from dataclasses import dataclass
16
+ from datetime import datetime, timedelta
17
+ from pathlib import Path
18
+ from typing import Any, Dict, Optional, Sequence, Tuple
19
+
20
+ import httpx
21
+
22
+
23
+ DEFAULT_RELEASE_REPO = "anton-prosterity/documentation-search-mcp"
24
+ DEFAULT_RELEASE_ASSET_BASENAME = "docs_site_index.json"
25
+ DEFAULT_RELEASE_URLS = (
26
+ f"https://github.com/{DEFAULT_RELEASE_REPO}/releases/latest/download/{DEFAULT_RELEASE_ASSET_BASENAME}.gz",
27
+ f"https://github.com/{DEFAULT_RELEASE_REPO}/releases/latest/download/{DEFAULT_RELEASE_ASSET_BASENAME}",
28
+ )
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class SiteIndexDownloadSettings:
33
+ path: str
34
+ urls: Tuple[str, ...] = DEFAULT_RELEASE_URLS
35
+ auto_download: bool = True
36
+ max_age_hours: int = 24 * 7
37
+
38
+
39
+ def _parse_bool(value: Optional[str], *, default: bool) -> bool:
40
+ if value is None:
41
+ return default
42
+ value = value.strip().lower()
43
+ if value in {"1", "true", "yes", "y", "on"}:
44
+ return True
45
+ if value in {"0", "false", "no", "n", "off"}:
46
+ return False
47
+ return default
48
+
49
+
50
+ def _parse_int(value: Optional[str], *, default: int) -> int:
51
+ if value is None:
52
+ return default
53
+ try:
54
+ return int(value.strip())
55
+ except ValueError:
56
+ return default
57
+
58
+
59
+ def _split_urls(value: Optional[str]) -> Tuple[str, ...]:
60
+ if not value:
61
+ return ()
62
+ parts = [part.strip() for part in value.split(",")]
63
+ return tuple(part for part in parts if part)
64
+
65
+
66
+ def _default_cache_dir() -> Path:
67
+ xdg_cache = os.getenv("XDG_CACHE_HOME")
68
+ if xdg_cache:
69
+ return Path(xdg_cache)
70
+ home = os.path.expanduser("~")
71
+ if home and home != "~":
72
+ return Path(home) / ".cache"
73
+ return Path(os.getcwd())
74
+
75
+
76
+ def default_site_index_path(*, cwd: Optional[str] = None) -> str:
77
+ """Choose a reasonable default path for the prebuilt index file."""
78
+ cwd_path = Path(cwd or os.getcwd()) / ".docs_site_index.json"
79
+ if cwd_path.exists():
80
+ return str(cwd_path)
81
+
82
+ cache_dir = _default_cache_dir() / "documentation-search-enhanced"
83
+ return str(cache_dir / "docs_site_index.json")
84
+
85
+
86
+ def load_site_index_settings_from_env(
87
+ *, cwd: Optional[str] = None
88
+ ) -> SiteIndexDownloadSettings:
89
+ """Load site index settings from environment variables."""
90
+ path = os.getenv("DOCS_SITE_INDEX_PATH") or default_site_index_path(cwd=cwd)
91
+
92
+ urls = _split_urls(os.getenv("DOCS_SITE_INDEX_URLS"))
93
+ if not urls:
94
+ url = os.getenv("DOCS_SITE_INDEX_URL")
95
+ urls = (url.strip(),) if url and url.strip() else DEFAULT_RELEASE_URLS
96
+
97
+ auto_download = _parse_bool(
98
+ os.getenv("DOCS_SITE_INDEX_AUTO_DOWNLOAD"), default=True
99
+ )
100
+ max_age_hours = _parse_int(
101
+ os.getenv("DOCS_SITE_INDEX_MAX_AGE_HOURS"), default=24 * 7
102
+ )
103
+
104
+ return SiteIndexDownloadSettings(
105
+ path=path,
106
+ urls=urls,
107
+ auto_download=auto_download,
108
+ max_age_hours=max_age_hours,
109
+ )
110
+
111
+
112
+ def should_download_site_index(path: str, *, max_age_hours: int) -> bool:
113
+ """Return True when the on-disk index is missing or older than max_age_hours."""
114
+ if not path:
115
+ return False
116
+ target = Path(path)
117
+ if not target.exists():
118
+ return True
119
+
120
+ if max_age_hours <= 0:
121
+ return True
122
+
123
+ try:
124
+ mtime = datetime.fromtimestamp(target.stat().st_mtime)
125
+ except Exception:
126
+ return True
127
+ return datetime.now() - mtime > timedelta(hours=max_age_hours)
128
+
129
+
130
+ def _maybe_decompress_gzip(blob: bytes) -> bytes:
131
+ if len(blob) >= 2 and blob[0] == 0x1F and blob[1] == 0x8B:
132
+ return gzip.decompress(blob)
133
+ return blob
134
+
135
+
136
+ def _validate_index_payload(payload: Any) -> Dict[str, Any]:
137
+ if not isinstance(payload, dict):
138
+ raise ValueError("Index file must be a JSON object")
139
+ schema_version = payload.get("schema_version")
140
+ if schema_version != 1:
141
+ raise ValueError(f"Unsupported index schema_version: {schema_version!r}")
142
+ if "indexes" not in payload and "sitemaps" not in payload:
143
+ raise ValueError("Index file missing expected keys")
144
+ return payload
145
+
146
+
147
+ async def download_site_index(
148
+ client: httpx.AsyncClient,
149
+ *,
150
+ urls: Sequence[str],
151
+ dest_path: str,
152
+ user_agent: str,
153
+ timeout_seconds: float = 60.0,
154
+ ) -> Dict[str, Any]:
155
+ """Download the latest index from the first working URL and save it to dest_path."""
156
+ if not dest_path:
157
+ return {"status": "error", "error": "dest_path is empty"}
158
+
159
+ errors: list[str] = []
160
+ headers = {"User-Agent": user_agent}
161
+
162
+ for url in urls:
163
+ try:
164
+ response = await client.get(
165
+ url,
166
+ headers=headers,
167
+ timeout=httpx.Timeout(timeout_seconds),
168
+ follow_redirects=True,
169
+ )
170
+ if response.status_code == 404:
171
+ errors.append(f"{url}: 404")
172
+ continue
173
+ response.raise_for_status()
174
+
175
+ blob = _maybe_decompress_gzip(response.content)
176
+ payload = json.loads(blob.decode("utf-8"))
177
+ _validate_index_payload(payload)
178
+
179
+ target = Path(dest_path)
180
+ target.parent.mkdir(parents=True, exist_ok=True)
181
+ tmp_path = target.with_suffix(target.suffix + ".tmp")
182
+ tmp_path.write_bytes(blob)
183
+ tmp_path.replace(target)
184
+
185
+ return {
186
+ "status": "downloaded",
187
+ "url": url,
188
+ "bytes": len(blob),
189
+ }
190
+ except Exception as e:
191
+ errors.append(f"{url}: {e}")
192
+
193
+ return {"status": "error", "errors": errors}
194
+
195
+
196
+ async def ensure_site_index_file(
197
+ client: httpx.AsyncClient,
198
+ *,
199
+ settings: SiteIndexDownloadSettings,
200
+ user_agent: str,
201
+ ) -> Dict[str, Any]:
202
+ """Ensure a reasonably fresh site index exists on disk (download if needed)."""
203
+ if not settings.auto_download:
204
+ return {
205
+ "status": "skipped",
206
+ "reason": "auto_download disabled",
207
+ "path": settings.path,
208
+ }
209
+
210
+ if not should_download_site_index(
211
+ settings.path, max_age_hours=settings.max_age_hours
212
+ ):
213
+ return {"status": "ok", "reason": "up_to_date", "path": settings.path}
214
+
215
+ result = await download_site_index(
216
+ client,
217
+ urls=settings.urls,
218
+ dest_path=settings.path,
219
+ user_agent=user_agent,
220
+ )
221
+ result["path"] = settings.path
222
+ return result