PyPI - iflow-mcp_anton-prosterity-documentation-search-enhanced - Versions diffs - 1.9.0__py3-none-any.whl - Mend

iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

documentation_search_enhanced/__init__.py +14 -0
documentation_search_enhanced/__main__.py +6 -0
documentation_search_enhanced/config.json +1674 -0
documentation_search_enhanced/config_manager.py +233 -0
documentation_search_enhanced/config_validator.py +79 -0
documentation_search_enhanced/content_enhancer.py +578 -0
documentation_search_enhanced/docker_manager.py +87 -0
documentation_search_enhanced/logger.py +179 -0
documentation_search_enhanced/main.py +2170 -0
documentation_search_enhanced/project_generator.py +260 -0
documentation_search_enhanced/project_scanner.py +85 -0
documentation_search_enhanced/reranker.py +230 -0
documentation_search_enhanced/site_index_builder.py +274 -0
documentation_search_enhanced/site_index_downloader.py +222 -0
documentation_search_enhanced/site_search.py +1325 -0
documentation_search_enhanced/smart_search.py +473 -0
documentation_search_enhanced/snyk_integration.py +657 -0
documentation_search_enhanced/vector_search.py +303 -0
documentation_search_enhanced/version_resolver.py +189 -0
documentation_search_enhanced/vulnerability_scanner.py +545 -0
documentation_search_enhanced/web_scraper.py +117 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0

documentation_search_enhanced/site_index_builder.py ADDED Viewed

@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""Build a preindexed docs site index for Serper-free search.
+This module is used by CI to generate `docs_site_index.json` (+ optional `.gz`) assets
+that are published to GitHub Releases and auto-downloaded by the server at startup.
+"""
+from __future__ import annotations
+import asyncio
+import argparse
+import gzip
+import json
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Dict, Iterable, Mapping, Optional
+from urllib.parse import urlparse
+import httpx
+from . import site_search
+@dataclass(frozen=True)
+class SiteIndexBuildSettings:
+    user_agent: str
+    max_concurrent_sites: int = 5
+    sitemap_mode: str = "missing"  # none|missing|all
+    max_sitemap_urls: int = 5_000
+    timeout_seconds: float = 60.0
+def load_docs_urls_from_config(config_path: str) -> Dict[str, str]:
+    if not config_path:
+        raise ValueError("config_path must be non-empty")
+    with open(config_path, "r", encoding="utf-8") as fh:
+        data = json.load(fh)
+    raw = data.get("docs_urls", {})
+    if not isinstance(raw, dict):
+        return {}
+    docs_urls: Dict[str, str] = {}
+    for name, value in raw.items():
+        if not isinstance(name, str):
+            continue
+        if isinstance(value, dict):
+            url = str(value.get("url") or "").strip()
+        else:
+            url = str(value or "").strip()
+        if url:
+            docs_urls[name] = url
+    return docs_urls
+def _parse_libraries(value: Optional[str]) -> Optional[list[str]]:
+    if not value:
+        return None
+    parts = [part.strip() for part in value.split(",")]
+    libraries = [part for part in parts if part]
+    return libraries or None
+def _origin_from_url(url: str) -> Optional[str]:
+    parsed = urlparse(url)
+    if not parsed.scheme or not parsed.netloc:
+        return None
+    return f"{parsed.scheme}://{parsed.netloc}"
+async def build_site_index_file(
+    docs_urls: Mapping[str, str],
+    *,
+    output_path: str,
+    gzip_output: bool,
+    settings: SiteIndexBuildSettings,
+    client: Optional[httpx.AsyncClient] = None,
+) -> Dict[str, Any]:
+    if not output_path:
+        raise ValueError("output_path must be non-empty")
+    if settings.sitemap_mode not in {"none", "missing", "all"}:
+        raise ValueError("sitemap_mode must be one of: none, missing, all")
+    site_search._sitemap_cache.clear()
+    site_search._sitemap_locks.clear()
+    site_search._index_cache.clear()
+    site_search._index_locks.clear()
+    original_max_sitemap_urls = getattr(site_search, "_MAX_SITEMAP_URLS", None)
+    if settings.max_sitemap_urls and settings.max_sitemap_urls > 0:
+        site_search._MAX_SITEMAP_URLS = int(settings.max_sitemap_urls)
+    created_client = client is None
+    if client is None:
+        client = httpx.AsyncClient(timeout=httpx.Timeout(settings.timeout_seconds))
+    try:
+        concurrency = max(1, min(int(settings.max_concurrent_sites), 20))
+        semaphore = asyncio.Semaphore(concurrency)
+        results: list[dict[str, Any]] = []
+        async def run_one(library: str, url: str) -> None:
+            async with semaphore:
+                summary = await site_search.preindex_site(
+                    url,
+                    client,
+                    user_agent=settings.user_agent,
+                    include_sitemap=False,
+                )
+                has_index = bool(
+                    summary.get("mkdocs_index") or summary.get("sphinx_index")
+                )
+                include_sitemap = settings.sitemap_mode == "all" or (
+                    settings.sitemap_mode == "missing" and not has_index
+                )
+                if include_sitemap:
+                    origin = summary.get("origin") or _origin_from_url(url)
+                    try:
+                        urls = await site_search._load_site_sitemap_urls(  # type: ignore[attr-defined]
+                            client,
+                            url,
+                            user_agent=settings.user_agent,
+                        )
+                    except Exception as e:
+                        summary.setdefault("errors", []).append(f"sitemap:{e}")
+                    else:
+                        if origin and urls:
+                            site_search._sitemap_cache[origin] = (
+                                site_search._SitemapCacheEntry(  # type: ignore[attr-defined]
+                                    fetched_at=datetime.now(),
+                                    urls=tuple(urls),
+                                )
+                            )
+                            summary["sitemap"] = {"urls": len(urls)}
+                summary["library"] = library
+                results.append(summary)
+        tasks = [
+            run_one(library, url)
+            for library, url in sorted(docs_urls.items(), key=lambda item: item[0])
+            if url
+        ]
+        if tasks:
+            await asyncio.gather(*tasks)
+        site_search.save_preindexed_state(output_path)
+        gz_path: Optional[str] = None
+        if gzip_output:
+            gz_path = f"{output_path}.gz"
+            with open(output_path, "rb") as fh:
+                blob = fh.read()
+            with gzip.open(gz_path, "wb", compresslevel=9) as fh:
+                fh.write(blob)
+        indexed_sites = sum(
+            1
+            for summary in results
+            if summary.get("mkdocs_index")
+            or summary.get("sphinx_index")
+            or summary.get("sitemap")
+        )
+        return {
+            "status": "ok" if indexed_sites else "error",
+            "output_path": output_path,
+            "gzip_path": gz_path,
+            "total_libraries": len(docs_urls),
+            "indexed_libraries": indexed_sites,
+            "results": sorted(results, key=lambda item: str(item.get("library") or "")),
+        }
+    finally:
+        if original_max_sitemap_urls is not None:
+            site_search._MAX_SITEMAP_URLS = original_max_sitemap_urls
+        if created_client:
+            await client.aclose()
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Build docs_site_index.json for releases."
+    )
+    parser.add_argument(
+        "--config",
+        default=os.path.join("src", "documentation_search_enhanced", "config.json"),
+        help="Path to config.json (default: bundled src config).",
+    )
+    parser.add_argument(
+        "--output",
+        default="docs_site_index.json",
+        help="Path to write the index JSON (default: docs_site_index.json).",
+    )
+    parser.add_argument(
+        "--gzip",
+        action="store_true",
+        help="Also write a .gz next to the JSON file.",
+    )
+    parser.add_argument(
+        "--libraries",
+        default=None,
+        help="Comma-separated list of libraries to index (default: all).",
+    )
+    parser.add_argument(
+        "--sitemap",
+        choices=("none", "missing", "all"),
+        default="missing",
+        help="Whether to include sitemap URLs: none, missing, or all (default: missing).",
+    )
+    parser.add_argument(
+        "--max-sitemap-urls",
+        type=int,
+        default=5_000,
+        help="Max sitemap URLs per site when included (default: 5000).",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=5,
+        help="Max concurrent sites to index (default: 5).",
+    )
+    parser.add_argument(
+        "--timeout-seconds",
+        type=float,
+        default=60.0,
+        help="HTTP timeout (default: 60s).",
+    )
+    parser.add_argument(
+        "--user-agent",
+        default="documentation-search-enhanced/index-builder",
+        help="User-Agent header to use for fetches.",
+    )
+    return parser
+def main(argv: Optional[Iterable[str]] = None) -> int:
+    args = _build_parser().parse_args(list(argv) if argv is not None else None)
+    docs_urls = load_docs_urls_from_config(args.config)
+    libraries = _parse_libraries(args.libraries)
+    if libraries:
+        docs_urls = {lib: docs_urls[lib] for lib in libraries if lib in docs_urls}
+    settings = SiteIndexBuildSettings(
+        user_agent=args.user_agent,
+        max_concurrent_sites=args.max_concurrency,
+        sitemap_mode=args.sitemap,
+        max_sitemap_urls=args.max_sitemap_urls,
+        timeout_seconds=args.timeout_seconds,
+    )
+    if not docs_urls:
+        print("No documentation sources found to index.", file=sys.stderr)
+        return 2
+    result = asyncio.run(
+        build_site_index_file(
+            docs_urls,
+            output_path=args.output,
+            gzip_output=bool(args.gzip),
+            settings=settings,
+        )
+    )
+    print(json.dumps(result, indent=2, sort_keys=True), file=sys.stderr)
+    return 0 if result.get("status") == "ok" else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

documentation_search_enhanced/site_index_downloader.py ADDED Viewed

@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""Download and manage the prebuilt docs site index.
+The server can operate without Serper by using docs-native search indexes (MkDocs/Sphinx)
+and/or a prebuilt index file. This module implements an optional auto-download flow
+from GitHub Releases to keep that prebuilt index up to date without requiring users
+to run any indexing commands.
+"""
+from __future__ import annotations
+import gzip
+import json
+import os
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Any, Dict, Optional, Sequence, Tuple
+import httpx
+DEFAULT_RELEASE_REPO = "anton-prosterity/documentation-search-mcp"
+DEFAULT_RELEASE_ASSET_BASENAME = "docs_site_index.json"
+DEFAULT_RELEASE_URLS = (
+    f"https://github.com/{DEFAULT_RELEASE_REPO}/releases/latest/download/{DEFAULT_RELEASE_ASSET_BASENAME}.gz",
+    f"https://github.com/{DEFAULT_RELEASE_REPO}/releases/latest/download/{DEFAULT_RELEASE_ASSET_BASENAME}",
+)
+@dataclass(frozen=True)
+class SiteIndexDownloadSettings:
+    path: str
+    urls: Tuple[str, ...] = DEFAULT_RELEASE_URLS
+    auto_download: bool = True
+    max_age_hours: int = 24 * 7
+def _parse_bool(value: Optional[str], *, default: bool) -> bool:
+    if value is None:
+        return default
+    value = value.strip().lower()
+    if value in {"1", "true", "yes", "y", "on"}:
+        return True
+    if value in {"0", "false", "no", "n", "off"}:
+        return False
+    return default
+def _parse_int(value: Optional[str], *, default: int) -> int:
+    if value is None:
+        return default
+    try:
+        return int(value.strip())
+    except ValueError:
+        return default
+def _split_urls(value: Optional[str]) -> Tuple[str, ...]:
+    if not value:
+        return ()
+    parts = [part.strip() for part in value.split(",")]
+    return tuple(part for part in parts if part)
+def _default_cache_dir() -> Path:
+    xdg_cache = os.getenv("XDG_CACHE_HOME")
+    if xdg_cache:
+        return Path(xdg_cache)
+    home = os.path.expanduser("~")
+    if home and home != "~":
+        return Path(home) / ".cache"
+    return Path(os.getcwd())
+def default_site_index_path(*, cwd: Optional[str] = None) -> str:
+    """Choose a reasonable default path for the prebuilt index file."""
+    cwd_path = Path(cwd or os.getcwd()) / ".docs_site_index.json"
+    if cwd_path.exists():
+        return str(cwd_path)
+    cache_dir = _default_cache_dir() / "documentation-search-enhanced"
+    return str(cache_dir / "docs_site_index.json")
+def load_site_index_settings_from_env(
+    *, cwd: Optional[str] = None
+) -> SiteIndexDownloadSettings:
+    """Load site index settings from environment variables."""
+    path = os.getenv("DOCS_SITE_INDEX_PATH") or default_site_index_path(cwd=cwd)
+    urls = _split_urls(os.getenv("DOCS_SITE_INDEX_URLS"))
+    if not urls:
+        url = os.getenv("DOCS_SITE_INDEX_URL")
+        urls = (url.strip(),) if url and url.strip() else DEFAULT_RELEASE_URLS
+    auto_download = _parse_bool(
+        os.getenv("DOCS_SITE_INDEX_AUTO_DOWNLOAD"), default=True
+    )
+    max_age_hours = _parse_int(
+        os.getenv("DOCS_SITE_INDEX_MAX_AGE_HOURS"), default=24 * 7
+    )
+    return SiteIndexDownloadSettings(
+        path=path,
+        urls=urls,
+        auto_download=auto_download,
+        max_age_hours=max_age_hours,
+    )
+def should_download_site_index(path: str, *, max_age_hours: int) -> bool:
+    """Return True when the on-disk index is missing or older than max_age_hours."""
+    if not path:
+        return False
+    target = Path(path)
+    if not target.exists():
+        return True
+    if max_age_hours <= 0:
+        return True
+    try:
+        mtime = datetime.fromtimestamp(target.stat().st_mtime)
+    except Exception:
+        return True
+    return datetime.now() - mtime > timedelta(hours=max_age_hours)
+def _maybe_decompress_gzip(blob: bytes) -> bytes:
+    if len(blob) >= 2 and blob[0] == 0x1F and blob[1] == 0x8B:
+        return gzip.decompress(blob)
+    return blob
+def _validate_index_payload(payload: Any) -> Dict[str, Any]:
+    if not isinstance(payload, dict):
+        raise ValueError("Index file must be a JSON object")
+    schema_version = payload.get("schema_version")
+    if schema_version != 1:
+        raise ValueError(f"Unsupported index schema_version: {schema_version!r}")
+    if "indexes" not in payload and "sitemaps" not in payload:
+        raise ValueError("Index file missing expected keys")
+    return payload
+async def download_site_index(
+    client: httpx.AsyncClient,
+    *,
+    urls: Sequence[str],
+    dest_path: str,
+    user_agent: str,
+    timeout_seconds: float = 60.0,
+) -> Dict[str, Any]:
+    """Download the latest index from the first working URL and save it to dest_path."""
+    if not dest_path:
+        return {"status": "error", "error": "dest_path is empty"}
+    errors: list[str] = []
+    headers = {"User-Agent": user_agent}
+    for url in urls:
+        try:
+            response = await client.get(
+                url,
+                headers=headers,
+                timeout=httpx.Timeout(timeout_seconds),
+                follow_redirects=True,
+            )
+            if response.status_code == 404:
+                errors.append(f"{url}: 404")
+                continue
+            response.raise_for_status()
+            blob = _maybe_decompress_gzip(response.content)
+            payload = json.loads(blob.decode("utf-8"))
+            _validate_index_payload(payload)
+            target = Path(dest_path)
+            target.parent.mkdir(parents=True, exist_ok=True)
+            tmp_path = target.with_suffix(target.suffix + ".tmp")
+            tmp_path.write_bytes(blob)
+            tmp_path.replace(target)
+            return {
+                "status": "downloaded",
+                "url": url,
+                "bytes": len(blob),
+            }
+        except Exception as e:
+            errors.append(f"{url}: {e}")
+    return {"status": "error", "errors": errors}
+async def ensure_site_index_file(
+    client: httpx.AsyncClient,
+    *,
+    settings: SiteIndexDownloadSettings,
+    user_agent: str,
+) -> Dict[str, Any]:
+    """Ensure a reasonably fresh site index exists on disk (download if needed)."""
+    if not settings.auto_download:
+        return {
+            "status": "skipped",
+            "reason": "auto_download disabled",
+            "path": settings.path,
+        }
+    if not should_download_site_index(
+        settings.path, max_age_hours=settings.max_age_hours
+    ):
+        return {"status": "ok", "reason": "up_to_date", "path": settings.path}
+    result = await download_site_index(
+        client,
+        urls=settings.urls,
+        dest_path=settings.path,
+        user_agent=user_agent,
+    )
+    result["path"] = settings.path
+    return result