PyPI - iflow-mcp_anton-prosterity-documentation-search-enhanced - Versions diffs - 1.9.0__py3-none-any.whl - Mend

iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

documentation_search_enhanced/__init__.py +14 -0
documentation_search_enhanced/__main__.py +6 -0
documentation_search_enhanced/config.json +1674 -0
documentation_search_enhanced/config_manager.py +233 -0
documentation_search_enhanced/config_validator.py +79 -0
documentation_search_enhanced/content_enhancer.py +578 -0
documentation_search_enhanced/docker_manager.py +87 -0
documentation_search_enhanced/logger.py +179 -0
documentation_search_enhanced/main.py +2170 -0
documentation_search_enhanced/project_generator.py +260 -0
documentation_search_enhanced/project_scanner.py +85 -0
documentation_search_enhanced/reranker.py +230 -0
documentation_search_enhanced/site_index_builder.py +274 -0
documentation_search_enhanced/site_index_downloader.py +222 -0
documentation_search_enhanced/site_search.py +1325 -0
documentation_search_enhanced/smart_search.py +473 -0
documentation_search_enhanced/snyk_integration.py +657 -0
documentation_search_enhanced/vector_search.py +303 -0
documentation_search_enhanced/version_resolver.py +189 -0
documentation_search_enhanced/vulnerability_scanner.py +545 -0
documentation_search_enhanced/web_scraper.py +117 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0

documentation_search_enhanced/vector_search.py ADDED Viewed

@@ -0,0 +1,303 @@
+"""Vector search engine for semantic documentation search using sentence transformers and FAISS."""
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional
+if TYPE_CHECKING:
+    import numpy as np
+logger = logging.getLogger(__name__)
+# Try to import vector search dependencies (optional)
+try:
+    import faiss
+    import numpy as np
+    from sentence_transformers import SentenceTransformer
+    VECTOR_SEARCH_AVAILABLE = True
+except ImportError as e:
+    VECTOR_SEARCH_AVAILABLE = False
+    logger.warning(
+        f"Vector search dependencies not available: {e}. "
+        "Install with: pip install documentation-search-enhanced[vector]"
+    )
+class SearchResult:
+    """Container for search results with score and metadata."""
+    def __init__(
+        self,
+        doc_id: str,
+        content: str,
+        score: float,
+        metadata: Optional[Dict] = None,
+    ):
+        self.doc_id = doc_id
+        self.content = content
+        self.score = score
+        self.metadata = metadata or {}
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "doc_id": self.doc_id,
+            "content": self.content,
+            "score": self.score,
+            "metadata": self.metadata,
+        }
+class VectorSearchEngine:
+    """
+    Semantic search engine using sentence transformers for embeddings and FAISS for vector similarity.
+    Uses the all-MiniLM-L6-v2 model which provides:
+    - 384-dimensional embeddings
+    - Good balance between speed and quality
+    - ~120MB model size
+    - Optimized for semantic search
+    """
+    def __init__(
+        self,
+        model_name: str = "all-MiniLM-L6-v2",
+        index_path: Optional[Path] = None,
+    ):
+        """
+        Initialize the vector search engine.
+        Args:
+            model_name: Name of the sentence-transformers model to use
+            index_path: Optional path to save/load FAISS index
+        """
+        if not VECTOR_SEARCH_AVAILABLE:
+            raise ImportError(
+                "Vector search dependencies not installed. "
+                "Install with: pip install documentation-search-enhanced[vector]"
+            )
+        self.model_name = model_name
+        self.index_path = index_path
+        self.dimension = 384  # all-MiniLM-L6-v2 embedding dimension
+        logger.info(f"Loading sentence transformer model: {model_name}")
+        self.model = SentenceTransformer(model_name)
+        # Initialize FAISS index (L2 distance for cosine similarity)
+        self.index = faiss.IndexFlatL2(self.dimension)
+        # Document store: maps index position to document data
+        self.doc_store: Dict[int, Dict] = {}
+        self.next_id = 0
+        # Load existing index if path provided
+        if index_path and index_path.exists():
+            self.load_index(index_path)
+    def embed_documents(self, documents: List[str]) -> "np.ndarray":
+        """
+        Generate embeddings for a list of documents.
+        Args:
+            documents: List of text documents to embed
+        Returns:
+            numpy array of shape (n_documents, embedding_dimension)
+        """
+        logger.debug(f"Embedding {len(documents)} documents")
+        embeddings = self.model.encode(
+            documents,
+            convert_to_numpy=True,
+            show_progress_bar=len(documents) > 100,
+        )
+        return embeddings
+    def add_documents(
+        self,
+        documents: List[str],
+        metadata: Optional[List[Dict]] = None,
+        doc_ids: Optional[List[str]] = None,
+    ) -> List[int]:
+        """
+        Add documents to the vector index.
+        Args:
+            documents: List of text documents to index
+            metadata: Optional list of metadata dicts for each document
+            doc_ids: Optional list of custom document IDs
+        Returns:
+            List of internal index IDs for the added documents
+        """
+        if not documents:
+            return []
+        # Generate embeddings
+        embeddings = self.embed_documents(documents)
+        # Normalize embeddings for cosine similarity
+        faiss.normalize_L2(embeddings)
+        # Add to FAISS index
+        start_id = self.next_id
+        self.index.add(embeddings)
+        # Store document data
+        metadata = metadata or [{} for _ in documents]
+        doc_ids = doc_ids or [f"doc_{start_id + i}" for i in range(len(documents))]
+        index_ids = []
+        for i, (doc, meta, doc_id) in enumerate(zip(documents, metadata, doc_ids)):
+            internal_id = start_id + i
+            self.doc_store[internal_id] = {
+                "doc_id": doc_id,
+                "content": doc,
+                "metadata": meta,
+            }
+            index_ids.append(internal_id)
+        self.next_id += len(documents)
+        logger.info(
+            f"Added {len(documents)} documents to index (total: {self.next_id})"
+        )
+        return index_ids
+    def search(
+        self,
+        query: str,
+        top_k: int = 10,
+        score_threshold: Optional[float] = None,
+    ) -> List[SearchResult]:
+        """
+        Perform semantic search for similar documents.
+        Args:
+            query: Search query text
+            top_k: Number of top results to return
+            score_threshold: Optional minimum similarity score (0-1, higher is more similar)
+        Returns:
+            List of SearchResult objects sorted by relevance
+        """
+        if self.index.ntotal == 0:
+            logger.warning("No documents in index")
+            return []
+        # Generate query embedding
+        query_embedding = self.model.encode([query], convert_to_numpy=True)
+        faiss.normalize_L2(query_embedding)
+        # Search FAISS index
+        k = min(top_k, self.index.ntotal)
+        distances, indices = self.index.search(query_embedding, k)
+        # Convert to SearchResult objects
+        results = []
+        for distance, idx in zip(distances[0], indices[0]):
+            if idx == -1:  # FAISS returns -1 for empty slots
+                continue
+            doc_data = self.doc_store.get(int(idx))
+            if not doc_data:
+                continue
+            # Convert L2 distance to similarity score (0-1, higher is better)
+            # For normalized vectors: L2 distance = sqrt(2 - 2*cosine_similarity)
+            # So: similarity = 1 - (distance^2 / 2)
+            similarity = 1 - (distance**2 / 2)
+            # Apply score threshold if provided
+            if score_threshold is not None and similarity < score_threshold:
+                continue
+            results.append(
+                SearchResult(
+                    doc_id=doc_data["doc_id"],
+                    content=doc_data["content"],
+                    score=float(similarity),
+                    metadata=doc_data["metadata"],
+                )
+            )
+        logger.debug(f"Found {len(results)} results for query: {query[:50]}...")
+        return results
+    def save_index(self, path: Optional[Path] = None):
+        """
+        Save FAISS index and document store to disk.
+        Args:
+            path: Path to save index (uses self.index_path if not provided)
+        """
+        save_path = path or self.index_path
+        if not save_path:
+            raise ValueError("No index path provided")
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        # Save FAISS index
+        faiss.write_index(self.index, str(save_path))
+        # Save document store
+        import pickle
+        doc_store_path = save_path.with_suffix(".docstore")
+        with open(doc_store_path, "wb") as f:
+            pickle.dump(
+                {"doc_store": self.doc_store, "next_id": self.next_id},
+                f,
+            )
+        logger.info(f"Saved index to {save_path}")
+    def load_index(self, path: Path):
+        """
+        Load FAISS index and document store from disk.
+        Args:
+            path: Path to load index from
+        """
+        path = Path(path)
+        if not path.exists():
+            raise FileNotFoundError(f"Index not found at {path}")
+        # Load FAISS index
+        self.index = faiss.read_index(str(path))
+        # Load document store
+        import pickle
+        doc_store_path = path.with_suffix(".docstore")
+        with open(doc_store_path, "rb") as f:
+            data = pickle.load(f)
+            self.doc_store = data["doc_store"]
+            self.next_id = data["next_id"]
+        logger.info(f"Loaded index from {path} ({self.index.ntotal} documents)")
+    def clear(self):
+        """Clear all documents from the index."""
+        self.index = faiss.IndexFlatL2(self.dimension)
+        self.doc_store = {}
+        self.next_id = 0
+        logger.info("Cleared vector index")
+    def __len__(self) -> int:
+        """Return number of documents in index."""
+        return self.index.ntotal
+# Global instance for reuse
+_vector_engine: Optional[VectorSearchEngine] = None
+def get_vector_engine() -> VectorSearchEngine:
+    """Get or create the global vector search engine instance."""
+    global _vector_engine
+    if _vector_engine is None:
+        _vector_engine = VectorSearchEngine()
+    return _vector_engine

documentation_search_enhanced/version_resolver.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""Version resolution for detecting installed package versions."""
+import asyncio
+import json
+import re
+from typing import Optional, Dict
+from pathlib import Path
+import sys
+class VersionResolver:
+    """Resolves library versions from installed packages and project files."""
+    def __init__(self):
+        self._cache: Dict[str, str] = {}
+        self._timeout = 5
+    async def resolve_version(
+        self,
+        library: str,
+        requested_version: str,
+        auto_detect: bool = True,
+        project_path: str = ".",
+    ) -> str:
+        """Resolve final version to use for documentation search.
+        Priority: explicit version > auto-detected > "latest"
+        """
+        if requested_version != "latest":
+            return requested_version
+        if auto_detect:
+            cache_key = f"{library}:{project_path}"
+            if cache_key in self._cache:
+                return self._cache[cache_key]
+            installed_version = await self.detect_installed_version(library)
+            if installed_version:
+                self._cache[cache_key] = installed_version
+                return installed_version
+            project_version = await self.detect_from_project(library, project_path)
+            if project_version:
+                self._cache[cache_key] = project_version
+                return project_version
+        return "latest"
+    async def detect_installed_version(self, library: str) -> Optional[str]:
+        """Detect version from pip, npm, or Python import."""
+        if pip_version := await self._try_pip_show(library):
+            return pip_version
+        if npm_version := await self._try_npm_list(library):
+            return npm_version
+        if py_version := await self._try_python_import(library):
+            return py_version
+        return None
+    async def _run_subprocess(
+        self, *cmd: str, timeout: Optional[int] = None
+    ) -> Optional[str]:
+        """Run subprocess with timeout handling."""
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            stdout, _ = await asyncio.wait_for(
+                proc.communicate(), timeout=timeout or self._timeout
+            )
+            if proc.returncode == 0:
+                return stdout.decode().strip()
+        except (asyncio.TimeoutError, Exception):
+            pass
+        return None
+    def _to_major_minor(self, version: str) -> str:
+        """Convert version to major.minor format."""
+        parts = version.split(".")
+        if len(parts) >= 2:
+            return f"{parts[0]}.{parts[1]}"
+        return version
+    async def _try_pip_show(self, package: str) -> Optional[str]:
+        """Get version via pip show."""
+        output = await self._run_subprocess(
+            sys.executable, "-m", "pip", "show", package
+        )
+        if output:
+            if match := re.search(r"Version:\s*(\S+)", output):
+                return self._to_major_minor(match.group(1))
+        return None
+    async def _try_npm_list(self, package: str) -> Optional[str]:
+        """Get version via npm list."""
+        output = await self._run_subprocess(
+            "npm", "list", package, "--depth=0", "--json"
+        )
+        if output:
+            try:
+                data = json.loads(output)
+                if package in data.get("dependencies", {}):
+                    version = (
+                        data["dependencies"][package].get("version", "").lstrip("^~")
+                    )
+                    return self._to_major_minor(version)
+            except json.JSONDecodeError:
+                pass
+        return None
+    async def _try_python_import(self, package: str) -> Optional[str]:
+        """Get version via Python import."""
+        output = await self._run_subprocess(
+            sys.executable,
+            "-c",
+            f"import {package}; print(getattr({package}, '__version__', ''))",
+        )
+        if output:
+            return self._to_major_minor(output)
+        return None
+    async def detect_from_project(
+        self, library: str, project_path: str
+    ) -> Optional[str]:
+        """Parse project dependency files for version."""
+        project = Path(project_path)
+        if (pyproject := project / "pyproject.toml").exists():
+            if version := await self._parse_pyproject(pyproject, library):
+                return version
+        if (requirements := project / "requirements.txt").exists():
+            if version := await self._parse_requirements(requirements, library):
+                return version
+        if (package_json := project / "package.json").exists():
+            if version := await self._parse_package_json(package_json, library):
+                return version
+        return None
+    async def _parse_pyproject(self, path: Path, library: str) -> Optional[str]:
+        """Parse pyproject.toml for library version."""
+        try:
+            import tomllib
+            with open(path, "rb") as f:
+                data = tomllib.load(f)
+            deps = data.get("project", {}).get("dependencies", [])
+            for dep in deps:
+                if library.lower() in dep.lower():
+                    if match := re.search(r">=?(\d+\.\d+)", dep):
+                        return match.group(1)
+        except Exception:
+            pass
+        return None
+    async def _parse_requirements(self, path: Path, library: str) -> Optional[str]:
+        """Parse requirements.txt for library version."""
+        try:
+            with open(path, "r") as f:
+                for line in f:
+                    if library.lower() in line.strip().lower():
+                        if match := re.search(r">=?(\d+\.\d+)", line):
+                            return match.group(1)
+        except Exception:
+            pass
+        return None
+    async def _parse_package_json(self, path: Path, library: str) -> Optional[str]:
+        """Parse package.json for library version."""
+        try:
+            with open(path, "r") as f:
+                data = json.load(f)
+            for dep_type in ["dependencies", "devDependencies"]:
+                if library in data.get(dep_type, {}):
+                    version = data[dep_type][library].lstrip("^~")
+                    return self._to_major_minor(version)
+        except Exception:
+            pass
+        return None
+    def clear_cache(self):
+        """Clear version resolution cache."""
+        self._cache.clear()
+version_resolver = VersionResolver()