iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. documentation_search_enhanced/__init__.py +14 -0
  2. documentation_search_enhanced/__main__.py +6 -0
  3. documentation_search_enhanced/config.json +1674 -0
  4. documentation_search_enhanced/config_manager.py +233 -0
  5. documentation_search_enhanced/config_validator.py +79 -0
  6. documentation_search_enhanced/content_enhancer.py +578 -0
  7. documentation_search_enhanced/docker_manager.py +87 -0
  8. documentation_search_enhanced/logger.py +179 -0
  9. documentation_search_enhanced/main.py +2170 -0
  10. documentation_search_enhanced/project_generator.py +260 -0
  11. documentation_search_enhanced/project_scanner.py +85 -0
  12. documentation_search_enhanced/reranker.py +230 -0
  13. documentation_search_enhanced/site_index_builder.py +274 -0
  14. documentation_search_enhanced/site_index_downloader.py +222 -0
  15. documentation_search_enhanced/site_search.py +1325 -0
  16. documentation_search_enhanced/smart_search.py +473 -0
  17. documentation_search_enhanced/snyk_integration.py +657 -0
  18. documentation_search_enhanced/vector_search.py +303 -0
  19. documentation_search_enhanced/version_resolver.py +189 -0
  20. documentation_search_enhanced/vulnerability_scanner.py +545 -0
  21. documentation_search_enhanced/web_scraper.py +117 -0
  22. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
  23. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
  24. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
  25. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
  26. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,303 @@
1
+ """Vector search engine for semantic documentation search using sentence transformers and FAISS."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Dict, List, Optional
6
+
7
+ if TYPE_CHECKING:
8
+ import numpy as np
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Try to import vector search dependencies (optional)
13
+ try:
14
+ import faiss
15
+ import numpy as np
16
+ from sentence_transformers import SentenceTransformer
17
+
18
+ VECTOR_SEARCH_AVAILABLE = True
19
+ except ImportError as e:
20
+ VECTOR_SEARCH_AVAILABLE = False
21
+ logger.warning(
22
+ f"Vector search dependencies not available: {e}. "
23
+ "Install with: pip install documentation-search-enhanced[vector]"
24
+ )
25
+
26
+
27
+ class SearchResult:
28
+ """Container for search results with score and metadata."""
29
+
30
+ def __init__(
31
+ self,
32
+ doc_id: str,
33
+ content: str,
34
+ score: float,
35
+ metadata: Optional[Dict] = None,
36
+ ):
37
+ self.doc_id = doc_id
38
+ self.content = content
39
+ self.score = score
40
+ self.metadata = metadata or {}
41
+
42
+ def to_dict(self) -> Dict:
43
+ """Convert to dictionary representation."""
44
+ return {
45
+ "doc_id": self.doc_id,
46
+ "content": self.content,
47
+ "score": self.score,
48
+ "metadata": self.metadata,
49
+ }
50
+
51
+
52
+ class VectorSearchEngine:
53
+ """
54
+ Semantic search engine using sentence transformers for embeddings and FAISS for vector similarity.
55
+
56
+ Uses the all-MiniLM-L6-v2 model which provides:
57
+ - 384-dimensional embeddings
58
+ - Good balance between speed and quality
59
+ - ~120MB model size
60
+ - Optimized for semantic search
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ model_name: str = "all-MiniLM-L6-v2",
66
+ index_path: Optional[Path] = None,
67
+ ):
68
+ """
69
+ Initialize the vector search engine.
70
+
71
+ Args:
72
+ model_name: Name of the sentence-transformers model to use
73
+ index_path: Optional path to save/load FAISS index
74
+ """
75
+ if not VECTOR_SEARCH_AVAILABLE:
76
+ raise ImportError(
77
+ "Vector search dependencies not installed. "
78
+ "Install with: pip install documentation-search-enhanced[vector]"
79
+ )
80
+
81
+ self.model_name = model_name
82
+ self.index_path = index_path
83
+ self.dimension = 384 # all-MiniLM-L6-v2 embedding dimension
84
+
85
+ logger.info(f"Loading sentence transformer model: {model_name}")
86
+ self.model = SentenceTransformer(model_name)
87
+
88
+ # Initialize FAISS index (L2 distance for cosine similarity)
89
+ self.index = faiss.IndexFlatL2(self.dimension)
90
+
91
+ # Document store: maps index position to document data
92
+ self.doc_store: Dict[int, Dict] = {}
93
+ self.next_id = 0
94
+
95
+ # Load existing index if path provided
96
+ if index_path and index_path.exists():
97
+ self.load_index(index_path)
98
+
99
+ def embed_documents(self, documents: List[str]) -> "np.ndarray":
100
+ """
101
+ Generate embeddings for a list of documents.
102
+
103
+ Args:
104
+ documents: List of text documents to embed
105
+
106
+ Returns:
107
+ numpy array of shape (n_documents, embedding_dimension)
108
+ """
109
+ logger.debug(f"Embedding {len(documents)} documents")
110
+ embeddings = self.model.encode(
111
+ documents,
112
+ convert_to_numpy=True,
113
+ show_progress_bar=len(documents) > 100,
114
+ )
115
+ return embeddings
116
+
117
+ def add_documents(
118
+ self,
119
+ documents: List[str],
120
+ metadata: Optional[List[Dict]] = None,
121
+ doc_ids: Optional[List[str]] = None,
122
+ ) -> List[int]:
123
+ """
124
+ Add documents to the vector index.
125
+
126
+ Args:
127
+ documents: List of text documents to index
128
+ metadata: Optional list of metadata dicts for each document
129
+ doc_ids: Optional list of custom document IDs
130
+
131
+ Returns:
132
+ List of internal index IDs for the added documents
133
+ """
134
+ if not documents:
135
+ return []
136
+
137
+ # Generate embeddings
138
+ embeddings = self.embed_documents(documents)
139
+
140
+ # Normalize embeddings for cosine similarity
141
+ faiss.normalize_L2(embeddings)
142
+
143
+ # Add to FAISS index
144
+ start_id = self.next_id
145
+ self.index.add(embeddings)
146
+
147
+ # Store document data
148
+ metadata = metadata or [{} for _ in documents]
149
+ doc_ids = doc_ids or [f"doc_{start_id + i}" for i in range(len(documents))]
150
+
151
+ index_ids = []
152
+ for i, (doc, meta, doc_id) in enumerate(zip(documents, metadata, doc_ids)):
153
+ internal_id = start_id + i
154
+ self.doc_store[internal_id] = {
155
+ "doc_id": doc_id,
156
+ "content": doc,
157
+ "metadata": meta,
158
+ }
159
+ index_ids.append(internal_id)
160
+
161
+ self.next_id += len(documents)
162
+ logger.info(
163
+ f"Added {len(documents)} documents to index (total: {self.next_id})"
164
+ )
165
+
166
+ return index_ids
167
+
168
+ def search(
169
+ self,
170
+ query: str,
171
+ top_k: int = 10,
172
+ score_threshold: Optional[float] = None,
173
+ ) -> List[SearchResult]:
174
+ """
175
+ Perform semantic search for similar documents.
176
+
177
+ Args:
178
+ query: Search query text
179
+ top_k: Number of top results to return
180
+ score_threshold: Optional minimum similarity score (0-1, higher is more similar)
181
+
182
+ Returns:
183
+ List of SearchResult objects sorted by relevance
184
+ """
185
+ if self.index.ntotal == 0:
186
+ logger.warning("No documents in index")
187
+ return []
188
+
189
+ # Generate query embedding
190
+ query_embedding = self.model.encode([query], convert_to_numpy=True)
191
+ faiss.normalize_L2(query_embedding)
192
+
193
+ # Search FAISS index
194
+ k = min(top_k, self.index.ntotal)
195
+ distances, indices = self.index.search(query_embedding, k)
196
+
197
+ # Convert to SearchResult objects
198
+ results = []
199
+ for distance, idx in zip(distances[0], indices[0]):
200
+ if idx == -1: # FAISS returns -1 for empty slots
201
+ continue
202
+
203
+ doc_data = self.doc_store.get(int(idx))
204
+ if not doc_data:
205
+ continue
206
+
207
+ # Convert L2 distance to similarity score (0-1, higher is better)
208
+ # For normalized vectors: L2 distance = sqrt(2 - 2*cosine_similarity)
209
+ # So: similarity = 1 - (distance^2 / 2)
210
+ similarity = 1 - (distance**2 / 2)
211
+
212
+ # Apply score threshold if provided
213
+ if score_threshold is not None and similarity < score_threshold:
214
+ continue
215
+
216
+ results.append(
217
+ SearchResult(
218
+ doc_id=doc_data["doc_id"],
219
+ content=doc_data["content"],
220
+ score=float(similarity),
221
+ metadata=doc_data["metadata"],
222
+ )
223
+ )
224
+
225
+ logger.debug(f"Found {len(results)} results for query: {query[:50]}...")
226
+ return results
227
+
228
+ def save_index(self, path: Optional[Path] = None):
229
+ """
230
+ Save FAISS index and document store to disk.
231
+
232
+ Args:
233
+ path: Path to save index (uses self.index_path if not provided)
234
+ """
235
+ save_path = path or self.index_path
236
+ if not save_path:
237
+ raise ValueError("No index path provided")
238
+
239
+ save_path = Path(save_path)
240
+ save_path.parent.mkdir(parents=True, exist_ok=True)
241
+
242
+ # Save FAISS index
243
+ faiss.write_index(self.index, str(save_path))
244
+
245
+ # Save document store
246
+ import pickle
247
+
248
+ doc_store_path = save_path.with_suffix(".docstore")
249
+ with open(doc_store_path, "wb") as f:
250
+ pickle.dump(
251
+ {"doc_store": self.doc_store, "next_id": self.next_id},
252
+ f,
253
+ )
254
+
255
+ logger.info(f"Saved index to {save_path}")
256
+
257
+ def load_index(self, path: Path):
258
+ """
259
+ Load FAISS index and document store from disk.
260
+
261
+ Args:
262
+ path: Path to load index from
263
+ """
264
+ path = Path(path)
265
+ if not path.exists():
266
+ raise FileNotFoundError(f"Index not found at {path}")
267
+
268
+ # Load FAISS index
269
+ self.index = faiss.read_index(str(path))
270
+
271
+ # Load document store
272
+ import pickle
273
+
274
+ doc_store_path = path.with_suffix(".docstore")
275
+ with open(doc_store_path, "rb") as f:
276
+ data = pickle.load(f)
277
+ self.doc_store = data["doc_store"]
278
+ self.next_id = data["next_id"]
279
+
280
+ logger.info(f"Loaded index from {path} ({self.index.ntotal} documents)")
281
+
282
+ def clear(self):
283
+ """Clear all documents from the index."""
284
+ self.index = faiss.IndexFlatL2(self.dimension)
285
+ self.doc_store = {}
286
+ self.next_id = 0
287
+ logger.info("Cleared vector index")
288
+
289
+ def __len__(self) -> int:
290
+ """Return number of documents in index."""
291
+ return self.index.ntotal
292
+
293
+
294
+ # Global instance for reuse
295
+ _vector_engine: Optional[VectorSearchEngine] = None
296
+
297
+
298
+ def get_vector_engine() -> VectorSearchEngine:
299
+ """Get or create the global vector search engine instance."""
300
+ global _vector_engine
301
+ if _vector_engine is None:
302
+ _vector_engine = VectorSearchEngine()
303
+ return _vector_engine
@@ -0,0 +1,189 @@
1
+ """Version resolution for detecting installed package versions."""
2
+
3
+ import asyncio
4
+ import json
5
+ import re
6
+ from typing import Optional, Dict
7
+ from pathlib import Path
8
+ import sys
9
+
10
+
11
+ class VersionResolver:
12
+ """Resolves library versions from installed packages and project files."""
13
+
14
+ def __init__(self):
15
+ self._cache: Dict[str, str] = {}
16
+ self._timeout = 5
17
+
18
+ async def resolve_version(
19
+ self,
20
+ library: str,
21
+ requested_version: str,
22
+ auto_detect: bool = True,
23
+ project_path: str = ".",
24
+ ) -> str:
25
+ """Resolve final version to use for documentation search.
26
+
27
+ Priority: explicit version > auto-detected > "latest"
28
+ """
29
+ if requested_version != "latest":
30
+ return requested_version
31
+
32
+ if auto_detect:
33
+ cache_key = f"{library}:{project_path}"
34
+ if cache_key in self._cache:
35
+ return self._cache[cache_key]
36
+
37
+ installed_version = await self.detect_installed_version(library)
38
+ if installed_version:
39
+ self._cache[cache_key] = installed_version
40
+ return installed_version
41
+
42
+ project_version = await self.detect_from_project(library, project_path)
43
+ if project_version:
44
+ self._cache[cache_key] = project_version
45
+ return project_version
46
+
47
+ return "latest"
48
+
49
+ async def detect_installed_version(self, library: str) -> Optional[str]:
50
+ """Detect version from pip, npm, or Python import."""
51
+ if pip_version := await self._try_pip_show(library):
52
+ return pip_version
53
+ if npm_version := await self._try_npm_list(library):
54
+ return npm_version
55
+ if py_version := await self._try_python_import(library):
56
+ return py_version
57
+ return None
58
+
59
+ async def _run_subprocess(
60
+ self, *cmd: str, timeout: Optional[int] = None
61
+ ) -> Optional[str]:
62
+ """Run subprocess with timeout handling."""
63
+ try:
64
+ proc = await asyncio.create_subprocess_exec(
65
+ *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
66
+ )
67
+ stdout, _ = await asyncio.wait_for(
68
+ proc.communicate(), timeout=timeout or self._timeout
69
+ )
70
+ if proc.returncode == 0:
71
+ return stdout.decode().strip()
72
+ except (asyncio.TimeoutError, Exception):
73
+ pass
74
+ return None
75
+
76
+ def _to_major_minor(self, version: str) -> str:
77
+ """Convert version to major.minor format."""
78
+ parts = version.split(".")
79
+ if len(parts) >= 2:
80
+ return f"{parts[0]}.{parts[1]}"
81
+ return version
82
+
83
+ async def _try_pip_show(self, package: str) -> Optional[str]:
84
+ """Get version via pip show."""
85
+ output = await self._run_subprocess(
86
+ sys.executable, "-m", "pip", "show", package
87
+ )
88
+ if output:
89
+ if match := re.search(r"Version:\s*(\S+)", output):
90
+ return self._to_major_minor(match.group(1))
91
+ return None
92
+
93
+ async def _try_npm_list(self, package: str) -> Optional[str]:
94
+ """Get version via npm list."""
95
+ output = await self._run_subprocess(
96
+ "npm", "list", package, "--depth=0", "--json"
97
+ )
98
+ if output:
99
+ try:
100
+ data = json.loads(output)
101
+ if package in data.get("dependencies", {}):
102
+ version = (
103
+ data["dependencies"][package].get("version", "").lstrip("^~")
104
+ )
105
+ return self._to_major_minor(version)
106
+ except json.JSONDecodeError:
107
+ pass
108
+ return None
109
+
110
+ async def _try_python_import(self, package: str) -> Optional[str]:
111
+ """Get version via Python import."""
112
+ output = await self._run_subprocess(
113
+ sys.executable,
114
+ "-c",
115
+ f"import {package}; print(getattr({package}, '__version__', ''))",
116
+ )
117
+ if output:
118
+ return self._to_major_minor(output)
119
+ return None
120
+
121
+ async def detect_from_project(
122
+ self, library: str, project_path: str
123
+ ) -> Optional[str]:
124
+ """Parse project dependency files for version."""
125
+ project = Path(project_path)
126
+
127
+ if (pyproject := project / "pyproject.toml").exists():
128
+ if version := await self._parse_pyproject(pyproject, library):
129
+ return version
130
+
131
+ if (requirements := project / "requirements.txt").exists():
132
+ if version := await self._parse_requirements(requirements, library):
133
+ return version
134
+
135
+ if (package_json := project / "package.json").exists():
136
+ if version := await self._parse_package_json(package_json, library):
137
+ return version
138
+
139
+ return None
140
+
141
+ async def _parse_pyproject(self, path: Path, library: str) -> Optional[str]:
142
+ """Parse pyproject.toml for library version."""
143
+ try:
144
+ import tomllib
145
+
146
+ with open(path, "rb") as f:
147
+ data = tomllib.load(f)
148
+
149
+ deps = data.get("project", {}).get("dependencies", [])
150
+ for dep in deps:
151
+ if library.lower() in dep.lower():
152
+ if match := re.search(r">=?(\d+\.\d+)", dep):
153
+ return match.group(1)
154
+ except Exception:
155
+ pass
156
+ return None
157
+
158
+ async def _parse_requirements(self, path: Path, library: str) -> Optional[str]:
159
+ """Parse requirements.txt for library version."""
160
+ try:
161
+ with open(path, "r") as f:
162
+ for line in f:
163
+ if library.lower() in line.strip().lower():
164
+ if match := re.search(r">=?(\d+\.\d+)", line):
165
+ return match.group(1)
166
+ except Exception:
167
+ pass
168
+ return None
169
+
170
+ async def _parse_package_json(self, path: Path, library: str) -> Optional[str]:
171
+ """Parse package.json for library version."""
172
+ try:
173
+ with open(path, "r") as f:
174
+ data = json.load(f)
175
+
176
+ for dep_type in ["dependencies", "devDependencies"]:
177
+ if library in data.get(dep_type, {}):
178
+ version = data[dep_type][library].lstrip("^~")
179
+ return self._to_major_minor(version)
180
+ except Exception:
181
+ pass
182
+ return None
183
+
184
+ def clear_cache(self):
185
+ """Clear version resolution cache."""
186
+ self._cache.clear()
187
+
188
+
189
+ version_resolver = VersionResolver()