iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. documentation_search_enhanced/__init__.py +14 -0
  2. documentation_search_enhanced/__main__.py +6 -0
  3. documentation_search_enhanced/config.json +1674 -0
  4. documentation_search_enhanced/config_manager.py +233 -0
  5. documentation_search_enhanced/config_validator.py +79 -0
  6. documentation_search_enhanced/content_enhancer.py +578 -0
  7. documentation_search_enhanced/docker_manager.py +87 -0
  8. documentation_search_enhanced/logger.py +179 -0
  9. documentation_search_enhanced/main.py +2170 -0
  10. documentation_search_enhanced/project_generator.py +260 -0
  11. documentation_search_enhanced/project_scanner.py +85 -0
  12. documentation_search_enhanced/reranker.py +230 -0
  13. documentation_search_enhanced/site_index_builder.py +274 -0
  14. documentation_search_enhanced/site_index_downloader.py +222 -0
  15. documentation_search_enhanced/site_search.py +1325 -0
  16. documentation_search_enhanced/smart_search.py +473 -0
  17. documentation_search_enhanced/snyk_integration.py +657 -0
  18. documentation_search_enhanced/vector_search.py +303 -0
  19. documentation_search_enhanced/version_resolver.py +189 -0
  20. documentation_search_enhanced/vulnerability_scanner.py +545 -0
  21. documentation_search_enhanced/web_scraper.py +117 -0
  22. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
  23. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
  24. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
  25. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
  26. iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,473 @@
1
+ """
2
+ Smart search features for documentation-search-enhanced MCP server.
3
+ Adds semantic search, relevance ranking, and contextual filtering.
4
+ """
5
+
6
+ import re
7
+ from typing import Dict, List, Any, Optional, Callable, Awaitable
8
+ from dataclasses import dataclass
9
+
10
+
11
+ @dataclass
12
+ class SearchResult:
13
+ """Enhanced search result with relevance scoring"""
14
+
15
+ source_library: str
16
+ url: str
17
+ title: str
18
+ snippet: str
19
+ relevance_score: float
20
+ content_type: str # "tutorial", "reference", "example", "guide"
21
+ difficulty_level: str # "beginner", "intermediate", "advanced"
22
+ code_snippets_count: int
23
+ estimated_read_time: int # in minutes
24
+
25
+
26
+ class SmartSearch:
27
+ """Enhanced search with semantic understanding and ranking"""
28
+
29
+ def __init__(self):
30
+ self.search_history = []
31
+ self.user_preferences = {}
32
+ self._docs_url_map: Dict[str, str] = {}
33
+ self._search_fn: Optional[Callable[[str, int], Awaitable[Dict[str, Any]]]] = (
34
+ None
35
+ )
36
+ self._results_limit = 5
37
+
38
+ def configure(
39
+ self,
40
+ docs_url_map: Dict[str, str],
41
+ search_fn: Callable[[str, int], Awaitable[Dict[str, Any]]],
42
+ results_limit: int = 5,
43
+ ) -> None:
44
+ """Attach configuration provided by the runtime server."""
45
+
46
+ self._docs_url_map = docs_url_map
47
+ self._search_fn = search_fn
48
+ self._results_limit = results_limit
49
+
50
+ async def semantic_search(
51
+ self, query: str, library: str, context: Optional[str] = None
52
+ ) -> List[SearchResult]:
53
+ """Perform semantic search with context awareness"""
54
+
55
+ # Expand query with semantic understanding
56
+ expanded_query = self.expand_query_semantically(query, library, context)
57
+
58
+ # Search with expanded query
59
+ base_query = f"site:{self.get_docs_url(library)} {expanded_query}"
60
+
61
+ # Perform the actual search (using existing search infrastructure)
62
+ raw_results = await self.perform_search(base_query)
63
+
64
+ # Enhance and rank results
65
+ enhanced_results = []
66
+ for result in raw_results:
67
+ enhanced_result = await self.enhance_search_result(result, query, library)
68
+ enhanced_results.append(enhanced_result)
69
+
70
+ # Sort by relevance score
71
+ enhanced_results.sort(key=lambda x: x.relevance_score, reverse=True)
72
+
73
+ return enhanced_results
74
+
75
+ def expand_query_semantically(
76
+ self, query: str, library: str, context: Optional[str] = None
77
+ ) -> str:
78
+ """Expand query with semantically related terms"""
79
+ expanded_terms = [query]
80
+
81
+ # Library-specific semantic expansions
82
+ semantic_expansions = {
83
+ "fastapi": {
84
+ "auth": ["authentication", "security", "JWT", "OAuth", "middleware"],
85
+ "database": ["SQLAlchemy", "ORM", "models", "async database"],
86
+ "api": ["endpoints", "routes", "REST", "OpenAPI", "swagger"],
87
+ "middleware": ["CORS", "authentication", "logging", "request"],
88
+ "async": ["asyncio", "concurrent", "await", "asynchronous"],
89
+ },
90
+ "react": {
91
+ "state": ["useState", "setState", "hooks", "context"],
92
+ "component": ["JSX", "props", "lifecycle", "functional"],
93
+ "routing": ["React Router", "navigation", "link"],
94
+ "forms": ["controlled", "uncontrolled", "validation"],
95
+ "hooks": ["useEffect", "useState", "useContext", "custom hooks"],
96
+ },
97
+ "django": {
98
+ "auth": ["authentication", "permissions", "user model", "login"],
99
+ "database": ["models", "ORM", "migrations", "queries"],
100
+ "views": ["class-based", "function-based", "generic views"],
101
+ "forms": ["ModelForm", "validation", "widgets"],
102
+ "admin": ["admin interface", "ModelAdmin", "customization"],
103
+ },
104
+ "langchain": {
105
+ "chains": ["LLMChain", "sequential", "pipeline", "workflow"],
106
+ "agents": ["tools", "ReAct", "planning", "execution"],
107
+ "memory": ["conversation", "buffer", "summary", "retrieval"],
108
+ "embeddings": ["vector", "similarity", "semantic search"],
109
+ "retrieval": ["RAG", "documents", "vector store", "similarity"],
110
+ },
111
+ }
112
+
113
+ # Add semantic expansions for the library
114
+ if library in semantic_expansions:
115
+ for key_term, expansions in semantic_expansions[library].items():
116
+ if key_term.lower() in query.lower():
117
+ expanded_terms.extend(expansions)
118
+
119
+ # Add context-based expansions
120
+ if context:
121
+ context_terms = self.extract_context_terms(context, library)
122
+ expanded_terms.extend(context_terms)
123
+
124
+ # Common technical term expansions
125
+ common_expansions = {
126
+ "error": ["exception", "troubleshooting", "debugging", "fix"],
127
+ "performance": ["optimization", "speed", "efficient", "benchmark"],
128
+ "security": ["authentication", "authorization", "encryption", "safety"],
129
+ "testing": ["unit test", "pytest", "mock", "coverage"],
130
+ "deployment": ["production", "docker", "hosting", "cloud"],
131
+ }
132
+
133
+ for term, expansions in common_expansions.items():
134
+ if term in query.lower():
135
+ expanded_terms.extend(expansions)
136
+
137
+ # Limit expansion to avoid over-broad results
138
+ unique_terms = list(set(expanded_terms))[:8]
139
+ return " ".join(unique_terms)
140
+
141
+ def extract_context_terms(self, context: str, library: str) -> List[str]:
142
+ """Extract relevant terms from user context"""
143
+ context_terms = []
144
+
145
+ # Extract mentioned technologies
146
+ tech_patterns = [
147
+ r"\b(react|vue|angular|svelte)\b",
148
+ r"\b(fastapi|django|flask|express)\b",
149
+ r"\b(python|javascript|typescript|node)\b",
150
+ r"\b(docker|kubernetes|aws|azure)\b",
151
+ r"\b(postgresql|mysql|mongodb|redis)\b",
152
+ ]
153
+
154
+ for pattern in tech_patterns:
155
+ matches = re.finditer(pattern, context, re.IGNORECASE)
156
+ context_terms.extend([match.group(1).lower() for match in matches])
157
+
158
+ # Extract use case indicators
159
+ use_case_patterns = {
160
+ r"\b(api|rest|graphql)\b": ["endpoint", "server", "client"],
161
+ r"\b(frontend|ui|interface)\b": ["component", "styling", "interaction"],
162
+ r"\b(database|data|storage)\b": ["model", "query", "migration"],
163
+ r"\b(auth|login|user)\b": ["permission", "session", "token"],
164
+ }
165
+
166
+ for pattern, related_terms in use_case_patterns.items():
167
+ if re.search(pattern, context, re.IGNORECASE):
168
+ context_terms.extend(related_terms)
169
+
170
+ return context_terms[:5] # Limit context expansion
171
+
172
+ async def enhance_search_result(
173
+ self, raw_result: Dict[str, Any], query: str, library: str
174
+ ) -> SearchResult:
175
+ """Enhance a raw search result with additional metadata"""
176
+
177
+ # Calculate relevance score
178
+ relevance_score = self.calculate_relevance_score(raw_result, query, library)
179
+
180
+ # Determine content type
181
+ content_type = self.classify_content_type(
182
+ raw_result.get("title", ""), raw_result.get("snippet", "")
183
+ )
184
+
185
+ # Assess difficulty level
186
+ difficulty_level = self.assess_difficulty_level(raw_result.get("snippet", ""))
187
+
188
+ # Count code snippets (estimate from snippet)
189
+ code_snippets_count = self.estimate_code_snippets(raw_result.get("snippet", ""))
190
+
191
+ # Estimate reading time
192
+ estimated_read_time = self.estimate_reading_time(raw_result.get("snippet", ""))
193
+
194
+ return SearchResult(
195
+ source_library=library,
196
+ url=raw_result.get("link", ""),
197
+ title=raw_result.get("title", ""),
198
+ snippet=raw_result.get("snippet", ""),
199
+ relevance_score=relevance_score,
200
+ content_type=content_type,
201
+ difficulty_level=difficulty_level,
202
+ code_snippets_count=code_snippets_count,
203
+ estimated_read_time=estimated_read_time,
204
+ )
205
+
206
+ def calculate_relevance_score(
207
+ self, result: Dict[str, Any], query: str, library: str
208
+ ) -> float:
209
+ """Calculate relevance score for a search result"""
210
+ score = 0.0
211
+
212
+ title = result.get("title", "").lower()
213
+ snippet = result.get("snippet", "").lower()
214
+ query_lower = query.lower()
215
+ query_words = set(query_lower.split())
216
+
217
+ # Title relevance (high weight)
218
+ title_words = set(title.split())
219
+ title_match_ratio = (
220
+ len(query_words.intersection(title_words)) / len(query_words)
221
+ if query_words
222
+ else 0
223
+ )
224
+ score += title_match_ratio * 40
225
+
226
+ # Snippet relevance (medium weight)
227
+ snippet_words = set(snippet.split())
228
+ snippet_match_ratio = (
229
+ len(query_words.intersection(snippet_words)) / len(query_words)
230
+ if query_words
231
+ else 0
232
+ )
233
+ score += snippet_match_ratio * 30
234
+
235
+ # Exact phrase matches (high bonus)
236
+ if query_lower in title:
237
+ score += 20
238
+ elif query_lower in snippet:
239
+ score += 15
240
+
241
+ # Library-specific bonuses
242
+ library_keywords = {
243
+ "fastapi": ["endpoint", "pydantic", "async", "uvicorn", "api"],
244
+ "react": ["component", "jsx", "hooks", "state", "props"],
245
+ "django": ["model", "view", "template", "admin", "orm"],
246
+ }
247
+
248
+ if library in library_keywords:
249
+ for keyword in library_keywords[library]:
250
+ if keyword in snippet:
251
+ score += 5
252
+
253
+ # Content type bonuses
254
+ if "example" in title or "tutorial" in title:
255
+ score += 10
256
+ elif "guide" in title or "documentation" in title:
257
+ score += 5
258
+
259
+ # Code presence bonus
260
+ if "```" in snippet or "<code>" in snippet or "def " in snippet:
261
+ score += 8
262
+
263
+ return min(score, 100.0) # Cap at 100
264
+
265
+ def classify_content_type(self, title: str, snippet: str) -> str:
266
+ """Classify the type of documentation content"""
267
+ title_lower = title.lower()
268
+ snippet_lower = snippet.lower()
269
+
270
+ # Tutorial indicators
271
+ if any(
272
+ word in title_lower
273
+ for word in ["tutorial", "guide", "walkthrough", "step-by-step"]
274
+ ):
275
+ return "tutorial"
276
+
277
+ # Example indicators
278
+ if any(
279
+ word in title_lower for word in ["example", "demo", "sample", "cookbook"]
280
+ ):
281
+ return "example"
282
+
283
+ # Reference indicators
284
+ if any(
285
+ word in title_lower
286
+ for word in ["api", "reference", "documentation", "docs"]
287
+ ):
288
+ return "reference"
289
+
290
+ # Check snippet for patterns
291
+ if any(
292
+ phrase in snippet_lower
293
+ for phrase in ["let's", "first", "getting started", "how to"]
294
+ ):
295
+ return "tutorial"
296
+ elif (
297
+ "class " in snippet_lower
298
+ or "function " in snippet_lower
299
+ or "method " in snippet_lower
300
+ ):
301
+ return "reference"
302
+ elif "example" in snippet_lower or "demo" in snippet_lower:
303
+ return "example"
304
+
305
+ return "guide" # Default
306
+
307
+ def assess_difficulty_level(self, snippet: str) -> str:
308
+ """Assess the difficulty level of content"""
309
+ snippet_lower = snippet.lower()
310
+ difficulty_score = 0
311
+
312
+ # Beginner indicators
313
+ beginner_terms = [
314
+ "basic",
315
+ "simple",
316
+ "introduction",
317
+ "getting started",
318
+ "quick start",
319
+ "hello world",
320
+ ]
321
+ difficulty_score -= sum(2 for term in beginner_terms if term in snippet_lower)
322
+
323
+ # Advanced indicators
324
+ advanced_terms = [
325
+ "advanced",
326
+ "complex",
327
+ "optimization",
328
+ "performance",
329
+ "architecture",
330
+ "async",
331
+ "concurrent",
332
+ "decorator",
333
+ "metaclass",
334
+ "inheritance",
335
+ ]
336
+ difficulty_score += sum(1 for term in advanced_terms if term in snippet_lower)
337
+
338
+ # Code complexity indicators
339
+ complex_patterns = [r"\bclass\s+\w+", r"\bdef\s+\w+", r"\basync\s+def", r"@\w+"]
340
+ difficulty_score += sum(
341
+ 1 for pattern in complex_patterns if re.search(pattern, snippet_lower)
342
+ )
343
+
344
+ if difficulty_score >= 3:
345
+ return "advanced"
346
+ elif difficulty_score >= 1:
347
+ return "intermediate"
348
+ else:
349
+ return "beginner"
350
+
351
+ def estimate_code_snippets(self, snippet: str) -> int:
352
+ """Estimate number of code snippets in content"""
353
+ code_indicators = [
354
+ "```",
355
+ "<code>",
356
+ "<pre>",
357
+ "def ",
358
+ "class ",
359
+ "function ",
360
+ "const ",
361
+ "let ",
362
+ "var ",
363
+ ]
364
+
365
+ count = 0
366
+ for indicator in code_indicators:
367
+ count += snippet.count(indicator)
368
+
369
+ # Rough estimate: assume each indicator represents part of a snippet
370
+ return min(count // 2, 10) # Cap at 10
371
+
372
+ def estimate_reading_time(self, snippet: str) -> int:
373
+ """Estimate reading time in minutes"""
374
+ words = len(snippet.split())
375
+
376
+ # Average reading speed: 200-250 words per minute
377
+ # Factor in code reading (slower)
378
+ code_ratio = (snippet.count("```") + snippet.count("<code>")) / max(
379
+ len(snippet), 1
380
+ )
381
+ effective_wpm = 200 - (code_ratio * 100) # Slower for code-heavy content
382
+
383
+ # Estimate full content is 5-10x longer than snippet
384
+ estimated_full_words = words * 7
385
+ reading_time = max(1, estimated_full_words / effective_wpm)
386
+
387
+ return int(reading_time)
388
+
389
+ async def perform_search(self, query: str) -> List[Dict[str, Any]]:
390
+ """Perform the actual search (placeholder - integrate with existing search)"""
391
+ if not self._search_fn:
392
+ raise RuntimeError("SmartSearch search function not configured")
393
+
394
+ search_response = await self._search_fn(query, self._results_limit)
395
+ organic_results = (
396
+ search_response.get("organic", [])
397
+ if isinstance(search_response, dict)
398
+ else []
399
+ )
400
+ return organic_results[: self._results_limit]
401
+
402
+ def get_docs_url(self, library: str) -> str:
403
+ """Get documentation URL for a library"""
404
+ if self._docs_url_map:
405
+ return self._docs_url_map.get(library, "docs.example.com")
406
+ return "docs.example.com"
407
+
408
+
409
+ @dataclass
410
+ class SearchFilters:
411
+ """Filters for refining search results"""
412
+
413
+ content_type: Optional[str] = None # "tutorial", "reference", "example"
414
+ difficulty_level: Optional[str] = None # "beginner", "intermediate", "advanced"
415
+ has_code_examples: Optional[bool] = None
416
+ max_reading_time: Optional[int] = None # in minutes
417
+ language: Optional[str] = None # programming language
418
+
419
+
420
+ class FilteredSearch:
421
+ """Search with advanced filtering capabilities"""
422
+
423
+ def __init__(self, smart_search: SmartSearch):
424
+ self.smart_search = smart_search
425
+
426
+ async def search_with_filters(
427
+ self, query: str, library: str, filters: SearchFilters
428
+ ) -> List[SearchResult]:
429
+ """Perform search with applied filters"""
430
+
431
+ # Get base search results
432
+ results = await self.smart_search.semantic_search(query, library)
433
+
434
+ # Apply filters
435
+ filtered_results = []
436
+ for result in results:
437
+ if self.passes_filters(result, filters):
438
+ filtered_results.append(result)
439
+
440
+ return filtered_results
441
+
442
+ def passes_filters(self, result: SearchResult, filters: SearchFilters) -> bool:
443
+ """Check if a result passes all filters"""
444
+
445
+ if filters.content_type and result.content_type != filters.content_type:
446
+ return False
447
+
448
+ if (
449
+ filters.difficulty_level
450
+ and result.difficulty_level != filters.difficulty_level
451
+ ):
452
+ return False
453
+
454
+ if filters.has_code_examples is not None:
455
+ has_code = result.code_snippets_count > 0
456
+ if filters.has_code_examples != has_code:
457
+ return False
458
+
459
+ if (
460
+ filters.max_reading_time
461
+ and result.estimated_read_time > filters.max_reading_time
462
+ ):
463
+ return False
464
+
465
+ # Language filter would need more sophisticated detection
466
+ # For now, skip language filtering
467
+
468
+ return True
469
+
470
+
471
+ # Global instances
472
+ smart_search = SmartSearch()
473
+ filtered_search = FilteredSearch(smart_search)