iflow-mcp_anton-prosterity-documentation-search-enhanced 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- documentation_search_enhanced/__init__.py +14 -0
- documentation_search_enhanced/__main__.py +6 -0
- documentation_search_enhanced/config.json +1674 -0
- documentation_search_enhanced/config_manager.py +233 -0
- documentation_search_enhanced/config_validator.py +79 -0
- documentation_search_enhanced/content_enhancer.py +578 -0
- documentation_search_enhanced/docker_manager.py +87 -0
- documentation_search_enhanced/logger.py +179 -0
- documentation_search_enhanced/main.py +2170 -0
- documentation_search_enhanced/project_generator.py +260 -0
- documentation_search_enhanced/project_scanner.py +85 -0
- documentation_search_enhanced/reranker.py +230 -0
- documentation_search_enhanced/site_index_builder.py +274 -0
- documentation_search_enhanced/site_index_downloader.py +222 -0
- documentation_search_enhanced/site_search.py +1325 -0
- documentation_search_enhanced/smart_search.py +473 -0
- documentation_search_enhanced/snyk_integration.py +657 -0
- documentation_search_enhanced/vector_search.py +303 -0
- documentation_search_enhanced/version_resolver.py +189 -0
- documentation_search_enhanced/vulnerability_scanner.py +545 -0
- documentation_search_enhanced/web_scraper.py +117 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/METADATA +195 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/RECORD +26 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/WHEEL +4 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/entry_points.txt +2 -0
- iflow_mcp_anton_prosterity_documentation_search_enhanced-1.9.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Generates boilerplate code and file structures for new projects.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from typing import Dict, List, TypedDict, NotRequired
|
|
8
|
+
|
|
9
|
+
# --- Project Templates ---
|
|
10
|
+
|
|
11
|
+
TEMPLATES: Dict[str, Dict[str, str]] = {
|
|
12
|
+
"fastapi": {
|
|
13
|
+
"main.py": """
|
|
14
|
+
from fastapi import FastAPI
|
|
15
|
+
|
|
16
|
+
app = FastAPI(
|
|
17
|
+
title="My FastAPI Project",
|
|
18
|
+
description="A new project generated by MCP.",
|
|
19
|
+
version="0.1.0",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
@app.get("/")
|
|
23
|
+
async def read_root():
|
|
24
|
+
return {"message": "Hello, World!"}
|
|
25
|
+
|
|
26
|
+
@app.get("/items/{item_id}")
|
|
27
|
+
async def read_item(item_id: int, q: str | None = None):
|
|
28
|
+
return {"item_id": item_id, "q": q}
|
|
29
|
+
""",
|
|
30
|
+
"pyproject.toml": """
|
|
31
|
+
[project]
|
|
32
|
+
name = "PROJECT_NAME_PLACEHOLDER"
|
|
33
|
+
version = "0.1.0"
|
|
34
|
+
description = "A new FastAPI project."
|
|
35
|
+
authors = [{ name = "Your Name", email = "you@example.com" }]
|
|
36
|
+
requires-python = ">=3.12"
|
|
37
|
+
dependencies = [
|
|
38
|
+
"fastapi",
|
|
39
|
+
"uvicorn[standard]",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
dev = ["pytest"]
|
|
44
|
+
""",
|
|
45
|
+
"README.md": """
|
|
46
|
+
# PROJECT_NAME_PLACEHOLDER
|
|
47
|
+
|
|
48
|
+
A new FastAPI project generated by Documentation Search Enhanced MCP.
|
|
49
|
+
|
|
50
|
+
## To run:
|
|
51
|
+
1. `uv pip sync`
|
|
52
|
+
2. `uv run uvicorn main:app --reload`
|
|
53
|
+
|
|
54
|
+
## To test:
|
|
55
|
+
`uv run pytest`
|
|
56
|
+
""",
|
|
57
|
+
".gitignore": """
|
|
58
|
+
__pycache__/
|
|
59
|
+
*.pyc
|
|
60
|
+
.env
|
|
61
|
+
.venv/
|
|
62
|
+
dist/
|
|
63
|
+
build/
|
|
64
|
+
*.egg-info
|
|
65
|
+
""",
|
|
66
|
+
"tests/test_main.py": """
|
|
67
|
+
from fastapi.testclient import TestClient
|
|
68
|
+
from main import app
|
|
69
|
+
|
|
70
|
+
client = TestClient(app)
|
|
71
|
+
|
|
72
|
+
def test_read_root():
|
|
73
|
+
response = client.get("/")
|
|
74
|
+
assert response.status_code == 200
|
|
75
|
+
assert response.json() == {"message": "Hello, World!"}
|
|
76
|
+
""",
|
|
77
|
+
},
|
|
78
|
+
"react-vite": {
|
|
79
|
+
"index.html": """
|
|
80
|
+
<!doctype html>
|
|
81
|
+
<html lang="en">
|
|
82
|
+
<head>
|
|
83
|
+
<meta charset="UTF-8" />
|
|
84
|
+
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
|
85
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
86
|
+
<title>PROJECT_NAME_PLACEHOLDER</title>
|
|
87
|
+
</head>
|
|
88
|
+
<body>
|
|
89
|
+
<div id="root"></div>
|
|
90
|
+
<script type="module" src="/src/main.jsx"></script>
|
|
91
|
+
</body>
|
|
92
|
+
</html>
|
|
93
|
+
""",
|
|
94
|
+
"package.json": """
|
|
95
|
+
{{
|
|
96
|
+
"name": "PROJECT_NAME_PLACEHOLDER",
|
|
97
|
+
"private": true,
|
|
98
|
+
"version": "0.0.0",
|
|
99
|
+
"type": "module",
|
|
100
|
+
"scripts": {{
|
|
101
|
+
"dev": "vite",
|
|
102
|
+
"build": "vite build",
|
|
103
|
+
"lint": "eslint . --ext js,jsx --report-unused-disable-directives --max-warnings 0",
|
|
104
|
+
"preview": "vite preview"
|
|
105
|
+
}},
|
|
106
|
+
"dependencies": {{
|
|
107
|
+
"react": "^18.2.0",
|
|
108
|
+
"react-dom": "^18.2.0"
|
|
109
|
+
}},
|
|
110
|
+
"devDependencies": {{
|
|
111
|
+
"@types/react": "^18.2.15",
|
|
112
|
+
"@types/react-dom": "^18.2.7",
|
|
113
|
+
"@vitejs/plugin-react": "^4.0.3",
|
|
114
|
+
"eslint": "^8.45.0",
|
|
115
|
+
"eslint-plugin-react": "^7.32.2",
|
|
116
|
+
"eslint-plugin-react-hooks": "^4.6.0",
|
|
117
|
+
"eslint-plugin-react-refresh": "^0.4.3",
|
|
118
|
+
"vite": "^4.4.5"
|
|
119
|
+
}}
|
|
120
|
+
}}
|
|
121
|
+
""",
|
|
122
|
+
"vite.config.js": """
|
|
123
|
+
import { defineConfig } from 'vite'
|
|
124
|
+
import react from '@vitejs/plugin-react'
|
|
125
|
+
|
|
126
|
+
// https://vitejs.dev/config/
|
|
127
|
+
export default defineConfig({
|
|
128
|
+
plugins: [react()],
|
|
129
|
+
})
|
|
130
|
+
""",
|
|
131
|
+
".gitignore": """
|
|
132
|
+
# Logs
|
|
133
|
+
logs
|
|
134
|
+
*.log
|
|
135
|
+
|
|
136
|
+
# Runtime data
|
|
137
|
+
pids
|
|
138
|
+
*.pid
|
|
139
|
+
*.seed
|
|
140
|
+
*.pid.lock
|
|
141
|
+
|
|
142
|
+
# Dependency directories
|
|
143
|
+
node_modules/
|
|
144
|
+
dist/
|
|
145
|
+
|
|
146
|
+
# IDE files
|
|
147
|
+
.idea/
|
|
148
|
+
.vscode/
|
|
149
|
+
|
|
150
|
+
# Environment variables
|
|
151
|
+
.env
|
|
152
|
+
.env.local
|
|
153
|
+
""",
|
|
154
|
+
"src/App.jsx": """
|
|
155
|
+
import './App.css'
|
|
156
|
+
|
|
157
|
+
function App() {
|
|
158
|
+
return (
|
|
159
|
+
<>
|
|
160
|
+
<h1>PROJECT_NAME_PLACEHOLDER</h1>
|
|
161
|
+
<p className="read-the-docs">
|
|
162
|
+
React + Vite project generated by MCP.
|
|
163
|
+
</p>
|
|
164
|
+
</>
|
|
165
|
+
)
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
export default App
|
|
169
|
+
""",
|
|
170
|
+
"src/main.jsx": """
|
|
171
|
+
import React from 'react'
|
|
172
|
+
import ReactDOM from 'react-dom/client'
|
|
173
|
+
import App from './App.jsx'
|
|
174
|
+
import './index.css'
|
|
175
|
+
|
|
176
|
+
ReactDOM.createRoot(document.getElementById('root')).render(
|
|
177
|
+
<React.StrictMode>
|
|
178
|
+
<App />
|
|
179
|
+
</React.StrictMode>,
|
|
180
|
+
)
|
|
181
|
+
""",
|
|
182
|
+
"src/index.css": """
|
|
183
|
+
:root {
|
|
184
|
+
font-family: Inter, system-ui, Avenir, Helvetica, Arial, sans-serif;
|
|
185
|
+
}
|
|
186
|
+
""",
|
|
187
|
+
"src/App.css": """
|
|
188
|
+
#root {
|
|
189
|
+
max-width: 1280px;
|
|
190
|
+
margin: 0 auto;
|
|
191
|
+
padding: 2rem;
|
|
192
|
+
text-align: center;
|
|
193
|
+
}
|
|
194
|
+
""",
|
|
195
|
+
},
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class ProjectCreationSummary(TypedDict):
|
|
200
|
+
project_name: str
|
|
201
|
+
template_used: str
|
|
202
|
+
project_path: str
|
|
203
|
+
directories_created: List[str]
|
|
204
|
+
files_created: List[str]
|
|
205
|
+
user_summary: NotRequired[str]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def generate_project(
|
|
209
|
+
project_name: str, template_name: str, base_path: str = "."
|
|
210
|
+
) -> ProjectCreationSummary:
|
|
211
|
+
"""
|
|
212
|
+
Generates a new project from a template.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
project_name: The name of the new project (will be created as a directory).
|
|
216
|
+
template_name: The name of the template to use (e.g., 'fastapi').
|
|
217
|
+
base_path: The path where the project directory will be created.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
A dictionary summarizing the created files and directories.
|
|
221
|
+
"""
|
|
222
|
+
if template_name not in TEMPLATES:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
f"Template '{template_name}' not found. Available templates: {list(TEMPLATES.keys())}"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
project_path = os.path.join(base_path, project_name)
|
|
228
|
+
if os.path.exists(project_path):
|
|
229
|
+
raise FileExistsError(f"Directory '{project_path}' already exists.")
|
|
230
|
+
|
|
231
|
+
os.makedirs(project_path)
|
|
232
|
+
|
|
233
|
+
template = TEMPLATES[template_name]
|
|
234
|
+
created_files = []
|
|
235
|
+
created_dirs = {project_path}
|
|
236
|
+
|
|
237
|
+
for file_path, content in template.items():
|
|
238
|
+
# Handle nested directories
|
|
239
|
+
full_path = os.path.join(project_path, file_path)
|
|
240
|
+
dir_name = os.path.dirname(full_path)
|
|
241
|
+
|
|
242
|
+
if not os.path.exists(dir_name):
|
|
243
|
+
os.makedirs(dir_name)
|
|
244
|
+
created_dirs.add(dir_name)
|
|
245
|
+
|
|
246
|
+
# Replace project name placeholder
|
|
247
|
+
formatted_content = content.replace("PROJECT_NAME_PLACEHOLDER", project_name)
|
|
248
|
+
|
|
249
|
+
with open(full_path, "w", encoding="utf-8") as f:
|
|
250
|
+
f.write(formatted_content.strip())
|
|
251
|
+
|
|
252
|
+
created_files.append(full_path)
|
|
253
|
+
|
|
254
|
+
return {
|
|
255
|
+
"project_name": project_name,
|
|
256
|
+
"template_used": template_name,
|
|
257
|
+
"project_path": project_path,
|
|
258
|
+
"directories_created": sorted(list(created_dirs)),
|
|
259
|
+
"files_created": sorted(created_files),
|
|
260
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Scans project directories to find and parse dependency files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import json
|
|
8
|
+
import sys
|
|
9
|
+
from typing import Dict, Tuple, Optional
|
|
10
|
+
import re
|
|
11
|
+
import tomllib
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _parse_requirement(req: str) -> Tuple[str, str]:
|
|
15
|
+
"""Parses a requirement string (e.g., 'fastapi==0.1.0' or 'django>=3.2')."""
|
|
16
|
+
match = re.match(r"([a-zA-Z0-9\-_]+)\s*([~<>=!]=?)\s*([0-9\.\*a-zA-Z]+)", req)
|
|
17
|
+
if match:
|
|
18
|
+
name, specifier, version = match.groups()
|
|
19
|
+
return name.strip(), f"{specifier}{version}"
|
|
20
|
+
return req.strip(), "latest"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_pyproject_toml(content: str) -> Dict[str, str]:
|
|
24
|
+
"""Parses dependencies from pyproject.toml content."""
|
|
25
|
+
data = tomllib.loads(content)
|
|
26
|
+
dependencies = data.get("project", {}).get("dependencies", [])
|
|
27
|
+
|
|
28
|
+
parsed_deps = {}
|
|
29
|
+
for req in dependencies:
|
|
30
|
+
name, version = _parse_requirement(req)
|
|
31
|
+
parsed_deps[name] = version
|
|
32
|
+
|
|
33
|
+
return parsed_deps
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def parse_requirements_txt(content: str) -> Dict[str, str]:
|
|
37
|
+
"""Parses dependencies from requirements.txt content."""
|
|
38
|
+
lines = content.splitlines()
|
|
39
|
+
parsed_deps = {}
|
|
40
|
+
for line in lines:
|
|
41
|
+
line = line.strip()
|
|
42
|
+
if line and not line.startswith("#"):
|
|
43
|
+
name, version = _parse_requirement(line)
|
|
44
|
+
parsed_deps[name] = version
|
|
45
|
+
return parsed_deps
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_package_json(content: str) -> Dict[str, str]:
|
|
49
|
+
"""Parses dependencies from package.json content."""
|
|
50
|
+
data = json.loads(content)
|
|
51
|
+
deps = data.get("dependencies", {})
|
|
52
|
+
dev_deps = data.get("devDependencies", {})
|
|
53
|
+
deps.update(dev_deps)
|
|
54
|
+
return deps
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def find_and_parse_dependencies(
|
|
58
|
+
directory: str,
|
|
59
|
+
) -> Optional[Tuple[str, str, Dict[str, str]]]:
|
|
60
|
+
"""
|
|
61
|
+
Finds and parses the most relevant dependency file in a directory.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A tuple of (file_path, ecosystem, dependencies_dict) or None.
|
|
65
|
+
"""
|
|
66
|
+
supported_files = {
|
|
67
|
+
"pyproject.toml": ("PyPI", parse_pyproject_toml),
|
|
68
|
+
"requirements.txt": ("PyPI", parse_requirements_txt),
|
|
69
|
+
"package.json": ("npm", parse_package_json),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
for filename, (ecosystem, parser_func) in supported_files.items():
|
|
73
|
+
file_path = os.path.join(directory, filename)
|
|
74
|
+
if os.path.exists(file_path):
|
|
75
|
+
try:
|
|
76
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
77
|
+
content = f.read()
|
|
78
|
+
dependencies = parser_func(content)
|
|
79
|
+
return filename, ecosystem, dependencies
|
|
80
|
+
except Exception as e:
|
|
81
|
+
print(f"⚠️ Error parsing {filename}: {e}", file=sys.stderr)
|
|
82
|
+
# Continue to the next file type if parsing fails
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
return None
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Search result reranking using hybrid scoring (vector + keyword + metadata)."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from .vector_search import get_vector_engine
|
|
8
|
+
from .smart_search import SearchResult
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SearchReranker:
|
|
14
|
+
"""
|
|
15
|
+
Rerank search results using a hybrid scoring approach:
|
|
16
|
+
- Semantic similarity (vector embeddings): 50% weight
|
|
17
|
+
- Keyword matching relevance: 30% weight
|
|
18
|
+
- Source authority/freshness: 20% weight
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
semantic_weight: float = 0.5,
|
|
24
|
+
keyword_weight: float = 0.3,
|
|
25
|
+
metadata_weight: float = 0.2,
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Initialize the reranker with configurable weights.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
semantic_weight: Weight for vector similarity score (0-1)
|
|
32
|
+
keyword_weight: Weight for keyword matching score (0-1)
|
|
33
|
+
metadata_weight: Weight for metadata scoring (0-1)
|
|
34
|
+
"""
|
|
35
|
+
self.semantic_weight = semantic_weight
|
|
36
|
+
self.keyword_weight = keyword_weight
|
|
37
|
+
self.metadata_weight = metadata_weight
|
|
38
|
+
|
|
39
|
+
# Ensure weights sum to 1.0
|
|
40
|
+
total = semantic_weight + keyword_weight + metadata_weight
|
|
41
|
+
if abs(total - 1.0) > 0.01:
|
|
42
|
+
logger.warning(f"Reranker weights sum to {total}, normalizing to 1.0")
|
|
43
|
+
self.semantic_weight /= total
|
|
44
|
+
self.keyword_weight /= total
|
|
45
|
+
self.metadata_weight /= total
|
|
46
|
+
|
|
47
|
+
self.vector_engine = get_vector_engine()
|
|
48
|
+
|
|
49
|
+
async def rerank(
|
|
50
|
+
self,
|
|
51
|
+
results: List[SearchResult],
|
|
52
|
+
query: str,
|
|
53
|
+
use_semantic: bool = True,
|
|
54
|
+
) -> List[SearchResult]:
|
|
55
|
+
"""
|
|
56
|
+
Rerank search results using hybrid scoring.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
results: List of search results to rerank
|
|
60
|
+
query: Original search query
|
|
61
|
+
use_semantic: Whether to use semantic scoring (can be disabled for speed)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Reranked list of search results
|
|
65
|
+
"""
|
|
66
|
+
if not results:
|
|
67
|
+
return results
|
|
68
|
+
|
|
69
|
+
logger.debug(f"Reranking {len(results)} results for query: {query[:50]}...")
|
|
70
|
+
|
|
71
|
+
# Calculate scores for each result
|
|
72
|
+
scored_results = []
|
|
73
|
+
for result in results:
|
|
74
|
+
score = 0.0
|
|
75
|
+
|
|
76
|
+
# 1. Semantic similarity score (if enabled)
|
|
77
|
+
if use_semantic:
|
|
78
|
+
semantic_score = await self._calculate_semantic_score(
|
|
79
|
+
query, result.snippet + " " + result.title
|
|
80
|
+
)
|
|
81
|
+
score += semantic_score * self.semantic_weight
|
|
82
|
+
else:
|
|
83
|
+
# If semantic disabled, redistribute weight to keyword matching
|
|
84
|
+
score += result.relevance_score * (
|
|
85
|
+
self.semantic_weight + self.keyword_weight
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# 2. Keyword matching score (use existing relevance_score)
|
|
89
|
+
if not use_semantic:
|
|
90
|
+
# Already included above
|
|
91
|
+
pass
|
|
92
|
+
else:
|
|
93
|
+
score += result.relevance_score * self.keyword_weight
|
|
94
|
+
|
|
95
|
+
# 3. Metadata scoring (authority, content quality indicators)
|
|
96
|
+
metadata_score = self._calculate_metadata_score(result)
|
|
97
|
+
score += metadata_score * self.metadata_weight
|
|
98
|
+
|
|
99
|
+
# Store the hybrid score
|
|
100
|
+
result.relevance_score = score
|
|
101
|
+
scored_results.append(result)
|
|
102
|
+
|
|
103
|
+
# Sort by hybrid score
|
|
104
|
+
scored_results.sort(key=lambda r: r.relevance_score, reverse=True)
|
|
105
|
+
|
|
106
|
+
logger.debug(
|
|
107
|
+
f"Reranked results. Top score: {scored_results[0].relevance_score:.3f}"
|
|
108
|
+
)
|
|
109
|
+
return scored_results
|
|
110
|
+
|
|
111
|
+
async def _calculate_semantic_score(self, query: str, document: str) -> float:
|
|
112
|
+
"""
|
|
113
|
+
Calculate semantic similarity between query and document.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
query: Search query
|
|
117
|
+
document: Document text (title + snippet)
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Similarity score between 0 and 1
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
# Generate embeddings
|
|
124
|
+
query_embedding = self.vector_engine.embed_documents([query])
|
|
125
|
+
doc_embedding = self.vector_engine.embed_documents([document])
|
|
126
|
+
|
|
127
|
+
# Calculate cosine similarity
|
|
128
|
+
import numpy as np
|
|
129
|
+
|
|
130
|
+
query_norm = query_embedding / np.linalg.norm(query_embedding)
|
|
131
|
+
doc_norm = doc_embedding / np.linalg.norm(doc_embedding)
|
|
132
|
+
similarity = np.dot(query_norm[0], doc_norm[0])
|
|
133
|
+
|
|
134
|
+
# Convert to 0-1 range (cosine similarity is -1 to 1)
|
|
135
|
+
score = (similarity + 1) / 2
|
|
136
|
+
return float(score)
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.warning(f"Error calculating semantic score: {e}")
|
|
140
|
+
return 0.5 # Neutral score on error
|
|
141
|
+
|
|
142
|
+
def _calculate_metadata_score(self, result: SearchResult) -> float:
|
|
143
|
+
"""
|
|
144
|
+
Calculate metadata-based score considering:
|
|
145
|
+
- Source authority (official docs > blogs > forums)
|
|
146
|
+
- Content type (tutorials/guides > reference > examples)
|
|
147
|
+
- Code examples presence
|
|
148
|
+
- Estimated quality indicators
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
result: Search result to score
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Metadata score between 0 and 1
|
|
155
|
+
"""
|
|
156
|
+
score = 0.5 # Base score
|
|
157
|
+
|
|
158
|
+
# Source authority scoring
|
|
159
|
+
url_lower = result.url.lower()
|
|
160
|
+
if any(
|
|
161
|
+
domain in url_lower
|
|
162
|
+
for domain in [
|
|
163
|
+
"docs.python.org",
|
|
164
|
+
"fastapi.tiangolo.com",
|
|
165
|
+
"reactjs.org",
|
|
166
|
+
"docs.djangoproject.com",
|
|
167
|
+
]
|
|
168
|
+
):
|
|
169
|
+
score += 0.3 # Official documentation
|
|
170
|
+
elif any(
|
|
171
|
+
domain in url_lower
|
|
172
|
+
for domain in ["github.com", "readthedocs.io", "readthedocs.org"]
|
|
173
|
+
):
|
|
174
|
+
score += 0.2 # Authoritative sources
|
|
175
|
+
elif any(
|
|
176
|
+
domain in url_lower
|
|
177
|
+
for domain in ["stackoverflow.com", "medium.com", "dev.to"]
|
|
178
|
+
):
|
|
179
|
+
score += 0.1 # Community sources
|
|
180
|
+
|
|
181
|
+
# Content type scoring
|
|
182
|
+
content_type_scores = {
|
|
183
|
+
"tutorial": 0.2,
|
|
184
|
+
"guide": 0.2,
|
|
185
|
+
"reference": 0.15,
|
|
186
|
+
"example": 0.1,
|
|
187
|
+
}
|
|
188
|
+
score += content_type_scores.get(result.content_type.lower(), 0)
|
|
189
|
+
|
|
190
|
+
# Code examples boost
|
|
191
|
+
if result.code_snippets_count > 0:
|
|
192
|
+
score += 0.1
|
|
193
|
+
|
|
194
|
+
# URL structure quality (indicates well-organized docs)
|
|
195
|
+
if self._has_good_url_structure(result.url):
|
|
196
|
+
score += 0.05
|
|
197
|
+
|
|
198
|
+
# Normalize to 0-1 range
|
|
199
|
+
return min(1.0, max(0.0, score))
|
|
200
|
+
|
|
201
|
+
def _has_good_url_structure(self, url: str) -> bool:
|
|
202
|
+
"""
|
|
203
|
+
Check if URL has good structure (versioned, organized).
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
url: URL to check
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
True if URL has good structure
|
|
210
|
+
"""
|
|
211
|
+
# Check for version in URL
|
|
212
|
+
has_version = bool(re.search(r"/v?\d+\.\d+/|/stable/|/latest/", url))
|
|
213
|
+
|
|
214
|
+
# Check for organized path structure
|
|
215
|
+
path_depth = len([p for p in url.split("/") if p]) - 2 # Exclude domain
|
|
216
|
+
has_good_depth = 2 <= path_depth <= 6
|
|
217
|
+
|
|
218
|
+
return has_version or has_good_depth
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# Global instance
|
|
222
|
+
_reranker: Optional[SearchReranker] = None
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def get_reranker() -> SearchReranker:
|
|
226
|
+
"""Get or create the global reranker instance."""
|
|
227
|
+
global _reranker
|
|
228
|
+
if _reranker is None:
|
|
229
|
+
_reranker = SearchReranker()
|
|
230
|
+
return _reranker
|