holoviz-mcp 0.0.1a0__py3-none-any.whl → 0.0.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holoviz-mcp might be problematic. Click here for more details.
- holoviz_mcp/__init__.py +18 -0
- holoviz_mcp/apps/__init__.py +1 -0
- holoviz_mcp/apps/configuration_viewer.py +116 -0
- holoviz_mcp/apps/search.py +314 -0
- holoviz_mcp/config/__init__.py +31 -0
- holoviz_mcp/config/config.yaml +167 -0
- holoviz_mcp/config/loader.py +308 -0
- holoviz_mcp/config/models.py +216 -0
- holoviz_mcp/config/resources/best-practices/hvplot.md +62 -0
- holoviz_mcp/config/resources/best-practices/panel-material-ui.md +318 -0
- holoviz_mcp/config/resources/best-practices/panel.md +294 -0
- holoviz_mcp/config/schema.json +203 -0
- holoviz_mcp/docs_mcp/__init__.py +1 -0
- holoviz_mcp/docs_mcp/data.py +963 -0
- holoviz_mcp/docs_mcp/models.py +21 -0
- holoviz_mcp/docs_mcp/pages_design.md +407 -0
- holoviz_mcp/docs_mcp/server.py +220 -0
- holoviz_mcp/hvplot_mcp/__init__.py +1 -0
- holoviz_mcp/hvplot_mcp/server.py +152 -0
- holoviz_mcp/panel_mcp/__init__.py +17 -0
- holoviz_mcp/panel_mcp/data.py +316 -0
- holoviz_mcp/panel_mcp/models.py +124 -0
- holoviz_mcp/panel_mcp/server.py +650 -0
- holoviz_mcp/py.typed +0 -0
- holoviz_mcp/serve.py +34 -0
- holoviz_mcp/server.py +77 -0
- holoviz_mcp/shared/__init__.py +1 -0
- holoviz_mcp/shared/extract_tools.py +74 -0
- holoviz_mcp-0.0.1a2.dist-info/METADATA +641 -0
- holoviz_mcp-0.0.1a2.dist-info/RECORD +33 -0
- {holoviz_mcp-0.0.1a0.dist-info → holoviz_mcp-0.0.1a2.dist-info}/WHEEL +1 -2
- holoviz_mcp-0.0.1a2.dist-info/entry_points.txt +4 -0
- holoviz_mcp-0.0.1a2.dist-info/licenses/LICENSE.txt +30 -0
- holoviz_mcp-0.0.1a0.dist-info/METADATA +0 -6
- holoviz_mcp-0.0.1a0.dist-info/RECORD +0 -5
- holoviz_mcp-0.0.1a0.dist-info/top_level.txt +0 -1
- main.py +0 -6
|
@@ -0,0 +1,963 @@
|
|
|
1
|
+
"""Data handling for the HoloViz Documentation MCP server."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
from typing import Literal
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
import chromadb
|
|
13
|
+
import git
|
|
14
|
+
from chromadb.api.collection_configuration import CreateCollectionConfiguration
|
|
15
|
+
from fastmcp import Context
|
|
16
|
+
from nbconvert import MarkdownExporter
|
|
17
|
+
from nbformat import read as nbread
|
|
18
|
+
from pydantic import HttpUrl
|
|
19
|
+
|
|
20
|
+
from holoviz_mcp.config.loader import get_config
|
|
21
|
+
from holoviz_mcp.config.models import FolderConfig
|
|
22
|
+
from holoviz_mcp.config.models import GitRepository
|
|
23
|
+
from holoviz_mcp.docs_mcp.models import Document
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# Todo: Describe DocumentApp
|
|
28
|
+
# Todo: Avoid overflow-x in SearchApp sidebar
|
|
29
|
+
# Todo: Add bokeh documentation to README extra config
|
|
30
|
+
|
|
31
|
+
_CROMA_CONFIGURATION = CreateCollectionConfiguration(
|
|
32
|
+
hnsw={
|
|
33
|
+
"space": "cosine",
|
|
34
|
+
"ef_construction": 200,
|
|
35
|
+
"ef_search": 200,
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
async def log_info(message: str, ctx: Context | None = None):
|
|
41
|
+
"""Log an info message to the context or logger."""
|
|
42
|
+
if ctx:
|
|
43
|
+
await ctx.info(message)
|
|
44
|
+
else:
|
|
45
|
+
logger.info(message)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def log_warning(message: str, ctx: Context | None = None):
|
|
49
|
+
"""Log a warning message to the context or logger."""
|
|
50
|
+
if ctx:
|
|
51
|
+
await ctx.warning(message)
|
|
52
|
+
else:
|
|
53
|
+
logger.warning(message)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def log_exception(message: str, ctx: Context | None = None):
|
|
57
|
+
"""Log an error message to the context or logger."""
|
|
58
|
+
if ctx:
|
|
59
|
+
await ctx.error(message)
|
|
60
|
+
else:
|
|
61
|
+
logger.error(message)
|
|
62
|
+
raise Exception(message)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_best_practices(project: str) -> str:
|
|
66
|
+
"""Get best practices for using a project with LLMs.
|
|
67
|
+
|
|
68
|
+
This function searches for best practices resources in user and default directories,
|
|
69
|
+
with user resources taking precedence over default ones.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
project (str): The name of the project to get best practices for.
|
|
73
|
+
Both hyphenated (e.g., "panel-material-ui") and underscored
|
|
74
|
+
(e.g., "panel_material_ui") names are supported.
|
|
75
|
+
|
|
76
|
+
Returns
|
|
77
|
+
-------
|
|
78
|
+
str: A string containing the best practices for the project in Markdown format.
|
|
79
|
+
|
|
80
|
+
Raises
|
|
81
|
+
------
|
|
82
|
+
FileNotFoundError: If no best practices file is found for the project.
|
|
83
|
+
"""
|
|
84
|
+
config = get_config()
|
|
85
|
+
|
|
86
|
+
# Convert underscored names to hyphenated for file lookup
|
|
87
|
+
project_filename = project.replace("_", "-")
|
|
88
|
+
|
|
89
|
+
# Search in user directory first, then default directory
|
|
90
|
+
search_paths = [
|
|
91
|
+
config.best_practices_dir("user"),
|
|
92
|
+
config.best_practices_dir("default"),
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
for search_dir in search_paths:
|
|
96
|
+
best_practices_file = search_dir / f"{project_filename}.md"
|
|
97
|
+
if best_practices_file.exists():
|
|
98
|
+
return best_practices_file.read_text(encoding="utf-8")
|
|
99
|
+
|
|
100
|
+
# If not found, raise error with helpful message
|
|
101
|
+
available_files = []
|
|
102
|
+
for search_dir in search_paths:
|
|
103
|
+
if search_dir.exists():
|
|
104
|
+
available_files.extend([f.stem for f in search_dir.glob("*.md")])
|
|
105
|
+
|
|
106
|
+
available_str = ", ".join(set(available_files)) if available_files else "None"
|
|
107
|
+
raise FileNotFoundError(
|
|
108
|
+
f"Best practices file for project '{project}' not found. " f"Available projects: {available_str}. " f"Searched in: {[str(p) for p in search_paths]}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def list_best_practices() -> list[str]:
|
|
113
|
+
"""List all available best practices projects.
|
|
114
|
+
|
|
115
|
+
This function discovers available best practices from both user and default directories,
|
|
116
|
+
with user resources taking precedence over default ones.
|
|
117
|
+
|
|
118
|
+
Returns
|
|
119
|
+
-------
|
|
120
|
+
list[str]: A list of project names that have best practices available.
|
|
121
|
+
Names are returned in hyphenated format (e.g., "panel-material-ui").
|
|
122
|
+
"""
|
|
123
|
+
config = get_config()
|
|
124
|
+
|
|
125
|
+
# Collect available projects from both directories
|
|
126
|
+
available_projects = set()
|
|
127
|
+
|
|
128
|
+
search_paths = [
|
|
129
|
+
config.best_practices_dir("user"),
|
|
130
|
+
config.best_practices_dir("default"),
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
for search_dir in search_paths:
|
|
134
|
+
if search_dir.exists():
|
|
135
|
+
for md_file in search_dir.glob("*.md"):
|
|
136
|
+
available_projects.add(md_file.stem)
|
|
137
|
+
|
|
138
|
+
return sorted(list(available_projects))
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def remove_leading_number_sep_from_path(p: Path) -> Path:
|
|
142
|
+
"""Remove a leading number + underscore or hyphen from the last path component."""
|
|
143
|
+
new_name = re.sub(r"^\d+[_-]", "", p.name)
|
|
144
|
+
return p.with_name(new_name)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def convert_path_to_url(path: Path, remove_first_part: bool = True, url_transform: Literal["holoviz", "plotly", "datashader"] = "holoviz") -> str:
|
|
148
|
+
"""Convert a relative file path to a URL path.
|
|
149
|
+
|
|
150
|
+
Converts file paths to web URLs by replacing file extensions with .html
|
|
151
|
+
and optionally removing the first path component for legacy compatibility.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
path: The file path to convert
|
|
155
|
+
remove_first_part: Whether to remove the first path component (legacy compatibility)
|
|
156
|
+
url_transform: How to transform the file path into a URL:
|
|
157
|
+
|
|
158
|
+
- "holoviz": Replace file extension with .html (default)
|
|
159
|
+
- "plotly": Replace file extension with / (e.g., filename.md -> filename/)
|
|
160
|
+
- "datashader": Remove leading index and replace file extension with .html (e.g., 01_filename.md -> filename.html)
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
URL path with .html extension
|
|
165
|
+
|
|
166
|
+
Examples
|
|
167
|
+
--------
|
|
168
|
+
>>> convert_path_to_url(Path("doc/getting_started.md"))
|
|
169
|
+
"getting_started.html"
|
|
170
|
+
>>> convert_path_to_url(Path("examples/reference/Button.ipynb"), False)
|
|
171
|
+
"examples/reference/Button.html"
|
|
172
|
+
>>> convert_path_to_url(Path("/doc/python/3d-axes.md"), False, "plotly")
|
|
173
|
+
"/doc/python/3d-axes/"
|
|
174
|
+
>>> convert_path_to_url(Path("/examples/user_guide/10_Performance.ipynb"), False, "datashader")
|
|
175
|
+
"/examples/user_guide/Performance.html"
|
|
176
|
+
"""
|
|
177
|
+
if url_transform in ["holoviz", "datashader"]:
|
|
178
|
+
path = remove_leading_number_sep_from_path(path)
|
|
179
|
+
|
|
180
|
+
# Convert path to URL format
|
|
181
|
+
parts = list(path.parts)
|
|
182
|
+
|
|
183
|
+
# Only remove first part if requested (for legacy compatibility)
|
|
184
|
+
if remove_first_part and parts:
|
|
185
|
+
parts.pop(0)
|
|
186
|
+
|
|
187
|
+
# Reconstruct path and convert to string
|
|
188
|
+
if parts:
|
|
189
|
+
url_path = str(Path(*parts))
|
|
190
|
+
else:
|
|
191
|
+
url_path = ""
|
|
192
|
+
|
|
193
|
+
# Replace file extensions with suffix
|
|
194
|
+
if url_path:
|
|
195
|
+
path_obj = Path(url_path)
|
|
196
|
+
if url_transform == "plotly":
|
|
197
|
+
url_path = str(path_obj.with_suffix(suffix="")) + "/"
|
|
198
|
+
else:
|
|
199
|
+
url_path = str(path_obj.with_suffix(suffix=".html"))
|
|
200
|
+
|
|
201
|
+
return url_path
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class DocumentationIndexer:
|
|
205
|
+
"""Handles cloning, processing, and indexing of documentation."""
|
|
206
|
+
|
|
207
|
+
def __init__(self, *, data_dir: Optional[Path] = None, repos_dir: Optional[Path] = None, vector_dir: Optional[Path] = None):
|
|
208
|
+
"""Initialize the DocumentationIndexer.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
data_dir: Directory to store index data. Defaults to user config directory.
|
|
212
|
+
repos_dir: Directory to store cloned repositories. Defaults to HOLOVIZ_MCP_REPOS_DIR.
|
|
213
|
+
vector_dir: Directory to store vector database. Defaults to config.vector_dir
|
|
214
|
+
"""
|
|
215
|
+
# Use unified config for default paths
|
|
216
|
+
config = get_config()
|
|
217
|
+
|
|
218
|
+
self.data_dir = data_dir or config.user_dir
|
|
219
|
+
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
220
|
+
|
|
221
|
+
# Use configurable repos directory for repository downloads
|
|
222
|
+
self.repos_dir = repos_dir or config.repos_dir
|
|
223
|
+
self.repos_dir.mkdir(parents=True, exist_ok=True)
|
|
224
|
+
|
|
225
|
+
# Use configurable directory for vector database path
|
|
226
|
+
vector_db_path = vector_dir or config.server.vector_db_path
|
|
227
|
+
vector_db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
228
|
+
|
|
229
|
+
# Disable ChromaDB telemetry based on config
|
|
230
|
+
if not config.server.anonymized_telemetry:
|
|
231
|
+
os.environ["ANONYMIZED_TELEMETRY"] = "False"
|
|
232
|
+
|
|
233
|
+
# Initialize ChromaDB
|
|
234
|
+
self.chroma_client = chromadb.PersistentClient(path=str(vector_db_path))
|
|
235
|
+
self.collection = self.chroma_client.get_or_create_collection("holoviz_docs", configuration=_CROMA_CONFIGURATION)
|
|
236
|
+
|
|
237
|
+
# Initialize notebook converter
|
|
238
|
+
self.nb_exporter = MarkdownExporter()
|
|
239
|
+
|
|
240
|
+
# Load documentation config from the centralized config system
|
|
241
|
+
self.config = get_config().docs
|
|
242
|
+
|
|
243
|
+
def is_indexed(self) -> bool:
|
|
244
|
+
"""Check if documentation index exists and is valid."""
|
|
245
|
+
try:
|
|
246
|
+
count = self.collection.count()
|
|
247
|
+
return count > 0
|
|
248
|
+
except Exception:
|
|
249
|
+
return False
|
|
250
|
+
|
|
251
|
+
async def ensure_indexed(self, ctx: Context | None = None):
|
|
252
|
+
"""Ensure documentation is indexed, creating if necessary."""
|
|
253
|
+
if not self.is_indexed():
|
|
254
|
+
await log_info("Documentation index not found. Creating initial index...", ctx)
|
|
255
|
+
await self.index_documentation()
|
|
256
|
+
|
|
257
|
+
async def clone_or_update_repo(self, repo_name: str, repo_config: "GitRepository", ctx: Context | None = None) -> Optional[Path]:
|
|
258
|
+
"""Clone or update a single repository."""
|
|
259
|
+
repo_path = self.repos_dir / repo_name
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
if repo_path.exists():
|
|
263
|
+
# Update existing repository
|
|
264
|
+
await log_info(f"Updating {repo_name} repository at {repo_path}...", ctx)
|
|
265
|
+
repo = git.Repo(repo_path)
|
|
266
|
+
repo.remotes.origin.pull()
|
|
267
|
+
else:
|
|
268
|
+
# Clone new repository
|
|
269
|
+
await log_info(f"Cloning {repo_name} repository to {repo_path}...", ctx)
|
|
270
|
+
clone_kwargs: dict[str, Any] = {"depth": 1} # Shallow clone for efficiency
|
|
271
|
+
|
|
272
|
+
# Add branch, tag, or commit if specified
|
|
273
|
+
if repo_config.branch:
|
|
274
|
+
clone_kwargs["branch"] = repo_config.branch
|
|
275
|
+
elif repo_config.tag:
|
|
276
|
+
clone_kwargs["branch"] = repo_config.tag
|
|
277
|
+
elif repo_config.commit:
|
|
278
|
+
# For specific commits, we need to clone and then checkout
|
|
279
|
+
git.Repo.clone_from(str(repo_config.url), repo_path, **clone_kwargs)
|
|
280
|
+
repo = git.Repo(repo_path)
|
|
281
|
+
repo.git.checkout(repo_config.commit)
|
|
282
|
+
return repo_path
|
|
283
|
+
|
|
284
|
+
git.Repo.clone_from(str(repo_config.url), repo_path, **clone_kwargs)
|
|
285
|
+
|
|
286
|
+
return repo_path
|
|
287
|
+
except Exception as e:
|
|
288
|
+
msg = f"Failed to clone/update {repo_name}: {e}"
|
|
289
|
+
await log_warning(msg, ctx) # Changed from log_exception to log_warning so it doesn't raise
|
|
290
|
+
return None
|
|
291
|
+
|
|
292
|
+
def _is_reference_document(self, file_path: Path, project: str, folder_name: str = "") -> bool:
|
|
293
|
+
"""Check if the document is a reference document using configurable patterns.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
file_path: Full path to the file
|
|
297
|
+
project: Project name
|
|
298
|
+
folder_name: Name of the folder this file belongs to
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
bool: True if this is a reference document
|
|
303
|
+
"""
|
|
304
|
+
repo_config = self.config.repositories[project]
|
|
305
|
+
repo_path = self.repos_dir / project
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
relative_path = file_path.relative_to(repo_path)
|
|
309
|
+
|
|
310
|
+
# Check against configured reference patterns
|
|
311
|
+
for pattern in repo_config.reference_patterns:
|
|
312
|
+
if relative_path.match(pattern):
|
|
313
|
+
return True
|
|
314
|
+
|
|
315
|
+
# Fallback to simple "reference" in path check
|
|
316
|
+
return "reference" in relative_path.parts
|
|
317
|
+
except (ValueError, KeyError):
|
|
318
|
+
# If we can't determine relative path or no patterns configured, use simple fallback
|
|
319
|
+
return "reference" in file_path.parts
|
|
320
|
+
|
|
321
|
+
def _generate_doc_id(self, project: str, path: Path) -> str:
|
|
322
|
+
"""Generate a unique document ID from project and path."""
|
|
323
|
+
readable_path = str(path).replace("/", "___").replace(".", "_")
|
|
324
|
+
readable_id = f"{project}___{readable_path}"
|
|
325
|
+
|
|
326
|
+
return readable_id
|
|
327
|
+
|
|
328
|
+
def _generate_doc_url(self, project: str, path: Path, folder_name: str = "") -> str:
|
|
329
|
+
"""Generate documentation URL for a file.
|
|
330
|
+
|
|
331
|
+
This method creates the final URL where the documentation can be accessed online.
|
|
332
|
+
It handles folder URL mapping to ensure proper URL structure for different documentation layouts.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
project: Name of the project/repository (e.g., "panel", "hvplot")
|
|
336
|
+
path: Relative path to the file within the repository
|
|
337
|
+
folder_name: Name of the folder containing the file (e.g., "examples/reference", "doc")
|
|
338
|
+
Used for URL path mapping when folders have custom URL structures
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
Complete URL to the documentation file
|
|
343
|
+
|
|
344
|
+
Examples
|
|
345
|
+
--------
|
|
346
|
+
For Panel reference guides:
|
|
347
|
+
- Input: project="panel", path="examples/reference/widgets/Button.ipynb", folder_name="examples/reference"
|
|
348
|
+
- Output: "https://panel.holoviz.org/reference/widgets/Button.html"
|
|
349
|
+
|
|
350
|
+
For regular documentation:
|
|
351
|
+
- Input: project="panel", path="doc/getting_started.md", folder_name="doc"
|
|
352
|
+
- Output: "https://panel.holoviz.org/getting_started.html"
|
|
353
|
+
"""
|
|
354
|
+
repo_config = self.config.repositories[project]
|
|
355
|
+
base_url = str(repo_config.base_url).rstrip("/")
|
|
356
|
+
|
|
357
|
+
# Get the URL path mapping for this folder
|
|
358
|
+
folder_url_path = repo_config.get_folder_url_path(folder_name)
|
|
359
|
+
|
|
360
|
+
# If there's a folder URL mapping, we need to adjust the path
|
|
361
|
+
if folder_url_path and folder_name:
|
|
362
|
+
# Remove the folder name from the beginning of the path
|
|
363
|
+
path_str = str(path)
|
|
364
|
+
|
|
365
|
+
# Check if path starts with the folder name
|
|
366
|
+
if path_str.startswith(folder_name + "/"):
|
|
367
|
+
# Remove the folder prefix and leading slash
|
|
368
|
+
remaining_path = path_str[len(folder_name) + 1 :]
|
|
369
|
+
adjusted_path = Path(remaining_path) if remaining_path else Path(".")
|
|
370
|
+
elif path_str == folder_name:
|
|
371
|
+
# The path is exactly the folder name
|
|
372
|
+
adjusted_path = Path(".")
|
|
373
|
+
else:
|
|
374
|
+
# Fallback: try to remove folder parts from the beginning
|
|
375
|
+
path_parts = list(path.parts)
|
|
376
|
+
folder_parts = folder_name.split("/")
|
|
377
|
+
for folder_part in folder_parts:
|
|
378
|
+
if path_parts and path_parts[0] == folder_part:
|
|
379
|
+
path_parts = path_parts[1:]
|
|
380
|
+
adjusted_path = Path(*path_parts) if path_parts else Path(".")
|
|
381
|
+
|
|
382
|
+
# Don't remove first part since we already adjusted the path
|
|
383
|
+
doc_path = convert_path_to_url(adjusted_path, remove_first_part=False, url_transform=repo_config.url_transform)
|
|
384
|
+
else:
|
|
385
|
+
# Convert file path to URL format normally (remove first part for legacy compatibility)
|
|
386
|
+
doc_path = convert_path_to_url(path, remove_first_part=True, url_transform=repo_config.url_transform)
|
|
387
|
+
|
|
388
|
+
# Combine base URL, folder URL path, and document path
|
|
389
|
+
if folder_url_path:
|
|
390
|
+
full_url = f"{base_url}{folder_url_path}/{doc_path}"
|
|
391
|
+
else:
|
|
392
|
+
full_url = f"{base_url}/{doc_path}"
|
|
393
|
+
|
|
394
|
+
return full_url.replace("//", "/").replace(":/", "://") # Fix double slashes
|
|
395
|
+
|
|
396
|
+
@staticmethod
|
|
397
|
+
def _to_title(fallback_filename: str = "") -> str:
|
|
398
|
+
"""Extract title from a filename or return a default title."""
|
|
399
|
+
title = Path(fallback_filename).stem
|
|
400
|
+
if "_" in title and title.split("_")[0].isdigit():
|
|
401
|
+
title = title.split("_", 1)[-1]
|
|
402
|
+
title = title.replace("_", " ").replace("-", " ").title()
|
|
403
|
+
return title
|
|
404
|
+
|
|
405
|
+
@classmethod
|
|
406
|
+
def _extract_title_from_markdown(cls, content: str, fallback_filename: str = "") -> str:
|
|
407
|
+
"""Extract title from markdown content, with filename fallback."""
|
|
408
|
+
lines = content.split("\n")
|
|
409
|
+
for line in lines:
|
|
410
|
+
line = line.strip()
|
|
411
|
+
if line.startswith("# "):
|
|
412
|
+
# Return just the title text without the "# " prefix
|
|
413
|
+
return line[2:].strip()
|
|
414
|
+
if line.startswith("##"):
|
|
415
|
+
break
|
|
416
|
+
|
|
417
|
+
if fallback_filename:
|
|
418
|
+
return cls._to_title(fallback_filename)
|
|
419
|
+
|
|
420
|
+
return "No Title"
|
|
421
|
+
|
|
422
|
+
@staticmethod
|
|
423
|
+
def _extract_description_from_markdown(content: str, max_length=200) -> str:
|
|
424
|
+
"""Extract description from markdown content."""
|
|
425
|
+
content = content.strip()
|
|
426
|
+
|
|
427
|
+
# Plotly documents start with --- ... --- section. Skip the section
|
|
428
|
+
if content.startswith("---"):
|
|
429
|
+
content = content.split("---", 2)[-1].strip()
|
|
430
|
+
|
|
431
|
+
lines = content.split("\n")
|
|
432
|
+
clean_lines = []
|
|
433
|
+
in_code_block = False
|
|
434
|
+
|
|
435
|
+
for line in lines:
|
|
436
|
+
if line.strip().startswith("```"):
|
|
437
|
+
in_code_block = not in_code_block
|
|
438
|
+
continue
|
|
439
|
+
|
|
440
|
+
if in_code_block or line.startswith(("#", " ", "\t", "---", "___")):
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
clean_lines.append(line)
|
|
444
|
+
|
|
445
|
+
# Join lines and clean up
|
|
446
|
+
clean_content = "\n".join(clean_lines).strip()
|
|
447
|
+
|
|
448
|
+
# Remove extra whitespace and limit length
|
|
449
|
+
clean_content = " ".join(clean_content.split())
|
|
450
|
+
|
|
451
|
+
if len(clean_content) > max_length:
|
|
452
|
+
clean_content = clean_content[:max_length].rsplit(" ", 1)[0]
|
|
453
|
+
if not clean_content.endswith("."):
|
|
454
|
+
clean_content += " ..."
|
|
455
|
+
|
|
456
|
+
return clean_content
|
|
457
|
+
|
|
458
|
+
def convert_notebook_to_markdown(self, notebook_path: Path) -> str:
|
|
459
|
+
"""Convert a Jupyter notebook to markdown."""
|
|
460
|
+
try:
|
|
461
|
+
with open(notebook_path, "r", encoding="utf-8") as f:
|
|
462
|
+
notebook = nbread(f, as_version=4)
|
|
463
|
+
|
|
464
|
+
(body, resources) = self.nb_exporter.from_notebook_node(notebook)
|
|
465
|
+
return body
|
|
466
|
+
except Exception as e:
|
|
467
|
+
logger.error(f"Failed to convert notebook {notebook_path}: {e}")
|
|
468
|
+
return str(e)
|
|
469
|
+
|
|
470
|
+
@staticmethod
|
|
471
|
+
def _to_source_url(file_path: Path, repo_config: GitRepository, raw: bool = False) -> str:
|
|
472
|
+
"""Generate source URL for a file based on repository configuration."""
|
|
473
|
+
url = str(repo_config.url)
|
|
474
|
+
branch = repo_config.branch or "main"
|
|
475
|
+
if url.startswith("https://github.com") and url.endswith(".git"):
|
|
476
|
+
url = url.replace("https://github.com/", "").replace(".git", "")
|
|
477
|
+
project, repository = url.split("/")
|
|
478
|
+
if raw:
|
|
479
|
+
return f"https://raw.githubusercontent.com/{project}/{repository}/refs/heads/{branch}/{file_path}"
|
|
480
|
+
|
|
481
|
+
return f"https://github.com/{project}/{repository}/blob/{branch}/{file_path}"
|
|
482
|
+
if "dev.azure.com" in url:
|
|
483
|
+
organisation = url.split("/")[3].split("@")[0]
|
|
484
|
+
project = url.split("/")[-3]
|
|
485
|
+
repo_name = url.split("/")[-1]
|
|
486
|
+
if raw:
|
|
487
|
+
return f"https://dev.azure.com/{organisation}/{project}/_apis/sourceProviders/TfsGit/filecontents?repository={repo_name}&path=/{file_path}&commitOrBranch={branch}&api-version=7.0"
|
|
488
|
+
|
|
489
|
+
return f"https://dev.azure.com/{organisation}/{project}/_git/{repo_name}?path=/{file_path}&version=GB{branch}"
|
|
490
|
+
|
|
491
|
+
raise ValueError(f"Unsupported repository URL format: {url}. Please provide a valid GitHub or Azure DevOps URL.")
|
|
492
|
+
|
|
493
|
+
def process_file(self, file_path: Path, project: str, repo_config: GitRepository, folder_name: str = "") -> Optional[dict[str, Any]]:
|
|
494
|
+
"""Process a file and extract metadata."""
|
|
495
|
+
try:
|
|
496
|
+
if file_path.suffix == ".ipynb":
|
|
497
|
+
content = self.convert_notebook_to_markdown(file_path)
|
|
498
|
+
elif file_path.suffix in [".md", ".rst", ".txt"]:
|
|
499
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
500
|
+
content = f.read()
|
|
501
|
+
else:
|
|
502
|
+
logger.debug(f"Skipping unsupported file type: {file_path}")
|
|
503
|
+
return None
|
|
504
|
+
|
|
505
|
+
title = self._extract_title_from_markdown(content, file_path.name)
|
|
506
|
+
if not title:
|
|
507
|
+
title = file_path.stem.replace("_", " ").title()
|
|
508
|
+
|
|
509
|
+
description = self._extract_description_from_markdown(content)
|
|
510
|
+
|
|
511
|
+
repo_path = self.repos_dir / project
|
|
512
|
+
relative_path = file_path.relative_to(repo_path)
|
|
513
|
+
|
|
514
|
+
doc_id = self._generate_doc_id(project, relative_path)
|
|
515
|
+
|
|
516
|
+
is_reference = self._is_reference_document(file_path, project, folder_name)
|
|
517
|
+
|
|
518
|
+
source_url = self._to_source_url(relative_path, repo_config)
|
|
519
|
+
|
|
520
|
+
return {
|
|
521
|
+
"id": doc_id,
|
|
522
|
+
"title": title,
|
|
523
|
+
"url": self._generate_doc_url(project, relative_path, folder_name),
|
|
524
|
+
"project": project,
|
|
525
|
+
"source_path": str(relative_path),
|
|
526
|
+
"source_path_stem": file_path.stem,
|
|
527
|
+
"source_url": source_url,
|
|
528
|
+
"description": description,
|
|
529
|
+
"content": content,
|
|
530
|
+
"is_reference": is_reference,
|
|
531
|
+
}
|
|
532
|
+
except Exception as e:
|
|
533
|
+
logger.error(f"Failed to process file {file_path}: {e}")
|
|
534
|
+
return None
|
|
535
|
+
|
|
536
|
+
async def extract_docs_from_repo(self, repo_path: Path, project: str, ctx: Context | None = None) -> list[dict[str, Any]]:
|
|
537
|
+
"""Extract documentation files from a repository."""
|
|
538
|
+
docs = []
|
|
539
|
+
repo_config = self.config.repositories[project]
|
|
540
|
+
|
|
541
|
+
# Use the new folder structure with URL path mapping
|
|
542
|
+
if isinstance(repo_config.folders, dict):
|
|
543
|
+
folders = repo_config.folders
|
|
544
|
+
else:
|
|
545
|
+
# Convert list to dict with default FolderConfig
|
|
546
|
+
folders = {name: FolderConfig() for name in repo_config.folders}
|
|
547
|
+
|
|
548
|
+
files: set = set()
|
|
549
|
+
await log_info(f"Processing {project} documentation files in {','.join(folders.keys())}", ctx)
|
|
550
|
+
|
|
551
|
+
for folder_name in folders.keys():
|
|
552
|
+
docs_folder: Path = repo_path / folder_name
|
|
553
|
+
if docs_folder.exists():
|
|
554
|
+
# Use index patterns from config
|
|
555
|
+
for pattern in self.config.index_patterns:
|
|
556
|
+
files.update(docs_folder.glob(pattern))
|
|
557
|
+
|
|
558
|
+
for file in files:
|
|
559
|
+
if file.exists() and not file.is_dir():
|
|
560
|
+
# Determine which folder this file belongs to
|
|
561
|
+
folder_name = ""
|
|
562
|
+
for fname in folders.keys():
|
|
563
|
+
folder_path = repo_path / fname
|
|
564
|
+
try:
|
|
565
|
+
file.relative_to(folder_path)
|
|
566
|
+
folder_name = fname
|
|
567
|
+
break
|
|
568
|
+
except ValueError:
|
|
569
|
+
continue
|
|
570
|
+
|
|
571
|
+
doc_data = self.process_file(file, project, repo_config, folder_name)
|
|
572
|
+
if doc_data:
|
|
573
|
+
docs.append(doc_data)
|
|
574
|
+
|
|
575
|
+
# Count reference vs regular documents
|
|
576
|
+
reference_count = sum(1 for doc in docs if doc["is_reference"])
|
|
577
|
+
regular_count = len(docs) - reference_count
|
|
578
|
+
|
|
579
|
+
await log_info(f" 📄 {project}: {len(docs)} total documents ({regular_count} regular, {reference_count} reference guides)", ctx)
|
|
580
|
+
return docs
|
|
581
|
+
|
|
582
|
+
async def index_documentation(self, ctx: Context | None = None):
|
|
583
|
+
"""Indexes all documentation."""
|
|
584
|
+
await log_info("Starting documentation indexing...", ctx)
|
|
585
|
+
await log_info(f"📁 Repositories directory: {self.repos_dir}", ctx)
|
|
586
|
+
await log_info(f"💾 Vector database location: {self.data_dir / 'chroma'}", ctx)
|
|
587
|
+
|
|
588
|
+
all_docs = []
|
|
589
|
+
|
|
590
|
+
# Clone/update repositories and extract documentation
|
|
591
|
+
for repo_name, repo_config in self.config.repositories.items():
|
|
592
|
+
await log_info(f"Processing {repo_name}...", ctx)
|
|
593
|
+
repo_path = await self.clone_or_update_repo(repo_name, repo_config)
|
|
594
|
+
if repo_path:
|
|
595
|
+
docs = await self.extract_docs_from_repo(repo_path, repo_name, ctx)
|
|
596
|
+
all_docs.extend(docs)
|
|
597
|
+
|
|
598
|
+
if not all_docs:
|
|
599
|
+
await log_warning("No documentation found to index", ctx)
|
|
600
|
+
return
|
|
601
|
+
|
|
602
|
+
# Validate for duplicate IDs and log details
|
|
603
|
+
await self._validate_unique_ids(all_docs)
|
|
604
|
+
|
|
605
|
+
# Clear existing collection
|
|
606
|
+
await log_info("Clearing existing index...", ctx)
|
|
607
|
+
|
|
608
|
+
# Only delete if collection has data
|
|
609
|
+
try:
|
|
610
|
+
count = self.collection.count()
|
|
611
|
+
if count > 0:
|
|
612
|
+
# Delete all documents by getting all IDs first
|
|
613
|
+
results = self.collection.get()
|
|
614
|
+
if results["ids"]:
|
|
615
|
+
self.collection.delete(ids=results["ids"])
|
|
616
|
+
except Exception as e:
|
|
617
|
+
logger.warning(f"Failed to clear existing collection: {e}")
|
|
618
|
+
# If clearing fails, recreate the collection
|
|
619
|
+
try:
|
|
620
|
+
self.chroma_client.delete_collection("holoviz_docs")
|
|
621
|
+
self.collection = self.chroma_client.get_or_create_collection("holoviz_docs", configuration=_CROMA_CONFIGURATION)
|
|
622
|
+
except Exception as e2:
|
|
623
|
+
await log_exception(f"Failed to recreate collection: {e2}", ctx)
|
|
624
|
+
raise
|
|
625
|
+
|
|
626
|
+
# Add documents to ChromaDB
|
|
627
|
+
await log_info(f"Adding {len(all_docs)} documents to index...", ctx)
|
|
628
|
+
|
|
629
|
+
self.collection.add(
|
|
630
|
+
documents=[doc["content"] for doc in all_docs],
|
|
631
|
+
metadatas=[
|
|
632
|
+
{
|
|
633
|
+
"title": doc["title"],
|
|
634
|
+
"url": doc["url"],
|
|
635
|
+
"project": doc["project"],
|
|
636
|
+
"source_path": doc["source_path"],
|
|
637
|
+
"source_path_stem": doc["source_path_stem"],
|
|
638
|
+
"source_url": doc["source_url"],
|
|
639
|
+
"description": doc["description"],
|
|
640
|
+
"is_reference": doc["is_reference"],
|
|
641
|
+
}
|
|
642
|
+
for doc in all_docs
|
|
643
|
+
],
|
|
644
|
+
ids=[doc["id"] for doc in all_docs],
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
await log_info(f"✅ Successfully indexed {len(all_docs)} documents", ctx)
|
|
648
|
+
await log_info(f"📊 Vector database stored at: {self.data_dir / 'chroma'}", ctx)
|
|
649
|
+
await log_info(f"🔍 Index contains {self.collection.count()} total documents", ctx)
|
|
650
|
+
|
|
651
|
+
# Show detailed summary table
|
|
652
|
+
await self._log_summary_table(ctx)
|
|
653
|
+
|
|
654
|
+
async def _validate_unique_ids(self, all_docs: list[dict[str, Any]], ctx: Context | None = None) -> None:
|
|
655
|
+
"""Validate that all document IDs are unique and log duplicates."""
|
|
656
|
+
seen_ids: dict = {}
|
|
657
|
+
duplicates = []
|
|
658
|
+
|
|
659
|
+
for doc in all_docs:
|
|
660
|
+
doc_id = doc["id"]
|
|
661
|
+
if doc_id in seen_ids:
|
|
662
|
+
duplicates.append(
|
|
663
|
+
{
|
|
664
|
+
"id": doc_id,
|
|
665
|
+
"first_doc": seen_ids[doc_id],
|
|
666
|
+
"duplicate_doc": {"project": doc["project"], "source_path": doc["source_path"], "title": doc["title"]},
|
|
667
|
+
}
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
await log_warning(f"DUPLICATE ID FOUND: {doc_id}", ctx)
|
|
671
|
+
await log_warning(f" First document: {seen_ids[doc_id]['project']}/{seen_ids[doc_id]['path']} - {seen_ids[doc_id]['title']}", ctx)
|
|
672
|
+
await log_warning(f" Duplicate document: {doc['project']}/{doc['path']} - {doc['title']}", ctx)
|
|
673
|
+
else:
|
|
674
|
+
seen_ids[doc_id] = {"project": doc["project"], "source_path": doc["source_path"], "title": doc["title"]}
|
|
675
|
+
|
|
676
|
+
if duplicates:
|
|
677
|
+
error_msg = f"Found {len(duplicates)} duplicate document IDs"
|
|
678
|
+
await log_exception(error_msg, ctx)
|
|
679
|
+
|
|
680
|
+
# Log all duplicates for debugging
|
|
681
|
+
for dup in duplicates:
|
|
682
|
+
await log_exception(
|
|
683
|
+
f"Duplicate ID '{dup['id']}': {dup['first_doc']['project']}/{dup['first_doc']['path']} vs {dup['duplicate_doc']['project']}/{dup['duplicate_doc']['path']}", # noqa: D401, E501
|
|
684
|
+
ctx,
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
raise ValueError(f"Document ID collision detected. {len(duplicates)} duplicate IDs found. Check logs for details.")
|
|
688
|
+
|
|
689
|
+
async def search_get_reference_guide(self, component: str, project: Optional[str] = None, content: bool = True, ctx: Context | None = None) -> list[Document]:
|
|
690
|
+
"""Search for reference guides for a specific component."""
|
|
691
|
+
await self.ensure_indexed()
|
|
692
|
+
|
|
693
|
+
# Build search strategies
|
|
694
|
+
filters: list[dict[str, Any]] = []
|
|
695
|
+
if project:
|
|
696
|
+
filters.append({"project": str(project)})
|
|
697
|
+
filters.append({"source_path_stem": str(component)})
|
|
698
|
+
filters.append({"is_reference": True})
|
|
699
|
+
where_clause: dict[str, Any] = {"$and": filters} if len(filters) > 1 else filters[0]
|
|
700
|
+
|
|
701
|
+
all_results = []
|
|
702
|
+
|
|
703
|
+
filename_results = self.collection.query(query_texts=[component], n_results=1000, where=where_clause)
|
|
704
|
+
if filename_results["ids"] and filename_results["ids"][0]:
|
|
705
|
+
for i, _ in enumerate(filename_results["ids"][0]):
|
|
706
|
+
if filename_results["metadatas"] and filename_results["metadatas"][0]:
|
|
707
|
+
metadata = filename_results["metadatas"][0][i]
|
|
708
|
+
# Include content if requested
|
|
709
|
+
content_text = filename_results["documents"][0][i] if (content and filename_results["documents"]) else None
|
|
710
|
+
|
|
711
|
+
# Safe URL construction
|
|
712
|
+
url_value = metadata.get("url", "https://example.com")
|
|
713
|
+
if not url_value or url_value == "None" or not isinstance(url_value, str):
|
|
714
|
+
url_value = "https://example.com"
|
|
715
|
+
|
|
716
|
+
# Give exact filename matches a high relevance score
|
|
717
|
+
relevance_score = 1.0 # Highest priority for exact filename matches
|
|
718
|
+
|
|
719
|
+
document = Document(
|
|
720
|
+
title=str(metadata["title"]),
|
|
721
|
+
url=HttpUrl(url_value),
|
|
722
|
+
project=str(metadata["project"]),
|
|
723
|
+
source_path=str(metadata["source_path"]),
|
|
724
|
+
source_url=HttpUrl(str(metadata.get("source_url", ""))),
|
|
725
|
+
description=str(metadata["description"]),
|
|
726
|
+
is_reference=bool(metadata["is_reference"]),
|
|
727
|
+
content=content_text,
|
|
728
|
+
relevance_score=relevance_score,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
if project and document.project != project:
|
|
732
|
+
await log_exception(f"Project mismatch for component '{component}': expected '{project}', got '{document.project}'", ctx)
|
|
733
|
+
elif metadata["source_path_stem"] != component:
|
|
734
|
+
await log_exception(f"Path stem mismatch for component '{component}': expected '{component}', got '{metadata['source_path_stem']}'", ctx)
|
|
735
|
+
else:
|
|
736
|
+
all_results.append(document)
|
|
737
|
+
return all_results
|
|
738
|
+
|
|
739
|
+
async def search(self, query: str, project: Optional[str] = None, content: bool = True, max_results: int = 5, ctx: Context | None = None) -> list[Document]:
|
|
740
|
+
"""Search the documentation using semantic similarity."""
|
|
741
|
+
await self.ensure_indexed(ctx=ctx)
|
|
742
|
+
|
|
743
|
+
# Build where clause for filtering
|
|
744
|
+
where_clause = {"project": str(project)} if project else None
|
|
745
|
+
|
|
746
|
+
try:
|
|
747
|
+
# Perform vector similarity search
|
|
748
|
+
results = self.collection.query(query_texts=[query], n_results=max_results, where=where_clause) # type: ignore[arg-type]
|
|
749
|
+
|
|
750
|
+
documents = []
|
|
751
|
+
if results["ids"] and results["ids"][0]:
|
|
752
|
+
for i, _ in enumerate(results["ids"][0]):
|
|
753
|
+
if results["metadatas"] and results["metadatas"][0]:
|
|
754
|
+
metadata = results["metadatas"][0][i]
|
|
755
|
+
|
|
756
|
+
# Include content if requested
|
|
757
|
+
content_text = results["documents"][0][i] if (content and results["documents"]) else None
|
|
758
|
+
|
|
759
|
+
# Safe URL construction
|
|
760
|
+
url_value = metadata.get("url", "https://example.com")
|
|
761
|
+
if not url_value or url_value == "None" or not isinstance(url_value, str):
|
|
762
|
+
url_value = "https://example.com"
|
|
763
|
+
|
|
764
|
+
# Safe relevance score calculation
|
|
765
|
+
relevance_score = None
|
|
766
|
+
if (
|
|
767
|
+
results.get("distances")
|
|
768
|
+
and isinstance(results["distances"], list)
|
|
769
|
+
and len(results["distances"]) > 0
|
|
770
|
+
and isinstance(results["distances"][0], list)
|
|
771
|
+
and len(results["distances"][0]) > i
|
|
772
|
+
):
|
|
773
|
+
try:
|
|
774
|
+
relevance_score = (2.0 - float(results["distances"][0][i])) / 2.0
|
|
775
|
+
except (ValueError, TypeError):
|
|
776
|
+
relevance_score = None
|
|
777
|
+
|
|
778
|
+
document = Document(
|
|
779
|
+
title=str(metadata["title"]),
|
|
780
|
+
url=HttpUrl(url_value),
|
|
781
|
+
project=str(metadata["project"]),
|
|
782
|
+
source_path=str(metadata["source_path"]),
|
|
783
|
+
source_url=HttpUrl(str(metadata.get("source_url", ""))),
|
|
784
|
+
description=str(metadata["description"]),
|
|
785
|
+
is_reference=bool(metadata["is_reference"]),
|
|
786
|
+
content=content_text,
|
|
787
|
+
relevance_score=relevance_score,
|
|
788
|
+
)
|
|
789
|
+
documents.append(document)
|
|
790
|
+
return documents
|
|
791
|
+
except Exception as e:
|
|
792
|
+
raise e
|
|
793
|
+
|
|
794
|
+
async def get_document(self, path: str, project: str, ctx: Context | None = None) -> Document:
|
|
795
|
+
"""Get a specific document."""
|
|
796
|
+
await self.ensure_indexed(ctx=ctx)
|
|
797
|
+
|
|
798
|
+
# Build where clause for filtering
|
|
799
|
+
filters: list[dict[str, str]] = [{"project": str(project)}, {"source_path": str(path)}]
|
|
800
|
+
where_clause: dict[str, Any] = {"$and": filters}
|
|
801
|
+
|
|
802
|
+
# Perform vector similarity search
|
|
803
|
+
results = self.collection.query(query_texts=[""], n_results=3, where=where_clause)
|
|
804
|
+
|
|
805
|
+
documents = []
|
|
806
|
+
if results["ids"] and results["ids"][0]:
|
|
807
|
+
for i, _ in enumerate(results["ids"][0]):
|
|
808
|
+
if results["metadatas"] and results["metadatas"][0]:
|
|
809
|
+
metadata = results["metadatas"][0][i]
|
|
810
|
+
|
|
811
|
+
# Include content if requested
|
|
812
|
+
content_text = results["documents"][0][i] if results["documents"] else None
|
|
813
|
+
|
|
814
|
+
# Safe URL construction
|
|
815
|
+
url_value = metadata.get("url", "https://example.com")
|
|
816
|
+
if not url_value or url_value == "None" or not isinstance(url_value, str):
|
|
817
|
+
url_value = "https://example.com"
|
|
818
|
+
|
|
819
|
+
# Safe relevance score calculation
|
|
820
|
+
relevance_score = None
|
|
821
|
+
if (
|
|
822
|
+
results.get("distances")
|
|
823
|
+
and isinstance(results["distances"], list)
|
|
824
|
+
and len(results["distances"]) > 0
|
|
825
|
+
and isinstance(results["distances"][0], list)
|
|
826
|
+
and len(results["distances"][0]) > i
|
|
827
|
+
):
|
|
828
|
+
try:
|
|
829
|
+
relevance_score = 1.0 - float(results["distances"][0][i])
|
|
830
|
+
except (ValueError, TypeError):
|
|
831
|
+
relevance_score = None
|
|
832
|
+
|
|
833
|
+
document = Document(
|
|
834
|
+
title=str(metadata["title"]),
|
|
835
|
+
url=HttpUrl(url_value),
|
|
836
|
+
project=str(metadata["project"]),
|
|
837
|
+
source_path=str(metadata["source_path"]),
|
|
838
|
+
source_url=HttpUrl(str(metadata.get("source_url", ""))),
|
|
839
|
+
description=str(metadata["description"]),
|
|
840
|
+
is_reference=bool(metadata["is_reference"]),
|
|
841
|
+
content=content_text,
|
|
842
|
+
relevance_score=relevance_score,
|
|
843
|
+
)
|
|
844
|
+
documents.append(document)
|
|
845
|
+
|
|
846
|
+
if len(documents) > 1:
|
|
847
|
+
raise ValueError(f"Multiple documents found for path '{path}' in project '{project}'. Please ensure unique paths.")
|
|
848
|
+
elif len(documents) == 0:
|
|
849
|
+
raise ValueError(f"No document found for path '{path}' in project '{project}'.")
|
|
850
|
+
return documents[0]
|
|
851
|
+
|
|
852
|
+
async def list_projects(self) -> list[str]:
|
|
853
|
+
"""List all available projects with documentation in the index.
|
|
854
|
+
|
|
855
|
+
Returns
|
|
856
|
+
-------
|
|
857
|
+
list[str]: A list of project names that have documentation available.
|
|
858
|
+
Names are returned in hyphenated format (e.g., "panel-material-ui").
|
|
859
|
+
"""
|
|
860
|
+
await self.ensure_indexed()
|
|
861
|
+
|
|
862
|
+
try:
|
|
863
|
+
# Get all documents from the collection to extract unique project names
|
|
864
|
+
results = self.collection.get()
|
|
865
|
+
|
|
866
|
+
if not results["metadatas"]:
|
|
867
|
+
return []
|
|
868
|
+
|
|
869
|
+
# Extract unique project names
|
|
870
|
+
projects = set()
|
|
871
|
+
for metadata in results["metadatas"]:
|
|
872
|
+
project = metadata.get("project")
|
|
873
|
+
if project:
|
|
874
|
+
# Convert underscored names to hyphenated format for consistency
|
|
875
|
+
project_name = str(project).replace("_", "-")
|
|
876
|
+
projects.add(project_name)
|
|
877
|
+
|
|
878
|
+
# Return sorted list
|
|
879
|
+
return sorted(projects)
|
|
880
|
+
|
|
881
|
+
except Exception as e:
|
|
882
|
+
logger.error(f"Failed to list projects: {e}")
|
|
883
|
+
return []
|
|
884
|
+
|
|
885
|
+
async def _log_summary_table(self, ctx: Context | None = None):
|
|
886
|
+
"""Log a summary table showing document counts by repository."""
|
|
887
|
+
try:
|
|
888
|
+
# Get all documents from the collection
|
|
889
|
+
results = self.collection.get()
|
|
890
|
+
|
|
891
|
+
if not results["metadatas"]:
|
|
892
|
+
await log_info("No documents found in index", ctx)
|
|
893
|
+
return
|
|
894
|
+
|
|
895
|
+
# Count documents by project and type
|
|
896
|
+
project_stats: dict[str, dict[str, int]] = {}
|
|
897
|
+
for metadata in results["metadatas"]:
|
|
898
|
+
project = str(metadata.get("project", "unknown"))
|
|
899
|
+
is_reference = metadata.get("is_reference", False)
|
|
900
|
+
|
|
901
|
+
if project not in project_stats:
|
|
902
|
+
project_stats[project] = {"total": 0, "regular": 0, "reference": 0}
|
|
903
|
+
|
|
904
|
+
project_stats[project]["total"] += 1
|
|
905
|
+
if is_reference:
|
|
906
|
+
project_stats[project]["reference"] += 1
|
|
907
|
+
else:
|
|
908
|
+
project_stats[project]["regular"] += 1
|
|
909
|
+
|
|
910
|
+
# Log summary table
|
|
911
|
+
await log_info("", ctx)
|
|
912
|
+
await log_info("📊 Document Summary by Repository:", ctx)
|
|
913
|
+
await log_info("=" * 60, ctx)
|
|
914
|
+
await log_info(f"{'Repository':<20} {'Total':<8} {'Regular':<8} {'Reference':<10}", ctx)
|
|
915
|
+
await log_info("-" * 60, ctx)
|
|
916
|
+
|
|
917
|
+
total_docs = 0
|
|
918
|
+
total_regular = 0
|
|
919
|
+
total_reference = 0
|
|
920
|
+
|
|
921
|
+
for project in sorted(project_stats.keys()):
|
|
922
|
+
stats = project_stats[project]
|
|
923
|
+
await log_info(f"{project:<20} {stats['total']:<8} {stats['regular']:<8} {stats['reference']:<10}", ctx)
|
|
924
|
+
total_docs += stats["total"]
|
|
925
|
+
total_regular += stats["regular"]
|
|
926
|
+
total_reference += stats["reference"]
|
|
927
|
+
|
|
928
|
+
await log_info("-" * 60, ctx)
|
|
929
|
+
await log_info(f"{'TOTAL':<20} {total_docs:<8} {total_regular:<8} {total_reference:<10}", ctx)
|
|
930
|
+
await log_info("=" * 60, ctx)
|
|
931
|
+
|
|
932
|
+
except Exception as e:
|
|
933
|
+
await log_warning(f"Failed to generate summary table: {e}", ctx)
|
|
934
|
+
|
|
935
|
+
def run(self):
|
|
936
|
+
"""Update the DocumentationIndexer."""
|
|
937
|
+
# Configure logging for the CLI
|
|
938
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()])
|
|
939
|
+
|
|
940
|
+
logger.info("🚀 HoloViz MCP Documentation Indexer")
|
|
941
|
+
logger.info("=" * 50)
|
|
942
|
+
|
|
943
|
+
async def run_indexer(indexer=self):
|
|
944
|
+
logger.info(f"📁 Repository directory: {indexer.repos_dir}")
|
|
945
|
+
logger.info(f"💾 Vector database: {indexer.data_dir / 'chroma'}")
|
|
946
|
+
logger.info(f"🔧 Configured repositories: {len(indexer.config.repositories)}")
|
|
947
|
+
logger.info("")
|
|
948
|
+
|
|
949
|
+
await indexer.index_documentation()
|
|
950
|
+
|
|
951
|
+
# Final summary
|
|
952
|
+
count = indexer.collection.count()
|
|
953
|
+
logger.info("")
|
|
954
|
+
logger.info("=" * 50)
|
|
955
|
+
logger.info("✅ Indexing completed successfully!")
|
|
956
|
+
logger.info(f"📊 Total documents in database: {count}")
|
|
957
|
+
logger.info("=" * 50)
|
|
958
|
+
|
|
959
|
+
asyncio.run(run_indexer())
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
if __name__ == "__main__":
|
|
963
|
+
DocumentationIndexer().run()
|