holoviz-mcp 0.0.1a0__py3-none-any.whl → 0.0.1a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holoviz-mcp might be problematic. Click here for more details.

Files changed (37) hide show
  1. holoviz_mcp/__init__.py +18 -0
  2. holoviz_mcp/apps/__init__.py +1 -0
  3. holoviz_mcp/apps/configuration_viewer.py +116 -0
  4. holoviz_mcp/apps/search.py +314 -0
  5. holoviz_mcp/config/__init__.py +31 -0
  6. holoviz_mcp/config/config.yaml +167 -0
  7. holoviz_mcp/config/loader.py +308 -0
  8. holoviz_mcp/config/models.py +216 -0
  9. holoviz_mcp/config/resources/best-practices/hvplot.md +62 -0
  10. holoviz_mcp/config/resources/best-practices/panel-material-ui.md +318 -0
  11. holoviz_mcp/config/resources/best-practices/panel.md +294 -0
  12. holoviz_mcp/config/schema.json +203 -0
  13. holoviz_mcp/docs_mcp/__init__.py +1 -0
  14. holoviz_mcp/docs_mcp/data.py +963 -0
  15. holoviz_mcp/docs_mcp/models.py +21 -0
  16. holoviz_mcp/docs_mcp/pages_design.md +407 -0
  17. holoviz_mcp/docs_mcp/server.py +220 -0
  18. holoviz_mcp/hvplot_mcp/__init__.py +1 -0
  19. holoviz_mcp/hvplot_mcp/server.py +152 -0
  20. holoviz_mcp/panel_mcp/__init__.py +17 -0
  21. holoviz_mcp/panel_mcp/data.py +316 -0
  22. holoviz_mcp/panel_mcp/models.py +124 -0
  23. holoviz_mcp/panel_mcp/server.py +650 -0
  24. holoviz_mcp/py.typed +0 -0
  25. holoviz_mcp/serve.py +34 -0
  26. holoviz_mcp/server.py +77 -0
  27. holoviz_mcp/shared/__init__.py +1 -0
  28. holoviz_mcp/shared/extract_tools.py +74 -0
  29. holoviz_mcp-0.0.1a2.dist-info/METADATA +641 -0
  30. holoviz_mcp-0.0.1a2.dist-info/RECORD +33 -0
  31. {holoviz_mcp-0.0.1a0.dist-info → holoviz_mcp-0.0.1a2.dist-info}/WHEEL +1 -2
  32. holoviz_mcp-0.0.1a2.dist-info/entry_points.txt +4 -0
  33. holoviz_mcp-0.0.1a2.dist-info/licenses/LICENSE.txt +30 -0
  34. holoviz_mcp-0.0.1a0.dist-info/METADATA +0 -6
  35. holoviz_mcp-0.0.1a0.dist-info/RECORD +0 -5
  36. holoviz_mcp-0.0.1a0.dist-info/top_level.txt +0 -1
  37. main.py +0 -6
@@ -0,0 +1,963 @@
1
+ """Data handling for the HoloViz Documentation MCP server."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import re
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from typing import Literal
10
+ from typing import Optional
11
+
12
+ import chromadb
13
+ import git
14
+ from chromadb.api.collection_configuration import CreateCollectionConfiguration
15
+ from fastmcp import Context
16
+ from nbconvert import MarkdownExporter
17
+ from nbformat import read as nbread
18
+ from pydantic import HttpUrl
19
+
20
+ from holoviz_mcp.config.loader import get_config
21
+ from holoviz_mcp.config.models import FolderConfig
22
+ from holoviz_mcp.config.models import GitRepository
23
+ from holoviz_mcp.docs_mcp.models import Document
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Todo: Describe DocumentApp
28
+ # Todo: Avoid overflow-x in SearchApp sidebar
29
+ # Todo: Add bokeh documentation to README extra config
30
+
31
+ _CROMA_CONFIGURATION = CreateCollectionConfiguration(
32
+ hnsw={
33
+ "space": "cosine",
34
+ "ef_construction": 200,
35
+ "ef_search": 200,
36
+ }
37
+ )
38
+
39
+
40
+ async def log_info(message: str, ctx: Context | None = None):
41
+ """Log an info message to the context or logger."""
42
+ if ctx:
43
+ await ctx.info(message)
44
+ else:
45
+ logger.info(message)
46
+
47
+
48
+ async def log_warning(message: str, ctx: Context | None = None):
49
+ """Log a warning message to the context or logger."""
50
+ if ctx:
51
+ await ctx.warning(message)
52
+ else:
53
+ logger.warning(message)
54
+
55
+
56
+ async def log_exception(message: str, ctx: Context | None = None):
57
+ """Log an error message to the context or logger."""
58
+ if ctx:
59
+ await ctx.error(message)
60
+ else:
61
+ logger.error(message)
62
+ raise Exception(message)
63
+
64
+
65
+ def get_best_practices(project: str) -> str:
66
+ """Get best practices for using a project with LLMs.
67
+
68
+ This function searches for best practices resources in user and default directories,
69
+ with user resources taking precedence over default ones.
70
+
71
+ Args:
72
+ project (str): The name of the project to get best practices for.
73
+ Both hyphenated (e.g., "panel-material-ui") and underscored
74
+ (e.g., "panel_material_ui") names are supported.
75
+
76
+ Returns
77
+ -------
78
+ str: A string containing the best practices for the project in Markdown format.
79
+
80
+ Raises
81
+ ------
82
+ FileNotFoundError: If no best practices file is found for the project.
83
+ """
84
+ config = get_config()
85
+
86
+ # Convert underscored names to hyphenated for file lookup
87
+ project_filename = project.replace("_", "-")
88
+
89
+ # Search in user directory first, then default directory
90
+ search_paths = [
91
+ config.best_practices_dir("user"),
92
+ config.best_practices_dir("default"),
93
+ ]
94
+
95
+ for search_dir in search_paths:
96
+ best_practices_file = search_dir / f"{project_filename}.md"
97
+ if best_practices_file.exists():
98
+ return best_practices_file.read_text(encoding="utf-8")
99
+
100
+ # If not found, raise error with helpful message
101
+ available_files = []
102
+ for search_dir in search_paths:
103
+ if search_dir.exists():
104
+ available_files.extend([f.stem for f in search_dir.glob("*.md")])
105
+
106
+ available_str = ", ".join(set(available_files)) if available_files else "None"
107
+ raise FileNotFoundError(
108
+ f"Best practices file for project '{project}' not found. " f"Available projects: {available_str}. " f"Searched in: {[str(p) for p in search_paths]}"
109
+ )
110
+
111
+
112
+ def list_best_practices() -> list[str]:
113
+ """List all available best practices projects.
114
+
115
+ This function discovers available best practices from both user and default directories,
116
+ with user resources taking precedence over default ones.
117
+
118
+ Returns
119
+ -------
120
+ list[str]: A list of project names that have best practices available.
121
+ Names are returned in hyphenated format (e.g., "panel-material-ui").
122
+ """
123
+ config = get_config()
124
+
125
+ # Collect available projects from both directories
126
+ available_projects = set()
127
+
128
+ search_paths = [
129
+ config.best_practices_dir("user"),
130
+ config.best_practices_dir("default"),
131
+ ]
132
+
133
+ for search_dir in search_paths:
134
+ if search_dir.exists():
135
+ for md_file in search_dir.glob("*.md"):
136
+ available_projects.add(md_file.stem)
137
+
138
+ return sorted(list(available_projects))
139
+
140
+
141
+ def remove_leading_number_sep_from_path(p: Path) -> Path:
142
+ """Remove a leading number + underscore or hyphen from the last path component."""
143
+ new_name = re.sub(r"^\d+[_-]", "", p.name)
144
+ return p.with_name(new_name)
145
+
146
+
147
+ def convert_path_to_url(path: Path, remove_first_part: bool = True, url_transform: Literal["holoviz", "plotly", "datashader"] = "holoviz") -> str:
148
+ """Convert a relative file path to a URL path.
149
+
150
+ Converts file paths to web URLs by replacing file extensions with .html
151
+ and optionally removing the first path component for legacy compatibility.
152
+
153
+ Args:
154
+ path: The file path to convert
155
+ remove_first_part: Whether to remove the first path component (legacy compatibility)
156
+ url_transform: How to transform the file path into a URL:
157
+
158
+ - "holoviz": Replace file extension with .html (default)
159
+ - "plotly": Replace file extension with / (e.g., filename.md -> filename/)
160
+ - "datashader": Remove leading index and replace file extension with .html (e.g., 01_filename.md -> filename.html)
161
+
162
+ Returns
163
+ -------
164
+ URL path with .html extension
165
+
166
+ Examples
167
+ --------
168
+ >>> convert_path_to_url(Path("doc/getting_started.md"))
169
+ "getting_started.html"
170
+ >>> convert_path_to_url(Path("examples/reference/Button.ipynb"), False)
171
+ "examples/reference/Button.html"
172
+ >>> convert_path_to_url(Path("/doc/python/3d-axes.md"), False, "plotly")
173
+ "/doc/python/3d-axes/"
174
+ >>> convert_path_to_url(Path("/examples/user_guide/10_Performance.ipynb"), False, "datashader")
175
+ "/examples/user_guide/Performance.html"
176
+ """
177
+ if url_transform in ["holoviz", "datashader"]:
178
+ path = remove_leading_number_sep_from_path(path)
179
+
180
+ # Convert path to URL format
181
+ parts = list(path.parts)
182
+
183
+ # Only remove first part if requested (for legacy compatibility)
184
+ if remove_first_part and parts:
185
+ parts.pop(0)
186
+
187
+ # Reconstruct path and convert to string
188
+ if parts:
189
+ url_path = str(Path(*parts))
190
+ else:
191
+ url_path = ""
192
+
193
+ # Replace file extensions with suffix
194
+ if url_path:
195
+ path_obj = Path(url_path)
196
+ if url_transform == "plotly":
197
+ url_path = str(path_obj.with_suffix(suffix="")) + "/"
198
+ else:
199
+ url_path = str(path_obj.with_suffix(suffix=".html"))
200
+
201
+ return url_path
202
+
203
+
204
+ class DocumentationIndexer:
205
+ """Handles cloning, processing, and indexing of documentation."""
206
+
207
+ def __init__(self, *, data_dir: Optional[Path] = None, repos_dir: Optional[Path] = None, vector_dir: Optional[Path] = None):
208
+ """Initialize the DocumentationIndexer.
209
+
210
+ Args:
211
+ data_dir: Directory to store index data. Defaults to user config directory.
212
+ repos_dir: Directory to store cloned repositories. Defaults to HOLOVIZ_MCP_REPOS_DIR.
213
+ vector_dir: Directory to store vector database. Defaults to config.vector_dir
214
+ """
215
+ # Use unified config for default paths
216
+ config = get_config()
217
+
218
+ self.data_dir = data_dir or config.user_dir
219
+ self.data_dir.mkdir(parents=True, exist_ok=True)
220
+
221
+ # Use configurable repos directory for repository downloads
222
+ self.repos_dir = repos_dir or config.repos_dir
223
+ self.repos_dir.mkdir(parents=True, exist_ok=True)
224
+
225
+ # Use configurable directory for vector database path
226
+ vector_db_path = vector_dir or config.server.vector_db_path
227
+ vector_db_path.parent.mkdir(parents=True, exist_ok=True)
228
+
229
+ # Disable ChromaDB telemetry based on config
230
+ if not config.server.anonymized_telemetry:
231
+ os.environ["ANONYMIZED_TELEMETRY"] = "False"
232
+
233
+ # Initialize ChromaDB
234
+ self.chroma_client = chromadb.PersistentClient(path=str(vector_db_path))
235
+ self.collection = self.chroma_client.get_or_create_collection("holoviz_docs", configuration=_CROMA_CONFIGURATION)
236
+
237
+ # Initialize notebook converter
238
+ self.nb_exporter = MarkdownExporter()
239
+
240
+ # Load documentation config from the centralized config system
241
+ self.config = get_config().docs
242
+
243
+ def is_indexed(self) -> bool:
244
+ """Check if documentation index exists and is valid."""
245
+ try:
246
+ count = self.collection.count()
247
+ return count > 0
248
+ except Exception:
249
+ return False
250
+
251
+ async def ensure_indexed(self, ctx: Context | None = None):
252
+ """Ensure documentation is indexed, creating if necessary."""
253
+ if not self.is_indexed():
254
+ await log_info("Documentation index not found. Creating initial index...", ctx)
255
+ await self.index_documentation()
256
+
257
+ async def clone_or_update_repo(self, repo_name: str, repo_config: "GitRepository", ctx: Context | None = None) -> Optional[Path]:
258
+ """Clone or update a single repository."""
259
+ repo_path = self.repos_dir / repo_name
260
+
261
+ try:
262
+ if repo_path.exists():
263
+ # Update existing repository
264
+ await log_info(f"Updating {repo_name} repository at {repo_path}...", ctx)
265
+ repo = git.Repo(repo_path)
266
+ repo.remotes.origin.pull()
267
+ else:
268
+ # Clone new repository
269
+ await log_info(f"Cloning {repo_name} repository to {repo_path}...", ctx)
270
+ clone_kwargs: dict[str, Any] = {"depth": 1} # Shallow clone for efficiency
271
+
272
+ # Add branch, tag, or commit if specified
273
+ if repo_config.branch:
274
+ clone_kwargs["branch"] = repo_config.branch
275
+ elif repo_config.tag:
276
+ clone_kwargs["branch"] = repo_config.tag
277
+ elif repo_config.commit:
278
+ # For specific commits, we need to clone and then checkout
279
+ git.Repo.clone_from(str(repo_config.url), repo_path, **clone_kwargs)
280
+ repo = git.Repo(repo_path)
281
+ repo.git.checkout(repo_config.commit)
282
+ return repo_path
283
+
284
+ git.Repo.clone_from(str(repo_config.url), repo_path, **clone_kwargs)
285
+
286
+ return repo_path
287
+ except Exception as e:
288
+ msg = f"Failed to clone/update {repo_name}: {e}"
289
+ await log_warning(msg, ctx) # Changed from log_exception to log_warning so it doesn't raise
290
+ return None
291
+
292
+ def _is_reference_document(self, file_path: Path, project: str, folder_name: str = "") -> bool:
293
+ """Check if the document is a reference document using configurable patterns.
294
+
295
+ Args:
296
+ file_path: Full path to the file
297
+ project: Project name
298
+ folder_name: Name of the folder this file belongs to
299
+
300
+ Returns
301
+ -------
302
+ bool: True if this is a reference document
303
+ """
304
+ repo_config = self.config.repositories[project]
305
+ repo_path = self.repos_dir / project
306
+
307
+ try:
308
+ relative_path = file_path.relative_to(repo_path)
309
+
310
+ # Check against configured reference patterns
311
+ for pattern in repo_config.reference_patterns:
312
+ if relative_path.match(pattern):
313
+ return True
314
+
315
+ # Fallback to simple "reference" in path check
316
+ return "reference" in relative_path.parts
317
+ except (ValueError, KeyError):
318
+ # If we can't determine relative path or no patterns configured, use simple fallback
319
+ return "reference" in file_path.parts
320
+
321
+ def _generate_doc_id(self, project: str, path: Path) -> str:
322
+ """Generate a unique document ID from project and path."""
323
+ readable_path = str(path).replace("/", "___").replace(".", "_")
324
+ readable_id = f"{project}___{readable_path}"
325
+
326
+ return readable_id
327
+
328
+ def _generate_doc_url(self, project: str, path: Path, folder_name: str = "") -> str:
329
+ """Generate documentation URL for a file.
330
+
331
+ This method creates the final URL where the documentation can be accessed online.
332
+ It handles folder URL mapping to ensure proper URL structure for different documentation layouts.
333
+
334
+ Args:
335
+ project: Name of the project/repository (e.g., "panel", "hvplot")
336
+ path: Relative path to the file within the repository
337
+ folder_name: Name of the folder containing the file (e.g., "examples/reference", "doc")
338
+ Used for URL path mapping when folders have custom URL structures
339
+
340
+ Returns
341
+ -------
342
+ Complete URL to the documentation file
343
+
344
+ Examples
345
+ --------
346
+ For Panel reference guides:
347
+ - Input: project="panel", path="examples/reference/widgets/Button.ipynb", folder_name="examples/reference"
348
+ - Output: "https://panel.holoviz.org/reference/widgets/Button.html"
349
+
350
+ For regular documentation:
351
+ - Input: project="panel", path="doc/getting_started.md", folder_name="doc"
352
+ - Output: "https://panel.holoviz.org/getting_started.html"
353
+ """
354
+ repo_config = self.config.repositories[project]
355
+ base_url = str(repo_config.base_url).rstrip("/")
356
+
357
+ # Get the URL path mapping for this folder
358
+ folder_url_path = repo_config.get_folder_url_path(folder_name)
359
+
360
+ # If there's a folder URL mapping, we need to adjust the path
361
+ if folder_url_path and folder_name:
362
+ # Remove the folder name from the beginning of the path
363
+ path_str = str(path)
364
+
365
+ # Check if path starts with the folder name
366
+ if path_str.startswith(folder_name + "/"):
367
+ # Remove the folder prefix and leading slash
368
+ remaining_path = path_str[len(folder_name) + 1 :]
369
+ adjusted_path = Path(remaining_path) if remaining_path else Path(".")
370
+ elif path_str == folder_name:
371
+ # The path is exactly the folder name
372
+ adjusted_path = Path(".")
373
+ else:
374
+ # Fallback: try to remove folder parts from the beginning
375
+ path_parts = list(path.parts)
376
+ folder_parts = folder_name.split("/")
377
+ for folder_part in folder_parts:
378
+ if path_parts and path_parts[0] == folder_part:
379
+ path_parts = path_parts[1:]
380
+ adjusted_path = Path(*path_parts) if path_parts else Path(".")
381
+
382
+ # Don't remove first part since we already adjusted the path
383
+ doc_path = convert_path_to_url(adjusted_path, remove_first_part=False, url_transform=repo_config.url_transform)
384
+ else:
385
+ # Convert file path to URL format normally (remove first part for legacy compatibility)
386
+ doc_path = convert_path_to_url(path, remove_first_part=True, url_transform=repo_config.url_transform)
387
+
388
+ # Combine base URL, folder URL path, and document path
389
+ if folder_url_path:
390
+ full_url = f"{base_url}{folder_url_path}/{doc_path}"
391
+ else:
392
+ full_url = f"{base_url}/{doc_path}"
393
+
394
+ return full_url.replace("//", "/").replace(":/", "://") # Fix double slashes
395
+
396
+ @staticmethod
397
+ def _to_title(fallback_filename: str = "") -> str:
398
+ """Extract title from a filename or return a default title."""
399
+ title = Path(fallback_filename).stem
400
+ if "_" in title and title.split("_")[0].isdigit():
401
+ title = title.split("_", 1)[-1]
402
+ title = title.replace("_", " ").replace("-", " ").title()
403
+ return title
404
+
405
+ @classmethod
406
+ def _extract_title_from_markdown(cls, content: str, fallback_filename: str = "") -> str:
407
+ """Extract title from markdown content, with filename fallback."""
408
+ lines = content.split("\n")
409
+ for line in lines:
410
+ line = line.strip()
411
+ if line.startswith("# "):
412
+ # Return just the title text without the "# " prefix
413
+ return line[2:].strip()
414
+ if line.startswith("##"):
415
+ break
416
+
417
+ if fallback_filename:
418
+ return cls._to_title(fallback_filename)
419
+
420
+ return "No Title"
421
+
422
+ @staticmethod
423
+ def _extract_description_from_markdown(content: str, max_length=200) -> str:
424
+ """Extract description from markdown content."""
425
+ content = content.strip()
426
+
427
+ # Plotly documents start with --- ... --- section. Skip the section
428
+ if content.startswith("---"):
429
+ content = content.split("---", 2)[-1].strip()
430
+
431
+ lines = content.split("\n")
432
+ clean_lines = []
433
+ in_code_block = False
434
+
435
+ for line in lines:
436
+ if line.strip().startswith("```"):
437
+ in_code_block = not in_code_block
438
+ continue
439
+
440
+ if in_code_block or line.startswith(("#", " ", "\t", "---", "___")):
441
+ continue
442
+
443
+ clean_lines.append(line)
444
+
445
+ # Join lines and clean up
446
+ clean_content = "\n".join(clean_lines).strip()
447
+
448
+ # Remove extra whitespace and limit length
449
+ clean_content = " ".join(clean_content.split())
450
+
451
+ if len(clean_content) > max_length:
452
+ clean_content = clean_content[:max_length].rsplit(" ", 1)[0]
453
+ if not clean_content.endswith("."):
454
+ clean_content += " ..."
455
+
456
+ return clean_content
457
+
458
+ def convert_notebook_to_markdown(self, notebook_path: Path) -> str:
459
+ """Convert a Jupyter notebook to markdown."""
460
+ try:
461
+ with open(notebook_path, "r", encoding="utf-8") as f:
462
+ notebook = nbread(f, as_version=4)
463
+
464
+ (body, resources) = self.nb_exporter.from_notebook_node(notebook)
465
+ return body
466
+ except Exception as e:
467
+ logger.error(f"Failed to convert notebook {notebook_path}: {e}")
468
+ return str(e)
469
+
470
+ @staticmethod
471
+ def _to_source_url(file_path: Path, repo_config: GitRepository, raw: bool = False) -> str:
472
+ """Generate source URL for a file based on repository configuration."""
473
+ url = str(repo_config.url)
474
+ branch = repo_config.branch or "main"
475
+ if url.startswith("https://github.com") and url.endswith(".git"):
476
+ url = url.replace("https://github.com/", "").replace(".git", "")
477
+ project, repository = url.split("/")
478
+ if raw:
479
+ return f"https://raw.githubusercontent.com/{project}/{repository}/refs/heads/{branch}/{file_path}"
480
+
481
+ return f"https://github.com/{project}/{repository}/blob/{branch}/{file_path}"
482
+ if "dev.azure.com" in url:
483
+ organisation = url.split("/")[3].split("@")[0]
484
+ project = url.split("/")[-3]
485
+ repo_name = url.split("/")[-1]
486
+ if raw:
487
+ return f"https://dev.azure.com/{organisation}/{project}/_apis/sourceProviders/TfsGit/filecontents?repository={repo_name}&path=/{file_path}&commitOrBranch={branch}&api-version=7.0"
488
+
489
+ return f"https://dev.azure.com/{organisation}/{project}/_git/{repo_name}?path=/{file_path}&version=GB{branch}"
490
+
491
+ raise ValueError(f"Unsupported repository URL format: {url}. Please provide a valid GitHub or Azure DevOps URL.")
492
+
493
+ def process_file(self, file_path: Path, project: str, repo_config: GitRepository, folder_name: str = "") -> Optional[dict[str, Any]]:
494
+ """Process a file and extract metadata."""
495
+ try:
496
+ if file_path.suffix == ".ipynb":
497
+ content = self.convert_notebook_to_markdown(file_path)
498
+ elif file_path.suffix in [".md", ".rst", ".txt"]:
499
+ with open(file_path, "r", encoding="utf-8") as f:
500
+ content = f.read()
501
+ else:
502
+ logger.debug(f"Skipping unsupported file type: {file_path}")
503
+ return None
504
+
505
+ title = self._extract_title_from_markdown(content, file_path.name)
506
+ if not title:
507
+ title = file_path.stem.replace("_", " ").title()
508
+
509
+ description = self._extract_description_from_markdown(content)
510
+
511
+ repo_path = self.repos_dir / project
512
+ relative_path = file_path.relative_to(repo_path)
513
+
514
+ doc_id = self._generate_doc_id(project, relative_path)
515
+
516
+ is_reference = self._is_reference_document(file_path, project, folder_name)
517
+
518
+ source_url = self._to_source_url(relative_path, repo_config)
519
+
520
+ return {
521
+ "id": doc_id,
522
+ "title": title,
523
+ "url": self._generate_doc_url(project, relative_path, folder_name),
524
+ "project": project,
525
+ "source_path": str(relative_path),
526
+ "source_path_stem": file_path.stem,
527
+ "source_url": source_url,
528
+ "description": description,
529
+ "content": content,
530
+ "is_reference": is_reference,
531
+ }
532
+ except Exception as e:
533
+ logger.error(f"Failed to process file {file_path}: {e}")
534
+ return None
535
+
536
+ async def extract_docs_from_repo(self, repo_path: Path, project: str, ctx: Context | None = None) -> list[dict[str, Any]]:
537
+ """Extract documentation files from a repository."""
538
+ docs = []
539
+ repo_config = self.config.repositories[project]
540
+
541
+ # Use the new folder structure with URL path mapping
542
+ if isinstance(repo_config.folders, dict):
543
+ folders = repo_config.folders
544
+ else:
545
+ # Convert list to dict with default FolderConfig
546
+ folders = {name: FolderConfig() for name in repo_config.folders}
547
+
548
+ files: set = set()
549
+ await log_info(f"Processing {project} documentation files in {','.join(folders.keys())}", ctx)
550
+
551
+ for folder_name in folders.keys():
552
+ docs_folder: Path = repo_path / folder_name
553
+ if docs_folder.exists():
554
+ # Use index patterns from config
555
+ for pattern in self.config.index_patterns:
556
+ files.update(docs_folder.glob(pattern))
557
+
558
+ for file in files:
559
+ if file.exists() and not file.is_dir():
560
+ # Determine which folder this file belongs to
561
+ folder_name = ""
562
+ for fname in folders.keys():
563
+ folder_path = repo_path / fname
564
+ try:
565
+ file.relative_to(folder_path)
566
+ folder_name = fname
567
+ break
568
+ except ValueError:
569
+ continue
570
+
571
+ doc_data = self.process_file(file, project, repo_config, folder_name)
572
+ if doc_data:
573
+ docs.append(doc_data)
574
+
575
+ # Count reference vs regular documents
576
+ reference_count = sum(1 for doc in docs if doc["is_reference"])
577
+ regular_count = len(docs) - reference_count
578
+
579
+ await log_info(f" 📄 {project}: {len(docs)} total documents ({regular_count} regular, {reference_count} reference guides)", ctx)
580
+ return docs
581
+
582
+ async def index_documentation(self, ctx: Context | None = None):
583
+ """Indexes all documentation."""
584
+ await log_info("Starting documentation indexing...", ctx)
585
+ await log_info(f"📁 Repositories directory: {self.repos_dir}", ctx)
586
+ await log_info(f"💾 Vector database location: {self.data_dir / 'chroma'}", ctx)
587
+
588
+ all_docs = []
589
+
590
+ # Clone/update repositories and extract documentation
591
+ for repo_name, repo_config in self.config.repositories.items():
592
+ await log_info(f"Processing {repo_name}...", ctx)
593
+ repo_path = await self.clone_or_update_repo(repo_name, repo_config)
594
+ if repo_path:
595
+ docs = await self.extract_docs_from_repo(repo_path, repo_name, ctx)
596
+ all_docs.extend(docs)
597
+
598
+ if not all_docs:
599
+ await log_warning("No documentation found to index", ctx)
600
+ return
601
+
602
+ # Validate for duplicate IDs and log details
603
+ await self._validate_unique_ids(all_docs)
604
+
605
+ # Clear existing collection
606
+ await log_info("Clearing existing index...", ctx)
607
+
608
+ # Only delete if collection has data
609
+ try:
610
+ count = self.collection.count()
611
+ if count > 0:
612
+ # Delete all documents by getting all IDs first
613
+ results = self.collection.get()
614
+ if results["ids"]:
615
+ self.collection.delete(ids=results["ids"])
616
+ except Exception as e:
617
+ logger.warning(f"Failed to clear existing collection: {e}")
618
+ # If clearing fails, recreate the collection
619
+ try:
620
+ self.chroma_client.delete_collection("holoviz_docs")
621
+ self.collection = self.chroma_client.get_or_create_collection("holoviz_docs", configuration=_CROMA_CONFIGURATION)
622
+ except Exception as e2:
623
+ await log_exception(f"Failed to recreate collection: {e2}", ctx)
624
+ raise
625
+
626
+ # Add documents to ChromaDB
627
+ await log_info(f"Adding {len(all_docs)} documents to index...", ctx)
628
+
629
+ self.collection.add(
630
+ documents=[doc["content"] for doc in all_docs],
631
+ metadatas=[
632
+ {
633
+ "title": doc["title"],
634
+ "url": doc["url"],
635
+ "project": doc["project"],
636
+ "source_path": doc["source_path"],
637
+ "source_path_stem": doc["source_path_stem"],
638
+ "source_url": doc["source_url"],
639
+ "description": doc["description"],
640
+ "is_reference": doc["is_reference"],
641
+ }
642
+ for doc in all_docs
643
+ ],
644
+ ids=[doc["id"] for doc in all_docs],
645
+ )
646
+
647
+ await log_info(f"✅ Successfully indexed {len(all_docs)} documents", ctx)
648
+ await log_info(f"📊 Vector database stored at: {self.data_dir / 'chroma'}", ctx)
649
+ await log_info(f"🔍 Index contains {self.collection.count()} total documents", ctx)
650
+
651
+ # Show detailed summary table
652
+ await self._log_summary_table(ctx)
653
+
654
+ async def _validate_unique_ids(self, all_docs: list[dict[str, Any]], ctx: Context | None = None) -> None:
655
+ """Validate that all document IDs are unique and log duplicates."""
656
+ seen_ids: dict = {}
657
+ duplicates = []
658
+
659
+ for doc in all_docs:
660
+ doc_id = doc["id"]
661
+ if doc_id in seen_ids:
662
+ duplicates.append(
663
+ {
664
+ "id": doc_id,
665
+ "first_doc": seen_ids[doc_id],
666
+ "duplicate_doc": {"project": doc["project"], "source_path": doc["source_path"], "title": doc["title"]},
667
+ }
668
+ )
669
+
670
+ await log_warning(f"DUPLICATE ID FOUND: {doc_id}", ctx)
671
+ await log_warning(f" First document: {seen_ids[doc_id]['project']}/{seen_ids[doc_id]['path']} - {seen_ids[doc_id]['title']}", ctx)
672
+ await log_warning(f" Duplicate document: {doc['project']}/{doc['path']} - {doc['title']}", ctx)
673
+ else:
674
+ seen_ids[doc_id] = {"project": doc["project"], "source_path": doc["source_path"], "title": doc["title"]}
675
+
676
+ if duplicates:
677
+ error_msg = f"Found {len(duplicates)} duplicate document IDs"
678
+ await log_exception(error_msg, ctx)
679
+
680
+ # Log all duplicates for debugging
681
+ for dup in duplicates:
682
+ await log_exception(
683
+ f"Duplicate ID '{dup['id']}': {dup['first_doc']['project']}/{dup['first_doc']['path']} vs {dup['duplicate_doc']['project']}/{dup['duplicate_doc']['path']}", # noqa: D401, E501
684
+ ctx,
685
+ )
686
+
687
+ raise ValueError(f"Document ID collision detected. {len(duplicates)} duplicate IDs found. Check logs for details.")
688
+
689
+ async def search_get_reference_guide(self, component: str, project: Optional[str] = None, content: bool = True, ctx: Context | None = None) -> list[Document]:
690
+ """Search for reference guides for a specific component."""
691
+ await self.ensure_indexed()
692
+
693
+ # Build search strategies
694
+ filters: list[dict[str, Any]] = []
695
+ if project:
696
+ filters.append({"project": str(project)})
697
+ filters.append({"source_path_stem": str(component)})
698
+ filters.append({"is_reference": True})
699
+ where_clause: dict[str, Any] = {"$and": filters} if len(filters) > 1 else filters[0]
700
+
701
+ all_results = []
702
+
703
+ filename_results = self.collection.query(query_texts=[component], n_results=1000, where=where_clause)
704
+ if filename_results["ids"] and filename_results["ids"][0]:
705
+ for i, _ in enumerate(filename_results["ids"][0]):
706
+ if filename_results["metadatas"] and filename_results["metadatas"][0]:
707
+ metadata = filename_results["metadatas"][0][i]
708
+ # Include content if requested
709
+ content_text = filename_results["documents"][0][i] if (content and filename_results["documents"]) else None
710
+
711
+ # Safe URL construction
712
+ url_value = metadata.get("url", "https://example.com")
713
+ if not url_value or url_value == "None" or not isinstance(url_value, str):
714
+ url_value = "https://example.com"
715
+
716
+ # Give exact filename matches a high relevance score
717
+ relevance_score = 1.0 # Highest priority for exact filename matches
718
+
719
+ document = Document(
720
+ title=str(metadata["title"]),
721
+ url=HttpUrl(url_value),
722
+ project=str(metadata["project"]),
723
+ source_path=str(metadata["source_path"]),
724
+ source_url=HttpUrl(str(metadata.get("source_url", ""))),
725
+ description=str(metadata["description"]),
726
+ is_reference=bool(metadata["is_reference"]),
727
+ content=content_text,
728
+ relevance_score=relevance_score,
729
+ )
730
+
731
+ if project and document.project != project:
732
+ await log_exception(f"Project mismatch for component '{component}': expected '{project}', got '{document.project}'", ctx)
733
+ elif metadata["source_path_stem"] != component:
734
+ await log_exception(f"Path stem mismatch for component '{component}': expected '{component}', got '{metadata['source_path_stem']}'", ctx)
735
+ else:
736
+ all_results.append(document)
737
+ return all_results
738
+
739
+ async def search(self, query: str, project: Optional[str] = None, content: bool = True, max_results: int = 5, ctx: Context | None = None) -> list[Document]:
740
+ """Search the documentation using semantic similarity."""
741
+ await self.ensure_indexed(ctx=ctx)
742
+
743
+ # Build where clause for filtering
744
+ where_clause = {"project": str(project)} if project else None
745
+
746
+ try:
747
+ # Perform vector similarity search
748
+ results = self.collection.query(query_texts=[query], n_results=max_results, where=where_clause) # type: ignore[arg-type]
749
+
750
+ documents = []
751
+ if results["ids"] and results["ids"][0]:
752
+ for i, _ in enumerate(results["ids"][0]):
753
+ if results["metadatas"] and results["metadatas"][0]:
754
+ metadata = results["metadatas"][0][i]
755
+
756
+ # Include content if requested
757
+ content_text = results["documents"][0][i] if (content and results["documents"]) else None
758
+
759
+ # Safe URL construction
760
+ url_value = metadata.get("url", "https://example.com")
761
+ if not url_value or url_value == "None" or not isinstance(url_value, str):
762
+ url_value = "https://example.com"
763
+
764
+ # Safe relevance score calculation
765
+ relevance_score = None
766
+ if (
767
+ results.get("distances")
768
+ and isinstance(results["distances"], list)
769
+ and len(results["distances"]) > 0
770
+ and isinstance(results["distances"][0], list)
771
+ and len(results["distances"][0]) > i
772
+ ):
773
+ try:
774
+ relevance_score = (2.0 - float(results["distances"][0][i])) / 2.0
775
+ except (ValueError, TypeError):
776
+ relevance_score = None
777
+
778
+ document = Document(
779
+ title=str(metadata["title"]),
780
+ url=HttpUrl(url_value),
781
+ project=str(metadata["project"]),
782
+ source_path=str(metadata["source_path"]),
783
+ source_url=HttpUrl(str(metadata.get("source_url", ""))),
784
+ description=str(metadata["description"]),
785
+ is_reference=bool(metadata["is_reference"]),
786
+ content=content_text,
787
+ relevance_score=relevance_score,
788
+ )
789
+ documents.append(document)
790
+ return documents
791
+ except Exception as e:
792
+ raise e
793
+
794
+ async def get_document(self, path: str, project: str, ctx: Context | None = None) -> Document:
795
+ """Get a specific document."""
796
+ await self.ensure_indexed(ctx=ctx)
797
+
798
+ # Build where clause for filtering
799
+ filters: list[dict[str, str]] = [{"project": str(project)}, {"source_path": str(path)}]
800
+ where_clause: dict[str, Any] = {"$and": filters}
801
+
802
+ # Perform vector similarity search
803
+ results = self.collection.query(query_texts=[""], n_results=3, where=where_clause)
804
+
805
+ documents = []
806
+ if results["ids"] and results["ids"][0]:
807
+ for i, _ in enumerate(results["ids"][0]):
808
+ if results["metadatas"] and results["metadatas"][0]:
809
+ metadata = results["metadatas"][0][i]
810
+
811
+ # Include content if requested
812
+ content_text = results["documents"][0][i] if results["documents"] else None
813
+
814
+ # Safe URL construction
815
+ url_value = metadata.get("url", "https://example.com")
816
+ if not url_value or url_value == "None" or not isinstance(url_value, str):
817
+ url_value = "https://example.com"
818
+
819
+ # Safe relevance score calculation
820
+ relevance_score = None
821
+ if (
822
+ results.get("distances")
823
+ and isinstance(results["distances"], list)
824
+ and len(results["distances"]) > 0
825
+ and isinstance(results["distances"][0], list)
826
+ and len(results["distances"][0]) > i
827
+ ):
828
+ try:
829
+ relevance_score = 1.0 - float(results["distances"][0][i])
830
+ except (ValueError, TypeError):
831
+ relevance_score = None
832
+
833
+ document = Document(
834
+ title=str(metadata["title"]),
835
+ url=HttpUrl(url_value),
836
+ project=str(metadata["project"]),
837
+ source_path=str(metadata["source_path"]),
838
+ source_url=HttpUrl(str(metadata.get("source_url", ""))),
839
+ description=str(metadata["description"]),
840
+ is_reference=bool(metadata["is_reference"]),
841
+ content=content_text,
842
+ relevance_score=relevance_score,
843
+ )
844
+ documents.append(document)
845
+
846
+ if len(documents) > 1:
847
+ raise ValueError(f"Multiple documents found for path '{path}' in project '{project}'. Please ensure unique paths.")
848
+ elif len(documents) == 0:
849
+ raise ValueError(f"No document found for path '{path}' in project '{project}'.")
850
+ return documents[0]
851
+
852
+ async def list_projects(self) -> list[str]:
853
+ """List all available projects with documentation in the index.
854
+
855
+ Returns
856
+ -------
857
+ list[str]: A list of project names that have documentation available.
858
+ Names are returned in hyphenated format (e.g., "panel-material-ui").
859
+ """
860
+ await self.ensure_indexed()
861
+
862
+ try:
863
+ # Get all documents from the collection to extract unique project names
864
+ results = self.collection.get()
865
+
866
+ if not results["metadatas"]:
867
+ return []
868
+
869
+ # Extract unique project names
870
+ projects = set()
871
+ for metadata in results["metadatas"]:
872
+ project = metadata.get("project")
873
+ if project:
874
+ # Convert underscored names to hyphenated format for consistency
875
+ project_name = str(project).replace("_", "-")
876
+ projects.add(project_name)
877
+
878
+ # Return sorted list
879
+ return sorted(projects)
880
+
881
+ except Exception as e:
882
+ logger.error(f"Failed to list projects: {e}")
883
+ return []
884
+
885
+ async def _log_summary_table(self, ctx: Context | None = None):
886
+ """Log a summary table showing document counts by repository."""
887
+ try:
888
+ # Get all documents from the collection
889
+ results = self.collection.get()
890
+
891
+ if not results["metadatas"]:
892
+ await log_info("No documents found in index", ctx)
893
+ return
894
+
895
+ # Count documents by project and type
896
+ project_stats: dict[str, dict[str, int]] = {}
897
+ for metadata in results["metadatas"]:
898
+ project = str(metadata.get("project", "unknown"))
899
+ is_reference = metadata.get("is_reference", False)
900
+
901
+ if project not in project_stats:
902
+ project_stats[project] = {"total": 0, "regular": 0, "reference": 0}
903
+
904
+ project_stats[project]["total"] += 1
905
+ if is_reference:
906
+ project_stats[project]["reference"] += 1
907
+ else:
908
+ project_stats[project]["regular"] += 1
909
+
910
+ # Log summary table
911
+ await log_info("", ctx)
912
+ await log_info("📊 Document Summary by Repository:", ctx)
913
+ await log_info("=" * 60, ctx)
914
+ await log_info(f"{'Repository':<20} {'Total':<8} {'Regular':<8} {'Reference':<10}", ctx)
915
+ await log_info("-" * 60, ctx)
916
+
917
+ total_docs = 0
918
+ total_regular = 0
919
+ total_reference = 0
920
+
921
+ for project in sorted(project_stats.keys()):
922
+ stats = project_stats[project]
923
+ await log_info(f"{project:<20} {stats['total']:<8} {stats['regular']:<8} {stats['reference']:<10}", ctx)
924
+ total_docs += stats["total"]
925
+ total_regular += stats["regular"]
926
+ total_reference += stats["reference"]
927
+
928
+ await log_info("-" * 60, ctx)
929
+ await log_info(f"{'TOTAL':<20} {total_docs:<8} {total_regular:<8} {total_reference:<10}", ctx)
930
+ await log_info("=" * 60, ctx)
931
+
932
+ except Exception as e:
933
+ await log_warning(f"Failed to generate summary table: {e}", ctx)
934
+
935
+ def run(self):
936
+ """Update the DocumentationIndexer."""
937
+ # Configure logging for the CLI
938
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()])
939
+
940
+ logger.info("🚀 HoloViz MCP Documentation Indexer")
941
+ logger.info("=" * 50)
942
+
943
+ async def run_indexer(indexer=self):
944
+ logger.info(f"📁 Repository directory: {indexer.repos_dir}")
945
+ logger.info(f"💾 Vector database: {indexer.data_dir / 'chroma'}")
946
+ logger.info(f"🔧 Configured repositories: {len(indexer.config.repositories)}")
947
+ logger.info("")
948
+
949
+ await indexer.index_documentation()
950
+
951
+ # Final summary
952
+ count = indexer.collection.count()
953
+ logger.info("")
954
+ logger.info("=" * 50)
955
+ logger.info("✅ Indexing completed successfully!")
956
+ logger.info(f"📊 Total documents in database: {count}")
957
+ logger.info("=" * 50)
958
+
959
+ asyncio.run(run_indexer())
960
+
961
+
962
+ if __name__ == "__main__":
963
+ DocumentationIndexer().run()