holoviz-mcp 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. holoviz_mcp/__init__.py +18 -0
  2. holoviz_mcp/apps/__init__.py +1 -0
  3. holoviz_mcp/apps/configuration_viewer.py +116 -0
  4. holoviz_mcp/apps/holoviz_get_best_practices.py +173 -0
  5. holoviz_mcp/apps/holoviz_search.py +319 -0
  6. holoviz_mcp/apps/hvplot_get_docstring.py +255 -0
  7. holoviz_mcp/apps/hvplot_get_signature.py +252 -0
  8. holoviz_mcp/apps/hvplot_list_plot_types.py +83 -0
  9. holoviz_mcp/apps/panel_get_component.py +496 -0
  10. holoviz_mcp/apps/panel_get_component_parameters.py +467 -0
  11. holoviz_mcp/apps/panel_list_components.py +311 -0
  12. holoviz_mcp/apps/panel_list_packages.py +71 -0
  13. holoviz_mcp/apps/panel_search_components.py +312 -0
  14. holoviz_mcp/cli.py +75 -0
  15. holoviz_mcp/client.py +94 -0
  16. holoviz_mcp/config/__init__.py +29 -0
  17. holoviz_mcp/config/config.yaml +178 -0
  18. holoviz_mcp/config/loader.py +316 -0
  19. holoviz_mcp/config/models.py +208 -0
  20. holoviz_mcp/config/resources/best-practices/holoviews.md +423 -0
  21. holoviz_mcp/config/resources/best-practices/hvplot.md +465 -0
  22. holoviz_mcp/config/resources/best-practices/panel-material-ui.md +318 -0
  23. holoviz_mcp/config/resources/best-practices/panel.md +562 -0
  24. holoviz_mcp/config/schema.json +228 -0
  25. holoviz_mcp/holoviz_mcp/__init__.py +1 -0
  26. holoviz_mcp/holoviz_mcp/data.py +970 -0
  27. holoviz_mcp/holoviz_mcp/models.py +21 -0
  28. holoviz_mcp/holoviz_mcp/pages_design.md +407 -0
  29. holoviz_mcp/holoviz_mcp/server.py +220 -0
  30. holoviz_mcp/hvplot_mcp/__init__.py +1 -0
  31. holoviz_mcp/hvplot_mcp/server.py +146 -0
  32. holoviz_mcp/panel_mcp/__init__.py +17 -0
  33. holoviz_mcp/panel_mcp/data.py +319 -0
  34. holoviz_mcp/panel_mcp/models.py +124 -0
  35. holoviz_mcp/panel_mcp/server.py +443 -0
  36. holoviz_mcp/py.typed +0 -0
  37. holoviz_mcp/serve.py +36 -0
  38. holoviz_mcp/server.py +86 -0
  39. holoviz_mcp/shared/__init__.py +1 -0
  40. holoviz_mcp/shared/extract_tools.py +74 -0
  41. holoviz_mcp/thumbnails/configuration_viewer.png +0 -0
  42. holoviz_mcp/thumbnails/holoviz_get_best_practices.png +0 -0
  43. holoviz_mcp/thumbnails/holoviz_search.png +0 -0
  44. holoviz_mcp/thumbnails/hvplot_get_docstring.png +0 -0
  45. holoviz_mcp/thumbnails/hvplot_get_signature.png +0 -0
  46. holoviz_mcp/thumbnails/hvplot_list_plot_types.png +0 -0
  47. holoviz_mcp/thumbnails/panel_get_component.png +0 -0
  48. holoviz_mcp/thumbnails/panel_get_component_parameters.png +0 -0
  49. holoviz_mcp/thumbnails/panel_list_components.png +0 -0
  50. holoviz_mcp/thumbnails/panel_list_packages.png +0 -0
  51. holoviz_mcp/thumbnails/panel_search_components.png +0 -0
  52. holoviz_mcp-0.4.0.dist-info/METADATA +216 -0
  53. holoviz_mcp-0.4.0.dist-info/RECORD +56 -0
  54. holoviz_mcp-0.4.0.dist-info/WHEEL +4 -0
  55. holoviz_mcp-0.4.0.dist-info/entry_points.txt +2 -0
  56. holoviz_mcp-0.4.0.dist-info/licenses/LICENSE.txt +30 -0
@@ -0,0 +1,970 @@
1
+ """Data handling for the HoloViz Documentation MCP server."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import re
7
+ from pathlib import Path
8
+ from typing import Any
9
+ from typing import Literal
10
+ from typing import Optional
11
+
12
+ import chromadb
13
+ import git
14
+ from chromadb.api.collection_configuration import CreateCollectionConfiguration
15
+ from fastmcp import Context
16
+ from nbconvert import MarkdownExporter
17
+ from nbformat import read as nbread
18
+ from pydantic import HttpUrl
19
+
20
+ from holoviz_mcp.config.loader import get_config
21
+ from holoviz_mcp.config.models import FolderConfig
22
+ from holoviz_mcp.config.models import GitRepository
23
+ from holoviz_mcp.holoviz_mcp.models import Document
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Todo: Describe DocumentApp
28
+ # Todo: Avoid overflow-x in SearchApp sidebar
29
+ # Todo: Add bokeh documentation to README extra config
30
+
31
+ _CROMA_CONFIGURATION = CreateCollectionConfiguration(
32
+ hnsw={
33
+ "space": "cosine",
34
+ "ef_construction": 200,
35
+ "ef_search": 200,
36
+ }
37
+ )
38
+
39
+
40
+ async def log_info(message: str, ctx: Context | None = None):
41
+ """Log an info message to the context or logger."""
42
+ if ctx:
43
+ await ctx.info(message)
44
+ else:
45
+ logger.info(message)
46
+
47
+
48
+ async def log_warning(message: str, ctx: Context | None = None):
49
+ """Log a warning message to the context or logger."""
50
+ if ctx:
51
+ await ctx.warning(message)
52
+ else:
53
+ logger.warning(message)
54
+
55
+
56
+ async def log_exception(message: str, ctx: Context | None = None):
57
+ """Log an error message to the context or logger."""
58
+ if ctx:
59
+ await ctx.error(message)
60
+ else:
61
+ logger.error(message)
62
+ raise Exception(message)
63
+
64
+
65
+ def get_best_practices(project: str) -> str:
66
+ """Get best practices for using a project with LLMs.
67
+
68
+ This function searches for best practices resources in user and default directories,
69
+ with user resources taking precedence over default ones.
70
+
71
+ Args:
72
+ project (str): The name of the project to get best practices for.
73
+ Both hyphenated (e.g., "panel-material-ui") and underscored
74
+ (e.g., "panel_material_ui") names are supported.
75
+
76
+ Returns
77
+ -------
78
+ str: A string containing the best practices for the project in Markdown format.
79
+
80
+ Raises
81
+ ------
82
+ FileNotFoundError: If no best practices file is found for the project.
83
+ """
84
+ config = get_config()
85
+
86
+ # Convert underscored names to hyphenated for file lookup
87
+ project_filename = project.replace("_", "-")
88
+
89
+ # Search in user directory first, then default directory
90
+ search_paths = [
91
+ config.best_practices_dir("user"),
92
+ config.best_practices_dir("default"),
93
+ ]
94
+
95
+ for search_dir in search_paths:
96
+ best_practices_file = search_dir / f"{project_filename}.md"
97
+ if best_practices_file.exists():
98
+ return best_practices_file.read_text(encoding="utf-8")
99
+
100
+ # If not found, raise error with helpful message
101
+ available_files = []
102
+ for search_dir in search_paths:
103
+ if search_dir.exists():
104
+ available_files.extend([f.stem for f in search_dir.glob("*.md")])
105
+
106
+ available_str = ", ".join(set(available_files)) if available_files else "None"
107
+ raise FileNotFoundError(
108
+ f"Best practices file for project '{project}' not found. " f"Available projects: {available_str}. " f"Searched in: {[str(p) for p in search_paths]}"
109
+ )
110
+
111
+
112
+ def list_best_practices() -> list[str]:
113
+ """List all available best practices projects.
114
+
115
+ This function discovers available best practices from both user and default directories,
116
+ with user resources taking precedence over default ones.
117
+
118
+ Returns
119
+ -------
120
+ list[str]: A list of project names that have best practices available.
121
+ Names are returned in hyphenated format (e.g., "panel-material-ui").
122
+ """
123
+ config = get_config()
124
+
125
+ # Collect available projects from both directories
126
+ available_projects = set()
127
+
128
+ search_paths = [
129
+ config.best_practices_dir("user"),
130
+ config.best_practices_dir("default"),
131
+ ]
132
+
133
+ for search_dir in search_paths:
134
+ if search_dir.exists():
135
+ for md_file in search_dir.glob("*.md"):
136
+ available_projects.add(md_file.stem)
137
+
138
+ return sorted(list(available_projects))
139
+
140
+
141
+ def remove_leading_number_sep_from_path(p: Path) -> Path:
142
+ """Remove a leading number + underscore or hyphen from the last path component."""
143
+ new_name = re.sub(r"^\d+[_-]", "", p.name)
144
+ return p.with_name(new_name)
145
+
146
+
147
+ def convert_path_to_url(path: Path, remove_first_part: bool = True, url_transform: Literal["holoviz", "plotly", "datashader"] = "holoviz") -> str:
148
+ """Convert a relative file path to a URL path.
149
+
150
+ Converts file paths to web URLs by replacing file extensions with .html
151
+ and optionally removing the first path component for legacy compatibility.
152
+
153
+ Args:
154
+ path: The file path to convert
155
+ remove_first_part: Whether to remove the first path component (legacy compatibility)
156
+ url_transform: How to transform the file path into a URL:
157
+
158
+ - "holoviz": Replace file extension with .html (default)
159
+ - "plotly": Replace file extension with / (e.g., filename.md -> filename/)
160
+ - "datashader": Remove leading index and replace file extension with .html (e.g., 01_filename.md -> filename.html)
161
+
162
+ Returns
163
+ -------
164
+ URL path with .html extension
165
+
166
+ Examples
167
+ --------
168
+ >>> convert_path_to_url(Path("doc/getting_started.md"))
169
+ "getting_started.html"
170
+ >>> convert_path_to_url(Path("examples/reference/Button.ipynb"), False)
171
+ "examples/reference/Button.html"
172
+ >>> convert_path_to_url(Path("/doc/python/3d-axes.md"), False, "plotly")
173
+ "/doc/python/3d-axes/"
174
+ >>> convert_path_to_url(Path("/examples/user_guide/10_Performance.ipynb"), False, "datashader")
175
+ "/examples/user_guide/Performance.html"
176
+ """
177
+ if url_transform in ["holoviz", "datashader"]:
178
+ path = remove_leading_number_sep_from_path(path)
179
+
180
+ # Convert path to URL format
181
+ parts = list(path.parts)
182
+
183
+ # Only remove first part if requested (for legacy compatibility)
184
+ if remove_first_part and parts:
185
+ parts.pop(0)
186
+
187
+ # Reconstruct path and convert to string
188
+ if parts:
189
+ url_path = str(Path(*parts))
190
+ else:
191
+ url_path = ""
192
+
193
+ # Replace file extensions with suffix
194
+ if url_path:
195
+ path_obj = Path(url_path)
196
+ if url_transform == "plotly":
197
+ url_path = str(path_obj.with_suffix(suffix="")) + "/"
198
+ if url_path.endswith("index/"):
199
+ url_path = url_path[: -len("index/")] + "/"
200
+ else:
201
+ url_path = str(path_obj.with_suffix(suffix=".html"))
202
+
203
+ return url_path
204
+
205
+
206
+ class DocumentationIndexer:
207
+ """Handles cloning, processing, and indexing of documentation."""
208
+
209
+ def __init__(self, *, data_dir: Optional[Path] = None, repos_dir: Optional[Path] = None, vector_dir: Optional[Path] = None):
210
+ """Initialize the DocumentationIndexer.
211
+
212
+ Args:
213
+ data_dir: Directory to store index data. Defaults to user config directory.
214
+ repos_dir: Directory to store cloned repositories. Defaults to HOLOVIZ_MCP_REPOS_DIR.
215
+ vector_dir: Directory to store vector database. Defaults to config.vector_dir
216
+ """
217
+ # Use unified config for default paths
218
+ config = self._holoviz_mcp_config = get_config()
219
+
220
+ self.data_dir = data_dir or config.user_dir
221
+ self.data_dir.mkdir(parents=True, exist_ok=True)
222
+
223
+ # Use configurable repos directory for repository downloads
224
+ self.repos_dir = repos_dir or config.repos_dir
225
+ self.repos_dir.mkdir(parents=True, exist_ok=True)
226
+
227
+ # Use configurable directory for vector database path
228
+ vector_db_path = vector_dir or config.server.vector_db_path
229
+ vector_db_path.parent.mkdir(parents=True, exist_ok=True)
230
+
231
+ # Disable ChromaDB telemetry based on config
232
+ if not config.server.anonymized_telemetry:
233
+ os.environ["ANONYMIZED_TELEMETRY"] = "False"
234
+
235
+ # Initialize ChromaDB
236
+ self.chroma_client = chromadb.PersistentClient(path=str(vector_db_path))
237
+ self.collection = self.chroma_client.get_or_create_collection("holoviz_docs", configuration=_CROMA_CONFIGURATION)
238
+
239
+ # Initialize notebook converter
240
+ self.nb_exporter = MarkdownExporter()
241
+
242
+ # Load documentation config from the centralized config system
243
+ self.config = get_config().docs
244
+
245
+ def is_indexed(self) -> bool:
246
+ """Check if documentation index exists and is valid."""
247
+ try:
248
+ count = self.collection.count()
249
+ return count > 0
250
+ except Exception:
251
+ return False
252
+
253
+ async def ensure_indexed(self, ctx: Context | None = None):
254
+ """Ensure documentation is indexed, creating if necessary."""
255
+ if not self.is_indexed():
256
+ await log_info("Documentation index not found. Creating initial index...", ctx)
257
+ await self.index_documentation()
258
+
259
+ async def clone_or_update_repo(self, repo_name: str, repo_config: "GitRepository", ctx: Context | None = None) -> Optional[Path]:
260
+ """Clone or update a single repository."""
261
+ repo_path = self.repos_dir / repo_name
262
+
263
+ try:
264
+ if repo_path.exists():
265
+ # Update existing repository
266
+ await log_info(f"Updating {repo_name} repository at {repo_path}...", ctx)
267
+ repo = git.Repo(repo_path)
268
+ repo.remotes.origin.pull()
269
+ else:
270
+ # Clone new repository
271
+ await log_info(f"Cloning {repo_name} repository to {repo_path}...", ctx)
272
+ clone_kwargs: dict[str, Any] = {"depth": 1} # Shallow clone for efficiency
273
+
274
+ # Add branch, tag, or commit if specified
275
+ if repo_config.branch:
276
+ clone_kwargs["branch"] = repo_config.branch
277
+ elif repo_config.tag:
278
+ clone_kwargs["branch"] = repo_config.tag
279
+ elif repo_config.commit:
280
+ # For specific commits, we need to clone and then checkout
281
+ git.Repo.clone_from(str(repo_config.url), repo_path, **clone_kwargs)
282
+ repo = git.Repo(repo_path)
283
+ repo.git.checkout(repo_config.commit)
284
+ return repo_path
285
+
286
+ git.Repo.clone_from(str(repo_config.url), repo_path, **clone_kwargs)
287
+
288
+ return repo_path
289
+ except Exception as e:
290
+ msg = f"Failed to clone/update {repo_name}: {e}"
291
+ await log_warning(msg, ctx) # Changed from log_exception to log_warning so it doesn't raise
292
+ return None
293
+
294
+ def _is_reference_document(self, file_path: Path, project: str, folder_name: str = "") -> bool:
295
+ """Check if the document is a reference document using configurable patterns.
296
+
297
+ Args:
298
+ file_path: Full path to the file
299
+ project: Project name
300
+ folder_name: Name of the folder this file belongs to
301
+
302
+ Returns
303
+ -------
304
+ bool: True if this is a reference document
305
+ """
306
+ repo_config = self.config.repositories[project]
307
+ repo_path = self.repos_dir / project
308
+
309
+ try:
310
+ relative_path = file_path.relative_to(repo_path)
311
+
312
+ # Check against configured reference patterns
313
+ for pattern in repo_config.reference_patterns:
314
+ if relative_path.match(pattern):
315
+ return True
316
+
317
+ # Fallback to simple "reference" in path check
318
+ return "reference" in relative_path.parts
319
+ except (ValueError, KeyError):
320
+ # If we can't determine relative path or no patterns configured, use simple fallback
321
+ return "reference" in file_path.parts
322
+
323
+ def _generate_doc_id(self, project: str, path: Path) -> str:
324
+ """Generate a unique document ID from project and path."""
325
+ readable_path = str(path).replace("/", "___").replace(".", "_")
326
+ readable_id = f"{project}___{readable_path}"
327
+
328
+ return readable_id
329
+
330
+ def _generate_doc_url(self, project: str, path: Path, folder_name: str = "") -> str:
331
+ """Generate documentation URL for a file.
332
+
333
+ This method creates the final URL where the documentation can be accessed online.
334
+ It handles folder URL mapping to ensure proper URL structure for different documentation layouts.
335
+
336
+ Args:
337
+ project: Name of the project/repository (e.g., "panel", "hvplot")
338
+ path: Relative path to the file within the repository
339
+ folder_name: Name of the folder containing the file (e.g., "examples/reference", "doc")
340
+ Used for URL path mapping when folders have custom URL structures
341
+
342
+ Returns
343
+ -------
344
+ Complete URL to the documentation file
345
+
346
+ Examples
347
+ --------
348
+ For Panel reference guides:
349
+ - Input: project="panel", path="examples/reference/widgets/Button.ipynb", folder_name="examples/reference"
350
+ - Output: "https://panel.holoviz.org/reference/widgets/Button.html"
351
+
352
+ For regular documentation:
353
+ - Input: project="panel", path="doc/getting_started.md", folder_name="doc"
354
+ - Output: "https://panel.holoviz.org/getting_started.html"
355
+ """
356
+ repo_config = self.config.repositories[project]
357
+ base_url = str(repo_config.base_url).rstrip("/")
358
+
359
+ # Get the URL path mapping for this folder
360
+ folder_url_path = repo_config.get_folder_url_path(folder_name)
361
+
362
+ # If there's a folder URL mapping, we need to adjust the path
363
+ if folder_url_path and folder_name:
364
+ # Remove the folder name from the beginning of the path
365
+ path_str = str(path)
366
+
367
+ # Check if path starts with the folder name
368
+ if path_str.startswith(folder_name + "/"):
369
+ # Remove the folder prefix and leading slash
370
+ remaining_path = path_str[len(folder_name) + 1 :]
371
+ adjusted_path = Path(remaining_path) if remaining_path else Path(".")
372
+ elif path_str == folder_name:
373
+ # The path is exactly the folder name
374
+ adjusted_path = Path(".")
375
+ else:
376
+ # Fallback: try to remove folder parts from the beginning
377
+ path_parts = list(path.parts)
378
+ folder_parts = folder_name.split("/")
379
+ for folder_part in folder_parts:
380
+ if path_parts and path_parts[0] == folder_part:
381
+ path_parts = path_parts[1:]
382
+ adjusted_path = Path(*path_parts) if path_parts else Path(".")
383
+
384
+ # Don't remove first part since we already adjusted the path
385
+ doc_path = convert_path_to_url(adjusted_path, remove_first_part=False, url_transform=repo_config.url_transform)
386
+ else:
387
+ # Convert file path to URL format normally (remove first part for legacy compatibility)
388
+ doc_path = convert_path_to_url(path, remove_first_part=True, url_transform=repo_config.url_transform)
389
+
390
+ # Combine base URL, folder URL path, and document path
391
+ if folder_url_path:
392
+ full_url = f"{base_url}{folder_url_path}/{doc_path}"
393
+ else:
394
+ full_url = f"{base_url}/{doc_path}"
395
+
396
+ return full_url.replace("//", "/").replace(":/", "://") # Fix double slashes
397
+
398
+ @staticmethod
399
+ def _to_title(fallback_filename: str = "") -> str:
400
+ """Extract title from a filename or return a default title."""
401
+ title = Path(fallback_filename).stem
402
+ if "_" in title and title.split("_")[0].isdigit():
403
+ title = title.split("_", 1)[-1]
404
+ title = title.replace("_", " ").replace("-", " ").title()
405
+ return title
406
+
407
+ @classmethod
408
+ def _extract_title_from_markdown(cls, content: str, fallback_filename: str = "") -> str:
409
+ """Extract title from markdown content, with filename fallback."""
410
+ lines = content.split("\n")
411
+ for line in lines:
412
+ line = line.strip()
413
+ if line.startswith("# "):
414
+ # Return just the title text without the "# " prefix
415
+ return line[2:].strip()
416
+ if line.startswith("##"):
417
+ break
418
+
419
+ if fallback_filename:
420
+ return cls._to_title(fallback_filename)
421
+
422
+ return "No Title"
423
+
424
+ @staticmethod
425
+ def _extract_description_from_markdown(content: str, max_length=200) -> str:
426
+ """Extract description from markdown content."""
427
+ content = content.strip()
428
+
429
+ # Plotly documents start with --- ... --- section. Skip the section
430
+ if content.startswith("---"):
431
+ content = content.split("---", 2)[-1].strip()
432
+
433
+ lines = content.split("\n")
434
+ clean_lines = []
435
+ in_code_block = False
436
+
437
+ for line in lines:
438
+ if line.strip().startswith("```"):
439
+ in_code_block = not in_code_block
440
+ continue
441
+
442
+ if in_code_block or line.startswith(("#", " ", "\t", "---", "___")):
443
+ continue
444
+
445
+ clean_lines.append(line)
446
+
447
+ # Join lines and clean up
448
+ clean_content = "\n".join(clean_lines).strip()
449
+
450
+ # Remove extra whitespace and limit length
451
+ clean_content = " ".join(clean_content.split())
452
+
453
+ if len(clean_content) > max_length:
454
+ clean_content = clean_content[:max_length].rsplit(" ", 1)[0]
455
+ if not clean_content.endswith("."):
456
+ clean_content += " ..."
457
+
458
+ return clean_content
459
+
460
+ def convert_notebook_to_markdown(self, notebook_path: Path) -> str:
461
+ """Convert a Jupyter notebook to markdown."""
462
+ try:
463
+ with open(notebook_path, "r", encoding="utf-8") as f:
464
+ notebook = nbread(f, as_version=4)
465
+
466
+ (body, resources) = self.nb_exporter.from_notebook_node(notebook)
467
+ return body
468
+ except Exception as e:
469
+ logger.error(f"Failed to convert notebook {notebook_path}: {e}")
470
+ return str(e)
471
+
472
+ @staticmethod
473
+ def _to_source_url(file_path: Path, repo_config: GitRepository, raw: bool = False) -> str:
474
+ """Generate source URL for a file based on repository configuration."""
475
+ url = str(repo_config.url)
476
+ branch = repo_config.branch or "main"
477
+ if url.startswith("https://github.com") and url.endswith(".git"):
478
+ url = url.replace("https://github.com/", "").replace(".git", "")
479
+ project, repository = url.split("/")
480
+ if raw:
481
+ return f"https://raw.githubusercontent.com/{project}/{repository}/refs/heads/{branch}/{file_path}"
482
+
483
+ return f"https://github.com/{project}/{repository}/blob/{branch}/{file_path}"
484
+ if "dev.azure.com" in url:
485
+ organisation = url.split("/")[3].split("@")[0]
486
+ project = url.split("/")[-3]
487
+ repo_name = url.split("/")[-1]
488
+ if raw:
489
+ return f"https://dev.azure.com/{organisation}/{project}/_apis/sourceProviders/TfsGit/filecontents?repository={repo_name}&path=/{file_path}&commitOrBranch={branch}&api-version=7.0"
490
+
491
+ return f"https://dev.azure.com/{organisation}/{project}/_git/{repo_name}?path=/{file_path}&version=GB{branch}"
492
+
493
+ raise ValueError(f"Unsupported repository URL format: {url}. Please provide a valid GitHub or Azure DevOps URL.")
494
+
495
+ def process_file(self, file_path: Path, project: str, repo_config: GitRepository, folder_name: str = "") -> Optional[dict[str, Any]]:
496
+ """Process a file and extract metadata."""
497
+ try:
498
+ if file_path.suffix == ".ipynb":
499
+ content = self.convert_notebook_to_markdown(file_path)
500
+ elif file_path.suffix in [".md", ".rst", ".txt"]:
501
+ with open(file_path, "r", encoding="utf-8") as f:
502
+ content = f.read()
503
+ else:
504
+ logger.debug(f"Skipping unsupported file type: {file_path}")
505
+ return None
506
+
507
+ title = self._extract_title_from_markdown(content, file_path.name)
508
+ if not title:
509
+ title = file_path.stem.replace("_", " ").title()
510
+
511
+ description = self._extract_description_from_markdown(content)
512
+
513
+ repo_path = self.repos_dir / project
514
+ relative_path = file_path.relative_to(repo_path)
515
+
516
+ doc_id = self._generate_doc_id(project, relative_path)
517
+
518
+ is_reference = self._is_reference_document(file_path, project, folder_name)
519
+
520
+ source_url = self._to_source_url(relative_path, repo_config)
521
+
522
+ return {
523
+ "id": doc_id,
524
+ "title": title,
525
+ "url": self._generate_doc_url(project, relative_path, folder_name),
526
+ "project": project,
527
+ "source_path": str(relative_path),
528
+ "source_path_stem": file_path.stem,
529
+ "source_url": source_url,
530
+ "description": description,
531
+ "content": content,
532
+ "is_reference": is_reference,
533
+ }
534
+ except Exception as e:
535
+ logger.error(f"Failed to process file {file_path}: {e}")
536
+ return None
537
+
538
+ async def extract_docs_from_repo(self, repo_path: Path, project: str, ctx: Context | None = None) -> list[dict[str, Any]]:
539
+ """Extract documentation files from a repository."""
540
+ docs = []
541
+ repo_config = self.config.repositories[project]
542
+
543
+ # Use the new folder structure with URL path mapping
544
+ if isinstance(repo_config.folders, dict):
545
+ folders = repo_config.folders
546
+ else:
547
+ # Convert list to dict with default FolderConfig
548
+ folders = {name: FolderConfig() for name in repo_config.folders}
549
+
550
+ files: set = set()
551
+ await log_info(f"Processing {project} documentation files in {','.join(folders.keys())}", ctx)
552
+
553
+ for folder_name in folders.keys():
554
+ docs_folder: Path = repo_path / folder_name
555
+ if docs_folder.exists():
556
+ # Use index patterns from config
557
+ for pattern in self.config.index_patterns:
558
+ files.update(docs_folder.glob(pattern))
559
+
560
+ for file in files:
561
+ if file.exists() and not file.is_dir():
562
+ # Determine which folder this file belongs to
563
+ folder_name = ""
564
+ for fname in folders.keys():
565
+ folder_path = repo_path / fname
566
+ try:
567
+ file.relative_to(folder_path)
568
+ folder_name = fname
569
+ break
570
+ except ValueError:
571
+ continue
572
+
573
+ doc_data = self.process_file(file, project, repo_config, folder_name)
574
+ if doc_data:
575
+ docs.append(doc_data)
576
+
577
+ # Count reference vs regular documents
578
+ reference_count = sum(1 for doc in docs if doc["is_reference"])
579
+ regular_count = len(docs) - reference_count
580
+
581
+ await log_info(f" 📄 {project}: {len(docs)} total documents ({regular_count} regular, {reference_count} reference guides)", ctx)
582
+ return docs
583
+
584
+ async def index_documentation(self, ctx: Context | None = None):
585
+ """Indexes all documentation."""
586
+ await log_info("Starting documentation indexing...", ctx)
587
+
588
+ all_docs = []
589
+
590
+ # Clone/update repositories and extract documentation
591
+ for repo_name, repo_config in self.config.repositories.items():
592
+ await log_info(f"Processing {repo_name}...", ctx)
593
+ repo_path = await self.clone_or_update_repo(repo_name, repo_config)
594
+ if repo_path:
595
+ docs = await self.extract_docs_from_repo(repo_path, repo_name, ctx)
596
+ all_docs.extend(docs)
597
+
598
+ if not all_docs:
599
+ await log_warning("No documentation found to index", ctx)
600
+ return
601
+
602
+ # Validate for duplicate IDs and log details
603
+ await self._validate_unique_ids(all_docs)
604
+
605
+ # Clear existing collection
606
+ await log_info("Clearing existing index...", ctx)
607
+
608
+ # Only delete if collection has data
609
+ try:
610
+ count = self.collection.count()
611
+ if count > 0:
612
+ # Delete all documents by getting all IDs first
613
+ results = self.collection.get()
614
+ if results["ids"]:
615
+ self.collection.delete(ids=results["ids"])
616
+ except Exception as e:
617
+ logger.warning(f"Failed to clear existing collection: {e}")
618
+ # If clearing fails, recreate the collection
619
+ try:
620
+ self.chroma_client.delete_collection("holoviz_docs")
621
+ self.collection = self.chroma_client.get_or_create_collection("holoviz_docs", configuration=_CROMA_CONFIGURATION)
622
+ except Exception as e2:
623
+ await log_exception(f"Failed to recreate collection: {e2}", ctx)
624
+ raise
625
+
626
+ # Add documents to ChromaDB
627
+ await log_info(f"Adding {len(all_docs)} documents to index...", ctx)
628
+
629
+ self.collection.add(
630
+ documents=[doc["content"] for doc in all_docs],
631
+ metadatas=[
632
+ {
633
+ "title": doc["title"],
634
+ "url": doc["url"],
635
+ "project": doc["project"],
636
+ "source_path": doc["source_path"],
637
+ "source_path_stem": doc["source_path_stem"],
638
+ "source_url": doc["source_url"],
639
+ "description": doc["description"],
640
+ "is_reference": doc["is_reference"],
641
+ }
642
+ for doc in all_docs
643
+ ],
644
+ ids=[doc["id"] for doc in all_docs],
645
+ )
646
+
647
+ await log_info(f"✅ Successfully indexed {len(all_docs)} documents", ctx)
648
+ await log_info(f"📊 Vector database stored at: {self.data_dir / 'chroma'}", ctx)
649
+ await log_info(f"🔍 Index contains {self.collection.count()} total documents", ctx)
650
+
651
+ # Show detailed summary table
652
+ await self._log_summary_table(ctx)
653
+
654
+ async def _validate_unique_ids(self, all_docs: list[dict[str, Any]], ctx: Context | None = None) -> None:
655
+ """Validate that all document IDs are unique and log duplicates."""
656
+ seen_ids: dict = {}
657
+ duplicates = []
658
+
659
+ for doc in all_docs:
660
+ doc_id = doc["id"]
661
+ if doc_id in seen_ids:
662
+ duplicates.append(
663
+ {
664
+ "id": doc_id,
665
+ "first_doc": seen_ids[doc_id],
666
+ "duplicate_doc": {"project": doc["project"], "source_path": doc["source_path"], "title": doc["title"]},
667
+ }
668
+ )
669
+
670
+ await log_warning(f"DUPLICATE ID FOUND: {doc_id}", ctx)
671
+ await log_warning(f" First document: {seen_ids[doc_id]['project']}/{seen_ids[doc_id]['path']} - {seen_ids[doc_id]['title']}", ctx)
672
+ await log_warning(f" Duplicate document: {doc['project']}/{doc['path']} - {doc['title']}", ctx)
673
+ else:
674
+ seen_ids[doc_id] = {"project": doc["project"], "source_path": doc["source_path"], "title": doc["title"]}
675
+
676
+ if duplicates:
677
+ error_msg = f"Found {len(duplicates)} duplicate document IDs"
678
+ await log_exception(error_msg, ctx)
679
+
680
+ # Log all duplicates for debugging
681
+ for dup in duplicates:
682
+ await log_exception(
683
+ f"Duplicate ID '{dup['id']}': {dup['first_doc']['project']}/{dup['first_doc']['path']} vs {dup['duplicate_doc']['project']}/{dup['duplicate_doc']['path']}", # noqa: D401, E501
684
+ ctx,
685
+ )
686
+
687
+ raise ValueError(f"Document ID collision detected. {len(duplicates)} duplicate IDs found. Check logs for details.")
688
+
689
+ async def search_get_reference_guide(self, component: str, project: Optional[str] = None, content: bool = True, ctx: Context | None = None) -> list[Document]:
690
+ """Search for reference guides for a specific component."""
691
+ await self.ensure_indexed()
692
+
693
+ # Build search strategies
694
+ filters: list[dict[str, Any]] = []
695
+ if project:
696
+ filters.append({"project": str(project)})
697
+ filters.append({"source_path_stem": str(component)})
698
+ filters.append({"is_reference": True})
699
+ where_clause: dict[str, Any] = {"$and": filters} if len(filters) > 1 else filters[0]
700
+
701
+ all_results = []
702
+
703
+ filename_results = self.collection.query(query_texts=[component], n_results=1000, where=where_clause)
704
+ if filename_results["ids"] and filename_results["ids"][0]:
705
+ for i, _ in enumerate(filename_results["ids"][0]):
706
+ if filename_results["metadatas"] and filename_results["metadatas"][0]:
707
+ metadata = filename_results["metadatas"][0][i]
708
+ # Include content if requested
709
+ content_text = filename_results["documents"][0][i] if (content and filename_results["documents"]) else None
710
+
711
+ # Safe URL construction
712
+ url_value = metadata.get("url", "https://example.com")
713
+ if not url_value or url_value == "None" or not isinstance(url_value, str):
714
+ url_value = "https://example.com"
715
+
716
+ # Give exact filename matches a high relevance score
717
+ relevance_score = 1.0 # Highest priority for exact filename matches
718
+
719
+ document = Document(
720
+ title=str(metadata["title"]),
721
+ url=HttpUrl(url_value),
722
+ project=str(metadata["project"]),
723
+ source_path=str(metadata["source_path"]),
724
+ source_url=HttpUrl(str(metadata.get("source_url", ""))),
725
+ description=str(metadata["description"]),
726
+ is_reference=bool(metadata["is_reference"]),
727
+ content=content_text,
728
+ relevance_score=relevance_score,
729
+ )
730
+
731
+ if project and document.project != project:
732
+ await log_exception(f"Project mismatch for component '{component}': expected '{project}', got '{document.project}'", ctx)
733
+ elif metadata["source_path_stem"] != component:
734
+ await log_exception(f"Path stem mismatch for component '{component}': expected '{component}', got '{metadata['source_path_stem']}'", ctx)
735
+ else:
736
+ all_results.append(document)
737
+ return all_results
738
+
739
+ async def search(self, query: str, project: Optional[str] = None, content: bool = True, max_results: int = 5, ctx: Context | None = None) -> list[Document]:
740
+ """Search the documentation using semantic similarity."""
741
+ await self.ensure_indexed(ctx=ctx)
742
+
743
+ # Build where clause for filtering
744
+ where_clause = {"project": str(project)} if project else None
745
+
746
+ try:
747
+ # Perform vector similarity search
748
+ results = self.collection.query(query_texts=[query], n_results=max_results, where=where_clause) # type: ignore[arg-type]
749
+
750
+ documents = []
751
+ if results["ids"] and results["ids"][0]:
752
+ for i, _ in enumerate(results["ids"][0]):
753
+ if results["metadatas"] and results["metadatas"][0]:
754
+ metadata = results["metadatas"][0][i]
755
+
756
+ # Include content if requested
757
+ content_text = results["documents"][0][i] if (content and results["documents"]) else None
758
+
759
+ # Safe URL construction
760
+ url_value = metadata.get("url", "https://example.com")
761
+ if not url_value or url_value == "None" or not isinstance(url_value, str):
762
+ url_value = "https://example.com"
763
+
764
+ # Safe relevance score calculation
765
+ relevance_score = None
766
+ if (
767
+ results.get("distances")
768
+ and isinstance(results["distances"], list)
769
+ and len(results["distances"]) > 0
770
+ and isinstance(results["distances"][0], list)
771
+ and len(results["distances"][0]) > i
772
+ ):
773
+ try:
774
+ relevance_score = (2.0 - float(results["distances"][0][i])) / 2.0
775
+ except (ValueError, TypeError):
776
+ relevance_score = None
777
+
778
+ document = Document(
779
+ title=str(metadata["title"]),
780
+ url=HttpUrl(url_value),
781
+ project=str(metadata["project"]),
782
+ source_path=str(metadata["source_path"]),
783
+ source_url=HttpUrl(str(metadata.get("source_url", ""))),
784
+ description=str(metadata["description"]),
785
+ is_reference=bool(metadata["is_reference"]),
786
+ content=content_text,
787
+ relevance_score=relevance_score,
788
+ )
789
+ documents.append(document)
790
+ return documents
791
+ except Exception as e:
792
+ raise e
793
+
794
+ async def get_document(self, path: str, project: str, ctx: Context | None = None) -> Document:
795
+ """Get a specific document."""
796
+ await self.ensure_indexed(ctx=ctx)
797
+
798
+ # Build where clause for filtering
799
+ filters: list[dict[str, str]] = [{"project": str(project)}, {"source_path": str(path)}]
800
+ where_clause: dict[str, Any] = {"$and": filters}
801
+
802
+ # Perform vector similarity search
803
+ results = self.collection.query(query_texts=[""], n_results=3, where=where_clause)
804
+
805
+ documents = []
806
+ if results["ids"] and results["ids"][0]:
807
+ for i, _ in enumerate(results["ids"][0]):
808
+ if results["metadatas"] and results["metadatas"][0]:
809
+ metadata = results["metadatas"][0][i]
810
+
811
+ # Include content if requested
812
+ content_text = results["documents"][0][i] if results["documents"] else None
813
+
814
+ # Safe URL construction
815
+ url_value = metadata.get("url", "https://example.com")
816
+ if not url_value or url_value == "None" or not isinstance(url_value, str):
817
+ url_value = "https://example.com"
818
+
819
+ # Safe relevance score calculation
820
+ relevance_score = None
821
+ if (
822
+ results.get("distances")
823
+ and isinstance(results["distances"], list)
824
+ and len(results["distances"]) > 0
825
+ and isinstance(results["distances"][0], list)
826
+ and len(results["distances"][0]) > i
827
+ ):
828
+ try:
829
+ relevance_score = 1.0 - float(results["distances"][0][i])
830
+ except (ValueError, TypeError):
831
+ relevance_score = None
832
+
833
+ document = Document(
834
+ title=str(metadata["title"]),
835
+ url=HttpUrl(url_value),
836
+ project=str(metadata["project"]),
837
+ source_path=str(metadata["source_path"]),
838
+ source_url=HttpUrl(str(metadata.get("source_url", ""))),
839
+ description=str(metadata["description"]),
840
+ is_reference=bool(metadata["is_reference"]),
841
+ content=content_text,
842
+ relevance_score=relevance_score,
843
+ )
844
+ documents.append(document)
845
+
846
+ if len(documents) > 1:
847
+ raise ValueError(f"Multiple documents found for path '{path}' in project '{project}'. Please ensure unique paths.")
848
+ elif len(documents) == 0:
849
+ raise ValueError(f"No document found for path '{path}' in project '{project}'.")
850
+ return documents[0]
851
+
852
+ async def list_projects(self) -> list[str]:
853
+ """List all available projects with documentation in the index.
854
+
855
+ Returns
856
+ -------
857
+ list[str]: A list of project names that have documentation available.
858
+ Names are returned in hyphenated format (e.g., "panel-material-ui").
859
+ """
860
+ await self.ensure_indexed()
861
+
862
+ try:
863
+ # Get all documents from the collection to extract unique project names
864
+ results = self.collection.get()
865
+
866
+ if not results["metadatas"]:
867
+ return []
868
+
869
+ # Extract unique project names
870
+ projects = set()
871
+ for metadata in results["metadatas"]:
872
+ project = metadata.get("project")
873
+ if project:
874
+ # Convert underscored names to hyphenated format for consistency
875
+ project_name = str(project).replace("_", "-")
876
+ projects.add(project_name)
877
+
878
+ # Return sorted list
879
+ return sorted(projects)
880
+
881
+ except Exception as e:
882
+ logger.error(f"Failed to list projects: {e}")
883
+ return []
884
+
885
+ async def _log_summary_table(self, ctx: Context | None = None):
886
+ """Log a summary table showing document counts by repository."""
887
+ try:
888
+ # Get all documents from the collection
889
+ results = self.collection.get()
890
+
891
+ if not results["metadatas"]:
892
+ await log_info("No documents found in index", ctx)
893
+ return
894
+
895
+ # Count documents by project and type
896
+ project_stats: dict[str, dict[str, int]] = {}
897
+ for metadata in results["metadatas"]:
898
+ project = str(metadata.get("project", "unknown"))
899
+ is_reference = metadata.get("is_reference", False)
900
+
901
+ if project not in project_stats:
902
+ project_stats[project] = {"total": 0, "regular": 0, "reference": 0}
903
+
904
+ project_stats[project]["total"] += 1
905
+ if is_reference:
906
+ project_stats[project]["reference"] += 1
907
+ else:
908
+ project_stats[project]["regular"] += 1
909
+
910
+ # Log summary table
911
+ await log_info("", ctx)
912
+ await log_info("📊 Document Summary by Repository:", ctx)
913
+ await log_info("=" * 60, ctx)
914
+ await log_info(f"{'Repository':<20} {'Total':<8} {'Regular':<8} {'Reference':<10}", ctx)
915
+ await log_info("-" * 60, ctx)
916
+
917
+ total_docs = 0
918
+ total_regular = 0
919
+ total_reference = 0
920
+
921
+ for project in sorted(project_stats.keys()):
922
+ stats = project_stats[project]
923
+ await log_info(f"{project:<20} {stats['total']:<8} {stats['regular']:<8} {stats['reference']:<10}", ctx)
924
+ total_docs += stats["total"]
925
+ total_regular += stats["regular"]
926
+ total_reference += stats["reference"]
927
+
928
+ await log_info("-" * 60, ctx)
929
+ await log_info(f"{'TOTAL':<20} {total_docs:<8} {total_regular:<8} {total_reference:<10}", ctx)
930
+ await log_info("=" * 60, ctx)
931
+
932
+ except Exception as e:
933
+ await log_warning(f"Failed to generate summary table: {e}", ctx)
934
+
935
+ def run(self):
936
+ """Update the DocumentationIndexer."""
937
+ # Configure logging for the CLI
938
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()])
939
+
940
+ logger.info("🚀 HoloViz MCP Documentation Indexer")
941
+ logger.info("=" * 50)
942
+
943
+ async def run_indexer(indexer=self):
944
+ logger.info(f"📦 Default config: {indexer._holoviz_mcp_config.config_file_path(location='default')}")
945
+ logger.info(f"🏠 User config: {indexer._holoviz_mcp_config.config_file_path(location='user')}")
946
+ logger.info(f"📁 Repository directory: {indexer.repos_dir}")
947
+ logger.info(f"💾 Vector database: {indexer.data_dir / 'chroma'}")
948
+ logger.info(f"🔧 Configured repositories: {len(indexer.config.repositories)}")
949
+ logger.info("")
950
+
951
+ await indexer.index_documentation()
952
+
953
+ # Final summary
954
+ count = indexer.collection.count()
955
+ logger.info("")
956
+ logger.info("=" * 50)
957
+ logger.info("✅ Indexing completed successfully!")
958
+ logger.info(f"📊 Total documents in database: {count}")
959
+ logger.info("=" * 50)
960
+
961
+ asyncio.run(run_indexer())
962
+
963
+
964
+ def main():
965
+ """Run the documentation indexer."""
966
+ DocumentationIndexer().run()
967
+
968
+
969
+ if __name__ == "__main__":
970
+ main()