mcp-vector-search 0.15.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (86) hide show
  1. mcp_vector_search/__init__.py +10 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/auto_index.py +397 -0
  5. mcp_vector_search/cli/commands/chat.py +534 -0
  6. mcp_vector_search/cli/commands/config.py +393 -0
  7. mcp_vector_search/cli/commands/demo.py +358 -0
  8. mcp_vector_search/cli/commands/index.py +762 -0
  9. mcp_vector_search/cli/commands/init.py +658 -0
  10. mcp_vector_search/cli/commands/install.py +869 -0
  11. mcp_vector_search/cli/commands/install_old.py +700 -0
  12. mcp_vector_search/cli/commands/mcp.py +1254 -0
  13. mcp_vector_search/cli/commands/reset.py +393 -0
  14. mcp_vector_search/cli/commands/search.py +796 -0
  15. mcp_vector_search/cli/commands/setup.py +1133 -0
  16. mcp_vector_search/cli/commands/status.py +584 -0
  17. mcp_vector_search/cli/commands/uninstall.py +404 -0
  18. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  19. mcp_vector_search/cli/commands/visualize/cli.py +265 -0
  20. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  21. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  22. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
  23. mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
  24. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  25. mcp_vector_search/cli/commands/visualize/server.py +201 -0
  26. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  27. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  28. mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
  29. mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
  30. mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
  31. mcp_vector_search/cli/commands/visualize.py.original +2536 -0
  32. mcp_vector_search/cli/commands/watch.py +287 -0
  33. mcp_vector_search/cli/didyoumean.py +520 -0
  34. mcp_vector_search/cli/export.py +320 -0
  35. mcp_vector_search/cli/history.py +295 -0
  36. mcp_vector_search/cli/interactive.py +342 -0
  37. mcp_vector_search/cli/main.py +484 -0
  38. mcp_vector_search/cli/output.py +414 -0
  39. mcp_vector_search/cli/suggestions.py +375 -0
  40. mcp_vector_search/config/__init__.py +1 -0
  41. mcp_vector_search/config/constants.py +24 -0
  42. mcp_vector_search/config/defaults.py +200 -0
  43. mcp_vector_search/config/settings.py +146 -0
  44. mcp_vector_search/core/__init__.py +1 -0
  45. mcp_vector_search/core/auto_indexer.py +298 -0
  46. mcp_vector_search/core/config_utils.py +394 -0
  47. mcp_vector_search/core/connection_pool.py +360 -0
  48. mcp_vector_search/core/database.py +1237 -0
  49. mcp_vector_search/core/directory_index.py +318 -0
  50. mcp_vector_search/core/embeddings.py +294 -0
  51. mcp_vector_search/core/exceptions.py +89 -0
  52. mcp_vector_search/core/factory.py +318 -0
  53. mcp_vector_search/core/git_hooks.py +345 -0
  54. mcp_vector_search/core/indexer.py +1002 -0
  55. mcp_vector_search/core/llm_client.py +453 -0
  56. mcp_vector_search/core/models.py +294 -0
  57. mcp_vector_search/core/project.py +350 -0
  58. mcp_vector_search/core/scheduler.py +330 -0
  59. mcp_vector_search/core/search.py +952 -0
  60. mcp_vector_search/core/watcher.py +322 -0
  61. mcp_vector_search/mcp/__init__.py +5 -0
  62. mcp_vector_search/mcp/__main__.py +25 -0
  63. mcp_vector_search/mcp/server.py +752 -0
  64. mcp_vector_search/parsers/__init__.py +8 -0
  65. mcp_vector_search/parsers/base.py +296 -0
  66. mcp_vector_search/parsers/dart.py +605 -0
  67. mcp_vector_search/parsers/html.py +413 -0
  68. mcp_vector_search/parsers/javascript.py +643 -0
  69. mcp_vector_search/parsers/php.py +694 -0
  70. mcp_vector_search/parsers/python.py +502 -0
  71. mcp_vector_search/parsers/registry.py +223 -0
  72. mcp_vector_search/parsers/ruby.py +678 -0
  73. mcp_vector_search/parsers/text.py +186 -0
  74. mcp_vector_search/parsers/utils.py +265 -0
  75. mcp_vector_search/py.typed +1 -0
  76. mcp_vector_search/utils/__init__.py +42 -0
  77. mcp_vector_search/utils/gitignore.py +250 -0
  78. mcp_vector_search/utils/gitignore_updater.py +212 -0
  79. mcp_vector_search/utils/monorepo.py +339 -0
  80. mcp_vector_search/utils/timing.py +338 -0
  81. mcp_vector_search/utils/version.py +47 -0
  82. mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
  83. mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
  84. mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
  85. mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
  86. mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,413 @@
1
+ """HTML parser for MCP Vector Search."""
2
+
3
+ from html.parser import HTMLParser as BaseHTMLParser
4
+ from pathlib import Path
5
+
6
+ from ..core.models import CodeChunk
7
+ from .base import BaseParser
8
+
9
+
10
+ class HTMLContentParser(BaseHTMLParser):
11
+ """HTML parser for extracting semantic content from HTML documents.
12
+
13
+ Extracts meaningful content from semantic HTML tags while ignoring
14
+ scripts, styles, and other non-content elements.
15
+ """
16
+
17
+ def __init__(self) -> None:
18
+ """Initialize the HTML content parser."""
19
+ super().__init__()
20
+ self.sections: list[dict] = []
21
+ self.current_section: dict | None = None
22
+ self.current_tag: str | None = None
23
+ self.current_attrs: dict = {}
24
+ self.tag_stack: list[tuple[str, dict]] = []
25
+ self.ignore_content = False
26
+ self.line_number = 1
27
+
28
+ # Semantic tags that define sections
29
+ self.section_tags = {
30
+ "h1",
31
+ "h2",
32
+ "h3",
33
+ "h4",
34
+ "h5",
35
+ "h6",
36
+ "section",
37
+ "article",
38
+ "main",
39
+ "aside",
40
+ "header",
41
+ "footer",
42
+ "nav",
43
+ }
44
+ # Tags to ignore completely
45
+ self.ignore_tags = {"script", "style", "noscript"}
46
+ # Inline text tags
47
+ self.inline_tags = {
48
+ "span",
49
+ "a",
50
+ "strong",
51
+ "em",
52
+ "b",
53
+ "i",
54
+ "code",
55
+ "pre",
56
+ "small",
57
+ }
58
+
59
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
60
+ """Handle opening HTML tags.
61
+
62
+ Args:
63
+ tag: Tag name
64
+ attrs: List of (name, value) tuples for tag attributes
65
+ """
66
+ tag = tag.lower()
67
+ attr_dict = {k: v for k, v in attrs if v is not None}
68
+
69
+ # Track tag stack
70
+ self.tag_stack.append((tag, attr_dict))
71
+
72
+ # Ignore script/style content
73
+ if tag in self.ignore_tags:
74
+ self.ignore_content = True
75
+ return
76
+
77
+ # Start new section for semantic tags
78
+ if tag in self.section_tags:
79
+ # Save current section if exists
80
+ if self.current_section and self.current_section.get("content", "").strip():
81
+ self._finalize_current_section()
82
+
83
+ # Start new section
84
+ tag_id = attr_dict.get("id", "")
85
+ tag_class = attr_dict.get("class", "")
86
+
87
+ # Create tag identifier
88
+ tag_info = tag
89
+ if tag_id:
90
+ tag_info = f"{tag}#{tag_id}"
91
+ elif tag_class:
92
+ tag_info = f"{tag}.{tag_class.split()[0]}"
93
+
94
+ self.current_section = {
95
+ "tag": tag,
96
+ "tag_info": tag_info,
97
+ "content": "",
98
+ "start_line": self.getpos()[0],
99
+ "attrs": attr_dict,
100
+ }
101
+
102
+ # Handle paragraph tags
103
+ if tag == "p":
104
+ if not self.current_section:
105
+ # Create implicit section for orphan paragraph
106
+ self.current_section = {
107
+ "tag": "p",
108
+ "tag_info": "p",
109
+ "content": "",
110
+ "start_line": self.getpos()[0],
111
+ "attrs": {},
112
+ }
113
+
114
+ self.current_tag = tag
115
+
116
+ def handle_endtag(self, tag: str) -> None:
117
+ """Handle closing HTML tags.
118
+
119
+ Args:
120
+ tag: Tag name
121
+ """
122
+ tag = tag.lower()
123
+
124
+ # Pop from stack
125
+ if self.tag_stack and self.tag_stack[-1][0] == tag:
126
+ self.tag_stack.pop()
127
+
128
+ # Stop ignoring content after script/style
129
+ if tag in self.ignore_tags:
130
+ self.ignore_content = False
131
+ return
132
+
133
+ # Finalize section for semantic tags
134
+ if tag in self.section_tags or tag == "p":
135
+ if self.current_section and self.current_section.get("content", "").strip():
136
+ self.current_section["end_line"] = self.getpos()[0]
137
+ self._finalize_current_section()
138
+
139
+ def handle_data(self, data: str) -> None:
140
+ """Handle text content between tags.
141
+
142
+ Args:
143
+ data: Text content
144
+ """
145
+ if self.ignore_content or not data.strip():
146
+ return
147
+
148
+ # Add content to current section
149
+ if self.current_section is not None:
150
+ # Add spacing between inline elements
151
+ if self.current_section["content"] and not self.current_section[
152
+ "content"
153
+ ].endswith(" "):
154
+ self.current_section["content"] += " "
155
+ self.current_section["content"] += data.strip()
156
+
157
+ def _finalize_current_section(self) -> None:
158
+ """Finalize and save the current section."""
159
+ if not self.current_section:
160
+ return
161
+
162
+ content = self.current_section["content"].strip()
163
+
164
+ # Only save sections with meaningful content (min 20 chars)
165
+ if len(content) >= 20:
166
+ self.sections.append(
167
+ {
168
+ "tag": self.current_section["tag"],
169
+ "tag_info": self.current_section["tag_info"],
170
+ "content": content,
171
+ "start_line": self.current_section["start_line"],
172
+ "end_line": self.current_section.get(
173
+ "end_line", self.current_section["start_line"]
174
+ ),
175
+ "attrs": self.current_section["attrs"],
176
+ }
177
+ )
178
+
179
+ self.current_section = None
180
+
181
+ def get_sections(self) -> list[dict]:
182
+ """Get all extracted sections.
183
+
184
+ Returns:
185
+ List of section dictionaries
186
+ """
187
+ # Finalize any remaining section
188
+ if self.current_section and self.current_section.get("content", "").strip():
189
+ self._finalize_current_section()
190
+
191
+ return self.sections
192
+
193
+
194
+ class HTMLParser(BaseParser):
195
+ """Parser for HTML files (.html, .htm).
196
+
197
+ Extracts semantic content from HTML documents by parsing
198
+ heading hierarchy, sections, articles, and paragraphs.
199
+ """
200
+
201
+ def __init__(self) -> None:
202
+ """Initialize HTML parser."""
203
+ super().__init__("html")
204
+
205
+ async def parse_file(self, file_path: Path) -> list[CodeChunk]:
206
+ """Parse an HTML file and extract semantic chunks.
207
+
208
+ Args:
209
+ file_path: Path to the HTML file
210
+
211
+ Returns:
212
+ List of semantic content chunks
213
+ """
214
+ try:
215
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
216
+ content = f.read()
217
+ return await self.parse_content(content, file_path)
218
+ except Exception:
219
+ # Return empty list if file can't be read
220
+ return []
221
+
222
+ async def parse_content(self, content: str, file_path: Path) -> list[CodeChunk]:
223
+ """Parse HTML content into semantic chunks.
224
+
225
+ Extracts content from semantic HTML tags (headings, sections, articles)
226
+ while ignoring scripts, styles, and other non-content elements.
227
+
228
+ Args:
229
+ content: HTML content to parse
230
+ file_path: Path to the source file
231
+
232
+ Returns:
233
+ List of semantic content chunks
234
+ """
235
+ if not content.strip():
236
+ return []
237
+
238
+ # Parse HTML content
239
+ parser = HTMLContentParser()
240
+ try:
241
+ parser.feed(content)
242
+ except Exception:
243
+ # If parsing fails, fall back to simple text extraction
244
+ return await self._fallback_parse(content, file_path)
245
+
246
+ sections = parser.get_sections()
247
+
248
+ if not sections:
249
+ # No semantic sections found, try fallback
250
+ return await self._fallback_parse(content, file_path)
251
+
252
+ # Convert sections to chunks
253
+ chunks = []
254
+
255
+ # Merge small sections for better semantic coherence
256
+ merged_sections = self._merge_small_sections(sections)
257
+
258
+ for section in merged_sections:
259
+ chunk_type = self._get_chunk_type(section["tag"])
260
+
261
+ # Create descriptive metadata
262
+ metadata = {
263
+ "chunk_type": chunk_type,
264
+ "function_name": section["tag_info"], # Use tag_info as identifier
265
+ }
266
+
267
+ # Add class name for sections with specific IDs
268
+ if section["attrs"].get("id"):
269
+ metadata["class_name"] = section["attrs"]["id"]
270
+
271
+ chunk = self._create_chunk(
272
+ content=section["content"],
273
+ file_path=file_path,
274
+ start_line=section["start_line"],
275
+ end_line=section["end_line"],
276
+ **metadata,
277
+ )
278
+ chunks.append(chunk)
279
+
280
+ return chunks
281
+
282
+ def _get_chunk_type(self, tag: str) -> str:
283
+ """Determine chunk type based on HTML tag.
284
+
285
+ Args:
286
+ tag: HTML tag name
287
+
288
+ Returns:
289
+ Chunk type string
290
+ """
291
+ if tag in {"h1", "h2", "h3", "h4", "h5", "h6"}:
292
+ return "heading"
293
+ elif tag in {"section", "article", "main"}:
294
+ return "section"
295
+ elif tag == "p":
296
+ return "paragraph"
297
+ else:
298
+ return "content"
299
+
300
+ def _merge_small_sections(
301
+ self, sections: list[dict], target_size: int = 200, max_size: int = 500
302
+ ) -> list[dict]:
303
+ """Merge small sections to create optimal chunk sizes.
304
+
305
+ Args:
306
+ sections: List of section dictionaries
307
+ target_size: Target minimum size for chunks in characters
308
+ max_size: Maximum size for chunks in characters
309
+
310
+ Returns:
311
+ List of merged section dictionaries
312
+ """
313
+ if not sections:
314
+ return []
315
+
316
+ merged = []
317
+ current_merge: dict | None = None
318
+
319
+ for section in sections:
320
+ section_len = len(section["content"])
321
+
322
+ # Always start new section for h1 tags
323
+ if section["tag"] == "h1":
324
+ if current_merge:
325
+ merged.append(current_merge)
326
+ current_merge = section.copy()
327
+ continue
328
+
329
+ if current_merge is None:
330
+ current_merge = section.copy()
331
+ elif len(current_merge["content"]) + section_len < max_size:
332
+ # Merge with current if under max size
333
+ current_merge["content"] += "\n\n" + section["content"]
334
+ current_merge["end_line"] = section["end_line"]
335
+
336
+ # Update tag_info to reflect merged content
337
+ if current_merge["tag_info"] != section["tag_info"]:
338
+ current_merge["tag_info"] = (
339
+ f"{current_merge['tag_info']}+{section['tag_info']}"
340
+ )
341
+ else:
342
+ # Start new section if max size would be exceeded
343
+ if len(current_merge["content"]) >= target_size:
344
+ merged.append(current_merge)
345
+ current_merge = section.copy()
346
+
347
+ # Add last section
348
+ if current_merge and len(current_merge["content"]) >= 20:
349
+ merged.append(current_merge)
350
+
351
+ return merged
352
+
353
+ async def _fallback_parse(self, content: str, file_path: Path) -> list[CodeChunk]:
354
+ """Fallback parsing for malformed HTML.
355
+
356
+ Strips HTML tags and creates simple text chunks.
357
+
358
+ Args:
359
+ content: HTML content
360
+ file_path: Path to source file
361
+
362
+ Returns:
363
+ List of text chunks
364
+ """
365
+ # Simple HTML tag removal
366
+ import re
367
+
368
+ # Remove script and style tags with content
369
+ content = re.sub(
370
+ r"<script[^>]*>.*?</script>", "", content, flags=re.DOTALL | re.IGNORECASE
371
+ )
372
+ content = re.sub(
373
+ r"<style[^>]*>.*?</style>", "", content, flags=re.DOTALL | re.IGNORECASE
374
+ )
375
+
376
+ # Remove HTML tags
377
+ content = re.sub(r"<[^>]+>", " ", content)
378
+
379
+ # Normalize whitespace
380
+ content = re.sub(r"\s+", " ", content)
381
+
382
+ # Split into chunks
383
+ lines = content.split(".")
384
+ chunks = []
385
+ current_chunk = []
386
+ start_line = 1
387
+
388
+ for i, line in enumerate(lines, 1):
389
+ current_chunk.append(line)
390
+ chunk_text = ". ".join(current_chunk)
391
+
392
+ if len(chunk_text) >= 200 or i == len(lines):
393
+ if chunk_text.strip():
394
+ chunk = self._create_chunk(
395
+ content=chunk_text.strip(),
396
+ file_path=file_path,
397
+ start_line=start_line,
398
+ end_line=i,
399
+ chunk_type="text",
400
+ )
401
+ chunks.append(chunk)
402
+ current_chunk = []
403
+ start_line = i + 1
404
+
405
+ return chunks
406
+
407
+ def get_supported_extensions(self) -> list[str]:
408
+ """Get list of supported file extensions.
409
+
410
+ Returns:
411
+ List of supported extensions
412
+ """
413
+ return [".html", ".htm"]