mcp-vector-search 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +9 -0
- mcp_vector_search/cli/__init__.py +1 -0
- mcp_vector_search/cli/commands/__init__.py +1 -0
- mcp_vector_search/cli/commands/config.py +303 -0
- mcp_vector_search/cli/commands/index.py +304 -0
- mcp_vector_search/cli/commands/init.py +212 -0
- mcp_vector_search/cli/commands/search.py +395 -0
- mcp_vector_search/cli/commands/status.py +340 -0
- mcp_vector_search/cli/commands/watch.py +288 -0
- mcp_vector_search/cli/main.py +117 -0
- mcp_vector_search/cli/output.py +242 -0
- mcp_vector_search/config/__init__.py +1 -0
- mcp_vector_search/config/defaults.py +175 -0
- mcp_vector_search/config/settings.py +108 -0
- mcp_vector_search/core/__init__.py +1 -0
- mcp_vector_search/core/database.py +431 -0
- mcp_vector_search/core/embeddings.py +250 -0
- mcp_vector_search/core/exceptions.py +66 -0
- mcp_vector_search/core/indexer.py +310 -0
- mcp_vector_search/core/models.py +174 -0
- mcp_vector_search/core/project.py +304 -0
- mcp_vector_search/core/search.py +324 -0
- mcp_vector_search/core/watcher.py +320 -0
- mcp_vector_search/mcp/__init__.py +1 -0
- mcp_vector_search/parsers/__init__.py +1 -0
- mcp_vector_search/parsers/base.py +180 -0
- mcp_vector_search/parsers/javascript.py +238 -0
- mcp_vector_search/parsers/python.py +407 -0
- mcp_vector_search/parsers/registry.py +187 -0
- mcp_vector_search/py.typed +1 -0
- mcp_vector_search-0.0.3.dist-info/METADATA +333 -0
- mcp_vector_search-0.0.3.dist-info/RECORD +35 -0
- mcp_vector_search-0.0.3.dist-info/WHEEL +4 -0
- mcp_vector_search-0.0.3.dist-info/entry_points.txt +2 -0
- mcp_vector_search-0.0.3.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""JavaScript/TypeScript parser for MCP Vector Search."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
|
|
9
|
+
from ..core.models import CodeChunk
|
|
10
|
+
from .base import BaseParser
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class JavaScriptParser(BaseParser):
|
|
14
|
+
"""JavaScript/TypeScript parser with fallback regex-based parsing."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, language: str = "javascript") -> None:
|
|
17
|
+
"""Initialize JavaScript parser."""
|
|
18
|
+
super().__init__(language)
|
|
19
|
+
|
|
20
|
+
async def parse_file(self, file_path: Path) -> List[CodeChunk]:
|
|
21
|
+
"""Parse a JavaScript/TypeScript file and extract code chunks."""
|
|
22
|
+
try:
|
|
23
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
24
|
+
content = f.read()
|
|
25
|
+
return await self.parse_content(content, file_path)
|
|
26
|
+
except Exception as e:
|
|
27
|
+
logger.error(f"Failed to read file {file_path}: {e}")
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
async def parse_content(self, content: str, file_path: Path) -> List[CodeChunk]:
|
|
31
|
+
"""Parse JavaScript/TypeScript content and extract code chunks."""
|
|
32
|
+
if not content.strip():
|
|
33
|
+
return []
|
|
34
|
+
|
|
35
|
+
return await self._regex_parse(content, file_path)
|
|
36
|
+
|
|
37
|
+
async def _regex_parse(self, content: str, file_path: Path) -> List[CodeChunk]:
|
|
38
|
+
"""Parse JavaScript/TypeScript using regex patterns."""
|
|
39
|
+
chunks = []
|
|
40
|
+
lines = self._split_into_lines(content)
|
|
41
|
+
|
|
42
|
+
# JavaScript/TypeScript patterns
|
|
43
|
+
function_patterns = [
|
|
44
|
+
re.compile(r"^\s*function\s+(\w+)\s*\(", re.MULTILINE), # function name()
|
|
45
|
+
re.compile(r"^\s*const\s+(\w+)\s*=\s*\([^)]*\)\s*=>\s*{", re.MULTILINE), # const name = () => {
|
|
46
|
+
re.compile(r"^\s*const\s+(\w+)\s*=\s*function\s*\(", re.MULTILINE), # const name = function(
|
|
47
|
+
re.compile(r"^\s*(\w+)\s*:\s*function\s*\(", re.MULTILINE), # name: function(
|
|
48
|
+
re.compile(r"^\s*(\w+)\s*\([^)]*\)\s*{", re.MULTILINE), # name() { (method)
|
|
49
|
+
re.compile(r"^\s*async\s+function\s+(\w+)\s*\(", re.MULTILINE), # async function name()
|
|
50
|
+
re.compile(r"^\s*async\s+(\w+)\s*\([^)]*\)\s*{", re.MULTILINE), # async name() {
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
class_patterns = [
|
|
54
|
+
re.compile(r"^\s*class\s+(\w+)", re.MULTILINE), # class Name
|
|
55
|
+
re.compile(r"^\s*export\s+class\s+(\w+)", re.MULTILINE), # export class Name
|
|
56
|
+
re.compile(r"^\s*export\s+default\s+class\s+(\w+)", re.MULTILINE), # export default class Name
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
interface_patterns = [
|
|
60
|
+
re.compile(r"^\s*interface\s+(\w+)", re.MULTILINE), # interface Name (TypeScript)
|
|
61
|
+
re.compile(r"^\s*export\s+interface\s+(\w+)", re.MULTILINE), # export interface Name
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
import_pattern = re.compile(r"^\s*(import|export).*", re.MULTILINE)
|
|
65
|
+
|
|
66
|
+
# Extract imports
|
|
67
|
+
imports = []
|
|
68
|
+
for match in import_pattern.finditer(content):
|
|
69
|
+
import_line = match.group(0).strip()
|
|
70
|
+
imports.append(import_line)
|
|
71
|
+
|
|
72
|
+
# Extract functions
|
|
73
|
+
for pattern in function_patterns:
|
|
74
|
+
for match in pattern.finditer(content):
|
|
75
|
+
function_name = match.group(1)
|
|
76
|
+
start_line = content[:match.start()].count("\n") + 1
|
|
77
|
+
|
|
78
|
+
# Find end of function
|
|
79
|
+
end_line = self._find_block_end(lines, start_line, "{", "}")
|
|
80
|
+
|
|
81
|
+
func_content = self._get_line_range(lines, start_line, end_line)
|
|
82
|
+
|
|
83
|
+
if func_content.strip():
|
|
84
|
+
# Extract JSDoc comment
|
|
85
|
+
jsdoc = self._extract_jsdoc(lines, start_line)
|
|
86
|
+
|
|
87
|
+
chunk = self._create_chunk(
|
|
88
|
+
content=func_content,
|
|
89
|
+
file_path=file_path,
|
|
90
|
+
start_line=start_line,
|
|
91
|
+
end_line=end_line,
|
|
92
|
+
chunk_type="function",
|
|
93
|
+
function_name=function_name,
|
|
94
|
+
docstring=jsdoc,
|
|
95
|
+
)
|
|
96
|
+
chunk.imports = imports
|
|
97
|
+
chunks.append(chunk)
|
|
98
|
+
|
|
99
|
+
# Extract classes
|
|
100
|
+
for pattern in class_patterns:
|
|
101
|
+
for match in pattern.finditer(content):
|
|
102
|
+
class_name = match.group(1)
|
|
103
|
+
start_line = content[:match.start()].count("\n") + 1
|
|
104
|
+
|
|
105
|
+
# Find end of class
|
|
106
|
+
end_line = self._find_block_end(lines, start_line, "{", "}")
|
|
107
|
+
|
|
108
|
+
class_content = self._get_line_range(lines, start_line, end_line)
|
|
109
|
+
|
|
110
|
+
if class_content.strip():
|
|
111
|
+
# Extract JSDoc comment
|
|
112
|
+
jsdoc = self._extract_jsdoc(lines, start_line)
|
|
113
|
+
|
|
114
|
+
chunk = self._create_chunk(
|
|
115
|
+
content=class_content,
|
|
116
|
+
file_path=file_path,
|
|
117
|
+
start_line=start_line,
|
|
118
|
+
end_line=end_line,
|
|
119
|
+
chunk_type="class",
|
|
120
|
+
class_name=class_name,
|
|
121
|
+
docstring=jsdoc,
|
|
122
|
+
)
|
|
123
|
+
chunk.imports = imports
|
|
124
|
+
chunks.append(chunk)
|
|
125
|
+
|
|
126
|
+
# Extract interfaces (TypeScript)
|
|
127
|
+
if self.language == "typescript":
|
|
128
|
+
for pattern in interface_patterns:
|
|
129
|
+
for match in pattern.finditer(content):
|
|
130
|
+
interface_name = match.group(1)
|
|
131
|
+
start_line = content[:match.start()].count("\n") + 1
|
|
132
|
+
|
|
133
|
+
# Find end of interface
|
|
134
|
+
end_line = self._find_block_end(lines, start_line, "{", "}")
|
|
135
|
+
|
|
136
|
+
interface_content = self._get_line_range(lines, start_line, end_line)
|
|
137
|
+
|
|
138
|
+
if interface_content.strip():
|
|
139
|
+
# Extract JSDoc comment
|
|
140
|
+
jsdoc = self._extract_jsdoc(lines, start_line)
|
|
141
|
+
|
|
142
|
+
chunk = self._create_chunk(
|
|
143
|
+
content=interface_content,
|
|
144
|
+
file_path=file_path,
|
|
145
|
+
start_line=start_line,
|
|
146
|
+
end_line=end_line,
|
|
147
|
+
chunk_type="interface",
|
|
148
|
+
class_name=interface_name, # Use class_name field for interface
|
|
149
|
+
docstring=jsdoc,
|
|
150
|
+
)
|
|
151
|
+
chunk.imports = imports
|
|
152
|
+
chunks.append(chunk)
|
|
153
|
+
|
|
154
|
+
# If no specific chunks found, create a single chunk for the whole file
|
|
155
|
+
if not chunks:
|
|
156
|
+
chunks.append(
|
|
157
|
+
self._create_chunk(
|
|
158
|
+
content=content,
|
|
159
|
+
file_path=file_path,
|
|
160
|
+
start_line=1,
|
|
161
|
+
end_line=len(lines),
|
|
162
|
+
chunk_type="module",
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return chunks
|
|
167
|
+
|
|
168
|
+
def _find_block_end(self, lines: List[str], start_line: int, open_char: str, close_char: str) -> int:
|
|
169
|
+
"""Find the end of a block by matching braces."""
|
|
170
|
+
if start_line > len(lines):
|
|
171
|
+
return len(lines)
|
|
172
|
+
|
|
173
|
+
brace_count = 0
|
|
174
|
+
found_opening = False
|
|
175
|
+
|
|
176
|
+
for i in range(start_line - 1, len(lines)):
|
|
177
|
+
line = lines[i]
|
|
178
|
+
|
|
179
|
+
for char in line:
|
|
180
|
+
if char == open_char:
|
|
181
|
+
brace_count += 1
|
|
182
|
+
found_opening = True
|
|
183
|
+
elif char == close_char:
|
|
184
|
+
brace_count -= 1
|
|
185
|
+
|
|
186
|
+
if found_opening and brace_count == 0:
|
|
187
|
+
return i + 1 # Return 1-based line number
|
|
188
|
+
|
|
189
|
+
return len(lines)
|
|
190
|
+
|
|
191
|
+
def _extract_jsdoc(self, lines: List[str], start_line: int) -> Optional[str]:
|
|
192
|
+
"""Extract JSDoc comment before a function/class."""
|
|
193
|
+
if start_line <= 1:
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
# Look backwards for JSDoc comment
|
|
197
|
+
for i in range(start_line - 2, max(-1, start_line - 10), -1):
|
|
198
|
+
line = lines[i].strip()
|
|
199
|
+
|
|
200
|
+
if line.endswith("*/"):
|
|
201
|
+
# Found end of JSDoc, collect the comment
|
|
202
|
+
jsdoc_lines = []
|
|
203
|
+
for j in range(i, -1, -1):
|
|
204
|
+
comment_line = lines[j].strip()
|
|
205
|
+
jsdoc_lines.insert(0, comment_line)
|
|
206
|
+
|
|
207
|
+
if comment_line.startswith("/**"):
|
|
208
|
+
# Found start of JSDoc
|
|
209
|
+
# Clean up the comment
|
|
210
|
+
cleaned_lines = []
|
|
211
|
+
for line in jsdoc_lines:
|
|
212
|
+
# Remove /** */ and * prefixes
|
|
213
|
+
cleaned = line.replace("/**", "").replace("*/", "").replace("*", "").strip()
|
|
214
|
+
if cleaned:
|
|
215
|
+
cleaned_lines.append(cleaned)
|
|
216
|
+
|
|
217
|
+
return " ".join(cleaned_lines) if cleaned_lines else None
|
|
218
|
+
|
|
219
|
+
# If we hit non-comment code, stop looking
|
|
220
|
+
elif line and not line.startswith("//") and not line.startswith("*"):
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
def get_supported_extensions(self) -> List[str]:
|
|
226
|
+
"""Get supported file extensions."""
|
|
227
|
+
if self.language == "typescript":
|
|
228
|
+
return [".ts", ".tsx"]
|
|
229
|
+
else:
|
|
230
|
+
return [".js", ".jsx", ".mjs"]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class TypeScriptParser(JavaScriptParser):
|
|
234
|
+
"""TypeScript parser extending JavaScript parser."""
|
|
235
|
+
|
|
236
|
+
def __init__(self) -> None:
|
|
237
|
+
"""Initialize TypeScript parser."""
|
|
238
|
+
super().__init__("typescript")
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
"""Python parser using Tree-sitter for MCP Vector Search."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
|
|
9
|
+
from ..core.models import CodeChunk
|
|
10
|
+
from .base import BaseParser
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PythonParser(BaseParser):
|
|
14
|
+
"""Python parser using Tree-sitter for AST-based code analysis."""
|
|
15
|
+
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
"""Initialize Python parser."""
|
|
18
|
+
super().__init__("python")
|
|
19
|
+
self._parser = None
|
|
20
|
+
self._language = None
|
|
21
|
+
self._initialize_parser()
|
|
22
|
+
|
|
23
|
+
def _initialize_parser(self) -> None:
|
|
24
|
+
"""Initialize Tree-sitter parser for Python."""
|
|
25
|
+
try:
|
|
26
|
+
# Try the tree-sitter-languages package first
|
|
27
|
+
import tree_sitter_languages
|
|
28
|
+
|
|
29
|
+
self._language = tree_sitter_languages.get_language("python")
|
|
30
|
+
self._parser = tree_sitter_languages.get_parser("python")
|
|
31
|
+
logger.debug("Python Tree-sitter parser initialized via tree-sitter-languages")
|
|
32
|
+
return
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.debug(f"tree-sitter-languages failed: {e}")
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
# Fallback to manual tree-sitter setup (requires language binaries)
|
|
38
|
+
import tree_sitter
|
|
39
|
+
|
|
40
|
+
# This would require language binaries to be available
|
|
41
|
+
# For now, we'll skip this and rely on fallback parsing
|
|
42
|
+
logger.debug("Manual tree-sitter setup not implemented yet")
|
|
43
|
+
self._parser = None
|
|
44
|
+
self._language = None
|
|
45
|
+
except Exception as e:
|
|
46
|
+
logger.debug(f"Manual tree-sitter setup failed: {e}")
|
|
47
|
+
self._parser = None
|
|
48
|
+
self._language = None
|
|
49
|
+
|
|
50
|
+
logger.info("Using fallback regex-based parsing for Python (Tree-sitter unavailable)")
|
|
51
|
+
|
|
52
|
+
async def parse_file(self, file_path: Path) -> List[CodeChunk]:
|
|
53
|
+
"""Parse a Python file and extract code chunks."""
|
|
54
|
+
try:
|
|
55
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
56
|
+
content = f.read()
|
|
57
|
+
return await self.parse_content(content, file_path)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logger.error(f"Failed to read file {file_path}: {e}")
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
async def parse_content(self, content: str, file_path: Path) -> List[CodeChunk]:
|
|
63
|
+
"""Parse Python content and extract code chunks."""
|
|
64
|
+
if not content.strip():
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
# If Tree-sitter is not available, fall back to simple parsing
|
|
68
|
+
if not self._parser:
|
|
69
|
+
return await self._fallback_parse(content, file_path)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
# Parse with Tree-sitter
|
|
73
|
+
tree = self._parser.parse(content.encode("utf-8"))
|
|
74
|
+
return self._extract_chunks_from_tree(tree, content, file_path)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.warning(f"Tree-sitter parsing failed for {file_path}: {e}")
|
|
77
|
+
return await self._fallback_parse(content, file_path)
|
|
78
|
+
|
|
79
|
+
def _extract_chunks_from_tree(
|
|
80
|
+
self, tree, content: str, file_path: Path
|
|
81
|
+
) -> List[CodeChunk]:
|
|
82
|
+
"""Extract code chunks from Tree-sitter AST."""
|
|
83
|
+
chunks = []
|
|
84
|
+
lines = self._split_into_lines(content)
|
|
85
|
+
|
|
86
|
+
def visit_node(node, current_class=None):
|
|
87
|
+
"""Recursively visit AST nodes."""
|
|
88
|
+
node_type = node.type
|
|
89
|
+
|
|
90
|
+
if node_type == "function_definition":
|
|
91
|
+
chunks.extend(
|
|
92
|
+
self._extract_function(node, lines, file_path, current_class)
|
|
93
|
+
)
|
|
94
|
+
elif node_type == "class_definition":
|
|
95
|
+
class_chunks = self._extract_class(node, lines, file_path)
|
|
96
|
+
chunks.extend(class_chunks)
|
|
97
|
+
|
|
98
|
+
# Visit class methods with class context
|
|
99
|
+
class_name = self._get_node_name(node)
|
|
100
|
+
for child in node.children:
|
|
101
|
+
visit_node(child, class_name)
|
|
102
|
+
elif node_type == "module":
|
|
103
|
+
# Extract module-level code
|
|
104
|
+
module_chunk = self._extract_module_chunk(node, lines, file_path)
|
|
105
|
+
if module_chunk:
|
|
106
|
+
chunks.append(module_chunk)
|
|
107
|
+
|
|
108
|
+
# Visit all children
|
|
109
|
+
for child in node.children:
|
|
110
|
+
visit_node(child)
|
|
111
|
+
else:
|
|
112
|
+
# Visit children for other node types
|
|
113
|
+
for child in node.children:
|
|
114
|
+
visit_node(child, current_class)
|
|
115
|
+
|
|
116
|
+
# Start traversal from root
|
|
117
|
+
visit_node(tree.root_node)
|
|
118
|
+
|
|
119
|
+
# If no specific chunks found, create a single chunk for the whole file
|
|
120
|
+
if not chunks:
|
|
121
|
+
chunks.append(
|
|
122
|
+
self._create_chunk(
|
|
123
|
+
content=content,
|
|
124
|
+
file_path=file_path,
|
|
125
|
+
start_line=1,
|
|
126
|
+
end_line=len(lines),
|
|
127
|
+
chunk_type="module",
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return chunks
|
|
132
|
+
|
|
133
|
+
def _extract_function(
|
|
134
|
+
self, node, lines: List[str], file_path: Path, class_name: Optional[str] = None
|
|
135
|
+
) -> List[CodeChunk]:
|
|
136
|
+
"""Extract function definition as a chunk."""
|
|
137
|
+
chunks = []
|
|
138
|
+
|
|
139
|
+
function_name = self._get_node_name(node)
|
|
140
|
+
start_line = node.start_point[0] + 1
|
|
141
|
+
end_line = node.end_point[0] + 1
|
|
142
|
+
|
|
143
|
+
# Get function content
|
|
144
|
+
content = self._get_line_range(lines, start_line, end_line)
|
|
145
|
+
|
|
146
|
+
# Extract docstring if present
|
|
147
|
+
docstring = self._extract_docstring(node, lines)
|
|
148
|
+
|
|
149
|
+
chunk = self._create_chunk(
|
|
150
|
+
content=content,
|
|
151
|
+
file_path=file_path,
|
|
152
|
+
start_line=start_line,
|
|
153
|
+
end_line=end_line,
|
|
154
|
+
chunk_type="function",
|
|
155
|
+
function_name=function_name,
|
|
156
|
+
class_name=class_name,
|
|
157
|
+
docstring=docstring,
|
|
158
|
+
)
|
|
159
|
+
chunks.append(chunk)
|
|
160
|
+
|
|
161
|
+
return chunks
|
|
162
|
+
|
|
163
|
+
def _extract_class(self, node, lines: List[str], file_path: Path) -> List[CodeChunk]:
|
|
164
|
+
"""Extract class definition as a chunk."""
|
|
165
|
+
chunks = []
|
|
166
|
+
|
|
167
|
+
class_name = self._get_node_name(node)
|
|
168
|
+
start_line = node.start_point[0] + 1
|
|
169
|
+
end_line = node.end_point[0] + 1
|
|
170
|
+
|
|
171
|
+
# Get class content
|
|
172
|
+
content = self._get_line_range(lines, start_line, end_line)
|
|
173
|
+
|
|
174
|
+
# Extract docstring if present
|
|
175
|
+
docstring = self._extract_docstring(node, lines)
|
|
176
|
+
|
|
177
|
+
chunk = self._create_chunk(
|
|
178
|
+
content=content,
|
|
179
|
+
file_path=file_path,
|
|
180
|
+
start_line=start_line,
|
|
181
|
+
end_line=end_line,
|
|
182
|
+
chunk_type="class",
|
|
183
|
+
class_name=class_name,
|
|
184
|
+
docstring=docstring,
|
|
185
|
+
)
|
|
186
|
+
chunks.append(chunk)
|
|
187
|
+
|
|
188
|
+
return chunks
|
|
189
|
+
|
|
190
|
+
def _extract_module_chunk(
|
|
191
|
+
self, node, lines: List[str], file_path: Path
|
|
192
|
+
) -> Optional[CodeChunk]:
|
|
193
|
+
"""Extract module-level code (imports, constants, etc.)."""
|
|
194
|
+
# Look for module-level statements (not inside functions/classes)
|
|
195
|
+
module_lines = []
|
|
196
|
+
|
|
197
|
+
for child in node.children:
|
|
198
|
+
if child.type in ["import_statement", "import_from_statement"]:
|
|
199
|
+
start_line = child.start_point[0] + 1
|
|
200
|
+
end_line = child.end_point[0] + 1
|
|
201
|
+
import_content = self._get_line_range(lines, start_line, end_line)
|
|
202
|
+
module_lines.append(import_content.strip())
|
|
203
|
+
|
|
204
|
+
if module_lines:
|
|
205
|
+
content = "\n".join(module_lines)
|
|
206
|
+
return self._create_chunk(
|
|
207
|
+
content=content,
|
|
208
|
+
file_path=file_path,
|
|
209
|
+
start_line=1,
|
|
210
|
+
end_line=len(module_lines),
|
|
211
|
+
chunk_type="imports",
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
def _get_node_name(self, node) -> Optional[str]:
|
|
217
|
+
"""Extract name from a named node (function, class, etc.)."""
|
|
218
|
+
for child in node.children:
|
|
219
|
+
if child.type == "identifier":
|
|
220
|
+
return child.text.decode("utf-8")
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
def _extract_docstring(self, node, lines: List[str]) -> Optional[str]:
|
|
224
|
+
"""Extract docstring from a function or class node."""
|
|
225
|
+
# Look for string literal as first statement in body
|
|
226
|
+
for child in node.children:
|
|
227
|
+
if child.type == "block":
|
|
228
|
+
for stmt in child.children:
|
|
229
|
+
if stmt.type == "expression_statement":
|
|
230
|
+
for expr_child in stmt.children:
|
|
231
|
+
if expr_child.type == "string":
|
|
232
|
+
# Extract string content
|
|
233
|
+
start_line = expr_child.start_point[0] + 1
|
|
234
|
+
end_line = expr_child.end_point[0] + 1
|
|
235
|
+
docstring = self._get_line_range(lines, start_line, end_line)
|
|
236
|
+
# Clean up docstring (remove quotes)
|
|
237
|
+
return self._clean_docstring(docstring)
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
def _clean_docstring(self, docstring: str) -> str:
|
|
241
|
+
"""Clean up extracted docstring."""
|
|
242
|
+
# Remove triple quotes and clean whitespace
|
|
243
|
+
cleaned = re.sub(r'^["\']{{3}}|["\']{{3}}$', "", docstring.strip())
|
|
244
|
+
cleaned = re.sub(r'^["\']|["\']$', "", cleaned.strip())
|
|
245
|
+
return cleaned.strip()
|
|
246
|
+
|
|
247
|
+
async def _fallback_parse(self, content: str, file_path: Path) -> List[CodeChunk]:
|
|
248
|
+
"""Fallback parsing using regex when Tree-sitter is not available."""
|
|
249
|
+
chunks = []
|
|
250
|
+
lines = self._split_into_lines(content)
|
|
251
|
+
|
|
252
|
+
# Enhanced regex patterns
|
|
253
|
+
function_pattern = re.compile(r"^\s*def\s+(\w+)\s*\(", re.MULTILINE)
|
|
254
|
+
class_pattern = re.compile(r"^\s*class\s+(\w+)\s*[:\(]", re.MULTILINE)
|
|
255
|
+
import_pattern = re.compile(r"^\s*(from\s+\S+\s+)?import\s+(.+)", re.MULTILINE)
|
|
256
|
+
|
|
257
|
+
# Extract imports first
|
|
258
|
+
imports = []
|
|
259
|
+
for match in import_pattern.finditer(content):
|
|
260
|
+
import_line = match.group(0).strip()
|
|
261
|
+
imports.append(import_line)
|
|
262
|
+
|
|
263
|
+
# Find functions
|
|
264
|
+
for match in function_pattern.finditer(content):
|
|
265
|
+
function_name = match.group(1)
|
|
266
|
+
# Find the actual line with 'def' by looking for it in the match
|
|
267
|
+
match_text = match.group(0)
|
|
268
|
+
def_pos_in_match = match_text.find('def')
|
|
269
|
+
actual_def_pos = match.start() + def_pos_in_match
|
|
270
|
+
start_line = content[:actual_def_pos].count("\n") + 1
|
|
271
|
+
|
|
272
|
+
# Find end of function (simple heuristic)
|
|
273
|
+
end_line = self._find_function_end(lines, start_line)
|
|
274
|
+
|
|
275
|
+
func_content = self._get_line_range(lines, start_line, end_line)
|
|
276
|
+
|
|
277
|
+
if func_content.strip(): # Only add if content is not empty
|
|
278
|
+
# Extract docstring using regex
|
|
279
|
+
docstring = self._extract_docstring_regex(func_content)
|
|
280
|
+
|
|
281
|
+
chunk = self._create_chunk(
|
|
282
|
+
content=func_content,
|
|
283
|
+
file_path=file_path,
|
|
284
|
+
start_line=start_line,
|
|
285
|
+
end_line=end_line,
|
|
286
|
+
chunk_type="function",
|
|
287
|
+
function_name=function_name,
|
|
288
|
+
docstring=docstring,
|
|
289
|
+
)
|
|
290
|
+
chunk.imports = imports # Add imports to chunk
|
|
291
|
+
chunks.append(chunk)
|
|
292
|
+
|
|
293
|
+
# Find classes
|
|
294
|
+
for match in class_pattern.finditer(content):
|
|
295
|
+
class_name = match.group(1)
|
|
296
|
+
# Find the actual line with 'class' by looking for it in the match
|
|
297
|
+
match_text = match.group(0)
|
|
298
|
+
class_pos_in_match = match_text.find('class')
|
|
299
|
+
actual_class_pos = match.start() + class_pos_in_match
|
|
300
|
+
start_line = content[:actual_class_pos].count("\n") + 1
|
|
301
|
+
|
|
302
|
+
# Find end of class (simple heuristic)
|
|
303
|
+
end_line = self._find_class_end(lines, start_line)
|
|
304
|
+
|
|
305
|
+
class_content = self._get_line_range(lines, start_line, end_line)
|
|
306
|
+
|
|
307
|
+
if class_content.strip(): # Only add if content is not empty
|
|
308
|
+
# Extract class docstring
|
|
309
|
+
docstring = self._extract_docstring_regex(class_content)
|
|
310
|
+
|
|
311
|
+
chunk = self._create_chunk(
|
|
312
|
+
content=class_content,
|
|
313
|
+
file_path=file_path,
|
|
314
|
+
start_line=start_line,
|
|
315
|
+
end_line=end_line,
|
|
316
|
+
chunk_type="class",
|
|
317
|
+
class_name=class_name,
|
|
318
|
+
docstring=docstring,
|
|
319
|
+
)
|
|
320
|
+
chunk.imports = imports # Add imports to chunk
|
|
321
|
+
chunks.append(chunk)
|
|
322
|
+
|
|
323
|
+
# If no functions or classes found, create chunks for the whole file
|
|
324
|
+
if not chunks:
|
|
325
|
+
chunks.append(
|
|
326
|
+
self._create_chunk(
|
|
327
|
+
content=content,
|
|
328
|
+
file_path=file_path,
|
|
329
|
+
start_line=1,
|
|
330
|
+
end_line=len(lines),
|
|
331
|
+
chunk_type="module",
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
return chunks
|
|
336
|
+
|
|
337
|
+
def _find_function_end(self, lines: List[str], start_line: int) -> int:
|
|
338
|
+
"""Find the end line of a function using indentation."""
|
|
339
|
+
if start_line > len(lines):
|
|
340
|
+
return len(lines)
|
|
341
|
+
|
|
342
|
+
# Get initial indentation of the def line
|
|
343
|
+
start_idx = start_line - 1
|
|
344
|
+
if start_idx >= len(lines):
|
|
345
|
+
return len(lines)
|
|
346
|
+
|
|
347
|
+
def_line = lines[start_idx]
|
|
348
|
+
def_indent = len(def_line) - len(def_line.lstrip())
|
|
349
|
+
|
|
350
|
+
# Find end by looking for line with indentation <= def indentation
|
|
351
|
+
# Start from the line after the def line
|
|
352
|
+
for i in range(start_idx + 1, len(lines)):
|
|
353
|
+
line = lines[i]
|
|
354
|
+
if line.strip(): # Skip empty lines
|
|
355
|
+
current_indent = len(line) - len(line.lstrip())
|
|
356
|
+
if current_indent <= def_indent:
|
|
357
|
+
return i # Return 1-based line number (i is 0-based index)
|
|
358
|
+
|
|
359
|
+
# If we reach here, the function goes to the end of the file
|
|
360
|
+
return len(lines)
|
|
361
|
+
|
|
362
|
+
def _find_class_end(self, lines: List[str], start_line: int) -> int:
|
|
363
|
+
"""Find the end line of a class using indentation."""
|
|
364
|
+
return self._find_function_end(lines, start_line)
|
|
365
|
+
|
|
366
|
+
def _extract_docstring_regex(self, content: str) -> Optional[str]:
|
|
367
|
+
"""Extract docstring using regex patterns."""
|
|
368
|
+
# Look for triple-quoted strings at the beginning of the content
|
|
369
|
+
# after the def/class line
|
|
370
|
+
lines = content.splitlines()
|
|
371
|
+
if len(lines) < 2:
|
|
372
|
+
return None
|
|
373
|
+
|
|
374
|
+
# Skip the def/class line and look for docstring in subsequent lines
|
|
375
|
+
for i in range(1, min(len(lines), 5)): # Check first few lines
|
|
376
|
+
line = lines[i].strip()
|
|
377
|
+
if not line:
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
# Check for triple-quoted docstrings
|
|
381
|
+
if line.startswith('"""') or line.startswith("'''"):
|
|
382
|
+
quote_type = line[:3]
|
|
383
|
+
|
|
384
|
+
# Single-line docstring
|
|
385
|
+
if line.endswith(quote_type) and len(line) > 6:
|
|
386
|
+
return line[3:-3].strip()
|
|
387
|
+
|
|
388
|
+
# Multi-line docstring
|
|
389
|
+
docstring_lines = [line[3:]]
|
|
390
|
+
for j in range(i + 1, len(lines)):
|
|
391
|
+
next_line = lines[j].strip()
|
|
392
|
+
if next_line.endswith(quote_type):
|
|
393
|
+
docstring_lines.append(next_line[:-3])
|
|
394
|
+
break
|
|
395
|
+
docstring_lines.append(next_line)
|
|
396
|
+
|
|
397
|
+
return " ".join(docstring_lines).strip()
|
|
398
|
+
|
|
399
|
+
# If we hit non-docstring code, stop looking
|
|
400
|
+
if line and not line.startswith('#'):
|
|
401
|
+
break
|
|
402
|
+
|
|
403
|
+
return None
|
|
404
|
+
|
|
405
|
+
def get_supported_extensions(self) -> List[str]:
|
|
406
|
+
"""Get supported file extensions."""
|
|
407
|
+
return [".py", ".pyw"]
|