codegraph-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_cli/__init__.py +4 -0
- codegraph_cli/agents.py +191 -0
- codegraph_cli/bug_detector.py +386 -0
- codegraph_cli/chat_agent.py +352 -0
- codegraph_cli/chat_session.py +220 -0
- codegraph_cli/cli.py +330 -0
- codegraph_cli/cli_chat.py +367 -0
- codegraph_cli/cli_diagnose.py +133 -0
- codegraph_cli/cli_refactor.py +230 -0
- codegraph_cli/cli_setup.py +470 -0
- codegraph_cli/cli_test.py +177 -0
- codegraph_cli/cli_v2.py +267 -0
- codegraph_cli/codegen_agent.py +265 -0
- codegraph_cli/config.py +31 -0
- codegraph_cli/config_manager.py +341 -0
- codegraph_cli/context_manager.py +500 -0
- codegraph_cli/crew_agents.py +123 -0
- codegraph_cli/crew_chat.py +159 -0
- codegraph_cli/crew_tools.py +497 -0
- codegraph_cli/diff_engine.py +265 -0
- codegraph_cli/embeddings.py +241 -0
- codegraph_cli/graph_export.py +144 -0
- codegraph_cli/llm.py +642 -0
- codegraph_cli/models.py +47 -0
- codegraph_cli/models_v2.py +185 -0
- codegraph_cli/orchestrator.py +49 -0
- codegraph_cli/parser.py +800 -0
- codegraph_cli/performance_analyzer.py +223 -0
- codegraph_cli/project_context.py +230 -0
- codegraph_cli/rag.py +200 -0
- codegraph_cli/refactor_agent.py +452 -0
- codegraph_cli/security_scanner.py +366 -0
- codegraph_cli/storage.py +390 -0
- codegraph_cli/templates/graph_interactive.html +257 -0
- codegraph_cli/testgen_agent.py +316 -0
- codegraph_cli/validation_engine.py +285 -0
- codegraph_cli/vector_store.py +293 -0
- codegraph_cli-2.0.0.dist-info/METADATA +318 -0
- codegraph_cli-2.0.0.dist-info/RECORD +43 -0
- codegraph_cli-2.0.0.dist-info/WHEEL +5 -0
- codegraph_cli-2.0.0.dist-info/entry_points.txt +2 -0
- codegraph_cli-2.0.0.dist-info/licenses/LICENSE +21 -0
- codegraph_cli-2.0.0.dist-info/top_level.txt +1 -0
codegraph_cli/parser.py
ADDED
|
@@ -0,0 +1,800 @@
|
|
|
1
|
+
"""Semantic code parser using Tree-sitter for multi-language AST extraction.
|
|
2
|
+
|
|
3
|
+
Replaces the legacy ast-based parser with Tree-sitter for:
|
|
4
|
+
- Error-tolerant parsing (handles broken / incomplete syntax gracefully)
|
|
5
|
+
- Multi-language support (Python now; JS/TS/Go extensible)
|
|
6
|
+
- Semantic chunking by function / class definition (not line-count windows)
|
|
7
|
+
|
|
8
|
+
Falls back to Python's built-in ``ast`` module when tree-sitter is unavailable.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import ast
|
|
14
|
+
import logging
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
18
|
+
|
|
19
|
+
from .models import Edge, Node
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Language <-> file-extension mapping (extensible)
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
LANGUAGE_MAP: Dict[str, str] = {
|
|
27
|
+
".py": "python",
|
|
28
|
+
".js": "javascript",
|
|
29
|
+
".ts": "typescript",
|
|
30
|
+
".tsx": "tsx",
|
|
31
|
+
".jsx": "javascript",
|
|
32
|
+
".go": "go",
|
|
33
|
+
".rs": "rust",
|
|
34
|
+
".java": "java",
|
|
35
|
+
".rb": "ruby",
|
|
36
|
+
".cpp": "cpp",
|
|
37
|
+
".c": "c",
|
|
38
|
+
".cs": "c_sharp",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
SKIP_DIRS: Set[str] = {
|
|
42
|
+
".venv", "venv", "__pycache__", "node_modules", ".git",
|
|
43
|
+
"site-packages", ".tox", ".pytest_cache", "build", "dist",
|
|
44
|
+
".mypy_cache", ".ruff_cache", "htmlcov", ".eggs",
|
|
45
|
+
"egg-info", ".codegraph", "lancedb",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ===================================================================
|
|
50
|
+
# Abstract Parser Interface
|
|
51
|
+
# ===================================================================
|
|
52
|
+
|
|
53
|
+
class Parser(ABC):
|
|
54
|
+
"""Abstract base class for all code parsers."""
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def parse_file(
|
|
58
|
+
self,
|
|
59
|
+
file_path: Path,
|
|
60
|
+
source: Optional[str] = None,
|
|
61
|
+
) -> Tuple[List[Node], List[Edge]]:
|
|
62
|
+
"""Parse a single file into nodes and edges."""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def parse_project(self) -> Tuple[List[Node], List[Edge]]:
|
|
67
|
+
"""Parse the entire project rooted at *project_root*."""
|
|
68
|
+
...
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def supports_language(self, language: str) -> bool:
|
|
72
|
+
"""Return True if this parser can handle *language*."""
|
|
73
|
+
...
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ===================================================================
|
|
77
|
+
# Tree-sitter Parser (Primary)
|
|
78
|
+
# ===================================================================
|
|
79
|
+
|
|
80
|
+
class TreeSitterParser(Parser):
|
|
81
|
+
"""Error-tolerant, multi-language parser built on Tree-sitter.
|
|
82
|
+
|
|
83
|
+
Uses ``tree-sitter-languages`` for pre-built grammars so setup is
|
|
84
|
+
zero-config for the end-user. Tree-sitter produces a *concrete
|
|
85
|
+
syntax tree* (CST) that preserves every token, allowing reliable
|
|
86
|
+
extraction even when the source has minor syntax errors.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
project_root: Path,
|
|
92
|
+
languages: Optional[List[str]] = None,
|
|
93
|
+
) -> None:
|
|
94
|
+
self.project_root = project_root
|
|
95
|
+
self._parsers: Dict[str, Any] = {}
|
|
96
|
+
self._requested_languages = languages or ["python"]
|
|
97
|
+
self._init_parsers()
|
|
98
|
+
|
|
99
|
+
# ------------------------------------------------------------------
|
|
100
|
+
# Initialisation
|
|
101
|
+
# ------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
# Map language name -> module that provides the tree-sitter Language
|
|
104
|
+
_GRAMMAR_MODULES: Dict[str, str] = {
|
|
105
|
+
"python": "tree_sitter_python",
|
|
106
|
+
"javascript": "tree_sitter_javascript",
|
|
107
|
+
"typescript": "tree_sitter_typescript",
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
def _init_parsers(self) -> None:
|
|
111
|
+
try:
|
|
112
|
+
import tree_sitter # type: ignore[import-untyped] # noqa: F401
|
|
113
|
+
except ImportError:
|
|
114
|
+
logger.warning(
|
|
115
|
+
"tree-sitter is not installed -- "
|
|
116
|
+
"Tree-sitter parsing unavailable. "
|
|
117
|
+
"Install with: pip install tree-sitter tree-sitter-python"
|
|
118
|
+
)
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
from tree_sitter import Language, Parser as TSParser # type: ignore[import-untyped]
|
|
122
|
+
|
|
123
|
+
for lang in self._requested_languages:
|
|
124
|
+
mod_name = self._GRAMMAR_MODULES.get(lang)
|
|
125
|
+
if mod_name is None:
|
|
126
|
+
logger.warning("No grammar module mapped for language '%s'", lang)
|
|
127
|
+
continue
|
|
128
|
+
try:
|
|
129
|
+
import importlib
|
|
130
|
+
mod = importlib.import_module(mod_name)
|
|
131
|
+
# tree-sitter >=0.22 per-language packages expose a
|
|
132
|
+
# language() function that returns the Language capsule.
|
|
133
|
+
ts_lang = Language(mod.language())
|
|
134
|
+
parser = TSParser(ts_lang)
|
|
135
|
+
self._parsers[lang] = parser
|
|
136
|
+
logger.debug("Loaded tree-sitter parser for %s", lang)
|
|
137
|
+
except ImportError:
|
|
138
|
+
logger.warning(
|
|
139
|
+
"Grammar package '%s' not installed for language '%s'. "
|
|
140
|
+
"Install with: pip install %s",
|
|
141
|
+
mod_name, lang, mod_name.replace('_', '-'),
|
|
142
|
+
)
|
|
143
|
+
except Exception as exc:
|
|
144
|
+
logger.warning("Could not load tree-sitter grammar for %s: %s", lang, exc)
|
|
145
|
+
|
|
146
|
+
def supports_language(self, language: str) -> bool:
|
|
147
|
+
return language in self._parsers
|
|
148
|
+
|
|
149
|
+
# ------------------------------------------------------------------
|
|
150
|
+
# Project-level parsing
|
|
151
|
+
# ------------------------------------------------------------------
|
|
152
|
+
|
|
153
|
+
def parse_project(self) -> Tuple[List[Node], List[Edge]]:
|
|
154
|
+
all_nodes: List[Node] = []
|
|
155
|
+
all_edges: List[Edge] = []
|
|
156
|
+
|
|
157
|
+
for ext, lang in LANGUAGE_MAP.items():
|
|
158
|
+
if lang not in self._parsers:
|
|
159
|
+
continue
|
|
160
|
+
for file_path in sorted(self.project_root.rglob(f"*{ext}")):
|
|
161
|
+
if any(part in SKIP_DIRS for part in file_path.parts):
|
|
162
|
+
continue
|
|
163
|
+
try:
|
|
164
|
+
nodes, edges = self.parse_file(file_path)
|
|
165
|
+
all_nodes.extend(nodes)
|
|
166
|
+
all_edges.extend(edges)
|
|
167
|
+
except Exception as exc:
|
|
168
|
+
logger.warning("Failed to parse %s: %s", file_path, exc)
|
|
169
|
+
|
|
170
|
+
all_edges = _resolve_call_edges(all_nodes, all_edges)
|
|
171
|
+
return all_nodes, all_edges
|
|
172
|
+
|
|
173
|
+
# ------------------------------------------------------------------
|
|
174
|
+
# File-level parsing
|
|
175
|
+
# ------------------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
def parse_file(
|
|
178
|
+
self,
|
|
179
|
+
file_path: Path,
|
|
180
|
+
source: Optional[str] = None,
|
|
181
|
+
) -> Tuple[List[Node], List[Edge]]:
|
|
182
|
+
if source is None:
|
|
183
|
+
source = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
184
|
+
|
|
185
|
+
ext = file_path.suffix
|
|
186
|
+
lang = LANGUAGE_MAP.get(ext)
|
|
187
|
+
if not lang or lang not in self._parsers:
|
|
188
|
+
return [], []
|
|
189
|
+
|
|
190
|
+
parser = self._parsers[lang]
|
|
191
|
+
source_bytes = source.encode("utf-8")
|
|
192
|
+
tree = parser.parse(source_bytes)
|
|
193
|
+
|
|
194
|
+
rel_path = str(file_path.relative_to(self.project_root))
|
|
195
|
+
lines = source.splitlines()
|
|
196
|
+
|
|
197
|
+
# -- Module node --------------------------------------------------
|
|
198
|
+
module_name = rel_path.replace("/", ".").removesuffix(".py")
|
|
199
|
+
module_id = f"module:{module_name}"
|
|
200
|
+
module_node = Node(
|
|
201
|
+
node_id=module_id,
|
|
202
|
+
node_type="module",
|
|
203
|
+
name=module_name.split(".")[-1],
|
|
204
|
+
qualname=module_name,
|
|
205
|
+
file_path=rel_path,
|
|
206
|
+
start_line=1,
|
|
207
|
+
end_line=max(len(lines), 1),
|
|
208
|
+
code=source,
|
|
209
|
+
docstring=self._extract_module_docstring(tree.root_node),
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
nodes: List[Node] = [module_node]
|
|
213
|
+
edges: List[Edge] = []
|
|
214
|
+
|
|
215
|
+
# -- Language-specific extraction ---------------------------------
|
|
216
|
+
if lang == "python":
|
|
217
|
+
self._walk_python(
|
|
218
|
+
tree.root_node,
|
|
219
|
+
scope_stack=[module_name],
|
|
220
|
+
scope_id_stack=[module_id],
|
|
221
|
+
rel_path=rel_path,
|
|
222
|
+
lines=lines,
|
|
223
|
+
nodes=nodes,
|
|
224
|
+
edges=edges,
|
|
225
|
+
)
|
|
226
|
+
self._extract_python_imports(tree.root_node, module_id, edges)
|
|
227
|
+
# Future: elif lang in ("javascript", "typescript"): ...
|
|
228
|
+
|
|
229
|
+
return nodes, edges
|
|
230
|
+
|
|
231
|
+
# ------------------------------------------------------------------
|
|
232
|
+
# Python: recursive definition walker
|
|
233
|
+
# ------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
def _walk_python(
|
|
236
|
+
self,
|
|
237
|
+
ts_node: Any,
|
|
238
|
+
scope_stack: List[str],
|
|
239
|
+
scope_id_stack: List[str],
|
|
240
|
+
rel_path: str,
|
|
241
|
+
lines: List[str],
|
|
242
|
+
nodes: List[Node],
|
|
243
|
+
edges: List[Edge],
|
|
244
|
+
) -> None:
|
|
245
|
+
"""Recursively extract class / function definitions from *ts_node*."""
|
|
246
|
+
for child in ts_node.children:
|
|
247
|
+
outer_node = child
|
|
248
|
+
actual_def = child
|
|
249
|
+
|
|
250
|
+
# Unwrap @decorated_definition -> inner function/class
|
|
251
|
+
if child.type == "decorated_definition":
|
|
252
|
+
inner = child.child_by_field_name("definition")
|
|
253
|
+
if inner is None:
|
|
254
|
+
continue
|
|
255
|
+
actual_def = inner
|
|
256
|
+
|
|
257
|
+
if actual_def.type == "function_definition":
|
|
258
|
+
self._process_python_function(
|
|
259
|
+
outer_node, actual_def, scope_stack, scope_id_stack,
|
|
260
|
+
rel_path, lines, nodes, edges,
|
|
261
|
+
)
|
|
262
|
+
elif actual_def.type == "class_definition":
|
|
263
|
+
self._process_python_class(
|
|
264
|
+
outer_node, actual_def, scope_stack, scope_id_stack,
|
|
265
|
+
rel_path, lines, nodes, edges,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def _process_python_function(
|
|
269
|
+
self,
|
|
270
|
+
outer_node: Any,
|
|
271
|
+
func_node: Any,
|
|
272
|
+
scope_stack: List[str],
|
|
273
|
+
scope_id_stack: List[str],
|
|
274
|
+
rel_path: str,
|
|
275
|
+
lines: List[str],
|
|
276
|
+
nodes: List[Node],
|
|
277
|
+
edges: List[Edge],
|
|
278
|
+
) -> None:
|
|
279
|
+
name_node = func_node.child_by_field_name("name")
|
|
280
|
+
if name_node is None:
|
|
281
|
+
return
|
|
282
|
+
name: str = name_node.text.decode("utf-8")
|
|
283
|
+
qualname = ".".join(scope_stack + [name])
|
|
284
|
+
node_id = f"function:{qualname}"
|
|
285
|
+
|
|
286
|
+
start_line = outer_node.start_point[0] + 1
|
|
287
|
+
end_line = outer_node.end_point[0] + 1
|
|
288
|
+
code = "\n".join(lines[start_line - 1: end_line])
|
|
289
|
+
|
|
290
|
+
nodes.append(Node(
|
|
291
|
+
node_id=node_id,
|
|
292
|
+
node_type="function",
|
|
293
|
+
name=name,
|
|
294
|
+
qualname=qualname,
|
|
295
|
+
file_path=rel_path,
|
|
296
|
+
start_line=start_line,
|
|
297
|
+
end_line=end_line,
|
|
298
|
+
code=code,
|
|
299
|
+
docstring=self._extract_docstring(func_node),
|
|
300
|
+
))
|
|
301
|
+
edges.append(Edge(src=scope_id_stack[-1], dst=node_id, edge_type="contains"))
|
|
302
|
+
|
|
303
|
+
# Call edges from function body
|
|
304
|
+
for call_name in self._collect_calls(func_node):
|
|
305
|
+
edges.append(Edge(src=node_id, dst=call_name, edge_type="calls"))
|
|
306
|
+
|
|
307
|
+
# Recurse into body for nested definitions
|
|
308
|
+
body = func_node.child_by_field_name("body")
|
|
309
|
+
if body is not None:
|
|
310
|
+
self._walk_python(
|
|
311
|
+
body,
|
|
312
|
+
scope_stack + [name],
|
|
313
|
+
scope_id_stack + [node_id],
|
|
314
|
+
rel_path, lines, nodes, edges,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def _process_python_class(
|
|
318
|
+
self,
|
|
319
|
+
outer_node: Any,
|
|
320
|
+
class_node: Any,
|
|
321
|
+
scope_stack: List[str],
|
|
322
|
+
scope_id_stack: List[str],
|
|
323
|
+
rel_path: str,
|
|
324
|
+
lines: List[str],
|
|
325
|
+
nodes: List[Node],
|
|
326
|
+
edges: List[Edge],
|
|
327
|
+
) -> None:
|
|
328
|
+
name_node = class_node.child_by_field_name("name")
|
|
329
|
+
if name_node is None:
|
|
330
|
+
return
|
|
331
|
+
name: str = name_node.text.decode("utf-8")
|
|
332
|
+
qualname = ".".join(scope_stack + [name])
|
|
333
|
+
node_id = f"class:{qualname}"
|
|
334
|
+
|
|
335
|
+
start_line = outer_node.start_point[0] + 1
|
|
336
|
+
end_line = outer_node.end_point[0] + 1
|
|
337
|
+
code = "\n".join(lines[start_line - 1: end_line])
|
|
338
|
+
|
|
339
|
+
nodes.append(Node(
|
|
340
|
+
node_id=node_id,
|
|
341
|
+
node_type="class",
|
|
342
|
+
name=name,
|
|
343
|
+
qualname=qualname,
|
|
344
|
+
file_path=rel_path,
|
|
345
|
+
start_line=start_line,
|
|
346
|
+
end_line=end_line,
|
|
347
|
+
code=code,
|
|
348
|
+
docstring=self._extract_docstring(class_node),
|
|
349
|
+
))
|
|
350
|
+
edges.append(Edge(src=scope_id_stack[-1], dst=node_id, edge_type="contains"))
|
|
351
|
+
|
|
352
|
+
# Walk class body for methods / nested classes
|
|
353
|
+
body = class_node.child_by_field_name("body")
|
|
354
|
+
if body is not None:
|
|
355
|
+
self._walk_python(
|
|
356
|
+
body,
|
|
357
|
+
scope_stack + [name],
|
|
358
|
+
scope_id_stack + [node_id],
|
|
359
|
+
rel_path, lines, nodes, edges,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# ------------------------------------------------------------------
|
|
363
|
+
# Python: imports
|
|
364
|
+
# ------------------------------------------------------------------
|
|
365
|
+
|
|
366
|
+
@staticmethod
|
|
367
|
+
def _extract_python_imports(
|
|
368
|
+
root: Any,
|
|
369
|
+
module_id: str,
|
|
370
|
+
edges: List[Edge],
|
|
371
|
+
) -> None:
|
|
372
|
+
for child in root.children:
|
|
373
|
+
if child.type == "import_statement":
|
|
374
|
+
for sub in child.children:
|
|
375
|
+
if sub.type == "dotted_name":
|
|
376
|
+
mod = sub.text.decode("utf-8")
|
|
377
|
+
edges.append(Edge(
|
|
378
|
+
src=module_id, dst=f"module:{mod}", edge_type="depends_on",
|
|
379
|
+
))
|
|
380
|
+
elif sub.type == "aliased_import":
|
|
381
|
+
name_n = sub.child_by_field_name("name")
|
|
382
|
+
if name_n is not None:
|
|
383
|
+
mod = name_n.text.decode("utf-8")
|
|
384
|
+
edges.append(Edge(
|
|
385
|
+
src=module_id, dst=f"module:{mod}", edge_type="depends_on",
|
|
386
|
+
))
|
|
387
|
+
|
|
388
|
+
elif child.type == "import_from_statement":
|
|
389
|
+
mod_node = child.child_by_field_name("module_name")
|
|
390
|
+
if mod_node is None:
|
|
391
|
+
continue
|
|
392
|
+
if mod_node.type == "dotted_name":
|
|
393
|
+
mod = mod_node.text.decode("utf-8")
|
|
394
|
+
elif mod_node.type == "relative_import":
|
|
395
|
+
dotted: Optional[str] = None
|
|
396
|
+
for sub in mod_node.children:
|
|
397
|
+
if sub.type == "dotted_name":
|
|
398
|
+
dotted = sub.text.decode("utf-8")
|
|
399
|
+
mod = dotted or ""
|
|
400
|
+
else:
|
|
401
|
+
mod = mod_node.text.decode("utf-8")
|
|
402
|
+
if mod:
|
|
403
|
+
edges.append(Edge(
|
|
404
|
+
src=module_id, dst=f"module:{mod}", edge_type="depends_on",
|
|
405
|
+
))
|
|
406
|
+
|
|
407
|
+
# ------------------------------------------------------------------
|
|
408
|
+
# Call extraction
|
|
409
|
+
# ------------------------------------------------------------------
|
|
410
|
+
|
|
411
|
+
@staticmethod
|
|
412
|
+
def _collect_calls(func_node: Any) -> List[str]:
|
|
413
|
+
"""Return every function/method name called inside *func_node*."""
|
|
414
|
+
calls: List[str] = []
|
|
415
|
+
|
|
416
|
+
def _find(node: Any) -> None:
|
|
417
|
+
if node.type == "call":
|
|
418
|
+
func = node.child_by_field_name("function")
|
|
419
|
+
if func is not None:
|
|
420
|
+
name = _resolve_ts_call_name(func)
|
|
421
|
+
if name:
|
|
422
|
+
calls.append(name)
|
|
423
|
+
for ch in node.children:
|
|
424
|
+
if ch.type in (
|
|
425
|
+
"function_definition",
|
|
426
|
+
"class_definition",
|
|
427
|
+
"decorated_definition",
|
|
428
|
+
):
|
|
429
|
+
continue
|
|
430
|
+
_find(ch)
|
|
431
|
+
|
|
432
|
+
body = func_node.child_by_field_name("body")
|
|
433
|
+
if body is not None:
|
|
434
|
+
_find(body)
|
|
435
|
+
return calls
|
|
436
|
+
|
|
437
|
+
# ------------------------------------------------------------------
|
|
438
|
+
# Docstring helpers
|
|
439
|
+
# ------------------------------------------------------------------
|
|
440
|
+
|
|
441
|
+
@staticmethod
|
|
442
|
+
def _extract_docstring(def_node: Any) -> str:
|
|
443
|
+
"""Extract the docstring from a function / class definition node."""
|
|
444
|
+
body = def_node.child_by_field_name("body")
|
|
445
|
+
if body is None:
|
|
446
|
+
return ""
|
|
447
|
+
for child in body.children:
|
|
448
|
+
if child.type == "expression_statement":
|
|
449
|
+
for expr in child.children:
|
|
450
|
+
if expr.type == "string":
|
|
451
|
+
raw = expr.text.decode("utf-8")
|
|
452
|
+
for q in ('"""', "'''"):
|
|
453
|
+
if raw.startswith(q) and raw.endswith(q):
|
|
454
|
+
return raw[3:-3].strip()
|
|
455
|
+
for q in ('"', "'"):
|
|
456
|
+
if raw.startswith(q) and raw.endswith(q):
|
|
457
|
+
return raw[1:-1].strip()
|
|
458
|
+
return raw.strip()
|
|
459
|
+
break
|
|
460
|
+
elif child.type != "comment":
|
|
461
|
+
break
|
|
462
|
+
return ""
|
|
463
|
+
|
|
464
|
+
@staticmethod
|
|
465
|
+
def _extract_module_docstring(root: Any) -> str:
|
|
466
|
+
"""Extract the module-level docstring."""
|
|
467
|
+
for child in root.children:
|
|
468
|
+
if child.type == "expression_statement":
|
|
469
|
+
for expr in child.children:
|
|
470
|
+
if expr.type == "string":
|
|
471
|
+
raw = expr.text.decode("utf-8")
|
|
472
|
+
for q in ('"""', "'''"):
|
|
473
|
+
if raw.startswith(q) and raw.endswith(q):
|
|
474
|
+
return raw[3:-3].strip()
|
|
475
|
+
return raw.strip()
|
|
476
|
+
break
|
|
477
|
+
elif child.type == "comment":
|
|
478
|
+
continue
|
|
479
|
+
else:
|
|
480
|
+
break
|
|
481
|
+
return ""
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
# ===================================================================
|
|
485
|
+
# AST Fallback Parser (when tree-sitter is not installed)
|
|
486
|
+
# ===================================================================
|
|
487
|
+
|
|
488
|
+
class ASTFallbackParser(Parser):
|
|
489
|
+
"""Pure-Python fallback using the built-in ``ast`` module.
|
|
490
|
+
|
|
491
|
+
Only supports Python. Used automatically when tree-sitter is missing.
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
def __init__(self, project_root: Path) -> None:
|
|
495
|
+
self.project_root = project_root
|
|
496
|
+
|
|
497
|
+
def supports_language(self, language: str) -> bool:
|
|
498
|
+
return language == "python"
|
|
499
|
+
|
|
500
|
+
def parse_project(self) -> Tuple[List[Node], List[Edge]]:
|
|
501
|
+
nodes: List[Node] = []
|
|
502
|
+
edges: List[Edge] = []
|
|
503
|
+
for fp in sorted(self.project_root.rglob("*.py")):
|
|
504
|
+
if any(part in SKIP_DIRS for part in fp.parts):
|
|
505
|
+
continue
|
|
506
|
+
try:
|
|
507
|
+
n, e = self.parse_file(fp)
|
|
508
|
+
nodes.extend(n)
|
|
509
|
+
edges.extend(e)
|
|
510
|
+
except Exception as exc:
|
|
511
|
+
logger.warning("AST parse failed for %s: %s", fp, exc)
|
|
512
|
+
edges = _resolve_call_edges(nodes, edges)
|
|
513
|
+
return nodes, edges
|
|
514
|
+
|
|
515
|
+
def parse_file(
|
|
516
|
+
self,
|
|
517
|
+
file_path: Path,
|
|
518
|
+
source: Optional[str] = None,
|
|
519
|
+
) -> Tuple[List[Node], List[Edge]]:
|
|
520
|
+
if source is None:
|
|
521
|
+
source = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
522
|
+
|
|
523
|
+
try:
|
|
524
|
+
tree = ast.parse(source)
|
|
525
|
+
except SyntaxError as exc:
|
|
526
|
+
logger.warning("SyntaxError in %s: %s", file_path, exc)
|
|
527
|
+
return [], []
|
|
528
|
+
|
|
529
|
+
rel_path = str(file_path.relative_to(self.project_root))
|
|
530
|
+
lines = source.splitlines()
|
|
531
|
+
module_name = rel_path.replace("/", ".").removesuffix(".py")
|
|
532
|
+
module_id = f"module:{module_name}"
|
|
533
|
+
|
|
534
|
+
module_node = Node(
|
|
535
|
+
node_id=module_id,
|
|
536
|
+
node_type="module",
|
|
537
|
+
name=module_name.split(".")[-1],
|
|
538
|
+
qualname=module_name,
|
|
539
|
+
file_path=rel_path,
|
|
540
|
+
start_line=1,
|
|
541
|
+
end_line=max(len(lines), 1),
|
|
542
|
+
code=source,
|
|
543
|
+
docstring=ast.get_docstring(tree) or "",
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
visitor = _ASTVisitor(module_id, module_name, rel_path, lines)
|
|
547
|
+
visitor.visit(tree)
|
|
548
|
+
|
|
549
|
+
nodes = [module_node] + visitor.nodes
|
|
550
|
+
edges = list(visitor.edges)
|
|
551
|
+
|
|
552
|
+
for stmt in tree.body:
|
|
553
|
+
if isinstance(stmt, ast.Import):
|
|
554
|
+
for alias in stmt.names:
|
|
555
|
+
edges.append(Edge(
|
|
556
|
+
src=module_id, dst=f"module:{alias.name}", edge_type="depends_on",
|
|
557
|
+
))
|
|
558
|
+
elif isinstance(stmt, ast.ImportFrom) and stmt.module:
|
|
559
|
+
edges.append(Edge(
|
|
560
|
+
src=module_id, dst=f"module:{stmt.module}", edge_type="depends_on",
|
|
561
|
+
))
|
|
562
|
+
|
|
563
|
+
return nodes, edges
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
# ===================================================================
|
|
567
|
+
# Backward-Compatible Alias
|
|
568
|
+
# ===================================================================
|
|
569
|
+
|
|
570
|
+
class PythonGraphParser(Parser):
|
|
571
|
+
"""Drop-in replacement for the legacy ``PythonGraphParser``.
|
|
572
|
+
|
|
573
|
+
Automatically selects **TreeSitterParser** when tree-sitter is
|
|
574
|
+
available, otherwise falls back to the built-in AST parser.
|
|
575
|
+
"""
|
|
576
|
+
|
|
577
|
+
def __init__(self, project_root: Path) -> None:
|
|
578
|
+
self.project_root = project_root
|
|
579
|
+
ts = TreeSitterParser(project_root, languages=["python"])
|
|
580
|
+
if ts.supports_language("python"):
|
|
581
|
+
self._delegate: Parser = ts
|
|
582
|
+
logger.info("Using Tree-sitter parser (error-tolerant, semantic chunking)")
|
|
583
|
+
else:
|
|
584
|
+
self._delegate = ASTFallbackParser(project_root)
|
|
585
|
+
logger.info("Using AST fallback parser (Python only)")
|
|
586
|
+
|
|
587
|
+
def parse_file(
|
|
588
|
+
self,
|
|
589
|
+
file_path: Path,
|
|
590
|
+
source: Optional[str] = None,
|
|
591
|
+
) -> Tuple[List[Node], List[Edge]]:
|
|
592
|
+
return self._delegate.parse_file(file_path, source)
|
|
593
|
+
|
|
594
|
+
def parse_project(self) -> Tuple[List[Node], List[Edge]]:
|
|
595
|
+
return self._delegate.parse_project()
|
|
596
|
+
|
|
597
|
+
def supports_language(self, language: str) -> bool:
|
|
598
|
+
return self._delegate.supports_language(language)
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
# ===================================================================
|
|
602
|
+
# Shared Helpers
|
|
603
|
+
# ===================================================================
|
|
604
|
+
|
|
605
|
+
def _resolve_ts_call_name(func_node: Any) -> Optional[str]:
|
|
606
|
+
"""Resolve a Tree-sitter call-function node to a dotted name string."""
|
|
607
|
+
if func_node.type == "identifier":
|
|
608
|
+
return func_node.text.decode("utf-8")
|
|
609
|
+
if func_node.type == "attribute":
|
|
610
|
+
parts: List[str] = []
|
|
611
|
+
current = func_node
|
|
612
|
+
while current is not None and current.type == "attribute":
|
|
613
|
+
attr = current.child_by_field_name("attribute")
|
|
614
|
+
if attr is not None:
|
|
615
|
+
parts.append(attr.text.decode("utf-8"))
|
|
616
|
+
current = current.child_by_field_name("object")
|
|
617
|
+
if current is not None and current.type == "identifier":
|
|
618
|
+
parts.append(current.text.decode("utf-8"))
|
|
619
|
+
return ".".join(reversed(parts)) if parts else None
|
|
620
|
+
if func_node.type == "call":
|
|
621
|
+
inner = func_node.child_by_field_name("function")
|
|
622
|
+
if inner is not None:
|
|
623
|
+
return _resolve_ts_call_name(inner)
|
|
624
|
+
return None
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def _resolve_call_edges(nodes: List[Node], edges: List[Edge]) -> List[Edge]:
|
|
628
|
+
"""Resolve symbolic call destinations to concrete node IDs.
|
|
629
|
+
|
|
630
|
+
Language-agnostic post-processing shared by every parser backend.
|
|
631
|
+
"""
|
|
632
|
+
qual_by_name: Dict[str, List[str]] = {}
|
|
633
|
+
qual_by_qualname: Dict[str, str] = {}
|
|
634
|
+
for n in nodes:
|
|
635
|
+
qual_by_name.setdefault(n.name, []).append(n.node_id)
|
|
636
|
+
qual_by_qualname[n.qualname] = n.node_id
|
|
637
|
+
node_ids = {n.node_id for n in nodes}
|
|
638
|
+
|
|
639
|
+
resolved: List[Edge] = []
|
|
640
|
+
for edge in edges:
|
|
641
|
+
if edge.edge_type != "calls":
|
|
642
|
+
resolved.append(edge)
|
|
643
|
+
continue
|
|
644
|
+
if edge.dst in node_ids:
|
|
645
|
+
resolved.append(edge)
|
|
646
|
+
continue
|
|
647
|
+
|
|
648
|
+
resolved_dst: Optional[str] = None
|
|
649
|
+
|
|
650
|
+
# --- dotted calls (self.method, obj.method) ----------------------
|
|
651
|
+
if "." in edge.dst:
|
|
652
|
+
parts = edge.dst.split(".")
|
|
653
|
+
method_name = parts[-1]
|
|
654
|
+
|
|
655
|
+
# self.method -> resolve inside same class
|
|
656
|
+
if parts[0] == "self" and edge.src.startswith("function:"):
|
|
657
|
+
src_qualname = edge.src.removeprefix("function:")
|
|
658
|
+
if "." in src_qualname:
|
|
659
|
+
class_qualname = ".".join(src_qualname.split(".")[:-1])
|
|
660
|
+
target_qualname = f"{class_qualname}.{method_name}"
|
|
661
|
+
if target_qualname in qual_by_qualname:
|
|
662
|
+
resolved_dst = qual_by_qualname[target_qualname]
|
|
663
|
+
|
|
664
|
+
if resolved_dst is None and method_name in qual_by_name:
|
|
665
|
+
candidates = qual_by_name[method_name]
|
|
666
|
+
src_parts = edge.src.split(":")[1].split(".") if ":" in edge.src else []
|
|
667
|
+
for cand in candidates:
|
|
668
|
+
cand_parts = cand.split(":")[1].split(".") if ":" in cand else []
|
|
669
|
+
if src_parts and cand_parts and src_parts[:-1] == cand_parts[:-1]:
|
|
670
|
+
resolved_dst = cand
|
|
671
|
+
break
|
|
672
|
+
if resolved_dst is None:
|
|
673
|
+
resolved_dst = candidates[0]
|
|
674
|
+
|
|
675
|
+
# --- simple name lookups -----------------------------------------
|
|
676
|
+
elif edge.dst in qual_by_name:
|
|
677
|
+
resolved_dst = qual_by_name[edge.dst][0]
|
|
678
|
+
elif edge.dst in qual_by_qualname:
|
|
679
|
+
resolved_dst = qual_by_qualname[edge.dst]
|
|
680
|
+
|
|
681
|
+
if resolved_dst:
|
|
682
|
+
resolved.append(Edge(src=edge.src, dst=resolved_dst, edge_type="calls"))
|
|
683
|
+
else:
|
|
684
|
+
resolved.append(edge)
|
|
685
|
+
|
|
686
|
+
return resolved
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
# ===================================================================
|
|
690
|
+
# Legacy AST visitor (used by ASTFallbackParser)
|
|
691
|
+
# ===================================================================
|
|
692
|
+
|
|
693
|
+
class _ASTVisitor(ast.NodeVisitor):
|
|
694
|
+
"""Walks a Python AST and collects Node / Edge objects."""
|
|
695
|
+
|
|
696
|
+
def __init__(
|
|
697
|
+
self,
|
|
698
|
+
module_id: str,
|
|
699
|
+
module_name: str,
|
|
700
|
+
rel_path: str,
|
|
701
|
+
lines: List[str],
|
|
702
|
+
) -> None:
|
|
703
|
+
self.module_id = module_id
|
|
704
|
+
self.module_name = module_name
|
|
705
|
+
self.rel_path = rel_path
|
|
706
|
+
self.lines = lines
|
|
707
|
+
self.scope_stack: List[str] = [module_name]
|
|
708
|
+
self.scope_id_stack: List[str] = [module_id]
|
|
709
|
+
self.nodes: List[Node] = []
|
|
710
|
+
self.edges: List[Edge] = []
|
|
711
|
+
|
|
712
|
+
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
|
713
|
+
qualname = self._mk_qualname(node.name)
|
|
714
|
+
node_id = f"class:{qualname}"
|
|
715
|
+
self.nodes.append(Node(
|
|
716
|
+
node_id=node_id, node_type="class", name=node.name,
|
|
717
|
+
qualname=qualname, file_path=self.rel_path,
|
|
718
|
+
start_line=node.lineno,
|
|
719
|
+
end_line=getattr(node, "end_lineno", node.lineno),
|
|
720
|
+
code=self._snippet(node),
|
|
721
|
+
docstring=ast.get_docstring(node) or "",
|
|
722
|
+
))
|
|
723
|
+
self.edges.append(Edge(
|
|
724
|
+
src=self.scope_id_stack[-1], dst=node_id, edge_type="contains",
|
|
725
|
+
))
|
|
726
|
+
self.scope_stack.append(node.name)
|
|
727
|
+
self.scope_id_stack.append(node_id)
|
|
728
|
+
self.generic_visit(node)
|
|
729
|
+
self.scope_stack.pop()
|
|
730
|
+
self.scope_id_stack.pop()
|
|
731
|
+
|
|
732
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
|
733
|
+
self._visit_function(node)
|
|
734
|
+
|
|
735
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
|
736
|
+
self._visit_function(node)
|
|
737
|
+
|
|
738
|
+
def _visit_function(self, node: ast.AST) -> None:
|
|
739
|
+
assert isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
|
|
740
|
+
qualname = self._mk_qualname(node.name)
|
|
741
|
+
node_id = f"function:{qualname}"
|
|
742
|
+
self.nodes.append(Node(
|
|
743
|
+
node_id=node_id, node_type="function", name=node.name,
|
|
744
|
+
qualname=qualname, file_path=self.rel_path,
|
|
745
|
+
start_line=node.lineno,
|
|
746
|
+
end_line=getattr(node, "end_lineno", node.lineno),
|
|
747
|
+
code=self._snippet(node),
|
|
748
|
+
docstring=ast.get_docstring(node) or "",
|
|
749
|
+
))
|
|
750
|
+
self.edges.append(Edge(
|
|
751
|
+
src=self.scope_id_stack[-1], dst=node_id, edge_type="contains",
|
|
752
|
+
))
|
|
753
|
+
for call_name in _ast_collect_calls(node):
|
|
754
|
+
self.edges.append(Edge(src=node_id, dst=call_name, edge_type="calls"))
|
|
755
|
+
|
|
756
|
+
self.scope_stack.append(node.name)
|
|
757
|
+
self.scope_id_stack.append(node_id)
|
|
758
|
+
self.generic_visit(node)
|
|
759
|
+
self.scope_stack.pop()
|
|
760
|
+
self.scope_id_stack.pop()
|
|
761
|
+
|
|
762
|
+
def _snippet(self, node: ast.AST) -> str:
|
|
763
|
+
start = max(getattr(node, "lineno", 1) - 1, 0)
|
|
764
|
+
end = getattr(node, "end_lineno", start + 1)
|
|
765
|
+
return "\n".join(self.lines[start:end])
|
|
766
|
+
|
|
767
|
+
def _mk_qualname(self, name: str) -> str:
|
|
768
|
+
return ".".join(self.scope_stack + [name])
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def _ast_collect_calls(node: ast.AST) -> List[str]:
|
|
772
|
+
names: List[str] = []
|
|
773
|
+
|
|
774
|
+
class _CV(ast.NodeVisitor):
|
|
775
|
+
def visit_Call(self, call_node: ast.Call) -> None:
|
|
776
|
+
n = _ast_name_from_expr(call_node.func)
|
|
777
|
+
if n:
|
|
778
|
+
names.append(n)
|
|
779
|
+
self.generic_visit(call_node)
|
|
780
|
+
|
|
781
|
+
_CV().visit(node)
|
|
782
|
+
return names
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
def _ast_name_from_expr(expr: ast.AST) -> Optional[str]:
|
|
786
|
+
if isinstance(expr, ast.Name):
|
|
787
|
+
return expr.id
|
|
788
|
+
if isinstance(expr, ast.Attribute):
|
|
789
|
+
parts: List[str] = []
|
|
790
|
+
current: ast.AST = expr
|
|
791
|
+
while isinstance(current, ast.Attribute):
|
|
792
|
+
parts.append(current.attr)
|
|
793
|
+
current = current.value
|
|
794
|
+
if isinstance(current, ast.Name):
|
|
795
|
+
parts.append(current.id)
|
|
796
|
+
return ".".join(reversed(parts)) if parts else None
|
|
797
|
+
if isinstance(expr, ast.Call):
|
|
798
|
+
return _ast_name_from_expr(expr.func)
|
|
799
|
+
return None
|
|
800
|
+
|