codebase-intel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_intel/__init__.py +3 -0
- codebase_intel/analytics/__init__.py +1 -0
- codebase_intel/analytics/benchmark.py +406 -0
- codebase_intel/analytics/feedback.py +496 -0
- codebase_intel/analytics/tracker.py +439 -0
- codebase_intel/cli/__init__.py +1 -0
- codebase_intel/cli/main.py +740 -0
- codebase_intel/contracts/__init__.py +1 -0
- codebase_intel/contracts/auto_generator.py +438 -0
- codebase_intel/contracts/evaluator.py +531 -0
- codebase_intel/contracts/models.py +433 -0
- codebase_intel/contracts/registry.py +225 -0
- codebase_intel/core/__init__.py +1 -0
- codebase_intel/core/config.py +248 -0
- codebase_intel/core/exceptions.py +454 -0
- codebase_intel/core/types.py +375 -0
- codebase_intel/decisions/__init__.py +1 -0
- codebase_intel/decisions/miner.py +297 -0
- codebase_intel/decisions/models.py +302 -0
- codebase_intel/decisions/store.py +411 -0
- codebase_intel/drift/__init__.py +1 -0
- codebase_intel/drift/detector.py +443 -0
- codebase_intel/graph/__init__.py +1 -0
- codebase_intel/graph/builder.py +391 -0
- codebase_intel/graph/parser.py +1232 -0
- codebase_intel/graph/query.py +377 -0
- codebase_intel/graph/storage.py +736 -0
- codebase_intel/mcp/__init__.py +1 -0
- codebase_intel/mcp/server.py +710 -0
- codebase_intel/orchestrator/__init__.py +1 -0
- codebase_intel/orchestrator/assembler.py +649 -0
- codebase_intel-0.1.0.dist-info/METADATA +361 -0
- codebase_intel-0.1.0.dist-info/RECORD +36 -0
- codebase_intel-0.1.0.dist-info/WHEEL +4 -0
- codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
- codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1232 @@
|
|
|
1
|
+
"""Tree-sitter based source code parser — extracts nodes and edges from files.
|
|
2
|
+
|
|
3
|
+
This is the most edge-case-heavy module in the system. Source code is wild:
|
|
4
|
+
people do bizarre things with imports, naming, and structure.
|
|
5
|
+
|
|
6
|
+
Edge cases handled:
|
|
7
|
+
- Binary files disguised as source (UTF-8 decode fails): skip gracefully
|
|
8
|
+
- Mixed encodings in a file: try UTF-8, fall back to latin-1, then skip
|
|
9
|
+
- Syntax errors in user code: tree-sitter still produces a partial AST,
|
|
10
|
+
we extract what we can and flag errors
|
|
11
|
+
- Generated code: detected via header markers, tagged is_generated=True
|
|
12
|
+
- Huge files (>1MB): skip entirely, log warning
|
|
13
|
+
- Empty files: valid — produce MODULE node with no children
|
|
14
|
+
- Circular imports: not our problem at parse time (graph handles it)
|
|
15
|
+
- Dynamic imports: `importlib.import_module("x")`, `__import__("x")`,
|
|
16
|
+
`require(variable)` — extract target string if it's a literal, flag
|
|
17
|
+
as dynamic_import with lower confidence if it's a variable
|
|
18
|
+
- Conditional imports: `if TYPE_CHECKING:`, `try/except ImportError:` —
|
|
19
|
+
tagged as type_only or optional
|
|
20
|
+
- Re-exports: barrel files that `from x import *` or explicit re-export
|
|
21
|
+
- Star imports: `from x import *` — edge exists but target is the module,
|
|
22
|
+
not specific symbols (can't resolve without runtime)
|
|
23
|
+
- Relative imports: `from . import x`, `from ..utils import y` — resolved
|
|
24
|
+
relative to file position in the project
|
|
25
|
+
- Decorator detection: @app.route, @router.get — used to identify endpoints
|
|
26
|
+
- Async vs sync: tracked in metadata for quality contracts
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import logging
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import TYPE_CHECKING
|
|
34
|
+
|
|
35
|
+
import xxhash
|
|
36
|
+
|
|
37
|
+
from codebase_intel.core.exceptions import ErrorContext, ParseError, UnsupportedLanguageError
|
|
38
|
+
from codebase_intel.core.types import (
|
|
39
|
+
EdgeKind,
|
|
40
|
+
GraphEdge,
|
|
41
|
+
GraphNode,
|
|
42
|
+
Language,
|
|
43
|
+
LineRange,
|
|
44
|
+
NodeKind,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if TYPE_CHECKING:
|
|
48
|
+
from codebase_intel.core.config import ParserConfig
|
|
49
|
+
|
|
50
|
+
logger = logging.getLogger(__name__)
|
|
51
|
+
|
|
52
|
+
# File extension → Language mapping (19 languages)
|
|
53
|
+
EXTENSION_MAP: dict[str, Language] = {
|
|
54
|
+
".py": Language.PYTHON,
|
|
55
|
+
".pyi": Language.PYTHON,
|
|
56
|
+
".js": Language.JAVASCRIPT,
|
|
57
|
+
".mjs": Language.JAVASCRIPT,
|
|
58
|
+
".cjs": Language.JAVASCRIPT,
|
|
59
|
+
".jsx": Language.JAVASCRIPT,
|
|
60
|
+
".ts": Language.TYPESCRIPT,
|
|
61
|
+
".tsx": Language.TSX,
|
|
62
|
+
".go": Language.GO,
|
|
63
|
+
".rs": Language.RUST,
|
|
64
|
+
".java": Language.JAVA,
|
|
65
|
+
".rb": Language.RUBY,
|
|
66
|
+
".c": Language.C,
|
|
67
|
+
".h": Language.C,
|
|
68
|
+
".cpp": Language.CPP,
|
|
69
|
+
".cc": Language.CPP,
|
|
70
|
+
".cxx": Language.CPP,
|
|
71
|
+
".hpp": Language.CPP,
|
|
72
|
+
".cs": Language.CSHARP,
|
|
73
|
+
".php": Language.PHP,
|
|
74
|
+
".swift": Language.SWIFT,
|
|
75
|
+
".kt": Language.KOTLIN,
|
|
76
|
+
".kts": Language.KOTLIN,
|
|
77
|
+
".scala": Language.SCALA,
|
|
78
|
+
".lua": Language.LUA,
|
|
79
|
+
".dart": Language.DART,
|
|
80
|
+
".ex": Language.ELIXIR,
|
|
81
|
+
".exs": Language.ELIXIR,
|
|
82
|
+
".hs": Language.HASKELL,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# tree-sitter-language-pack grammar names
|
|
86
|
+
LANGUAGE_GRAMMAR_MAP: dict[Language, str] = {
|
|
87
|
+
Language.PYTHON: "python",
|
|
88
|
+
Language.JAVASCRIPT: "javascript",
|
|
89
|
+
Language.TYPESCRIPT: "typescript",
|
|
90
|
+
Language.TSX: "tsx",
|
|
91
|
+
Language.GO: "go",
|
|
92
|
+
Language.RUST: "rust",
|
|
93
|
+
Language.JAVA: "java",
|
|
94
|
+
Language.RUBY: "ruby",
|
|
95
|
+
Language.C: "c",
|
|
96
|
+
Language.CPP: "cpp",
|
|
97
|
+
Language.CSHARP: "c_sharp",
|
|
98
|
+
Language.PHP: "php",
|
|
99
|
+
Language.SWIFT: "swift",
|
|
100
|
+
Language.KOTLIN: "kotlin",
|
|
101
|
+
Language.SCALA: "scala",
|
|
102
|
+
Language.LUA: "lua",
|
|
103
|
+
Language.DART: "dart",
|
|
104
|
+
Language.ELIXIR: "elixir",
|
|
105
|
+
Language.HASKELL: "haskell",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def detect_language(file_path: Path) -> Language:
|
|
110
|
+
"""Detect language from file extension.
|
|
111
|
+
|
|
112
|
+
Edge case: .tsx is TSX not TypeScript (different grammar).
|
|
113
|
+
Edge case: .mjs/.cjs are JavaScript (ES modules / CommonJS).
|
|
114
|
+
Edge case: .pyi is Python (type stubs — treated same as .py).
|
|
115
|
+
"""
|
|
116
|
+
return EXTENSION_MAP.get(file_path.suffix.lower(), Language.UNKNOWN)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def compute_file_hash(content: bytes) -> str:
|
|
120
|
+
"""Content-addressable hash using xxhash (much faster than SHA for our use case)."""
|
|
121
|
+
return xxhash.xxh64(content).hexdigest()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ParseResult:
|
|
125
|
+
"""Result of parsing a single file — nodes, edges, and any warnings."""
|
|
126
|
+
|
|
127
|
+
def __init__(self, file_path: Path, language: Language) -> None:
|
|
128
|
+
self.file_path = file_path
|
|
129
|
+
self.language = language
|
|
130
|
+
self.nodes: list[GraphNode] = []
|
|
131
|
+
self.edges: list[GraphEdge] = []
|
|
132
|
+
self.warnings: list[str] = []
|
|
133
|
+
self.content_hash: str = ""
|
|
134
|
+
self.size_bytes: int = 0
|
|
135
|
+
self.is_generated: bool = False
|
|
136
|
+
self.had_syntax_errors: bool = False
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def module_node_id(self) -> str:
|
|
140
|
+
"""The node ID of the file-level MODULE node."""
|
|
141
|
+
return GraphNode.make_id(self.file_path, NodeKind.MODULE, self.file_path.stem)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class FileParser:
|
|
145
|
+
"""Parses source files into graph nodes and edges.
|
|
146
|
+
|
|
147
|
+
Uses tree-sitter for language-aware AST parsing. Falls back to
|
|
148
|
+
regex-based extraction for unsupported languages (limited but
|
|
149
|
+
better than nothing).
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def __init__(self, config: ParserConfig, project_root: Path) -> None:
|
|
153
|
+
self._config = config
|
|
154
|
+
self._project_root = project_root
|
|
155
|
+
self._grammars: dict[Language, object] = {}
|
|
156
|
+
|
|
157
|
+
def _is_ignored(self, file_path: Path) -> bool:
|
|
158
|
+
"""Check if file matches any ignore pattern.
|
|
159
|
+
|
|
160
|
+
Edge case: patterns are relative to project root.
|
|
161
|
+
Edge case: symlinks — we resolve before checking to avoid
|
|
162
|
+
processing the same file twice via different paths.
|
|
163
|
+
"""
|
|
164
|
+
import fnmatch
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
rel = file_path.resolve().relative_to(self._project_root)
|
|
168
|
+
except ValueError:
|
|
169
|
+
return True # Outside project root — skip
|
|
170
|
+
|
|
171
|
+
rel_str = str(rel)
|
|
172
|
+
return any(
|
|
173
|
+
fnmatch.fnmatch(rel_str, pattern)
|
|
174
|
+
for pattern in self._config.ignored_patterns
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def _is_generated(self, content: str) -> bool:
|
|
178
|
+
"""Check if file is generated code by examining its header.
|
|
179
|
+
|
|
180
|
+
Edge case: some generated files put the marker on line 3 (after
|
|
181
|
+
a shebang and encoding declaration). We check first 5 lines.
|
|
182
|
+
"""
|
|
183
|
+
header = "\n".join(content.split("\n")[:5]).lower()
|
|
184
|
+
return any(marker.lower() in header for marker in self._config.generated_markers)
|
|
185
|
+
|
|
186
|
+
def _read_file_safe(self, file_path: Path) -> tuple[bytes, str] | None:
|
|
187
|
+
"""Read file with encoding fallback.
|
|
188
|
+
|
|
189
|
+
Edge cases:
|
|
190
|
+
- Binary file: UTF-8 decode fails, latin-1 produces garbage but doesn't crash
|
|
191
|
+
- Mixed encoding: some lines UTF-8, some not — we get partial content
|
|
192
|
+
- Null bytes in file: strong indicator of binary, skip
|
|
193
|
+
- Symlink to file outside project: resolve and check
|
|
194
|
+
|
|
195
|
+
Returns (raw_bytes, decoded_text) or None if unreadable.
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
raw = file_path.read_bytes()
|
|
199
|
+
except (OSError, PermissionError) as exc:
|
|
200
|
+
logger.warning("Cannot read %s: %s", file_path, exc)
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
# Size check
|
|
204
|
+
if len(raw) > self._config.max_file_size_bytes:
|
|
205
|
+
logger.info(
|
|
206
|
+
"Skipping %s: %d bytes exceeds limit %d",
|
|
207
|
+
file_path,
|
|
208
|
+
len(raw),
|
|
209
|
+
self._config.max_file_size_bytes,
|
|
210
|
+
)
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
# Binary detection: null bytes in first 8KB
|
|
214
|
+
if b"\x00" in raw[:8192]:
|
|
215
|
+
logger.debug("Skipping binary file: %s", file_path)
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
# Decode with fallback
|
|
219
|
+
try:
|
|
220
|
+
text = raw.decode("utf-8")
|
|
221
|
+
except UnicodeDecodeError:
|
|
222
|
+
try:
|
|
223
|
+
text = raw.decode("latin-1")
|
|
224
|
+
logger.debug("Fell back to latin-1 for %s", file_path)
|
|
225
|
+
except UnicodeDecodeError:
|
|
226
|
+
logger.warning("Cannot decode %s with any encoding", file_path)
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
return raw, text
|
|
230
|
+
|
|
231
|
+
async def parse_file(self, file_path: Path) -> ParseResult | None:
|
|
232
|
+
"""Parse a single file and extract graph nodes and edges.
|
|
233
|
+
|
|
234
|
+
Returns None if the file should be skipped entirely.
|
|
235
|
+
|
|
236
|
+
This is the main entry point. Language-specific extraction
|
|
237
|
+
is delegated to _extract_python, _extract_javascript, etc.
|
|
238
|
+
"""
|
|
239
|
+
if self._is_ignored(file_path):
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
language = detect_language(file_path)
|
|
243
|
+
|
|
244
|
+
result = ParseResult(file_path, language)
|
|
245
|
+
|
|
246
|
+
# Read file
|
|
247
|
+
read_result = self._read_file_safe(file_path)
|
|
248
|
+
if read_result is None:
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
raw_bytes, text = read_result
|
|
252
|
+
result.content_hash = compute_file_hash(raw_bytes)
|
|
253
|
+
result.size_bytes = len(raw_bytes)
|
|
254
|
+
result.is_generated = self._is_generated(text)
|
|
255
|
+
|
|
256
|
+
# Create the MODULE-level node (every file gets one)
|
|
257
|
+
is_test = self._is_test_file(file_path)
|
|
258
|
+
module_node = GraphNode(
|
|
259
|
+
node_id=result.module_node_id,
|
|
260
|
+
kind=NodeKind.MODULE,
|
|
261
|
+
name=file_path.stem,
|
|
262
|
+
qualified_name=self._qualified_module_name(file_path),
|
|
263
|
+
file_path=file_path,
|
|
264
|
+
line_range=LineRange(start=1, end=max(1, text.count("\n") + 1)),
|
|
265
|
+
language=language,
|
|
266
|
+
content_hash=result.content_hash,
|
|
267
|
+
is_generated=result.is_generated,
|
|
268
|
+
is_test=is_test,
|
|
269
|
+
is_entry_point=self._is_entry_point(file_path, text),
|
|
270
|
+
)
|
|
271
|
+
result.nodes.append(module_node)
|
|
272
|
+
|
|
273
|
+
# Language-specific extraction
|
|
274
|
+
if language == Language.PYTHON:
|
|
275
|
+
await self._extract_python(text, file_path, result)
|
|
276
|
+
elif language in (Language.JAVASCRIPT, Language.TYPESCRIPT, Language.TSX):
|
|
277
|
+
await self._extract_javascript_family(text, file_path, result)
|
|
278
|
+
elif language == Language.UNKNOWN:
|
|
279
|
+
result.warnings.append(f"No parser for {file_path.suffix}")
|
|
280
|
+
elif language in LANGUAGE_GRAMMAR_MAP:
|
|
281
|
+
# All other supported languages — extract basic structure via tree-sitter
|
|
282
|
+
await self._extract_generic(text, file_path, language, result)
|
|
283
|
+
else:
|
|
284
|
+
result.warnings.append(f"Language {language.value} not in enabled_languages")
|
|
285
|
+
|
|
286
|
+
return result
|
|
287
|
+
|
|
288
|
+
def _is_test_file(self, file_path: Path) -> bool:
|
|
289
|
+
"""Detect if a file is a test file.
|
|
290
|
+
|
|
291
|
+
Edge cases:
|
|
292
|
+
- test_*.py, *_test.py, *_spec.ts, *.test.js — all common patterns
|
|
293
|
+
- Files inside tests/, __tests__/, spec/ directories
|
|
294
|
+
- conftest.py is test infrastructure, not a test itself (still flagged)
|
|
295
|
+
"""
|
|
296
|
+
name = file_path.stem.lower()
|
|
297
|
+
parts = [p.lower() for p in file_path.parts]
|
|
298
|
+
|
|
299
|
+
is_test_name = (
|
|
300
|
+
name.startswith("test_")
|
|
301
|
+
or name.endswith("_test")
|
|
302
|
+
or name.endswith("_spec")
|
|
303
|
+
or name.endswith(".test")
|
|
304
|
+
or name.endswith(".spec")
|
|
305
|
+
or name == "conftest"
|
|
306
|
+
)
|
|
307
|
+
is_test_dir = any(
|
|
308
|
+
p in ("tests", "test", "__tests__", "spec", "specs") for p in parts
|
|
309
|
+
)
|
|
310
|
+
return is_test_name or is_test_dir
|
|
311
|
+
|
|
312
|
+
def _is_entry_point(self, file_path: Path, content: str) -> bool:
|
|
313
|
+
"""Detect if a file is an application entry point.
|
|
314
|
+
|
|
315
|
+
Edge cases:
|
|
316
|
+
- Python: `if __name__ == "__main__"`, main.py, app.py, manage.py
|
|
317
|
+
- JS/TS: package.json "main" field (not detectable here — handled in builder)
|
|
318
|
+
- Multiple entry points: CLI, web server, worker — all valid
|
|
319
|
+
"""
|
|
320
|
+
name = file_path.stem.lower()
|
|
321
|
+
if name in ("main", "app", "manage", "server", "worker", "cli"):
|
|
322
|
+
return True
|
|
323
|
+
if '__name__' in content and '__main__' in content:
|
|
324
|
+
return True
|
|
325
|
+
return False
|
|
326
|
+
|
|
327
|
+
def _qualified_module_name(self, file_path: Path) -> str:
|
|
328
|
+
"""Build a Python-style qualified module name from file path.
|
|
329
|
+
|
|
330
|
+
Edge case: __init__.py represents the package, not a module named "init".
|
|
331
|
+
Edge case: files outside src/ — use path from project root.
|
|
332
|
+
"""
|
|
333
|
+
try:
|
|
334
|
+
rel = file_path.resolve().relative_to(self._project_root)
|
|
335
|
+
except ValueError:
|
|
336
|
+
return file_path.stem
|
|
337
|
+
|
|
338
|
+
parts = list(rel.with_suffix("").parts)
|
|
339
|
+
|
|
340
|
+
# Remove common source directories from the qualified name
|
|
341
|
+
for prefix in ("src", "lib", "app"):
|
|
342
|
+
if parts and parts[0] == prefix:
|
|
343
|
+
parts = parts[1:]
|
|
344
|
+
break
|
|
345
|
+
|
|
346
|
+
# __init__ represents the package
|
|
347
|
+
if parts and parts[-1] == "__init__":
|
|
348
|
+
parts = parts[:-1]
|
|
349
|
+
|
|
350
|
+
# index.ts/js represents the directory
|
|
351
|
+
if parts and parts[-1] in ("index", "mod"):
|
|
352
|
+
parts = parts[:-1]
|
|
353
|
+
|
|
354
|
+
return ".".join(parts) if parts else file_path.stem
|
|
355
|
+
|
|
356
|
+
# -------------------------------------------------------------------
|
|
357
|
+
# Python extraction
|
|
358
|
+
# -------------------------------------------------------------------
|
|
359
|
+
|
|
360
|
+
async def _extract_python(
|
|
361
|
+
self, source: str, file_path: Path, result: ParseResult
|
|
362
|
+
) -> None:
|
|
363
|
+
"""Extract Python nodes and edges using tree-sitter.
|
|
364
|
+
|
|
365
|
+
Edge cases specific to Python:
|
|
366
|
+
- `from __future__ import annotations`: changes how type hints are evaluated
|
|
367
|
+
- `if TYPE_CHECKING:` blocks: imports are type-only, not runtime
|
|
368
|
+
- `try: import x / except: import y`: optional dependency pattern
|
|
369
|
+
- `importlib.import_module(f"plugins.{name}")`: dynamic import with template
|
|
370
|
+
- `__all__ = [...]`: explicit re-exports
|
|
371
|
+
- Decorators: @app.route("/path") marks an endpoint
|
|
372
|
+
- Dataclasses, NamedTuples: class-like but different structure
|
|
373
|
+
- Nested functions/classes: qualified name must include parent
|
|
374
|
+
"""
|
|
375
|
+
try:
|
|
376
|
+
from tree_sitter_language_pack import get_language, get_parser
|
|
377
|
+
except ImportError:
|
|
378
|
+
result.warnings.append("tree-sitter-language-pack not installed")
|
|
379
|
+
await self._extract_python_regex(source, file_path, result)
|
|
380
|
+
return
|
|
381
|
+
|
|
382
|
+
parser = get_parser("python")
|
|
383
|
+
tree = parser.parse(source.encode("utf-8"))
|
|
384
|
+
|
|
385
|
+
if tree.root_node.has_error:
|
|
386
|
+
result.had_syntax_errors = True
|
|
387
|
+
result.warnings.append(f"Syntax errors in {file_path} — partial extraction")
|
|
388
|
+
|
|
389
|
+
lines = source.split("\n")
|
|
390
|
+
module_id = result.module_node_id
|
|
391
|
+
in_type_checking = False
|
|
392
|
+
|
|
393
|
+
for child in tree.root_node.children:
|
|
394
|
+
node_type = child.type
|
|
395
|
+
|
|
396
|
+
# --- Imports ---
|
|
397
|
+
if node_type in ("import_statement", "import_from_statement"):
|
|
398
|
+
edges = self._parse_python_import(
|
|
399
|
+
child, source, file_path, module_id, in_type_checking
|
|
400
|
+
)
|
|
401
|
+
result.edges.extend(edges)
|
|
402
|
+
|
|
403
|
+
# --- If TYPE_CHECKING block ---
|
|
404
|
+
elif node_type == "if_statement":
|
|
405
|
+
condition_text = self._node_text(child.child_by_field_name("condition"), source)
|
|
406
|
+
if condition_text and "TYPE_CHECKING" in condition_text:
|
|
407
|
+
# Process imports inside this block as type-only
|
|
408
|
+
body = child.child_by_field_name("consequence")
|
|
409
|
+
if body:
|
|
410
|
+
for stmt in body.children:
|
|
411
|
+
if stmt.type in ("import_statement", "import_from_statement"):
|
|
412
|
+
edges = self._parse_python_import(
|
|
413
|
+
stmt, source, file_path, module_id,
|
|
414
|
+
is_type_only=True,
|
|
415
|
+
)
|
|
416
|
+
result.edges.extend(edges)
|
|
417
|
+
|
|
418
|
+
# --- Classes ---
|
|
419
|
+
elif node_type == "class_definition":
|
|
420
|
+
class_node = self._parse_python_class(
|
|
421
|
+
child, source, file_path, result
|
|
422
|
+
)
|
|
423
|
+
if class_node:
|
|
424
|
+
result.nodes.append(class_node)
|
|
425
|
+
result.edges.append(GraphEdge(
|
|
426
|
+
source_id=module_id,
|
|
427
|
+
target_id=class_node.node_id,
|
|
428
|
+
kind=EdgeKind.DEPENDS_ON,
|
|
429
|
+
))
|
|
430
|
+
|
|
431
|
+
# --- Functions ---
|
|
432
|
+
elif node_type == "function_definition":
|
|
433
|
+
func_node = self._parse_python_function(
|
|
434
|
+
child, source, file_path, parent_qualified=""
|
|
435
|
+
)
|
|
436
|
+
if func_node:
|
|
437
|
+
result.nodes.append(func_node)
|
|
438
|
+
|
|
439
|
+
# --- Assignments (module-level constants, __all__) ---
|
|
440
|
+
elif node_type in ("expression_statement", "assignment"):
|
|
441
|
+
self._parse_python_assignment(child, source, file_path, result)
|
|
442
|
+
|
|
443
|
+
def _node_text(self, node: object | None, source: str) -> str | None:
|
|
444
|
+
"""Extract text from a tree-sitter node safely."""
|
|
445
|
+
if node is None:
|
|
446
|
+
return None
|
|
447
|
+
start = getattr(node, "start_byte", 0)
|
|
448
|
+
end = getattr(node, "end_byte", 0)
|
|
449
|
+
return source[start:end]
|
|
450
|
+
|
|
451
|
+
def _parse_python_import(
|
|
452
|
+
self,
|
|
453
|
+
node: object,
|
|
454
|
+
source: str,
|
|
455
|
+
file_path: Path,
|
|
456
|
+
module_id: str,
|
|
457
|
+
is_type_only: bool = False,
|
|
458
|
+
) -> list[GraphEdge]:
|
|
459
|
+
"""Parse a Python import statement into graph edges.
|
|
460
|
+
|
|
461
|
+
Edge cases:
|
|
462
|
+
- `from . import x`: relative import — resolve against package
|
|
463
|
+
- `from ...utils import y`: multi-level relative import
|
|
464
|
+
- `import x.y.z`: creates edge to module x.y.z
|
|
465
|
+
- `from x import *`: star import — edge to module, not symbols
|
|
466
|
+
- `import x as y`: alias doesn't affect the edge
|
|
467
|
+
- `from __future__ import annotations`: skip (not a real dependency)
|
|
468
|
+
"""
|
|
469
|
+
text = self._node_text(node, source)
|
|
470
|
+
if not text:
|
|
471
|
+
return []
|
|
472
|
+
|
|
473
|
+
# Skip __future__ imports
|
|
474
|
+
if "__future__" in text:
|
|
475
|
+
return []
|
|
476
|
+
|
|
477
|
+
edges: list[GraphEdge] = []
|
|
478
|
+
# Extract the module being imported
|
|
479
|
+
module_name = self._extract_import_module(text, file_path)
|
|
480
|
+
|
|
481
|
+
if module_name:
|
|
482
|
+
target_id = GraphNode.make_id(
|
|
483
|
+
self._resolve_module_path(module_name),
|
|
484
|
+
NodeKind.MODULE,
|
|
485
|
+
module_name.split(".")[-1],
|
|
486
|
+
)
|
|
487
|
+
is_dynamic = False
|
|
488
|
+
confidence = 1.0 if not is_dynamic else 0.6
|
|
489
|
+
|
|
490
|
+
edges.append(GraphEdge(
|
|
491
|
+
source_id=module_id,
|
|
492
|
+
target_id=target_id,
|
|
493
|
+
kind=EdgeKind.IMPORTS,
|
|
494
|
+
confidence=confidence,
|
|
495
|
+
is_type_only=is_type_only,
|
|
496
|
+
metadata={"raw_import": text.strip()},
|
|
497
|
+
))
|
|
498
|
+
|
|
499
|
+
return edges
|
|
500
|
+
|
|
501
|
+
def _extract_import_module(self, import_text: str, file_path: Path) -> str | None:
|
|
502
|
+
"""Extract module name from import statement text.
|
|
503
|
+
|
|
504
|
+
Edge cases:
|
|
505
|
+
- `from .sibling import func` → resolve to package.sibling
|
|
506
|
+
- `from .. import parent_func` → resolve to parent package
|
|
507
|
+
- `import os.path` → "os.path"
|
|
508
|
+
- `from typing import List` → "typing"
|
|
509
|
+
"""
|
|
510
|
+
import re
|
|
511
|
+
|
|
512
|
+
# from X import Y
|
|
513
|
+
match = re.match(r"from\s+([\w.]+)\s+import", import_text)
|
|
514
|
+
if match:
|
|
515
|
+
module = match.group(1)
|
|
516
|
+
# Handle relative imports
|
|
517
|
+
if import_text.strip().startswith("from ."):
|
|
518
|
+
dots = re.match(r"from\s+(\.+)", import_text)
|
|
519
|
+
if dots:
|
|
520
|
+
level = len(dots.group(1))
|
|
521
|
+
package_parts = self._get_package_parts(file_path)
|
|
522
|
+
if level <= len(package_parts):
|
|
523
|
+
base = ".".join(package_parts[: -level] if level > 0 else package_parts)
|
|
524
|
+
rest = re.match(r"from\s+\.+\s*([\w.]*)\s+import", import_text)
|
|
525
|
+
if rest and rest.group(1):
|
|
526
|
+
return f"{base}.{rest.group(1)}" if base else rest.group(1)
|
|
527
|
+
return base
|
|
528
|
+
return module
|
|
529
|
+
|
|
530
|
+
# import X
|
|
531
|
+
match = re.match(r"import\s+([\w.]+)", import_text)
|
|
532
|
+
if match:
|
|
533
|
+
return match.group(1)
|
|
534
|
+
|
|
535
|
+
return None
|
|
536
|
+
|
|
537
|
+
def _get_package_parts(self, file_path: Path) -> list[str]:
|
|
538
|
+
"""Get the package path components for resolving relative imports."""
|
|
539
|
+
try:
|
|
540
|
+
rel = file_path.resolve().relative_to(self._project_root)
|
|
541
|
+
except ValueError:
|
|
542
|
+
return []
|
|
543
|
+
|
|
544
|
+
parts = list(rel.parent.parts)
|
|
545
|
+
# Remove common source directories
|
|
546
|
+
for prefix in ("src", "lib"):
|
|
547
|
+
if parts and parts[0] == prefix:
|
|
548
|
+
parts = parts[1:]
|
|
549
|
+
break
|
|
550
|
+
return parts
|
|
551
|
+
|
|
552
|
+
def _resolve_module_path(self, module_name: str) -> Path:
|
|
553
|
+
"""Best-effort resolution of a module name to a file path.
|
|
554
|
+
|
|
555
|
+
Edge case: module might be a package (directory with __init__.py)
|
|
556
|
+
or a file. We try both and return whichever exists, defaulting
|
|
557
|
+
to file path if neither exists (the node will be created as a
|
|
558
|
+
placeholder).
|
|
559
|
+
"""
|
|
560
|
+
parts = module_name.split(".")
|
|
561
|
+
# Try as file
|
|
562
|
+
file_path = self._project_root / Path(*parts).with_suffix(".py")
|
|
563
|
+
if file_path.exists():
|
|
564
|
+
return file_path
|
|
565
|
+
|
|
566
|
+
# Try common source directories
|
|
567
|
+
for src_dir in ("src", "lib"):
|
|
568
|
+
file_path = self._project_root / src_dir / Path(*parts).with_suffix(".py")
|
|
569
|
+
if file_path.exists():
|
|
570
|
+
return file_path
|
|
571
|
+
|
|
572
|
+
# Try as package
|
|
573
|
+
pkg_path = self._project_root / Path(*parts) / "__init__.py"
|
|
574
|
+
if pkg_path.exists():
|
|
575
|
+
return pkg_path
|
|
576
|
+
|
|
577
|
+
# Return a synthetic path — the node will exist but may not resolve to a real file
|
|
578
|
+
return self._project_root / Path(*parts).with_suffix(".py")
|
|
579
|
+
|
|
580
|
+
def _parse_python_class(
|
|
581
|
+
self,
|
|
582
|
+
node: object,
|
|
583
|
+
source: str,
|
|
584
|
+
file_path: Path,
|
|
585
|
+
result: ParseResult,
|
|
586
|
+
) -> GraphNode | None:
|
|
587
|
+
"""Parse a Python class definition.
|
|
588
|
+
|
|
589
|
+
Edge cases:
|
|
590
|
+
- Dataclass: @dataclass decorator → metadata tag
|
|
591
|
+
- NamedTuple: class Foo(NamedTuple) → metadata tag
|
|
592
|
+
- Protocol: class Foo(Protocol) → kind=INTERFACE
|
|
593
|
+
- Multiple inheritance: class Foo(Base, Mixin) → multiple INHERITS edges
|
|
594
|
+
- Nested class: class Outer: class Inner → qualified_name includes parent
|
|
595
|
+
- Abstract class: has @abstractmethod → metadata tag
|
|
596
|
+
"""
|
|
597
|
+
name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
|
|
598
|
+
name = self._node_text(name_node, source) if name_node else None
|
|
599
|
+
if not name:
|
|
600
|
+
return None
|
|
601
|
+
|
|
602
|
+
start_line = getattr(node, "start_point", (0, 0))[0] + 1
|
|
603
|
+
end_line = getattr(node, "end_point", (0, 0))[0] + 1
|
|
604
|
+
|
|
605
|
+
# Detect kind: Protocol/ABC → INTERFACE, else CLASS
|
|
606
|
+
kind = NodeKind.CLASS
|
|
607
|
+
superclasses = self._extract_superclasses(node, source)
|
|
608
|
+
if any(s in ("Protocol", "ABC", "ABCMeta") for s in superclasses):
|
|
609
|
+
kind = NodeKind.INTERFACE
|
|
610
|
+
|
|
611
|
+
# Build docstring
|
|
612
|
+
docstring = self._extract_docstring(node, source)
|
|
613
|
+
|
|
614
|
+
qualified = f"{self._qualified_module_name(file_path)}.{name}"
|
|
615
|
+
|
|
616
|
+
class_node = GraphNode(
|
|
617
|
+
node_id=GraphNode.make_id(file_path, kind, name),
|
|
618
|
+
kind=kind,
|
|
619
|
+
name=name,
|
|
620
|
+
qualified_name=qualified,
|
|
621
|
+
file_path=file_path,
|
|
622
|
+
line_range=LineRange(start=start_line, end=end_line),
|
|
623
|
+
language=Language.PYTHON,
|
|
624
|
+
docstring=docstring,
|
|
625
|
+
is_generated=result.is_generated,
|
|
626
|
+
is_test=result.nodes[0].is_test if result.nodes else False,
|
|
627
|
+
metadata={
|
|
628
|
+
"superclasses": superclasses,
|
|
629
|
+
"is_dataclass": self._has_decorator(node, source, "dataclass"),
|
|
630
|
+
},
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
# Create INHERITS edges for each superclass
|
|
634
|
+
for superclass in superclasses:
|
|
635
|
+
target_id = GraphNode.make_id(
|
|
636
|
+
self._resolve_module_path(superclass),
|
|
637
|
+
NodeKind.CLASS,
|
|
638
|
+
superclass.split(".")[-1],
|
|
639
|
+
)
|
|
640
|
+
result.edges.append(GraphEdge(
|
|
641
|
+
source_id=class_node.node_id,
|
|
642
|
+
target_id=target_id,
|
|
643
|
+
kind=EdgeKind.INHERITS,
|
|
644
|
+
))
|
|
645
|
+
|
|
646
|
+
return class_node
|
|
647
|
+
|
|
648
|
+
def _parse_python_function(
|
|
649
|
+
self,
|
|
650
|
+
node: object,
|
|
651
|
+
source: str,
|
|
652
|
+
file_path: Path,
|
|
653
|
+
parent_qualified: str = "",
|
|
654
|
+
) -> GraphNode | None:
|
|
655
|
+
"""Parse a Python function definition.
|
|
656
|
+
|
|
657
|
+
Edge cases:
|
|
658
|
+
- Async functions: `async def` → metadata["is_async"] = True
|
|
659
|
+
- Decorated with @app.route → kind=ENDPOINT
|
|
660
|
+
- @staticmethod/@classmethod → metadata tag
|
|
661
|
+
- Inner functions (closures): tracked with qualified name
|
|
662
|
+
- Property decorators: @property → metadata tag
|
|
663
|
+
- Overloaded functions: @overload → skip (only the implementation matters)
|
|
664
|
+
"""
|
|
665
|
+
name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
|
|
666
|
+
name = self._node_text(name_node, source) if name_node else None
|
|
667
|
+
if not name:
|
|
668
|
+
return None
|
|
669
|
+
|
|
670
|
+
start_line = getattr(node, "start_point", (0, 0))[0] + 1
|
|
671
|
+
end_line = getattr(node, "end_point", (0, 0))[0] + 1
|
|
672
|
+
|
|
673
|
+
# Skip overload stubs — they provide no implementation
|
|
674
|
+
if self._has_decorator(node, source, "overload"):
|
|
675
|
+
return None
|
|
676
|
+
|
|
677
|
+
# Detect endpoint decorators
|
|
678
|
+
kind = NodeKind.FUNCTION
|
|
679
|
+
is_endpoint = self._has_decorator(node, source, "route") or self._has_decorator(
|
|
680
|
+
node, source, "get"
|
|
681
|
+
) or self._has_decorator(node, source, "post")
|
|
682
|
+
if is_endpoint:
|
|
683
|
+
kind = NodeKind.ENDPOINT
|
|
684
|
+
|
|
685
|
+
# Detect method vs function
|
|
686
|
+
if parent_qualified:
|
|
687
|
+
kind = NodeKind.METHOD
|
|
688
|
+
|
|
689
|
+
qualified = f"{self._qualified_module_name(file_path)}.{name}"
|
|
690
|
+
if parent_qualified:
|
|
691
|
+
qualified = f"{parent_qualified}.{name}"
|
|
692
|
+
|
|
693
|
+
is_async = self._node_text(node, source).strip().startswith("async ") if self._node_text(node, source) else False
|
|
694
|
+
|
|
695
|
+
return GraphNode(
|
|
696
|
+
node_id=GraphNode.make_id(file_path, kind, qualified),
|
|
697
|
+
kind=kind,
|
|
698
|
+
name=name,
|
|
699
|
+
qualified_name=qualified,
|
|
700
|
+
file_path=file_path,
|
|
701
|
+
line_range=LineRange(start=start_line, end=end_line),
|
|
702
|
+
language=Language.PYTHON,
|
|
703
|
+
docstring=self._extract_docstring(node, source),
|
|
704
|
+
metadata={
|
|
705
|
+
"is_async": is_async,
|
|
706
|
+
"is_staticmethod": self._has_decorator(node, source, "staticmethod"),
|
|
707
|
+
"is_classmethod": self._has_decorator(node, source, "classmethod"),
|
|
708
|
+
"is_property": self._has_decorator(node, source, "property"),
|
|
709
|
+
},
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
def _parse_python_assignment(
|
|
713
|
+
self,
|
|
714
|
+
node: object,
|
|
715
|
+
source: str,
|
|
716
|
+
file_path: Path,
|
|
717
|
+
result: ParseResult,
|
|
718
|
+
) -> None:
|
|
719
|
+
"""Parse module-level assignments for __all__, constants, etc.
|
|
720
|
+
|
|
721
|
+
Edge case: `__all__ = ["Foo", "Bar"]` defines explicit re-exports.
|
|
722
|
+
We create RE_EXPORTS edges for each listed symbol.
|
|
723
|
+
"""
|
|
724
|
+
text = self._node_text(node, source)
|
|
725
|
+
if not text:
|
|
726
|
+
return
|
|
727
|
+
|
|
728
|
+
if "__all__" in text:
|
|
729
|
+
import re
|
|
730
|
+
symbols = re.findall(r'["\'](\w+)["\']', text)
|
|
731
|
+
module_id = result.module_node_id
|
|
732
|
+
for symbol in symbols:
|
|
733
|
+
target_id = GraphNode.make_id(file_path, NodeKind.FUNCTION, symbol)
|
|
734
|
+
result.edges.append(GraphEdge(
|
|
735
|
+
source_id=module_id,
|
|
736
|
+
target_id=target_id,
|
|
737
|
+
kind=EdgeKind.RE_EXPORTS,
|
|
738
|
+
))
|
|
739
|
+
|
|
740
|
+
# -------------------------------------------------------------------
|
|
741
|
+
# JavaScript/TypeScript extraction
|
|
742
|
+
# -------------------------------------------------------------------
|
|
743
|
+
|
|
744
|
+
async def _extract_javascript_family(
|
|
745
|
+
self, source: str, file_path: Path, result: ParseResult
|
|
746
|
+
) -> None:
|
|
747
|
+
"""Extract nodes and edges from JS/TS/TSX files.
|
|
748
|
+
|
|
749
|
+
Edge cases specific to JavaScript ecosystem:
|
|
750
|
+
- require() vs import: both CommonJS and ESM in same codebase
|
|
751
|
+
- Dynamic import(): `import("./module")` — async, different from static
|
|
752
|
+
- Path aliases: tsconfig paths, webpack aliases (can't resolve fully,
|
|
753
|
+
but can extract the alias and flag it)
|
|
754
|
+
- Barrel files: index.ts that re-exports from 10+ files
|
|
755
|
+
- Default export vs named exports: affects how dependents reference it
|
|
756
|
+
- JSX: <Component /> creates an implicit dependency on the component
|
|
757
|
+
- Type-only imports: `import type { Foo }` in TypeScript
|
|
758
|
+
- Namespace imports: `import * as utils from "./utils"`
|
|
759
|
+
- Side-effect imports: `import "./polyfill"` — no named bindings
|
|
760
|
+
"""
|
|
761
|
+
try:
|
|
762
|
+
from tree_sitter_language_pack import get_parser
|
|
763
|
+
|
|
764
|
+
grammar_name = LANGUAGE_GRAMMAR_MAP.get(detect_language(file_path), "javascript")
|
|
765
|
+
parser = get_parser(grammar_name)
|
|
766
|
+
|
|
767
|
+
tree = parser.parse(source.encode("utf-8"))
|
|
768
|
+
|
|
769
|
+
if tree.root_node.has_error:
|
|
770
|
+
result.had_syntax_errors = True
|
|
771
|
+
result.warnings.append(f"Syntax errors in {file_path}")
|
|
772
|
+
|
|
773
|
+
await self._walk_js_tree(tree.root_node, source, file_path, result)
|
|
774
|
+
|
|
775
|
+
except (ImportError, Exception) as exc:
|
|
776
|
+
result.warnings.append(f"tree-sitter grammar not available for {file_path.suffix}: {exc}")
|
|
777
|
+
await self._extract_js_regex(source, file_path, result)
|
|
778
|
+
|
|
779
|
+
async def _walk_js_tree(
|
|
780
|
+
self, root: object, source: str, file_path: Path, result: ParseResult
|
|
781
|
+
) -> None:
|
|
782
|
+
"""Walk a JS/TS AST and extract nodes and edges."""
|
|
783
|
+
module_id = result.module_node_id
|
|
784
|
+
|
|
785
|
+
for child in getattr(root, "children", []):
|
|
786
|
+
child_type = getattr(child, "type", "")
|
|
787
|
+
|
|
788
|
+
# --- ESM imports ---
|
|
789
|
+
if child_type == "import_statement":
|
|
790
|
+
edge = self._parse_js_import(child, source, file_path, module_id)
|
|
791
|
+
if edge:
|
|
792
|
+
result.edges.append(edge)
|
|
793
|
+
|
|
794
|
+
# --- Exports (function, class, variable) ---
|
|
795
|
+
elif child_type == "export_statement":
|
|
796
|
+
# Contains the actual declaration
|
|
797
|
+
declaration = getattr(child, "child_by_field_name", lambda _: None)("declaration")
|
|
798
|
+
if declaration:
|
|
799
|
+
decl_type = getattr(declaration, "type", "")
|
|
800
|
+
if decl_type in ("function_declaration", "generator_function_declaration"):
|
|
801
|
+
node = self._parse_js_function(declaration, source, file_path)
|
|
802
|
+
if node:
|
|
803
|
+
result.nodes.append(node)
|
|
804
|
+
elif decl_type == "class_declaration":
|
|
805
|
+
node = self._parse_js_class(declaration, source, file_path)
|
|
806
|
+
if node:
|
|
807
|
+
result.nodes.append(node)
|
|
808
|
+
|
|
809
|
+
# --- Top-level functions ---
|
|
810
|
+
elif child_type in ("function_declaration", "generator_function_declaration"):
|
|
811
|
+
node = self._parse_js_function(child, source, file_path)
|
|
812
|
+
if node:
|
|
813
|
+
result.nodes.append(node)
|
|
814
|
+
|
|
815
|
+
# --- Top-level classes ---
|
|
816
|
+
elif child_type == "class_declaration":
|
|
817
|
+
node = self._parse_js_class(child, source, file_path)
|
|
818
|
+
if node:
|
|
819
|
+
result.nodes.append(node)
|
|
820
|
+
|
|
821
|
+
# --- CommonJS require() ---
|
|
822
|
+
elif child_type == "lexical_declaration" or child_type == "variable_declaration":
|
|
823
|
+
text = self._node_text(child, source) or ""
|
|
824
|
+
if "require(" in text:
|
|
825
|
+
edge = self._parse_require(text, file_path, module_id)
|
|
826
|
+
if edge:
|
|
827
|
+
result.edges.append(edge)
|
|
828
|
+
|
|
829
|
+
def _parse_js_import(
|
|
830
|
+
self, node: object, source: str, file_path: Path, module_id: str
|
|
831
|
+
) -> GraphEdge | None:
|
|
832
|
+
"""Parse an ESM import statement.
|
|
833
|
+
|
|
834
|
+
Edge cases:
|
|
835
|
+
- `import type { Foo }` → type-only edge
|
|
836
|
+
- `import "./side-effect"` → side-effect import, edge with no target symbol
|
|
837
|
+
- `import * as ns from "x"` → namespace import
|
|
838
|
+
- `import("./lazy")` → dynamic import (different AST node type)
|
|
839
|
+
"""
|
|
840
|
+
import re
|
|
841
|
+
|
|
842
|
+
text = self._node_text(node, source) or ""
|
|
843
|
+
is_type_only = "import type" in text
|
|
844
|
+
|
|
845
|
+
# Extract module path
|
|
846
|
+
match = re.search(r'''from\s+['"]([^'"]+)['"]''', text)
|
|
847
|
+
if not match:
|
|
848
|
+
match = re.search(r'''import\s+['"]([^'"]+)['"]''', text)
|
|
849
|
+
if not match:
|
|
850
|
+
return None
|
|
851
|
+
|
|
852
|
+
module_path = match.group(1)
|
|
853
|
+
target_path = self._resolve_js_module(module_path, file_path)
|
|
854
|
+
target_id = GraphNode.make_id(target_path, NodeKind.MODULE, target_path.stem)
|
|
855
|
+
|
|
856
|
+
return GraphEdge(
|
|
857
|
+
source_id=module_id,
|
|
858
|
+
target_id=target_id,
|
|
859
|
+
kind=EdgeKind.IMPORTS,
|
|
860
|
+
is_type_only=is_type_only,
|
|
861
|
+
metadata={"raw_import": text.strip(), "module_specifier": module_path},
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
def _parse_require(
|
|
865
|
+
self, text: str, file_path: Path, module_id: str
|
|
866
|
+
) -> GraphEdge | None:
|
|
867
|
+
"""Parse a CommonJS require() call.
|
|
868
|
+
|
|
869
|
+
Edge case: `require(variable)` — can't resolve, create edge
|
|
870
|
+
with low confidence and the variable name in metadata.
|
|
871
|
+
"""
|
|
872
|
+
import re
|
|
873
|
+
|
|
874
|
+
match = re.search(r'''require\(['"]([^'"]+)['"]\)''', text)
|
|
875
|
+
if not match:
|
|
876
|
+
# Dynamic require — check if it's require(variable)
|
|
877
|
+
match_dynamic = re.search(r"require\((\w+)\)", text)
|
|
878
|
+
if match_dynamic:
|
|
879
|
+
return GraphEdge(
|
|
880
|
+
source_id=module_id,
|
|
881
|
+
target_id="unresolved:" + match_dynamic.group(1),
|
|
882
|
+
kind=EdgeKind.DYNAMIC_IMPORT,
|
|
883
|
+
confidence=0.3,
|
|
884
|
+
metadata={"dynamic_variable": match_dynamic.group(1)},
|
|
885
|
+
)
|
|
886
|
+
return None
|
|
887
|
+
|
|
888
|
+
module_path = match.group(1)
|
|
889
|
+
target_path = self._resolve_js_module(module_path, file_path)
|
|
890
|
+
target_id = GraphNode.make_id(target_path, NodeKind.MODULE, target_path.stem)
|
|
891
|
+
|
|
892
|
+
return GraphEdge(
|
|
893
|
+
source_id=module_id,
|
|
894
|
+
target_id=target_id,
|
|
895
|
+
kind=EdgeKind.IMPORTS,
|
|
896
|
+
metadata={"raw_import": text.strip(), "style": "commonjs"},
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
def _resolve_js_module(self, specifier: str, from_file: Path) -> Path:
|
|
900
|
+
"""Resolve a JS module specifier to a file path.
|
|
901
|
+
|
|
902
|
+
Edge cases:
|
|
903
|
+
- Relative: "./foo" → look for foo.ts, foo.tsx, foo.js, foo/index.ts, etc.
|
|
904
|
+
- Bare: "react" → node_modules/react (external, tracked as is_external)
|
|
905
|
+
- Alias: "@/utils" → depends on tsconfig/webpack (flag, can't fully resolve)
|
|
906
|
+
- Missing extension: JS allows importing without extension
|
|
907
|
+
"""
|
|
908
|
+
if specifier.startswith("."):
|
|
909
|
+
# Relative import
|
|
910
|
+
base = from_file.parent
|
|
911
|
+
candidate = base / specifier
|
|
912
|
+
# Try common extensions
|
|
913
|
+
for ext in (".ts", ".tsx", ".js", ".jsx", "/index.ts", "/index.tsx", "/index.js"):
|
|
914
|
+
full = candidate.parent / (candidate.name + ext)
|
|
915
|
+
if full.exists():
|
|
916
|
+
return full
|
|
917
|
+
return candidate.with_suffix(".ts") # Default assumption
|
|
918
|
+
else:
|
|
919
|
+
# Bare specifier (package import) or alias
|
|
920
|
+
return Path(f"node_modules/{specifier}/index.js")
|
|
921
|
+
|
|
922
|
+
def _parse_js_function(
|
|
923
|
+
self, node: object, source: str, file_path: Path
|
|
924
|
+
) -> GraphNode | None:
|
|
925
|
+
"""Parse a JS/TS function declaration."""
|
|
926
|
+
name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
|
|
927
|
+
name = self._node_text(name_node, source) if name_node else None
|
|
928
|
+
if not name:
|
|
929
|
+
return None
|
|
930
|
+
|
|
931
|
+
start_line = getattr(node, "start_point", (0, 0))[0] + 1
|
|
932
|
+
end_line = getattr(node, "end_point", (0, 0))[0] + 1
|
|
933
|
+
text = self._node_text(node, source) or ""
|
|
934
|
+
|
|
935
|
+
return GraphNode(
|
|
936
|
+
node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
|
|
937
|
+
kind=NodeKind.FUNCTION,
|
|
938
|
+
name=name,
|
|
939
|
+
qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
|
|
940
|
+
file_path=file_path,
|
|
941
|
+
line_range=LineRange(start=start_line, end=end_line),
|
|
942
|
+
language=detect_language(file_path),
|
|
943
|
+
metadata={
|
|
944
|
+
"is_async": "async " in text[:50],
|
|
945
|
+
"is_generator": "function*" in text[:50],
|
|
946
|
+
"is_exported": True,
|
|
947
|
+
},
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
def _parse_js_class(
|
|
951
|
+
self, node: object, source: str, file_path: Path
|
|
952
|
+
) -> GraphNode | None:
|
|
953
|
+
"""Parse a JS/TS class declaration."""
|
|
954
|
+
name_node = getattr(node, "child_by_field_name", lambda _: None)("name")
|
|
955
|
+
name = self._node_text(name_node, source) if name_node else None
|
|
956
|
+
if not name:
|
|
957
|
+
return None
|
|
958
|
+
|
|
959
|
+
start_line = getattr(node, "start_point", (0, 0))[0] + 1
|
|
960
|
+
end_line = getattr(node, "end_point", (0, 0))[0] + 1
|
|
961
|
+
|
|
962
|
+
return GraphNode(
|
|
963
|
+
node_id=GraphNode.make_id(file_path, NodeKind.CLASS, name),
|
|
964
|
+
kind=NodeKind.CLASS,
|
|
965
|
+
name=name,
|
|
966
|
+
qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
|
|
967
|
+
file_path=file_path,
|
|
968
|
+
line_range=LineRange(start=start_line, end=end_line),
|
|
969
|
+
language=detect_language(file_path),
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
# -------------------------------------------------------------------
|
|
973
|
+
# Regex fallbacks (when tree-sitter grammar not available)
|
|
974
|
+
# -------------------------------------------------------------------
|
|
975
|
+
|
|
976
|
+
async def _extract_python_regex(
|
|
977
|
+
self, source: str, file_path: Path, result: ParseResult
|
|
978
|
+
) -> None:
|
|
979
|
+
"""Regex-based Python extraction — fallback when tree-sitter unavailable.
|
|
980
|
+
|
|
981
|
+
This is intentionally limited. It catches the most common patterns
|
|
982
|
+
(imports, class/function definitions) but misses nesting, decorators,
|
|
983
|
+
and complex constructs.
|
|
984
|
+
"""
|
|
985
|
+
import re
|
|
986
|
+
|
|
987
|
+
module_id = result.module_node_id
|
|
988
|
+
|
|
989
|
+
for line_no, line in enumerate(source.split("\n"), 1):
|
|
990
|
+
stripped = line.strip()
|
|
991
|
+
|
|
992
|
+
# Imports
|
|
993
|
+
if stripped.startswith(("import ", "from ")):
|
|
994
|
+
module_name = self._extract_import_module(stripped, file_path)
|
|
995
|
+
if module_name:
|
|
996
|
+
target_path = self._resolve_module_path(module_name)
|
|
997
|
+
target_id = GraphNode.make_id(target_path, NodeKind.MODULE, module_name.split(".")[-1])
|
|
998
|
+
result.edges.append(GraphEdge(
|
|
999
|
+
source_id=module_id,
|
|
1000
|
+
target_id=target_id,
|
|
1001
|
+
kind=EdgeKind.IMPORTS,
|
|
1002
|
+
confidence=0.9, # Lower confidence for regex extraction
|
|
1003
|
+
metadata={"extraction": "regex"},
|
|
1004
|
+
))
|
|
1005
|
+
|
|
1006
|
+
# Class definitions
|
|
1007
|
+
match = re.match(r"class\s+(\w+)", stripped)
|
|
1008
|
+
if match:
|
|
1009
|
+
name = match.group(1)
|
|
1010
|
+
result.nodes.append(GraphNode(
|
|
1011
|
+
node_id=GraphNode.make_id(file_path, NodeKind.CLASS, name),
|
|
1012
|
+
kind=NodeKind.CLASS,
|
|
1013
|
+
name=name,
|
|
1014
|
+
qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
|
|
1015
|
+
file_path=file_path,
|
|
1016
|
+
line_range=LineRange(start=line_no, end=line_no),
|
|
1017
|
+
language=Language.PYTHON,
|
|
1018
|
+
metadata={"extraction": "regex"},
|
|
1019
|
+
))
|
|
1020
|
+
|
|
1021
|
+
# Function definitions
|
|
1022
|
+
match = re.match(r"(?:async\s+)?def\s+(\w+)", stripped)
|
|
1023
|
+
if match:
|
|
1024
|
+
name = match.group(1)
|
|
1025
|
+
result.nodes.append(GraphNode(
|
|
1026
|
+
node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
|
|
1027
|
+
kind=NodeKind.FUNCTION,
|
|
1028
|
+
name=name,
|
|
1029
|
+
qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
|
|
1030
|
+
file_path=file_path,
|
|
1031
|
+
line_range=LineRange(start=line_no, end=line_no),
|
|
1032
|
+
language=Language.PYTHON,
|
|
1033
|
+
metadata={"is_async": "async" in stripped, "extraction": "regex"},
|
|
1034
|
+
))
|
|
1035
|
+
|
|
1036
|
+
async def _extract_js_regex(
|
|
1037
|
+
self, source: str, file_path: Path, result: ParseResult
|
|
1038
|
+
) -> None:
|
|
1039
|
+
"""Regex-based JS/TS extraction — fallback."""
|
|
1040
|
+
import re
|
|
1041
|
+
|
|
1042
|
+
module_id = result.module_node_id
|
|
1043
|
+
|
|
1044
|
+
for line_no, line in enumerate(source.split("\n"), 1):
|
|
1045
|
+
stripped = line.strip()
|
|
1046
|
+
|
|
1047
|
+
# ESM imports
|
|
1048
|
+
match = re.search(r'''from\s+['"]([^'"]+)['"]''', stripped)
|
|
1049
|
+
if match:
|
|
1050
|
+
target_path = self._resolve_js_module(match.group(1), file_path)
|
|
1051
|
+
target_id = GraphNode.make_id(target_path, NodeKind.MODULE, target_path.stem)
|
|
1052
|
+
result.edges.append(GraphEdge(
|
|
1053
|
+
source_id=module_id,
|
|
1054
|
+
target_id=target_id,
|
|
1055
|
+
kind=EdgeKind.IMPORTS,
|
|
1056
|
+
confidence=0.9,
|
|
1057
|
+
metadata={"extraction": "regex"},
|
|
1058
|
+
))
|
|
1059
|
+
|
|
1060
|
+
# Function declarations
|
|
1061
|
+
match = re.match(r"(?:export\s+)?(?:async\s+)?function\s+(\w+)", stripped)
|
|
1062
|
+
if match:
|
|
1063
|
+
name = match.group(1)
|
|
1064
|
+
result.nodes.append(GraphNode(
|
|
1065
|
+
node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
|
|
1066
|
+
kind=NodeKind.FUNCTION,
|
|
1067
|
+
name=name,
|
|
1068
|
+
qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
|
|
1069
|
+
file_path=file_path,
|
|
1070
|
+
line_range=LineRange(start=line_no, end=line_no),
|
|
1071
|
+
language=detect_language(file_path),
|
|
1072
|
+
metadata={"extraction": "regex"},
|
|
1073
|
+
))
|
|
1074
|
+
|
|
1075
|
+
# -------------------------------------------------------------------
|
|
1076
|
+
# Generic language extraction (all 19 languages via tree-sitter-language-pack)
|
|
1077
|
+
# -------------------------------------------------------------------
|
|
1078
|
+
|
|
1079
|
+
async def _extract_generic(
|
|
1080
|
+
self, source: str, file_path: Path, language: Language, result: ParseResult
|
|
1081
|
+
) -> None:
|
|
1082
|
+
"""Extract basic structure from any tree-sitter supported language.
|
|
1083
|
+
|
|
1084
|
+
Works for Go, Rust, Java, C, C++, C#, PHP, Swift, Kotlin, Scala,
|
|
1085
|
+
Lua, Dart, Elixir, Haskell, Ruby.
|
|
1086
|
+
|
|
1087
|
+
Extracts:
|
|
1088
|
+
- Top-level function/method definitions
|
|
1089
|
+
- Class/struct/interface/trait definitions
|
|
1090
|
+
- Import/include/use statements
|
|
1091
|
+
|
|
1092
|
+
This is intentionally broad — we use tree-sitter node type names
|
|
1093
|
+
that are common across grammars (function_definition, class_definition,
|
|
1094
|
+
import_declaration, etc.) and skip what we don't recognize.
|
|
1095
|
+
"""
|
|
1096
|
+
grammar_name = LANGUAGE_GRAMMAR_MAP.get(language)
|
|
1097
|
+
if not grammar_name:
|
|
1098
|
+
return
|
|
1099
|
+
|
|
1100
|
+
try:
|
|
1101
|
+
from tree_sitter_language_pack import get_parser
|
|
1102
|
+
parser = get_parser(grammar_name)
|
|
1103
|
+
except Exception as exc:
|
|
1104
|
+
result.warnings.append(f"No grammar for {language.value}: {exc}")
|
|
1105
|
+
return
|
|
1106
|
+
|
|
1107
|
+
tree = parser.parse(source.encode("utf-8"))
|
|
1108
|
+
if tree.root_node.has_error:
|
|
1109
|
+
result.had_syntax_errors = True
|
|
1110
|
+
|
|
1111
|
+
module_id = result.module_node_id
|
|
1112
|
+
|
|
1113
|
+
# Node types that represent function/method definitions across languages
|
|
1114
|
+
function_types = {
|
|
1115
|
+
"function_definition", "function_declaration", "method_definition",
|
|
1116
|
+
"method_declaration", "function_item", "fun_spec",
|
|
1117
|
+
}
|
|
1118
|
+
# Node types that represent class/struct/interface definitions
|
|
1119
|
+
class_types = {
|
|
1120
|
+
"class_definition", "class_declaration", "struct_item",
|
|
1121
|
+
"interface_declaration", "trait_item", "type_declaration",
|
|
1122
|
+
"struct_declaration", "enum_declaration", "enum_item",
|
|
1123
|
+
"object_declaration",
|
|
1124
|
+
}
|
|
1125
|
+
# Node types that represent imports
|
|
1126
|
+
import_types = {
|
|
1127
|
+
"import_declaration", "import_statement", "use_declaration",
|
|
1128
|
+
"include_statement", "require_expression", "using_directive",
|
|
1129
|
+
"import_from_statement", "preproc_include",
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
for child in tree.root_node.children:
|
|
1133
|
+
child_type = getattr(child, "type", "")
|
|
1134
|
+
|
|
1135
|
+
if child_type in function_types:
|
|
1136
|
+
name_node = getattr(child, "child_by_field_name", lambda _: None)("name")
|
|
1137
|
+
name = self._node_text(name_node, source) if name_node else None
|
|
1138
|
+
if name:
|
|
1139
|
+
start_line = getattr(child, "start_point", (0, 0))[0] + 1
|
|
1140
|
+
end_line = getattr(child, "end_point", (0, 0))[0] + 1
|
|
1141
|
+
result.nodes.append(GraphNode(
|
|
1142
|
+
node_id=GraphNode.make_id(file_path, NodeKind.FUNCTION, name),
|
|
1143
|
+
kind=NodeKind.FUNCTION,
|
|
1144
|
+
name=name,
|
|
1145
|
+
qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
|
|
1146
|
+
file_path=file_path,
|
|
1147
|
+
line_range=LineRange(start=start_line, end=end_line),
|
|
1148
|
+
language=language,
|
|
1149
|
+
))
|
|
1150
|
+
|
|
1151
|
+
elif child_type in class_types:
|
|
1152
|
+
name_node = getattr(child, "child_by_field_name", lambda _: None)("name")
|
|
1153
|
+
name = self._node_text(name_node, source) if name_node else None
|
|
1154
|
+
if name:
|
|
1155
|
+
start_line = getattr(child, "start_point", (0, 0))[0] + 1
|
|
1156
|
+
end_line = getattr(child, "end_point", (0, 0))[0] + 1
|
|
1157
|
+
kind = NodeKind.INTERFACE if "interface" in child_type or "trait" in child_type else NodeKind.CLASS
|
|
1158
|
+
result.nodes.append(GraphNode(
|
|
1159
|
+
node_id=GraphNode.make_id(file_path, kind, name),
|
|
1160
|
+
kind=kind,
|
|
1161
|
+
name=name,
|
|
1162
|
+
qualified_name=f"{self._qualified_module_name(file_path)}.{name}",
|
|
1163
|
+
file_path=file_path,
|
|
1164
|
+
line_range=LineRange(start=start_line, end=end_line),
|
|
1165
|
+
language=language,
|
|
1166
|
+
))
|
|
1167
|
+
|
|
1168
|
+
elif child_type in import_types:
|
|
1169
|
+
# Record a basic import edge
|
|
1170
|
+
text = self._node_text(child, source)
|
|
1171
|
+
if text:
|
|
1172
|
+
result.edges.append(GraphEdge(
|
|
1173
|
+
source_id=module_id,
|
|
1174
|
+
target_id=GraphNode.make_id(file_path, NodeKind.MODULE, f"_import_{hash(text) % 10000}"),
|
|
1175
|
+
kind=EdgeKind.IMPORTS,
|
|
1176
|
+
confidence=0.7,
|
|
1177
|
+
metadata={"raw_import": text.strip()[:200], "language": language.value},
|
|
1178
|
+
))
|
|
1179
|
+
|
|
1180
|
+
# -------------------------------------------------------------------
|
|
1181
|
+
# Utility methods
|
|
1182
|
+
# -------------------------------------------------------------------
|
|
1183
|
+
|
|
1184
|
+
def _extract_superclasses(self, class_node: object, source: str) -> list[str]:
|
|
1185
|
+
"""Extract superclass names from a class definition node."""
|
|
1186
|
+
superclasses_node = getattr(class_node, "child_by_field_name", lambda _: None)("superclasses")
|
|
1187
|
+
if not superclasses_node:
|
|
1188
|
+
return []
|
|
1189
|
+
|
|
1190
|
+
text = self._node_text(superclasses_node, source)
|
|
1191
|
+
if not text:
|
|
1192
|
+
return []
|
|
1193
|
+
|
|
1194
|
+
# Remove parens and split
|
|
1195
|
+
import re
|
|
1196
|
+
text = text.strip("()")
|
|
1197
|
+
return [s.strip() for s in re.split(r",\s*", text) if s.strip()]
|
|
1198
|
+
|
|
1199
|
+
def _extract_docstring(self, node: object, source: str) -> str | None:
|
|
1200
|
+
"""Extract docstring from a class or function body."""
|
|
1201
|
+
body = getattr(node, "child_by_field_name", lambda _: None)("body")
|
|
1202
|
+
if not body:
|
|
1203
|
+
return None
|
|
1204
|
+
|
|
1205
|
+
children = getattr(body, "children", [])
|
|
1206
|
+
if not children:
|
|
1207
|
+
return None
|
|
1208
|
+
|
|
1209
|
+
first = children[0]
|
|
1210
|
+
if getattr(first, "type", "") == "expression_statement":
|
|
1211
|
+
inner_children = getattr(first, "children", [])
|
|
1212
|
+
if inner_children and getattr(inner_children[0], "type", "") == "string":
|
|
1213
|
+
text = self._node_text(inner_children[0], source)
|
|
1214
|
+
if text:
|
|
1215
|
+
return text.strip("'\"").strip()
|
|
1216
|
+
return None
|
|
1217
|
+
|
|
1218
|
+
def _has_decorator(self, node: object, source: str, decorator_name: str) -> bool:
|
|
1219
|
+
"""Check if a node has a specific decorator."""
|
|
1220
|
+
# Look for decorator nodes before this node
|
|
1221
|
+
parent = getattr(node, "parent", None)
|
|
1222
|
+
if not parent:
|
|
1223
|
+
return False
|
|
1224
|
+
|
|
1225
|
+
for child in getattr(parent, "children", []):
|
|
1226
|
+
if getattr(child, "type", "") == "decorator":
|
|
1227
|
+
text = self._node_text(child, source) or ""
|
|
1228
|
+
if decorator_name in text:
|
|
1229
|
+
return True
|
|
1230
|
+
if child is node:
|
|
1231
|
+
break
|
|
1232
|
+
return False
|